-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathaeg.html
More file actions
1552 lines (1121 loc) · 146 KB
/
aeg.html
File metadata and controls
1552 lines (1121 loc) · 146 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
<!DOCTYPE html>
<html lang="zh-CN">
<head hexo-theme='https://github.com/volantis-x/hexo-theme-volantis/tree/4.3.1'>
<meta charset="utf-8">
<!-- SEO相关 -->
<!-- 渲染优化 -->
<meta http-equiv='x-dns-prefetch-control' content='on' />
<link rel='dns-prefetch' href='https://cdn.jsdelivr.net'>
<link rel="preconnect" href="https://cdn.jsdelivr.net" crossorigin>
<meta name="renderer" content="webkit">
<meta name="force-rendering" content="webkit">
<meta http-equiv="X-UA-Compatible" content="IE=Edge,chrome=1">
<meta name="HandheldFriendly" content="True" >
<meta name="apple-mobile-web-app-capable" content="yes">
<meta name="viewport" content="width=device-width, initial-scale=1, maximum-scale=1">
<link rel="preload" href="/css/first.css" as="style">
<!-- 页面元数据 -->
<title>[论文解读]Adversarial Environment Generation for Learning to Navigate the Web - lornd's blog</title>
<!-- feed -->
<!-- import meta -->
<!-- link -->
<!-- import link -->
<link rel="stylesheet" href="/css/first.css">
<link rel="stylesheet" href="/css/style.css" media="print" onload="this.media='all';this.onload=null">
<noscript><link rel="stylesheet" href="/css/style.css"></noscript>
<script id="loadcss"></script>
<script>
if (/*@cc_on!@*/false || (!!window.MSInputMethodContext && !!document.documentMode))
document.write(
'<style>'+
'html{'+
'overflow-x: hidden !important;'+
'overflow-y: hidden !important;'+
'}'+
'.kill-ie{'+
'text-align:center;'+
'height: 100%;'+
'margin-top: 15%;'+
'margin-bottom: 5500%;'+
'}'+
'</style>'+
'<div class="kill-ie">'+
'<h1><b>抱歉,您的浏览器无法访问本站</b></h1>'+
'<h3>微软已经于2016年终止了对 Internet Explorer (IE) 10 及更早版本的支持,<br/>'+
'继续使用存在极大的安全隐患,请使用当代主流的浏览器进行访问。</h3><br/>'+
'<a target="_blank" rel="noopener" href="https://www.microsoft.com/zh-cn/WindowsForBusiness/End-of-IE-support"><strong>了解详情 ></strong></a>'+
'</div>');
</script>
<noscript>
<style>
html{
overflow-x: hidden !important;
overflow-y: hidden !important;
}
.kill-noscript{
text-align:center;
height: 100%;
margin-top: 15%;
margin-bottom: 5500%;
}
</style>
<div class="kill-noscript">
<h1><b>抱歉,您的浏览器无法访问本站</b></h1>
<h3>本页面需要浏览器支持(启用)JavaScript</h3><br/>
<a target="_blank" rel="noopener" href="https://www.baidu.com/s?wd=启用JavaScript"><strong>了解详情 ></strong></a>
</div>
</noscript>
<!-- hexo injector head_end start -->
<link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/katex@0.12.0/dist/katex.min.css">
<link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/hexo-math@4.0.0/dist/style.css">
<!-- hexo injector head_end end --><link href="https://cdn.bootcss.com/KaTeX/0.11.1/katex.min.css" rel="stylesheet" /></head>
<body>
<header id="l_header" class="l_header auto shadow blur show" style='opacity: 0' >
<div class='container'>
<div id='wrapper'>
<div class='nav-sub'>
<p class="title"></p>
<ul class='switcher nav-list-h m-phone' id="pjax-header-nav-list">
<li><a id="s-comment" class="fas fa-comments fa-fw" target="_self" href='javascript:void(0)'></a></li>
<li><a id="s-toc" class="s-toc fas fa-list fa-fw" target="_self" href='javascript:void(0)'></a></li>
</ul>
</div>
<div class="nav-main">
<a class="title flat-box" target="_self" href='/'>
<img no-lazy class='logo' src='https://cdn.jsdelivr.net/gh/volantis-x/cdn-org/blog/Logo-NavBar@3x.png'/>
</a>
<div class='menu navigation'>
<ul class='nav-list-h m-pc'>
<li>
<a class="menuitem flat-box faa-parent animated-hover" href=/
id="home"
>
<i class='fas fa-rss fa-fw'></i>博客
</a>
</li>
<li>
<a class="menuitem flat-box faa-parent animated-hover" href=/friends/
id="friends"
>
<i class='fas fa-link fa-fw'></i>友情链接
</a>
</li>
</ul>
</div>
<div class="m_search">
<form name="searchform" class="form u-search-form">
<i class="icon fas fa-search fa-fw"></i>
<input type="text" class="input u-search-input" placeholder="Search..." />
</form>
</div>
<ul class='switcher nav-list-h m-phone'>
<li><a class="s-search fas fa-search fa-fw" target="_self" href='javascript:void(0)'></a></li>
<li>
<a class="s-menu fas fa-bars fa-fw" target="_self" href='javascript:void(0)'></a>
<ul class="menu-phone list-v navigation white-box">
<li>
<a class="menuitem flat-box faa-parent animated-hover" href=/
id="home"
>
<i class='fas fa-rss fa-fw'></i>博客
</a>
</li>
<li>
<a class="menuitem flat-box faa-parent animated-hover" href=/friends/
id="friends"
>
<i class='fas fa-link fa-fw'></i>友情链接
</a>
</li>
</ul>
</li>
</ul>
</div>
</div>
</div>
</header>
<div id="l_body">
<div id="l_cover">
<div id="full" class='cover-wrapper post dock' style="display: none;">
<div class='cover-bg lazyload placeholder' data-bg="https://cdn.jsdelivr.net/gh/volantis-x/cdn-wallpaper-minimalist/2020/042.jpg"></div>
<div class='cover-body'>
<div class='top'>
<p class="title">lornd's Blog</p>
<p class="subtitle">学算法时一定要想怎么去改进!</p>
</div>
<div class='bottom'>
<div class='menu navigation'>
<div class='list-h'>
</div>
</div>
</div>
</div>
<div id="scroll-down" style="display: none;"><i class="fa fa-chevron-down scroll-down-effects"></i></div>
</div>
</div>
<div id="safearea">
<div class="body-wrapper" id="pjax-container">
<div class='l_main'>
<article class="article post white-box reveal md shadow article-type-post" id="post" itemscope itemprop="blogPost">
<div class="article-meta" id="top">
<h1 class="title">
[论文解读]Adversarial Environment Generation for Learning to Navigate the Web
</h1>
<div class='new-meta-box'>
<div class='new-meta-item author'>
<a class='author' href="https://lornd.top" rel="nofollow">
<img no-lazy src="\images\avatar.jpg">
<p>lornd</p>
</a>
</div>
<div class="new-meta-item date">
<a class='notlink'>
<i class="fas fa-calendar-alt fa-fw" aria-hidden="true"></i>
<p>发布于:2023年8月20日</p>
</a>
</div>
<div class="new-meta-item browse leancloud">
<a class='notlink'>
<div id="lc-pv" data-title="[论文解读]Adversarial Environment Generation for Learning to Navigate the Web" data-path="/aeg.html">
<i class="fas fa-eye fa-fw" aria-hidden="true"></i>
<span id='number'><i class="fas fa-circle-notch fa-spin fa-fw" aria-hidden="true"></i></span>
次浏览
</div>
</a>
</div>
</div>
</div>
<p>论文地址:<a target="_blank" rel="noopener" href="https://arxiv.org/abs/2103.01991">Adversarial Environment Generation for Learning to Navigate the Web</a> 。</p>
<h2 id="摘要">摘要</h2>
<p>学习如何自动在网页中进行导航是一个困难的序列决策工作。状态空间和动作空间巨大,且具有组合特性,同时网站也是包含多个页面的动态环境。训练网页导航智能体的一个瓶颈是为训练环境提供一个可学习的课程,并且要求能够覆盖真实世界网站的多样性。因此,我们提出用对抗环境生成(Adversarial Environment Generation,AEG)方法生成具有挑战性的网页环境,以训练强化学习智能体。</p>
<ul>
<li>我们提供了一个新的基准环境 gMiniWob ,使强化学习的对手能够使用组合原语学习生成任意复杂的网站。</li>
<li>为了训练这个对手,我们提出了一种新的技术,利用导航智能体获得的分数差来最大化遗憾。我们的实验结果表明:我们的方法在最小最大化遗憾 AEG 方面明显优于之前的方法。</li>
<li>遗憾目标训练对手设计一套对于导航智能体来说是“适中挑战”的课程。我们的实验结果表明:随着时间的推移,对手学会了生成逐渐变难的网页导航任务。</li>
<li>使用我们的技术训练的导航智能体学会了完成具有挑战性的,高维的网页导航任务,如表单填写,航班预定等。我们的实验结果表明:在一组具有挑战性的不可见测试环境中,通过我们提出的灵活的 b-PAIRED 技术训练的导航智能体的性能显著优于具有竞争力的自动化课程生成基线,包括了一个最先进的强化学习网页导航方法,并且在一些任务上达到了超过 <span class="katex"><span class="katex-mathml"><math xmlns="http://www.w3.org/1998/Math/MathML"><semantics><mrow><mn>80</mn><mi mathvariant="normal">%</mi></mrow><annotation encoding="application/x-tex">80\%</annotation></semantics></math></span><span class="katex-html" aria-hidden="true"><span class="base"><span class="strut" style="height:0.80556em;vertical-align:-0.05556em;"></span><span class="mord">8</span><span class="mord">0</span><span class="mord">%</span></span></span></span> 的准确率。</li>
</ul>
<h2 id="研究背景">研究背景</h2>
<p>本文的主要目的是训练一个强化学习智能体自动在网页中进行导航,将相关信息正确输入到未知的现实网站中。这种能力可以让用户发出如预定机票和发朋友圈等请求,并让强化学习智能体自动处理完成任务的细节。然而,现实网站的复杂性和多样性让这个任务非常困难。</p>
<p>为了让我们的智能体能够泛化到新的网站,我们让其直接在 DOM 树上进行操作,智能体必须正确选择并将信息填入到合适的 DOM 元素中。这让问题的状态-动作空间非常大。即使智能体能够在网页中导航到正确的表单,甚至选择了正确的元素,仍然会有很多的可以输入的值。为了解决这个问题,前人的研究利用了从专家演示进行动作克隆的方法。但是,这个方法十分脆弱,并且不能有效地泛化。为每个网站收集导航演示是不可能的,尤其在网页经常变化和更新的情况下。如果没有可用的演示数据,基于模仿学习的模型就不能够泛化到新的网站。</p>
<p>成功在大范围的现实网站中进行导航需要在大量可能的任务和环境中对智能体进行训练。问题在于如何创建这样一个任务分布,其不仅能够涵盖大多数现实任务,而且能够以可以让智能体进行学习课程的形式呈现。一种可能的做法是为网站人为设计一个提前定义好的课程。但是这种做法十分枯燥,耗时,易错,而且脆弱,设计者很容易忽略一些现实边缘情况。另外一种做法是使用域随机算法(Domain Randomization, DR)对网站的参数进行随机化,或者随着时间的推移自动增加一些参数以控制难度。然而,这些方法可能无法覆盖重要的测试案例,并且不能根据智能体当前的能力来调整参数配置的难度。</p>
<h2 id="文章贡献">文章贡献</h2>
<p>在本文中,我们利用最新的对抗环境生成(AEG)技术为富有挑战性的网页导航任务建立课程。具体地,我们训练一个对抗强化学习智能体学习在网站中创建新的页面,以利用当前正在学习网页导航的智能体的弱点。为了实验这个 AEG 网页设计技术,我们建立了一个新的框架 gMiniWoB ,以允许对抗智能体利用常见的设计基元构建网站,如导航栏、产品转盘、项目卡片、网络表单和购物车等。这个环境是开源的以促进后续的研究。</p>
<p>AEG 的目标是自动生成一个训练环境的课程,涵盖可能的网站空间,并由此实现到现实网站导航任务的泛化。然而,如果我们只是简单地应用一个最小最大对抗智能体,即对抗智能体试图让学习智能体的表现最糟糕,那么这种课程不太可能出现。这是因为对抗智能体的动机是创建一个尽可能难的网站,而不是根据学习智能体当前的技术水平调整网站难度。最近提出的一种 AEG 技术 PAIRED(Protagonist Antagonist Induced Regret Environment Design)训练对抗智能体去最大化遗憾值。我们通过两种新的算法改进了原始的 PAIRED 算法:</p>
<ol>
<li>我们提出了一种更加灵活的计算遗憾的方法,使得我们的算法梗不容易陷入局部最小值。</li>
<li>我们引入了一个明确的预算机制,使得对抗智能体在学习智能体不能完成任务时,因为使得环境更复杂而受到惩罚;否则的话对抗智能体会因为使得环境变复杂而受到奖励。</li>
</ol>
<p>本文总体有如下贡献:</p>
<ul>
<li>一个新的基准环境 gMiniWoB ,通过允许利用组合设计基元来构建网站,增强 AEG 在网页导航中的应用。</li>
<li>Flexible b-PAIRED 算法,可计算出一个更加稳定的对遗憾的估计,并直接鼓励对抗智能体根据学习智能体的表现调整环境的复杂程度。</li>
<li>实验结果表明:Flexible b-PAIRED 申城了一套包含越来越具有挑战性的网站的课程,并且训练出了能够在测试时成功泛化到复杂、未知网站的智能体。我们的方法明显超过了在最小最大遗憾 AEG 上的之前的工作,也包括了使用强化学习训练网页导航智能体的最先进的方法。</li>
</ul>
<h2 id="问题描述">问题描述</h2>
<h3 id="网页导航问题">网页导航问题</h3>
<p>我们将网页导航问题定义为一个序列决策问题。我们训练一个智能体,其参数为一个网络 <span class="katex"><span class="katex-mathml"><math xmlns="http://www.w3.org/1998/Math/MathML"><semantics><mrow><mi>π</mi><mo stretchy="false">(</mo><msub><mi>a</mi><mi>t</mi></msub><mi mathvariant="normal">∣</mi><msub><mi>s</mi><mi>t</mi></msub><mo separator="true">;</mo><msub><mi mathvariant="normal">Θ</mi><mi>i</mi></msub><mo stretchy="false">)</mo></mrow><annotation encoding="application/x-tex">\pi(a_t|s_t;\Theta_i)</annotation></semantics></math></span><span class="katex-html" aria-hidden="true"><span class="base"><span class="strut" style="height:1em;vertical-align:-0.25em;"></span><span class="mord mathdefault" style="margin-right:0.03588em;">π</span><span class="mopen">(</span><span class="mord"><span class="mord mathdefault">a</span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.2805559999999999em;"><span style="top:-2.5500000000000003em;margin-left:0em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mathdefault mtight">t</span></span></span></span><span class="vlist-s"></span></span><span class="vlist-r"><span class="vlist" style="height:0.15em;"><span></span></span></span></span></span></span><span class="mord">∣</span><span class="mord"><span class="mord mathdefault">s</span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.2805559999999999em;"><span style="top:-2.5500000000000003em;margin-left:0em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mathdefault mtight">t</span></span></span></span><span class="vlist-s"></span></span><span class="vlist-r"><span class="vlist" style="height:0.15em;"><span></span></span></span></span></span></span><span class="mpunct">;</span><span class="mspace" style="margin-right:0.16666666666666666em;"></span><span class="mord"><span class="mord">Θ</span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.31166399999999994em;"><span style="top:-2.5500000000000003em;margin-left:0em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mathdefault mtight">i</span></span></span></span><span class="vlist-s"></span></span><span class="vlist-r"><span class="vlist" style="height:0.15em;"><span></span></span></span></span></span></span><span class="mclose">)</span></span></span></span> ,将输入的状态 <span class="katex"><span class="katex-mathml"><math xmlns="http://www.w3.org/1998/Math/MathML"><semantics><mrow><msub><mi>s</mi><mi>t</mi></msub></mrow><annotation encoding="application/x-tex">s_t</annotation></semantics></math></span><span class="katex-html" aria-hidden="true"><span class="base"><span class="strut" style="height:0.58056em;vertical-align:-0.15em;"></span><span class="mord"><span class="mord mathdefault">s</span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.2805559999999999em;"><span style="top:-2.5500000000000003em;margin-left:0em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mathdefault mtight">t</span></span></span></span><span class="vlist-s"></span></span><span class="vlist-r"><span class="vlist" style="height:0.15em;"><span></span></span></span></span></span></span></span></span></span> 转换为输出的动作 <span class="katex"><span class="katex-mathml"><math xmlns="http://www.w3.org/1998/Math/MathML"><semantics><mrow><msub><mi>a</mi><mi>t</mi></msub></mrow><annotation encoding="application/x-tex">a_t</annotation></semantics></math></span><span class="katex-html" aria-hidden="true"><span class="base"><span class="strut" style="height:0.58056em;vertical-align:-0.15em;"></span><span class="mord"><span class="mord mathdefault">a</span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.2805559999999999em;"><span style="top:-2.5500000000000003em;margin-left:0em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mathdefault mtight">t</span></span></span></span><span class="vlist-s"></span></span><span class="vlist-r"><span class="vlist" style="height:0.15em;"><span></span></span></span></span></span></span></span></span></span> 以最大化累计折损增益 <span class="katex"><span class="katex-mathml"><math xmlns="http://www.w3.org/1998/Math/MathML"><semantics><mrow><mi>O</mi><mo>=</mo><msubsup><mo>∑</mo><mrow><mi>t</mi><mo>=</mo><mn>0</mn></mrow><mi>T</mi></msubsup><msup><mi>γ</mi><mi>t</mi></msup><msub><mi>r</mi><mi>t</mi></msub></mrow><annotation encoding="application/x-tex">O = \sum\limits_{t=0}^T \gamma^tr_t</annotation></semantics></math></span><span class="katex-html" aria-hidden="true"><span class="base"><span class="strut" style="height:0.68333em;vertical-align:0em;"></span><span class="mord mathdefault" style="margin-right:0.02778em;">O</span><span class="mspace" style="margin-right:0.2777777777777778em;"></span><span class="mrel">=</span><span class="mspace" style="margin-right:0.2777777777777778em;"></span></span><span class="base"><span class="strut" style="height:2.4954490000000003em;vertical-align:-0.9671129999999999em;"></span><span class="mop op-limits"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:1.5283360000000004em;"><span style="top:-2.132887em;margin-left:0em;"><span class="pstrut" style="height:3em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mtight"><span class="mord mathdefault mtight">t</span><span class="mrel mtight">=</span><span class="mord mtight">0</span></span></span></span><span style="top:-3.0000050000000003em;"><span class="pstrut" style="height:3em;"></span><span><span class="mop op-symbol small-op">∑</span></span></span><span style="top:-3.950005em;margin-left:0em;"><span class="pstrut" style="height:3em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mathdefault mtight" style="margin-right:0.13889em;">T</span></span></span></span><span class="vlist-s"></span></span><span class="vlist-r"><span class="vlist" style="height:0.9671129999999999em;"><span></span></span></span></span></span><span class="mspace" style="margin-right:0.16666666666666666em;"></span><span class="mord"><span class="mord mathdefault" style="margin-right:0.05556em;">γ</span><span class="msupsub"><span class="vlist-t"><span class="vlist-r"><span class="vlist" style="height:0.7935559999999999em;"><span style="top:-3.063em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mathdefault mtight">t</span></span></span></span></span></span></span></span><span class="mord"><span class="mord mathdefault" style="margin-right:0.02778em;">r</span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.2805559999999999em;"><span style="top:-2.5500000000000003em;margin-left:-0.02778em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mathdefault mtight">t</span></span></span></span><span class="vlist-s"></span></span><span class="vlist-r"><span class="vlist" style="height:0.15em;"><span></span></span></span></span></span></span></span></span></span> ,其中 <span class="katex"><span class="katex-mathml"><math xmlns="http://www.w3.org/1998/Math/MathML"><semantics><mrow><msub><mi>r</mi><mi>t</mi></msub></mrow><annotation encoding="application/x-tex">r_t</annotation></semantics></math></span><span class="katex-html" aria-hidden="true"><span class="base"><span class="strut" style="height:0.58056em;vertical-align:-0.15em;"></span><span class="mord"><span class="mord mathdefault" style="margin-right:0.02778em;">r</span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.2805559999999999em;"><span style="top:-2.5500000000000003em;margin-left:-0.02778em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mathdefault mtight">t</span></span></span></span><span class="vlist-s"></span></span><span class="vlist-r"><span class="vlist" style="height:0.15em;"><span></span></span></span></span></span></span></span></span></span> 是在 <span class="katex"><span class="katex-mathml"><math xmlns="http://www.w3.org/1998/Math/MathML"><semantics><mrow><mi>t</mi></mrow><annotation encoding="application/x-tex">t</annotation></semantics></math></span><span class="katex-html" aria-hidden="true"><span class="base"><span class="strut" style="height:0.61508em;vertical-align:0em;"></span><span class="mord mathdefault">t</span></span></span></span> 时刻所获得的奖励,<span class="katex"><span class="katex-mathml"><math xmlns="http://www.w3.org/1998/Math/MathML"><semantics><mrow><mi>γ</mi></mrow><annotation encoding="application/x-tex">\gamma</annotation></semantics></math></span><span class="katex-html" aria-hidden="true"><span class="base"><span class="strut" style="height:0.625em;vertical-align:-0.19444em;"></span><span class="mord mathdefault" style="margin-right:0.05556em;">γ</span></span></span></span> 是折损因子,<span class="katex"><span class="katex-mathml"><math xmlns="http://www.w3.org/1998/Math/MathML"><semantics><mrow><mi>T</mi></mrow><annotation encoding="application/x-tex">T</annotation></semantics></math></span><span class="katex-html" aria-hidden="true"><span class="base"><span class="strut" style="height:0.68333em;vertical-align:0em;"></span><span class="mord mathdefault" style="margin-right:0.13889em;">T</span></span></span></span> 是一个 episode 的长度。 <strong>我们将网页和用户指令作为输入状态</strong>。其中网页在每个时刻都会被动态更新,而用户指令在最开始就确定不变。我们使用 DOM 树表示网页,其中每个元素都用一个(属性,值)二元组的集合和一个特征数组进行表示。指令是字段的集合,其中字段都表示为(键,值)二元组。对于每个任务,键是固定的,但是值会基于用户的输入而动态变化。</p>
<p><strong>每个动作都被表示为(元素,字段)二元组,表示在元素上进行动作,并且将字段的值作为输入</strong>,即将字段的值输入元素。智能体在每个 episode 的最后会收到一个<strong>任务成功奖励</strong> (<span class="katex"><span class="katex-mathml"><math xmlns="http://www.w3.org/1998/Math/MathML"><semantics><mrow><mn>1.0</mn></mrow><annotation encoding="application/x-tex">1.0</annotation></semantics></math></span><span class="katex-html" aria-hidden="true"><span class="base"><span class="strut" style="height:0.64444em;vertical-align:0em;"></span><span class="mord">1</span><span class="mord">.</span><span class="mord">0</span></span></span></span> 或 <span class="katex"><span class="katex-mathml"><math xmlns="http://www.w3.org/1998/Math/MathML"><semantics><mrow><mo>−</mo><mn>1.0</mn></mrow><annotation encoding="application/x-tex">-1.0</annotation></semantics></math></span><span class="katex-html" aria-hidden="true"><span class="base"><span class="strut" style="height:0.72777em;vertical-align:-0.08333em;"></span><span class="mord">−</span><span class="mord">1</span><span class="mord">.</span><span class="mord">0</span></span></span></span>);每当页面中的元素被更新,智能体会收到一个<strong>基于潜力的奖励</strong>;为了鼓励智能体进行高效地导航,每个时刻智能体都会收到一个<strong>小惩罚</strong>。</p>
<h3 id="paired-算法">PAIRED 算法</h3>
<p>AEG 训练一个对抗策略 <span class="katex"><span class="katex-mathml"><math xmlns="http://www.w3.org/1998/Math/MathML"><semantics><mrow><msub><mi>π</mi><mi>E</mi></msub></mrow><annotation encoding="application/x-tex">\pi_E</annotation></semantics></math></span><span class="katex-html" aria-hidden="true"><span class="base"><span class="strut" style="height:0.58056em;vertical-align:-0.15em;"></span><span class="mord"><span class="mord mathdefault" style="margin-right:0.03588em;">π</span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.32833099999999993em;"><span style="top:-2.5500000000000003em;margin-left:-0.03588em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mathdefault mtight" style="margin-right:0.05764em;">E</span></span></span></span><span class="vlist-s"></span></span><span class="vlist-r"><span class="vlist" style="height:0.15em;"><span></span></span></span></span></span></span></span></span></span> 以设计环境使得学习智能体策略 <span class="katex"><span class="katex-mathml"><math xmlns="http://www.w3.org/1998/Math/MathML"><semantics><mrow><msub><mi>π</mi><mi>P</mi></msub></mrow><annotation encoding="application/x-tex">\pi_P</annotation></semantics></math></span><span class="katex-html" aria-hidden="true"><span class="base"><span class="strut" style="height:0.58056em;vertical-align:-0.15em;"></span><span class="mord"><span class="mord mathdefault" style="margin-right:0.03588em;">π</span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.32833099999999993em;"><span style="top:-2.5500000000000003em;margin-left:-0.03588em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mathdefault mtight" style="margin-right:0.13889em;">P</span></span></span></span><span class="vlist-s"></span></span><span class="vlist-r"><span class="vlist" style="height:0.15em;"><span></span></span></span></span></span></span></span></span></span> 的表现尽可能坏。假设 <span class="katex"><span class="katex-mathml"><math xmlns="http://www.w3.org/1998/Math/MathML"><semantics><mrow><msubsup><mi>R</mi><mi>i</mi><mi>P</mi></msubsup><mo>=</mo><msubsup><mo>∑</mo><mrow><mi>t</mi><mo>=</mo><mn>1</mn></mrow><mi>T</mi></msubsup><msup><mi>γ</mi><mi>t</mi></msup><msubsup><mi>r</mi><mi>t</mi><mi>P</mi></msubsup></mrow><annotation encoding="application/x-tex">R^P_i = \sum\limits_{t=1}^T \gamma^tr^P_t</annotation></semantics></math></span><span class="katex-html" aria-hidden="true"><span class="base"><span class="strut" style="height:1.0999949999999998em;vertical-align:-0.258664em;"></span><span class="mord"><span class="mord mathdefault" style="margin-right:0.00773em;">R</span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.8413309999999999em;"><span style="top:-2.441336em;margin-left:-0.00773em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mathdefault mtight">i</span></span></span><span style="top:-3.063em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mathdefault mtight" style="margin-right:0.13889em;">P</span></span></span></span><span class="vlist-s"></span></span><span class="vlist-r"><span class="vlist" style="height:0.258664em;"><span></span></span></span></span></span></span><span class="mspace" style="margin-right:0.2777777777777778em;"></span><span class="mrel">=</span><span class="mspace" style="margin-right:0.2777777777777778em;"></span></span><span class="base"><span class="strut" style="height:2.4954490000000003em;vertical-align:-0.9671129999999999em;"></span><span class="mop op-limits"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:1.5283360000000004em;"><span style="top:-2.132887em;margin-left:0em;"><span class="pstrut" style="height:3em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mtight"><span class="mord mathdefault mtight">t</span><span class="mrel mtight">=</span><span class="mord mtight">1</span></span></span></span><span style="top:-3.0000050000000003em;"><span class="pstrut" style="height:3em;"></span><span><span class="mop op-symbol small-op">∑</span></span></span><span style="top:-3.950005em;margin-left:0em;"><span class="pstrut" style="height:3em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mathdefault mtight" style="margin-right:0.13889em;">T</span></span></span></span><span class="vlist-s"></span></span><span class="vlist-r"><span class="vlist" style="height:0.9671129999999999em;"><span></span></span></span></span></span><span class="mspace" style="margin-right:0.16666666666666666em;"></span><span class="mord"><span class="mord mathdefault" style="margin-right:0.05556em;">γ</span><span class="msupsub"><span class="vlist-t"><span class="vlist-r"><span class="vlist" style="height:0.7935559999999999em;"><span style="top:-3.063em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mathdefault mtight">t</span></span></span></span></span></span></span></span><span class="mord"><span class="mord mathdefault" style="margin-right:0.02778em;">r</span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.8413309999999999em;"><span style="top:-2.4530000000000003em;margin-left:-0.02778em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mathdefault mtight">t</span></span></span><span style="top:-3.063em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mathdefault mtight" style="margin-right:0.13889em;">P</span></span></span></span><span class="vlist-s"></span></span><span class="vlist-r"><span class="vlist" style="height:0.247em;"><span></span></span></span></span></span></span></span></span></span> 表示学习策略在轨迹 <span class="katex"><span class="katex-mathml"><math xmlns="http://www.w3.org/1998/Math/MathML"><semantics><mrow><mi>i</mi></mrow><annotation encoding="application/x-tex">i</annotation></semantics></math></span><span class="katex-html" aria-hidden="true"><span class="base"><span class="strut" style="height:0.65952em;vertical-align:0em;"></span><span class="mord mathdefault">i</span></span></span></span> 上所获得的总奖励值。在最小最大 AEG 中,对抗策略的目标很简单:<span class="katex"><span class="katex-mathml"><math xmlns="http://www.w3.org/1998/Math/MathML"><semantics><mrow><mo>−</mo><msup><mi>R</mi><mi>P</mi></msup></mrow><annotation encoding="application/x-tex">-R^P</annotation></semantics></math></span><span class="katex-html" aria-hidden="true"><span class="base"><span class="strut" style="height:0.924661em;vertical-align:-0.08333em;"></span><span class="mord">−</span><span class="mord"><span class="mord mathdefault" style="margin-right:0.00773em;">R</span><span class="msupsub"><span class="vlist-t"><span class="vlist-r"><span class="vlist" style="height:0.8413309999999999em;"><span style="top:-3.063em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mathdefault mtight" style="margin-right:0.13889em;">P</span></span></span></span></span></span></span></span></span></span></span> 。因此,极小极大对抗策略有动机创造过于困难或不可能的环境,这可能使得学习策略无法学习。然而,PAIRED 算法训练对抗策略最大化学习策略的遗憾值,定义为学习策略所获得回报与最有策略所获得回报的插值 <span class="katex"><span class="katex-mathml"><math xmlns="http://www.w3.org/1998/Math/MathML"><semantics><mrow><msup><mi>R</mi><mo>∗</mo></msup><mo>−</mo><msup><mi>R</mi><mi>P</mi></msup></mrow><annotation encoding="application/x-tex">R^*-R^P</annotation></semantics></math></span><span class="katex-html" aria-hidden="true"><span class="base"><span class="strut" style="height:0.772026em;vertical-align:-0.08333em;"></span><span class="mord"><span class="mord mathdefault" style="margin-right:0.00773em;">R</span><span class="msupsub"><span class="vlist-t"><span class="vlist-r"><span class="vlist" style="height:0.688696em;"><span style="top:-3.063em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mbin mtight">∗</span></span></span></span></span></span></span></span><span class="mspace" style="margin-right:0.2222222222222222em;"></span><span class="mbin">−</span><span class="mspace" style="margin-right:0.2222222222222222em;"></span></span><span class="base"><span class="strut" style="height:0.8413309999999999em;vertical-align:0em;"></span><span class="mord"><span class="mord mathdefault" style="margin-right:0.00773em;">R</span><span class="msupsub"><span class="vlist-t"><span class="vlist-r"><span class="vlist" style="height:0.8413309999999999em;"><span style="top:-3.063em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mathdefault mtight" style="margin-right:0.13889em;">P</span></span></span></span></span></span></span></span></span></span></span> 。当奖励函数包含对更高效地完成任务的激励时,对于那些在最优策略下,可以在几步之内完成,但是当前策略无法完成的简单任务,遗憾值会很高。因此,最大化遗憾的对抗策略会不断提出更加容易的任务,直到学习策略开始解决这些任务。这也让遗憾值成为 AEG 的一个理想目标。</p>
<p>为了估计遗憾值,PAIRED 算法引入了第三个智能体——反派智能体(策略为 <span class="katex"><span class="katex-mathml"><math xmlns="http://www.w3.org/1998/Math/MathML"><semantics><mrow><msub><mi>π</mi><mi>A</mi></msub></mrow><annotation encoding="application/x-tex">\pi_A</annotation></semantics></math></span><span class="katex-html" aria-hidden="true"><span class="base"><span class="strut" style="height:0.58056em;vertical-align:-0.15em;"></span><span class="mord"><span class="mord mathdefault" style="margin-right:0.03588em;">π</span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.32833099999999993em;"><span style="top:-2.5500000000000003em;margin-left:-0.03588em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mathdefault mtight">A</span></span></span></span><span class="vlist-s"></span></span><span class="vlist-r"><span class="vlist" style="height:0.15em;"><span></span></span></span></span></span></span></span></span></span>),它限制对抗智能体只能生成反派智能体能够完成的可行环境。当对抗智能体生成了一个环境 <span class="katex"><span class="katex-mathml"><math xmlns="http://www.w3.org/1998/Math/MathML"><semantics><mrow><mi>E</mi></mrow><annotation encoding="application/x-tex">E</annotation></semantics></math></span><span class="katex-html" aria-hidden="true"><span class="base"><span class="strut" style="height:0.68333em;vertical-align:0em;"></span><span class="mord mathdefault" style="margin-right:0.05764em;">E</span></span></span></span> ,学习智能体和反派智能体都会在环境 <span class="katex"><span class="katex-mathml"><math xmlns="http://www.w3.org/1998/Math/MathML"><semantics><mrow><mi>E</mi></mrow><annotation encoding="application/x-tex">E</annotation></semantics></math></span><span class="katex-html" aria-hidden="true"><span class="base"><span class="strut" style="height:0.68333em;vertical-align:0em;"></span><span class="mord mathdefault" style="margin-right:0.05764em;">E</span></span></span></span> 中收集 <span class="katex"><span class="katex-mathml"><math xmlns="http://www.w3.org/1998/Math/MathML"><semantics><mrow><mi>M</mi></mrow><annotation encoding="application/x-tex">M</annotation></semantics></math></span><span class="katex-html" aria-hidden="true"><span class="base"><span class="strut" style="height:0.68333em;vertical-align:0em;"></span><span class="mord mathdefault" style="margin-right:0.10903em;">M</span></span></span></span> 条轨迹,其回报分别为 <span class="katex"><span class="katex-mathml"><math xmlns="http://www.w3.org/1998/Math/MathML"><semantics><mrow><msubsup><mi>R</mi><mn>1</mn><mi>P</mi></msubsup><mo separator="true">,</mo><mo>⋯</mo><mtext> </mtext><mo separator="true">,</mo><msubsup><mi>R</mi><mi>M</mi><mi>P</mi></msubsup><mo separator="true">,</mo><msubsup><mi>R</mi><mn>1</mn><mi>A</mi></msubsup><mo separator="true">,</mo><mo>⋯</mo><mtext> </mtext><mo separator="true">,</mo><msubsup><mi>R</mi><mi>M</mi><mi>A</mi></msubsup></mrow><annotation encoding="application/x-tex">R^P_1,\cdots,R^P_M,R^A_1,\cdots,R^A_M</annotation></semantics></math></span><span class="katex-html" aria-hidden="true"><span class="base"><span class="strut" style="height:1.1166619999999998em;vertical-align:-0.275331em;"></span><span class="mord"><span class="mord mathdefault" style="margin-right:0.00773em;">R</span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.8413309999999999em;"><span style="top:-2.4518920000000004em;margin-left:-0.00773em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mtight">1</span></span></span><span style="top:-3.063em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mathdefault mtight" style="margin-right:0.13889em;">P</span></span></span></span><span class="vlist-s"></span></span><span class="vlist-r"><span class="vlist" style="height:0.24810799999999997em;"><span></span></span></span></span></span></span><span class="mpunct">,</span><span class="mspace" style="margin-right:0.16666666666666666em;"></span><span class="minner">⋯</span><span class="mspace" style="margin-right:0.16666666666666666em;"></span><span class="mspace" style="margin-right:0.16666666666666666em;"></span><span class="mpunct">,</span><span class="mspace" style="margin-right:0.16666666666666666em;"></span><span class="mord"><span class="mord mathdefault" style="margin-right:0.00773em;">R</span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.8413309999999999em;"><span style="top:-2.424669em;margin-left:-0.00773em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mathdefault mtight" style="margin-right:0.10903em;">M</span></span></span><span style="top:-3.063em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mathdefault mtight" style="margin-right:0.13889em;">P</span></span></span></span><span class="vlist-s"></span></span><span class="vlist-r"><span class="vlist" style="height:0.275331em;"><span></span></span></span></span></span></span><span class="mpunct">,</span><span class="mspace" style="margin-right:0.16666666666666666em;"></span><span class="mord"><span class="mord mathdefault" style="margin-right:0.00773em;">R</span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.8413309999999999em;"><span style="top:-2.4518920000000004em;margin-left:-0.00773em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mtight">1</span></span></span><span style="top:-3.063em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mathdefault mtight">A</span></span></span></span><span class="vlist-s"></span></span><span class="vlist-r"><span class="vlist" style="height:0.24810799999999997em;"><span></span></span></span></span></span></span><span class="mpunct">,</span><span class="mspace" style="margin-right:0.16666666666666666em;"></span><span class="minner">⋯</span><span class="mspace" style="margin-right:0.16666666666666666em;"></span><span class="mspace" style="margin-right:0.16666666666666666em;"></span><span class="mpunct">,</span><span class="mspace" style="margin-right:0.16666666666666666em;"></span><span class="mord"><span class="mord mathdefault" style="margin-right:0.00773em;">R</span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.8413309999999999em;"><span style="top:-2.424669em;margin-left:-0.00773em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mathdefault mtight" style="margin-right:0.10903em;">M</span></span></span><span style="top:-3.063em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mathdefault mtight">A</span></span></span></span><span class="vlist-s"></span></span><span class="vlist-r"><span class="vlist" style="height:0.275331em;"><span></span></span></span></span></span></span></span></span></span>,最后的遗憾值计算如下:</p>
<p class="katex-block"><span class="katex-display"><span class="katex"><span class="katex-mathml"><math xmlns="http://www.w3.org/1998/Math/MathML"><semantics><mrow><mrow><mi mathvariant="normal">R</mi><mi mathvariant="normal">e</mi><mi mathvariant="normal">g</mi><mi mathvariant="normal">r</mi><mi mathvariant="normal">e</mi><mi mathvariant="normal">t</mi></mrow><mo>=</mo><munder><mo><mi>max</mi><mo></mo></mo><mi>i</mi></munder><msubsup><mi>R</mi><mi>i</mi><mi>A</mi></msubsup><mo>−</mo><mfrac><mn>1</mn><mi>M</mi></mfrac><munderover><mo>∑</mo><mrow><mi>m</mi><mo>=</mo><mn>1</mn></mrow><mi>M</mi></munderover><msubsup><mi>R</mi><mi>m</mi><mi>P</mi></msubsup></mrow><annotation encoding="application/x-tex">{\rm Regret} = \max\limits_{i} R^A_i - \dfrac{1}{M}\sum\limits_{m=1}^MR^P_m
</annotation></semantics></math></span><span class="katex-html" aria-hidden="true"><span class="base"><span class="strut" style="height:0.8777699999999999em;vertical-align:-0.19444em;"></span><span class="mord"><span class="mord"><span class="mord mathrm">R</span><span class="mord mathrm">e</span><span class="mord mathrm" style="margin-right:0.01389em;">g</span><span class="mord mathrm">r</span><span class="mord mathrm">e</span><span class="mord mathrm">t</span></span></span><span class="mspace" style="margin-right:0.2777777777777778em;"></span><span class="mrel">=</span><span class="mspace" style="margin-right:0.2777777777777778em;"></span></span><span class="base"><span class="strut" style="height:1.618995em;vertical-align:-0.7276640000000001em;"></span><span class="mop op-limits"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.43055999999999983em;"><span style="top:-2.372336em;margin-left:0em;"><span class="pstrut" style="height:3em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mtight"><span class="mord mathdefault mtight">i</span></span></span></span><span style="top:-3em;"><span class="pstrut" style="height:3em;"></span><span><span class="mop">max</span></span></span></span><span class="vlist-s"></span></span><span class="vlist-r"><span class="vlist" style="height:0.7276640000000001em;"><span></span></span></span></span></span><span class="mspace" style="margin-right:0.16666666666666666em;"></span><span class="mord"><span class="mord mathdefault" style="margin-right:0.00773em;">R</span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.8913309999999999em;"><span style="top:-2.4530000000000003em;margin-left:-0.00773em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mathdefault mtight">i</span></span></span><span style="top:-3.113em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mathdefault mtight">A</span></span></span></span><span class="vlist-s"></span></span><span class="vlist-r"><span class="vlist" style="height:0.247em;"><span></span></span></span></span></span></span><span class="mspace" style="margin-right:0.2222222222222222em;"></span><span class="mbin">−</span><span class="mspace" style="margin-right:0.2222222222222222em;"></span></span><span class="base"><span class="strut" style="height:3.0954490000000003em;vertical-align:-1.267113em;"></span><span class="mord"><span class="mopen nulldelimiter"></span><span class="mfrac"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:1.32144em;"><span style="top:-2.314em;"><span class="pstrut" style="height:3em;"></span><span class="mord"><span class="mord mathdefault" style="margin-right:0.10903em;">M</span></span></span><span style="top:-3.23em;"><span class="pstrut" style="height:3em;"></span><span class="frac-line" style="border-bottom-width:0.04em;"></span></span><span style="top:-3.677em;"><span class="pstrut" style="height:3em;"></span><span class="mord"><span class="mord">1</span></span></span></span><span class="vlist-s"></span></span><span class="vlist-r"><span class="vlist" style="height:0.686em;"><span></span></span></span></span></span><span class="mclose nulldelimiter"></span></span><span class="mspace" style="margin-right:0.16666666666666666em;"></span><span class="mop op-limits"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:1.8283360000000002em;"><span style="top:-1.882887em;margin-left:0em;"><span class="pstrut" style="height:3.05em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mtight"><span class="mord mathdefault mtight">m</span><span class="mrel mtight">=</span><span class="mord mtight">1</span></span></span></span><span style="top:-3.050005em;"><span class="pstrut" style="height:3.05em;"></span><span><span class="mop op-symbol large-op">∑</span></span></span><span style="top:-4.3000050000000005em;margin-left:0em;"><span class="pstrut" style="height:3.05em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mathdefault mtight" style="margin-right:0.10903em;">M</span></span></span></span><span class="vlist-s"></span></span><span class="vlist-r"><span class="vlist" style="height:1.267113em;"><span></span></span></span></span></span><span class="mspace" style="margin-right:0.16666666666666666em;"></span><span class="mord"><span class="mord mathdefault" style="margin-right:0.00773em;">R</span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.8913309999999999em;"><span style="top:-2.4530000000000003em;margin-left:-0.00773em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mathdefault mtight">m</span></span></span><span style="top:-3.113em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mathdefault mtight" style="margin-right:0.13889em;">P</span></span></span></span><span class="vlist-s"></span></span><span class="vlist-r"><span class="vlist" style="height:0.247em;"><span></span></span></span></span></span></span></span></span></span></span></p>
<p>在这种情况下,如果对抗策略和反派策略协调一致并与学习策略达成纳什均衡,那么学习智能体将学会最小化遗憾。然而,在实践中,基于梯度的多智能体强化学习并没有收敛性保证,它们具有高度非稳态性,并且经常不会收敛。如果反派策略和对抗策略不能协调一致,那么 PAIRED 算法会通过优化反派策略以最小化遗憾值。在这种情况下,上式中的目标只会让学习策略学得像反派策略一样好。如果反派策略无法提升,或者达到一个局部最优值,那么对抗策略也就无法继续训练学习策略了。因此本文提出了一个改进的目标以解决这个问题。</p>
<h2 id="解决方法">解决方法</h2>
<p>我们从一个空的网站开始,逐渐添加新的页面和页面之间的链接。由于我们通过 DOM 树来表示页面,我们重点关注 DOM 树的创建,并且假设页面之间的链接是根据绑定到某些元素的事件隐式定义的。</p>
<p>设计 DOM 树的最通用的方法是以自底向上的方法组合一组任意元素,但是这会生成大量的语义不一致的错误网站。如图 3 的第二个页面所示,文本框的顶部有一个标签“First Name”。现在,如果我们将这个标签插入到第一个页面中“Username”文本框的上面,那么这个网站的格式就会变得不正确,因为无法确定文本框是指“Username”还是“First Name”。</p>
<div class="gallery ">
<p><img src="/images/aeg/fig3.png" class="lazyload" data-srcset="/images/aeg/fig3.png" srcset="data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAABGdBTUEAALGPC/xhBQAAADhlWElmTU0AKgAAAAgAAYdpAAQAAAABAAAAGgAAAAAAAqACAAQAAAABAAAAAaADAAQAAAABAAAAAQAAAADa6r/EAAAAC0lEQVQIHWNgAAIAAAUAAY27m/MAAAAASUVORK5CYII=" alt="网页生成示例"></p>
</div>
<p>因此,我们将网页设计定义为一系列原子 DOM 子树的组合,这些子树足够通用以构建复杂的网站,但是可以在树形结构中安全地组合。收i西安,我们创建了一组未指定的 DOM 树模板,将其中一些元素和属性替换成了变量。通过在模板中为属性赋值,就能够生成一个完全指定的原子 DOM 树,并且可以和其他原子元素组合创建一个新的网页。原子的组合顺序也决定了网页将会如何渲染。</p>
<div class="gallery ">
<p><img src="/images/aeg/fig2.png" class="lazyload" data-srcset="/images/aeg/fig2.png" srcset="data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAABGdBTUEAALGPC/xhBQAAADhlWElmTU0AKgAAAAgAAYdpAAQAAAABAAAAGgAAAAAAAqACAAQAAAABAAAAAaADAAQAAAABAAAAAQAAAADa6r/EAAAAC0lEQVQIHWNgAAIAAAUAAY27m/MAAAAASUVORK5CYII=" alt="DOM 树模板示例"></p>
</div>
<p>图 2 是未指定 DOM 树模板及其在不同变量赋值情况下实例化的一个例子。在图 2b 中,我们创建了一个输入模板,包括一个变量标签和一个文本框,它们有一个共同的父节点;在图 2a 中,我们选择标签元素,并为它的文本属性赋值;在图 2c 中,我们为文本框的内部文本赋值,而忽略了标签元素。</p>
<h3 id="网页设计组件">网页设计组件</h3>
<p>我们引入了一个名为 gMiniWoB 的新框架,用于自动生成网站,其实现了来自 11 个不同的未指定 DOM 模板中的 40 个不同的设计组件。这些组件在网页中被广泛使用,包括导航栏、轮播图、项目卡片、网页表单、购物车、下拉菜单等。每个组件都有至少一个可交互的元素,当智能体与其进行交互时,会改变 DOM 结构。每个组件根据其在奖励计算中不同的用处分成两类:(1)<strong>主动组件(已使用)</strong>;(2)<strong>被动组件(未使用)</strong>。40 个不同的组件中,26 个组件时主动组件,其他的是被动组件。当一个新的主动组件被添加到网页中时,指令会自动扩展以适应对应的字段。这使得主动组件相较于被动组件(大多数情况下被用作噪声)学起来更加复杂。然而,真实网站中往往包括很多分散注意力的元素(被动组件),因此智能体能够学会忽略它们非常重要。</p>
<h3 id="对抗智能体架构">对抗智能体架构</h3>
<p>为了解决组合环境生成问题,我们提出了一个自回归的对抗策略,其目标时将一系列组件放到一系列位置上。我们使用策略 <span class="katex"><span class="katex-mathml"><math xmlns="http://www.w3.org/1998/Math/MathML"><semantics><mrow><msub><mi>π</mi><mi>E</mi></msub><mo stretchy="false">(</mo><msup><mi>a</mi><mi>A</mi></msup><mi mathvariant="normal">∣</mi><msup><mi>o</mi><mi>A</mi></msup><mo stretchy="false">)</mo></mrow><annotation encoding="application/x-tex">\pi_E(a^A|o^A)</annotation></semantics></math></span><span class="katex-html" aria-hidden="true"><span class="base"><span class="strut" style="height:1.0913309999999998em;vertical-align:-0.25em;"></span><span class="mord"><span class="mord mathdefault" style="margin-right:0.03588em;">π</span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.32833099999999993em;"><span style="top:-2.5500000000000003em;margin-left:-0.03588em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mathdefault mtight" style="margin-right:0.05764em;">E</span></span></span></span><span class="vlist-s"></span></span><span class="vlist-r"><span class="vlist" style="height:0.15em;"><span></span></span></span></span></span></span><span class="mopen">(</span><span class="mord"><span class="mord mathdefault">a</span><span class="msupsub"><span class="vlist-t"><span class="vlist-r"><span class="vlist" style="height:0.8413309999999999em;"><span style="top:-3.063em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mathdefault mtight">A</span></span></span></span></span></span></span></span><span class="mord">∣</span><span class="mord"><span class="mord mathdefault">o</span><span class="msupsub"><span class="vlist-t"><span class="vlist-r"><span class="vlist" style="height:0.8413309999999999em;"><span style="top:-3.063em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mathdefault mtight">A</span></span></span></span></span></span></span></span><span class="mclose">)</span></span></span></span> 对对抗智能体进行参数化:</p>
<p class="katex-block"><span class="katex-display"><span class="katex"><span class="katex-mathml"><math xmlns="http://www.w3.org/1998/Math/MathML"><semantics><mrow><msub><mi>π</mi><mi>E</mi></msub><mo stretchy="false">(</mo><msup><mi>a</mi><mi>A</mi></msup><mi mathvariant="normal">∣</mi><msup><mi>o</mi><mi>A</mi></msup><mo stretchy="false">)</mo><mo>=</mo><mi>π</mi><mo stretchy="false">(</mo><mi>k</mi><mi mathvariant="normal">∣</mi><mi>K</mi><mo stretchy="false">)</mo><munderover><mo>∏</mo><mrow><mi>i</mi><mo>=</mo><mn>0</mn></mrow><mi>N</mi></munderover><mi>π</mi><mo stretchy="false">(</mo><msub><mi>a</mi><mi>i</mi></msub><mi mathvariant="normal">∣</mi><msub><mi>a</mi><mrow><mn>0</mn><mo>⋯</mo><mi>i</mi><mo>−</mo><mn>1</mn></mrow></msub><mo separator="true">,</mo><msub><mi>b</mi><mrow><mn>0</mn><mo>⋯</mo><mi>i</mi><mo>−</mo><mn>1</mn></mrow></msub><mo separator="true">,</mo><mi>k</mi><mo stretchy="false">)</mo><mi>π</mi><mo stretchy="false">(</mo><msub><mi>b</mi><mi>i</mi></msub><mi mathvariant="normal">∣</mi><msub><mi>a</mi><mrow><mn>0</mn><mo>⋯</mo><mi>i</mi><mo>−</mo><mn>1</mn></mrow></msub><mo separator="true">,</mo><msub><mi>b</mi><mrow><mn>0</mn><mo>⋯</mo><mi>i</mi><mo>−</mo><mn>1</mn></mrow></msub><mo separator="true">,</mo><mi>k</mi><mo stretchy="false">)</mo></mrow><annotation encoding="application/x-tex">\pi_E(a^A|o^A) = \pi(k|K)\prod\limits_{i=0}^N\pi(a_i|a_{0\cdots i-1},b_{0\cdots i-1},k)\pi(b_i|a_{0\cdots i-1},b_{0\cdots i-1},k)
</annotation></semantics></math></span><span class="katex-html" aria-hidden="true"><span class="base"><span class="strut" style="height:1.1413309999999999em;vertical-align:-0.25em;"></span><span class="mord"><span class="mord mathdefault" style="margin-right:0.03588em;">π</span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.32833099999999993em;"><span style="top:-2.5500000000000003em;margin-left:-0.03588em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mathdefault mtight" style="margin-right:0.05764em;">E</span></span></span></span><span class="vlist-s"></span></span><span class="vlist-r"><span class="vlist" style="height:0.15em;"><span></span></span></span></span></span></span><span class="mopen">(</span><span class="mord"><span class="mord mathdefault">a</span><span class="msupsub"><span class="vlist-t"><span class="vlist-r"><span class="vlist" style="height:0.8913309999999999em;"><span style="top:-3.113em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mathdefault mtight">A</span></span></span></span></span></span></span></span><span class="mord">∣</span><span class="mord"><span class="mord mathdefault">o</span><span class="msupsub"><span class="vlist-t"><span class="vlist-r"><span class="vlist" style="height:0.8913309999999999em;"><span style="top:-3.113em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mathdefault mtight">A</span></span></span></span></span></span></span></span><span class="mclose">)</span><span class="mspace" style="margin-right:0.2777777777777778em;"></span><span class="mrel">=</span><span class="mspace" style="margin-right:0.2777777777777778em;"></span></span><span class="base"><span class="strut" style="height:3.106005em;vertical-align:-1.277669em;"></span><span class="mord mathdefault" style="margin-right:0.03588em;">π</span><span class="mopen">(</span><span class="mord mathdefault" style="margin-right:0.03148em;">k</span><span class="mord">∣</span><span class="mord mathdefault" style="margin-right:0.07153em;">K</span><span class="mclose">)</span><span class="mspace" style="margin-right:0.16666666666666666em;"></span><span class="mop op-limits"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:1.8283360000000002em;"><span style="top:-1.872331em;margin-left:0em;"><span class="pstrut" style="height:3.05em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mtight"><span class="mord mathdefault mtight">i</span><span class="mrel mtight">=</span><span class="mord mtight">0</span></span></span></span><span style="top:-3.050005em;"><span class="pstrut" style="height:3.05em;"></span><span><span class="mop op-symbol large-op">∏</span></span></span><span style="top:-4.3000050000000005em;margin-left:0em;"><span class="pstrut" style="height:3.05em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mathdefault mtight" style="margin-right:0.10903em;">N</span></span></span></span><span class="vlist-s"></span></span><span class="vlist-r"><span class="vlist" style="height:1.277669em;"><span></span></span></span></span></span><span class="mspace" style="margin-right:0.16666666666666666em;"></span><span class="mord mathdefault" style="margin-right:0.03588em;">π</span><span class="mopen">(</span><span class="mord"><span class="mord mathdefault">a</span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.31166399999999994em;"><span style="top:-2.5500000000000003em;margin-left:0em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mathdefault mtight">i</span></span></span></span><span class="vlist-s"></span></span><span class="vlist-r"><span class="vlist" style="height:0.15em;"><span></span></span></span></span></span></span><span class="mord">∣</span><span class="mord"><span class="mord mathdefault">a</span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.311664em;"><span style="top:-2.5500000000000003em;margin-left:0em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mtight"><span class="mord mtight">0</span><span class="minner mtight">⋯</span><span class="mord mathdefault mtight">i</span><span class="mbin mtight">−</span><span class="mord mtight">1</span></span></span></span></span><span class="vlist-s"></span></span><span class="vlist-r"><span class="vlist" style="height:0.208331em;"><span></span></span></span></span></span></span><span class="mpunct">,</span><span class="mspace" style="margin-right:0.16666666666666666em;"></span><span class="mord"><span class="mord mathdefault">b</span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.311664em;"><span style="top:-2.5500000000000003em;margin-left:0em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mtight"><span class="mord mtight">0</span><span class="minner mtight">⋯</span><span class="mord mathdefault mtight">i</span><span class="mbin mtight">−</span><span class="mord mtight">1</span></span></span></span></span><span class="vlist-s"></span></span><span class="vlist-r"><span class="vlist" style="height:0.208331em;"><span></span></span></span></span></span></span><span class="mpunct">,</span><span class="mspace" style="margin-right:0.16666666666666666em;"></span><span class="mord mathdefault" style="margin-right:0.03148em;">k</span><span class="mclose">)</span><span class="mord mathdefault" style="margin-right:0.03588em;">π</span><span class="mopen">(</span><span class="mord"><span class="mord mathdefault">b</span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.31166399999999994em;"><span style="top:-2.5500000000000003em;margin-left:0em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mathdefault mtight">i</span></span></span></span><span class="vlist-s"></span></span><span class="vlist-r"><span class="vlist" style="height:0.15em;"><span></span></span></span></span></span></span><span class="mord">∣</span><span class="mord"><span class="mord mathdefault">a</span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.311664em;"><span style="top:-2.5500000000000003em;margin-left:0em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mtight"><span class="mord mtight">0</span><span class="minner mtight">⋯</span><span class="mord mathdefault mtight">i</span><span class="mbin mtight">−</span><span class="mord mtight">1</span></span></span></span></span><span class="vlist-s"></span></span><span class="vlist-r"><span class="vlist" style="height:0.208331em;"><span></span></span></span></span></span></span><span class="mpunct">,</span><span class="mspace" style="margin-right:0.16666666666666666em;"></span><span class="mord"><span class="mord mathdefault">b</span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.311664em;"><span style="top:-2.5500000000000003em;margin-left:0em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mtight"><span class="mord mtight">0</span><span class="minner mtight">⋯</span><span class="mord mathdefault mtight">i</span><span class="mbin mtight">−</span><span class="mord mtight">1</span></span></span></span></span><span class="vlist-s"></span></span><span class="vlist-r"><span class="vlist" style="height:0.208331em;"><span></span></span></span></span></span></span><span class="mpunct">,</span><span class="mspace" style="margin-right:0.16666666666666666em;"></span><span class="mord mathdefault" style="margin-right:0.03148em;">k</span><span class="mclose">)</span></span></span></span></span></p>
<p>其中 <span class="katex"><span class="katex-mathml"><math xmlns="http://www.w3.org/1998/Math/MathML"><semantics><mrow><mi>N</mi></mrow><annotation encoding="application/x-tex">N</annotation></semantics></math></span><span class="katex-html" aria-hidden="true"><span class="base"><span class="strut" style="height:0.68333em;vertical-align:0em;"></span><span class="mord mathdefault" style="margin-right:0.10903em;">N</span></span></span></span> 是输出数量的上限;<span class="katex"><span class="katex-mathml"><math xmlns="http://www.w3.org/1998/Math/MathML"><semantics><mrow><mi>K</mi></mrow><annotation encoding="application/x-tex">K</annotation></semantics></math></span><span class="katex-html" aria-hidden="true"><span class="base"><span class="strut" style="height:0.68333em;vertical-align:0em;"></span><span class="mord mathdefault" style="margin-right:0.07153em;">K</span></span></span></span> 是位置数量的上限;<span class="katex"><span class="katex-mathml"><math xmlns="http://www.w3.org/1998/Math/MathML"><semantics><mrow><msub><mi>a</mi><mi>i</mi></msub></mrow><annotation encoding="application/x-tex">a_i</annotation></semantics></math></span><span class="katex-html" aria-hidden="true"><span class="base"><span class="strut" style="height:0.58056em;vertical-align:-0.15em;"></span><span class="mord"><span class="mord mathdefault">a</span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.31166399999999994em;"><span style="top:-2.5500000000000003em;margin-left:0em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mathdefault mtight">i</span></span></span></span><span class="vlist-s"></span></span><span class="vlist-r"><span class="vlist" style="height:0.15em;"><span></span></span></span></span></span></span></span></span></span> 是设计组件;<span class="katex"><span class="katex-mathml"><math xmlns="http://www.w3.org/1998/Math/MathML"><semantics><mrow><msub><mi>b</mi><mi>i</mi></msub></mrow><annotation encoding="application/x-tex">b_i</annotation></semantics></math></span><span class="katex-html" aria-hidden="true"><span class="base"><span class="strut" style="height:0.84444em;vertical-align:-0.15em;"></span><span class="mord"><span class="mord mathdefault">b</span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.31166399999999994em;"><span style="top:-2.5500000000000003em;margin-left:0em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mathdefault mtight">i</span></span></span></span><span class="vlist-s"></span></span><span class="vlist-r"><span class="vlist" style="height:0.15em;"><span></span></span></span></span></span></span></span></span></span> 是位置序号;<span class="katex"><span class="katex-mathml"><math xmlns="http://www.w3.org/1998/Math/MathML"><semantics><mrow><msup><mi>o</mi><mi>A</mi></msup></mrow><annotation encoding="application/x-tex">o^A</annotation></semantics></math></span><span class="katex-html" aria-hidden="true"><span class="base"><span class="strut" style="height:0.8413309999999999em;vertical-align:0em;"></span><span class="mord"><span class="mord mathdefault">o</span><span class="msupsub"><span class="vlist-t"><span class="vlist-r"><span class="vlist" style="height:0.8413309999999999em;"><span style="top:-3.063em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mathdefault mtight">A</span></span></span></span></span></span></span></span></span></span></span> 是初始状态。对抗策略首先从参数化的多项式分布 <span class="katex"><span class="katex-mathml"><math xmlns="http://www.w3.org/1998/Math/MathML"><semantics><mrow><mi>C</mi><mi>a</mi><mi>t</mi><mo stretchy="false">(</mo><mn>0</mn><mo separator="true">,</mo><mi>K</mi><mo stretchy="false">)</mo></mrow><annotation encoding="application/x-tex">Cat(0, K)</annotation></semantics></math></span><span class="katex-html" aria-hidden="true"><span class="base"><span class="strut" style="height:1em;vertical-align:-0.25em;"></span><span class="mord mathdefault" style="margin-right:0.07153em;">C</span><span class="mord mathdefault">a</span><span class="mord mathdefault">t</span><span class="mopen">(</span><span class="mord">0</span><span class="mpunct">,</span><span class="mspace" style="margin-right:0.16666666666666666em;"></span><span class="mord mathdefault" style="margin-right:0.07153em;">K</span><span class="mclose">)</span></span></span></span> 中采样位置的数量 <span class="katex"><span class="katex-mathml"><math xmlns="http://www.w3.org/1998/Math/MathML"><semantics><mrow><mi>k</mi></mrow><annotation encoding="application/x-tex">k</annotation></semantics></math></span><span class="katex-html" aria-hidden="true"><span class="base"><span class="strut" style="height:0.69444em;vertical-align:0em;"></span><span class="mord mathdefault" style="margin-right:0.03148em;">k</span></span></span></span> 。在给定 <span class="katex"><span class="katex-mathml"><math xmlns="http://www.w3.org/1998/Math/MathML"><semantics><mrow><msup><mi>o</mi><mi>A</mi></msup></mrow><annotation encoding="application/x-tex">o^A</annotation></semantics></math></span><span class="katex-html" aria-hidden="true"><span class="base"><span class="strut" style="height:0.8413309999999999em;vertical-align:0em;"></span><span class="mord"><span class="mord mathdefault">o</span><span class="msupsub"><span class="vlist-t"><span class="vlist-r"><span class="vlist" style="height:0.8413309999999999em;"><span style="top:-3.063em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mathdefault mtight">A</span></span></span></span></span></span></span></span></span></span></span> 的条件下,它执行一个自回归的模型以生成一组基元及其对应的位置 <span class="katex"><span class="katex-mathml"><math xmlns="http://www.w3.org/1998/Math/MathML"><semantics><mrow><mo stretchy="false">[</mo><mn>0</mn><mo separator="true">,</mo><mo>⋯</mo><mtext> </mtext><mo separator="true">,</mo><mi>k</mi><mo stretchy="false">]</mo></mrow><annotation encoding="application/x-tex">[0,\cdots, k]</annotation></semantics></math></span><span class="katex-html" aria-hidden="true"><span class="base"><span class="strut" style="height:1em;vertical-align:-0.25em;"></span><span class="mopen">[</span><span class="mord">0</span><span class="mpunct">,</span><span class="mspace" style="margin-right:0.16666666666666666em;"></span><span class="minner">⋯</span><span class="mspace" style="margin-right:0.16666666666666666em;"></span><span class="mspace" style="margin-right:0.16666666666666666em;"></span><span class="mpunct">,</span><span class="mspace" style="margin-right:0.16666666666666666em;"></span><span class="mord mathdefault" style="margin-right:0.03148em;">k</span><span class="mclose">]</span></span></span></span> 。</p>
<p>类似于对抗生成网络(GAN),我们从标准正太分布中采样 <span class="katex"><span class="katex-mathml"><math xmlns="http://www.w3.org/1998/Math/MathML"><semantics><mrow><msup><mi>o</mi><mi>A</mi></msup></mrow><annotation encoding="application/x-tex">o^A</annotation></semantics></math></span><span class="katex-html" aria-hidden="true"><span class="base"><span class="strut" style="height:0.8413309999999999em;vertical-align:0em;"></span><span class="mord"><span class="mord mathdefault">o</span><span class="msupsub"><span class="vlist-t"><span class="vlist-r"><span class="vlist" style="height:0.8413309999999999em;"><span style="top:-3.063em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mathdefault mtight">A</span></span></span></span></span></span></span></span></span></span></span> 以让对抗策略多样化其设计分布。观测状态首先通过前馈网络编码为 <span class="katex"><span class="katex-mathml"><math xmlns="http://www.w3.org/1998/Math/MathML"><semantics><mrow><msub><mi>h</mi><mn>0</mn></msub><mo>=</mo><msub><mi>f</mi><mn>0</mn></msub><mo stretchy="false">(</mo><msup><mi>o</mi><mi>A</mi></msup><mo stretchy="false">)</mo></mrow><annotation encoding="application/x-tex">h_0=f_0(o^A)</annotation></semantics></math></span><span class="katex-html" aria-hidden="true"><span class="base"><span class="strut" style="height:0.84444em;vertical-align:-0.15em;"></span><span class="mord"><span class="mord mathdefault">h</span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.30110799999999993em;"><span style="top:-2.5500000000000003em;margin-left:0em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mtight">0</span></span></span></span><span class="vlist-s"></span></span><span class="vlist-r"><span class="vlist" style="height:0.15em;"><span></span></span></span></span></span></span><span class="mspace" style="margin-right:0.2777777777777778em;"></span><span class="mrel">=</span><span class="mspace" style="margin-right:0.2777777777777778em;"></span></span><span class="base"><span class="strut" style="height:1.0913309999999998em;vertical-align:-0.25em;"></span><span class="mord"><span class="mord mathdefault" style="margin-right:0.10764em;">f</span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.30110799999999993em;"><span style="top:-2.5500000000000003em;margin-left:-0.10764em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mtight">0</span></span></span></span><span class="vlist-s"></span></span><span class="vlist-r"><span class="vlist" style="height:0.15em;"><span></span></span></span></span></span></span><span class="mopen">(</span><span class="mord"><span class="mord mathdefault">o</span><span class="msupsub"><span class="vlist-t"><span class="vlist-r"><span class="vlist" style="height:0.8413309999999999em;"><span style="top:-3.063em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mathdefault mtight">A</span></span></span></span></span></span></span></span><span class="mclose">)</span></span></span></span> ,再通过另外一个网络 <span class="katex"><span class="katex-mathml"><math xmlns="http://www.w3.org/1998/Math/MathML"><semantics><mrow><msub><mi>f</mi><mi>K</mi></msub></mrow><annotation encoding="application/x-tex">f_K</annotation></semantics></math></span><span class="katex-html" aria-hidden="true"><span class="base"><span class="strut" style="height:0.8888799999999999em;vertical-align:-0.19444em;"></span><span class="mord"><span class="mord mathdefault" style="margin-right:0.10764em;">f</span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.32833099999999993em;"><span style="top:-2.5500000000000003em;margin-left:-0.10764em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mathdefault mtight" style="margin-right:0.07153em;">K</span></span></span></span><span class="vlist-s"></span></span><span class="vlist-r"><span class="vlist" style="height:0.15em;"><span></span></span></span></span></span></span></span></span></span> 输出关于空白页数量的分布。相同的隐藏状态 <span class="katex"><span class="katex-mathml"><math xmlns="http://www.w3.org/1998/Math/MathML"><semantics><mrow><msub><mi>h</mi><mn>0</mn></msub></mrow><annotation encoding="application/x-tex">h_0</annotation></semantics></math></span><span class="katex-html" aria-hidden="true"><span class="base"><span class="strut" style="height:0.84444em;vertical-align:-0.15em;"></span><span class="mord"><span class="mord mathdefault">h</span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.30110799999999993em;"><span style="top:-2.5500000000000003em;margin-left:0em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mtight">0</span></span></span></span><span class="vlist-s"></span></span><span class="vlist-r"><span class="vlist" style="height:0.15em;"><span></span></span></span></span></span></span></span></span></span> 被输入到一个 LSTM 网络中,作为其初始输入向量。LSTM 网络的输出被两个独立的网络 <span class="katex"><span class="katex-mathml"><math xmlns="http://www.w3.org/1998/Math/MathML"><semantics><mrow><msub><mi>f</mi><mi>P</mi></msub></mrow><annotation encoding="application/x-tex">f_P</annotation></semantics></math></span><span class="katex-html" aria-hidden="true"><span class="base"><span class="strut" style="height:0.8888799999999999em;vertical-align:-0.19444em;"></span><span class="mord"><span class="mord mathdefault" style="margin-right:0.10764em;">f</span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.32833099999999993em;"><span style="top:-2.5500000000000003em;margin-left:-0.10764em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mathdefault mtight" style="margin-right:0.13889em;">P</span></span></span></span><span class="vlist-s"></span></span><span class="vlist-r"><span class="vlist" style="height:0.15em;"><span></span></span></span></span></span></span></span></span></span> 和 <span class="katex"><span class="katex-mathml"><math xmlns="http://www.w3.org/1998/Math/MathML"><semantics><mrow><msub><mi>f</mi><mi>L</mi></msub></mrow><annotation encoding="application/x-tex">f_L</annotation></semantics></math></span><span class="katex-html" aria-hidden="true"><span class="base"><span class="strut" style="height:0.8888799999999999em;vertical-align:-0.19444em;"></span><span class="mord"><span class="mord mathdefault" style="margin-right:0.10764em;">f</span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.32833099999999993em;"><span style="top:-2.5500000000000003em;margin-left:-0.10764em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mathdefault mtight">L</span></span></span></span><span class="vlist-s"></span></span><span class="vlist-r"><span class="vlist" style="height:0.15em;"><span></span></span></span></span></span></span></span></span></span> 分别用于学习(1)设计组件的分布和(2)位置的分布。我们从这些分布中采样一个组件和一个位置,将它们通过另外一个网络 <span class="katex"><span class="katex-mathml"><math xmlns="http://www.w3.org/1998/Math/MathML"><semantics><mrow><msub><mi>f</mi><mi>I</mi></msub></mrow><annotation encoding="application/x-tex">f_I</annotation></semantics></math></span><span class="katex-html" aria-hidden="true"><span class="base"><span class="strut" style="height:0.8888799999999999em;vertical-align:-0.19444em;"></span><span class="mord"><span class="mord mathdefault" style="margin-right:0.10764em;">f</span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.32833099999999993em;"><span style="top:-2.5500000000000003em;margin-left:-0.10764em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mathdefault mtight" style="margin-right:0.07847em;">I</span></span></span></span><span class="vlist-s"></span></span><span class="vlist-r"><span class="vlist" style="height:0.15em;"><span></span></span></span></span></span></span></span></span></span> 编码成隐藏向量,作为 LSTM 下一个时刻的输入。<span class="katex"><span class="katex-mathml"><math xmlns="http://www.w3.org/1998/Math/MathML"><semantics><mrow><mi>N</mi></mrow><annotation encoding="application/x-tex">N</annotation></semantics></math></span><span class="katex-html" aria-hidden="true"><span class="base"><span class="strut" style="height:0.68333em;vertical-align:0em;"></span><span class="mord mathdefault" style="margin-right:0.10903em;">N</span></span></span></span> 步之后,采样得到的设计动作就会发送给渲染模块以生成环境。</p>
<p>对于网页导航问题,<span class="katex"><span class="katex-mathml"><math xmlns="http://www.w3.org/1998/Math/MathML"><semantics><mrow><mi>K</mi></mrow><annotation encoding="application/x-tex">K</annotation></semantics></math></span><span class="katex-html" aria-hidden="true"><span class="base"><span class="strut" style="height:0.68333em;vertical-align:0em;"></span><span class="mord mathdefault" style="margin-right:0.07153em;">K</span></span></span></span> 表示网站中页面的数量,位置 <span class="katex"><span class="katex-mathml"><math xmlns="http://www.w3.org/1998/Math/MathML"><semantics><mrow><msub><mi>b</mi><mi>i</mi></msub></mrow><annotation encoding="application/x-tex">b_i</annotation></semantics></math></span><span class="katex-html" aria-hidden="true"><span class="base"><span class="strut" style="height:0.84444em;vertical-align:-0.15em;"></span><span class="mord"><span class="mord mathdefault">b</span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.31166399999999994em;"><span style="top:-2.5500000000000003em;margin-left:0em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mathdefault mtight">i</span></span></span></span><span class="vlist-s"></span></span><span class="vlist-r"><span class="vlist" style="height:0.15em;"><span></span></span></span></span></span></span></span></span></span> 表示页面,而组件 <span class="katex"><span class="katex-mathml"><math xmlns="http://www.w3.org/1998/Math/MathML"><semantics><mrow><msub><mi>a</mi><mi>i</mi></msub></mrow><annotation encoding="application/x-tex">a_i</annotation></semantics></math></span><span class="katex-html" aria-hidden="true"><span class="base"><span class="strut" style="height:0.58056em;vertical-align:-0.15em;"></span><span class="mord"><span class="mord mathdefault">a</span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.31166399999999994em;"><span style="top:-2.5500000000000003em;margin-left:0em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mathdefault mtight">i</span></span></span></span><span class="vlist-s"></span></span><span class="vlist-r"><span class="vlist" style="height:0.15em;"><span></span></span></span></span></span></span></span></span></span> 表示 DOM 树组件。图 3 展示了对抗策略如何生成网页。我们还使用了一个特殊的<strong>跳过操作</strong>来增强组件设计操作,当渲染器执行该动作时,不进行任何操作。这允许对抗策略控制添加的组件的数量。</p>
<div class="gallery ">
<p><img src="/images/aeg/fig3.png" class="lazyload" data-srcset="/images/aeg/fig3.png" srcset="data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAABGdBTUEAALGPC/xhBQAAADhlWElmTU0AKgAAAAgAAYdpAAQAAAABAAAAGgAAAAAAAqACAAQAAAABAAAAAaADAAQAAAABAAAAAQAAAADa6r/EAAAAC0lEQVQIHWNgAAIAAAUAAY27m/MAAAAASUVORK5CYII=" alt="网页生成示例"></p>
</div>
<h3 id="灵活-paired-算法">灵活 PAIRED 算法</h3>
<p>我们使用灵活的对手选择来提升前文公式所示的遗憾目标。首先初始化两个智能体 <span class="katex"><span class="katex-mathml"><math xmlns="http://www.w3.org/1998/Math/MathML"><semantics><mrow><mi>A</mi></mrow><annotation encoding="application/x-tex">A</annotation></semantics></math></span><span class="katex-html" aria-hidden="true"><span class="base"><span class="strut" style="height:0.68333em;vertical-align:0em;"></span><span class="mord mathdefault">A</span></span></span></span> 和 <span class="katex"><span class="katex-mathml"><math xmlns="http://www.w3.org/1998/Math/MathML"><semantics><mrow><mi>P</mi></mrow><annotation encoding="application/x-tex">P</annotation></semantics></math></span><span class="katex-html" aria-hidden="true"><span class="base"><span class="strut" style="height:0.68333em;vertical-align:0em;"></span><span class="mord mathdefault" style="margin-right:0.13889em;">P</span></span></span></span> 。在每次迭代的过程中,对抗智能体设计一个新的网站,每个智能体都通过在网页中进行导航收集到奖励 <span class="katex"><span class="katex-mathml"><math xmlns="http://www.w3.org/1998/Math/MathML"><semantics><mrow><mi>R</mi></mrow><annotation encoding="application/x-tex">R</annotation></semantics></math></span><span class="katex-html" aria-hidden="true"><span class="base"><span class="strut" style="height:0.68333em;vertical-align:0em;"></span><span class="mord mathdefault" style="margin-right:0.00773em;">R</span></span></span></span> ,最后的遗憾值为:</p>
<p class="katex-block"><span class="katex-display"><span class="katex"><span class="katex-mathml"><math xmlns="http://www.w3.org/1998/Math/MathML"><semantics><mrow><mrow><mi mathvariant="normal">R</mi><mi mathvariant="normal">E</mi><mi mathvariant="normal">G</mi><mi mathvariant="normal">R</mi><mi mathvariant="normal">E</mi><mi mathvariant="normal">T</mi></mrow><mo>=</mo><mi>max</mi><mo></mo><mrow><msup><mi>R</mi><mi>A</mi></msup><mo separator="true">,</mo><msup><mi>R</mi><mi>P</mi></msup></mrow><mo>−</mo><mn>0.5</mn><mo>×</mo><mo stretchy="false">(</mo><msup><mi>R</mi><mi>A</mi></msup><mo>+</mo><msup><mi>R</mi><mi>P</mi></msup><mo stretchy="false">)</mo></mrow><annotation encoding="application/x-tex">{\rm REGRET} = \max{R^A,R^P} - 0.5 \times (R^A+R^P)
</annotation></semantics></math></span><span class="katex-html" aria-hidden="true"><span class="base"><span class="strut" style="height:0.68333em;vertical-align:0em;"></span><span class="mord"><span class="mord"><span class="mord mathrm">R</span><span class="mord mathrm">E</span><span class="mord mathrm">G</span><span class="mord mathrm">R</span><span class="mord mathrm">E</span><span class="mord mathrm">T</span></span></span><span class="mspace" style="margin-right:0.2777777777777778em;"></span><span class="mrel">=</span><span class="mspace" style="margin-right:0.2777777777777778em;"></span></span><span class="base"><span class="strut" style="height:1.0857709999999998em;vertical-align:-0.19444em;"></span><span class="mop">max</span><span class="mspace" style="margin-right:0.16666666666666666em;"></span><span class="mord"><span class="mord"><span class="mord mathdefault" style="margin-right:0.00773em;">R</span><span class="msupsub"><span class="vlist-t"><span class="vlist-r"><span class="vlist" style="height:0.8913309999999999em;"><span style="top:-3.113em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mathdefault mtight">A</span></span></span></span></span></span></span></span><span class="mpunct">,</span><span class="mspace" style="margin-right:0.16666666666666666em;"></span><span class="mord"><span class="mord mathdefault" style="margin-right:0.00773em;">R</span><span class="msupsub"><span class="vlist-t"><span class="vlist-r"><span class="vlist" style="height:0.8913309999999999em;"><span style="top:-3.113em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mathdefault mtight" style="margin-right:0.13889em;">P</span></span></span></span></span></span></span></span></span><span class="mspace" style="margin-right:0.2222222222222222em;"></span><span class="mbin">−</span><span class="mspace" style="margin-right:0.2222222222222222em;"></span></span><span class="base"><span class="strut" style="height:0.72777em;vertical-align:-0.08333em;"></span><span class="mord">0</span><span class="mord">.</span><span class="mord">5</span><span class="mspace" style="margin-right:0.2222222222222222em;"></span><span class="mbin">×</span><span class="mspace" style="margin-right:0.2222222222222222em;"></span></span><span class="base"><span class="strut" style="height:1.1413309999999999em;vertical-align:-0.25em;"></span><span class="mopen">(</span><span class="mord"><span class="mord mathdefault" style="margin-right:0.00773em;">R</span><span class="msupsub"><span class="vlist-t"><span class="vlist-r"><span class="vlist" style="height:0.8913309999999999em;"><span style="top:-3.113em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mathdefault mtight">A</span></span></span></span></span></span></span></span><span class="mspace" style="margin-right:0.2222222222222222em;"></span><span class="mbin">+</span><span class="mspace" style="margin-right:0.2222222222222222em;"></span></span><span class="base"><span class="strut" style="height:1.1413309999999999em;vertical-align:-0.25em;"></span><span class="mord"><span class="mord mathdefault" style="margin-right:0.00773em;">R</span><span class="msupsub"><span class="vlist-t"><span class="vlist-r"><span class="vlist" style="height:0.8913309999999999em;"><span style="top:-3.113em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mathdefault mtight" style="margin-right:0.13889em;">P</span></span></span></span></span></span></span></span><span class="mclose">)</span></span></span></span></span></p>
<p>这个目标不在学习智能体和反派智能体之间进行区分,而是将表现最好的智能体视为反派智能体。只要有一个智能体表现得比另外一个智能体更好,这个目标就会持续改进更弱的智能体。在此期间,另外一个智能体会持续学习,从而为我们衡量遗憾值提供一个更强大的最好表现依据。我们提出的灵活 PAIRED 算法如算法 1 所示:利用策略梯度更新法,我们训练学习智能体和反派智能体优化环境奖励,而训练对抗智能体最大化遗憾值。</p>
<div class="gallery ">
<p><img src="/images/aeg/alg1.png" class="lazyload" data-srcset="/images/aeg/alg1.png" srcset="data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAABGdBTUEAALGPC/xhBQAAADhlWElmTU0AKgAAAAgAAYdpAAQAAAABAAAAGgAAAAAAAqACAAQAAAABAAAAAaADAAQAAAABAAAAAQAAAADa6r/EAAAAC0lEQVQIHWNgAAIAAAUAAY27m/MAAAAASUVORK5CYII=" alt="算法 1"></p>
</div>
<h3 id="将预算引入对抗策略">将预算引入对抗策略</h3>
<p>考虑以下场景:智能体在一个购物网站的首页,首页上有很多元素,但是只有一个单独的按钮会进入到账户页面。在探索的过程中,智能体大多数只会因为采取错误的行动而收集负奖励,这个奖励的范围很小(因为只有一个最优行动)。在这种情况下,遗憾值就会很小,并且不提供任何信息,这阻碍了对抗智能体为学习智能体设计难度合适的环境的能力。这个问题在我们提出的灵活遗憾目标下依旧存在。</p>
<p>为了解决这个问题,在遗憾值(将对抗智能体设计的预算与最好的智能体的表现绑定)的基础上,我们使用预算对目标进行增强。我们将对抗智能体的有效预算近似为 <span class="katex"><span class="katex-mathml"><math xmlns="http://www.w3.org/1998/Math/MathML"><semantics><mrow><mi>N</mi></mrow><annotation encoding="application/x-tex">N</annotation></semantics></math></span><span class="katex-html" aria-hidden="true"><span class="base"><span class="strut" style="height:0.68333em;vertical-align:0em;"></span><span class="mord mathdefault" style="margin-right:0.10903em;">N</span></span></span></span> 个时刻中非跳过行为的期望数量,并根据学习智能体是否正在学习来调整这个预算。具体地,我们将下面这个最小化预算的目标添加到 PAIRED 目标中对目标进行增强:</p>
<p class="katex-block"><span class="katex-display"><span class="katex"><span class="katex-mathml"><math xmlns="http://www.w3.org/1998/Math/MathML"><semantics><mrow><msub><mi>O</mi><mrow><mi mathvariant="normal">b</mi><mi mathvariant="normal">u</mi><mi mathvariant="normal">d</mi><mi mathvariant="normal">g</mi><mi mathvariant="normal">e</mi><mi mathvariant="normal">t</mi></mrow></msub><mo>=</mo><msup><mi>R</mi><mi>A</mi></msup><mo>×</mo><munderover><mo>∑</mo><mrow><mi>i</mi><mo>=</mo><mn>1</mn></mrow><mi>N</mi></munderover><mi>log</mi><mo></mo><mi>π</mi><mo stretchy="false">(</mo><msub><mi>a</mi><mi>i</mi></msub><mo>=</mo><mrow><mi mathvariant="normal">S</mi><mi mathvariant="normal">K</mi><mi mathvariant="normal">I</mi><mi mathvariant="normal">P</mi></mrow><mi mathvariant="normal">∣</mi><msub><mi>a</mi><mrow><mn>0</mn><mo separator="true">,</mo><mo>⋯</mo><mtext> </mtext><mo separator="true">,</mo><mi>i</mi><mo>−</mo><mn>1</mn></mrow></msub><mo separator="true">,</mo><msub><mi>b</mi><mrow><mn>0</mn><mo separator="true">,</mo><mo>⋯</mo><mtext> </mtext><mo separator="true">,</mo><mi>i</mi><mo>−</mo><mn>1</mn></mrow></msub><mo stretchy="false">)</mo></mrow><annotation encoding="application/x-tex">O_{\rm budget} = R^A \times \sum\limits_{i=1}^N \log \pi(a_i = {\rm SKIP} | a_{0,\cdots,i-1}, b_{0,\cdots,i-1})
</annotation></semantics></math></span><span class="katex-html" aria-hidden="true"><span class="base"><span class="strut" style="height:0.969438em;vertical-align:-0.286108em;"></span><span class="mord"><span class="mord mathdefault" style="margin-right:0.02778em;">O</span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.3361079999999999em;"><span style="top:-2.5500000000000003em;margin-left:-0.02778em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mtight"><span class="mord mtight"><span class="mord mathrm mtight">b</span><span class="mord mathrm mtight">u</span><span class="mord mathrm mtight">d</span><span class="mord mathrm mtight" style="margin-right:0.01389em;">g</span><span class="mord mathrm mtight">e</span><span class="mord mathrm mtight">t</span></span></span></span></span></span><span class="vlist-s"></span></span><span class="vlist-r"><span class="vlist" style="height:0.286108em;"><span></span></span></span></span></span></span><span class="mspace" style="margin-right:0.2777777777777778em;"></span><span class="mrel">=</span><span class="mspace" style="margin-right:0.2777777777777778em;"></span></span><span class="base"><span class="strut" style="height:0.9746609999999999em;vertical-align:-0.08333em;"></span><span class="mord"><span class="mord mathdefault" style="margin-right:0.00773em;">R</span><span class="msupsub"><span class="vlist-t"><span class="vlist-r"><span class="vlist" style="height:0.8913309999999999em;"><span style="top:-3.113em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mathdefault mtight">A</span></span></span></span></span></span></span></span><span class="mspace" style="margin-right:0.2222222222222222em;"></span><span class="mbin">×</span><span class="mspace" style="margin-right:0.2222222222222222em;"></span></span><span class="base"><span class="strut" style="height:3.106005em;vertical-align:-1.277669em;"></span><span class="mop op-limits"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:1.8283360000000002em;"><span style="top:-1.872331em;margin-left:0em;"><span class="pstrut" style="height:3.05em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mtight"><span class="mord mathdefault mtight">i</span><span class="mrel mtight">=</span><span class="mord mtight">1</span></span></span></span><span style="top:-3.050005em;"><span class="pstrut" style="height:3.05em;"></span><span><span class="mop op-symbol large-op">∑</span></span></span><span style="top:-4.3000050000000005em;margin-left:0em;"><span class="pstrut" style="height:3.05em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mathdefault mtight" style="margin-right:0.10903em;">N</span></span></span></span><span class="vlist-s"></span></span><span class="vlist-r"><span class="vlist" style="height:1.277669em;"><span></span></span></span></span></span><span class="mspace" style="margin-right:0.16666666666666666em;"></span><span class="mop">lo<span style="margin-right:0.01389em;">g</span></span><span class="mspace" style="margin-right:0.16666666666666666em;"></span><span class="mord mathdefault" style="margin-right:0.03588em;">π</span><span class="mopen">(</span><span class="mord"><span class="mord mathdefault">a</span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.31166399999999994em;"><span style="top:-2.5500000000000003em;margin-left:0em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mathdefault mtight">i</span></span></span></span><span class="vlist-s"></span></span><span class="vlist-r"><span class="vlist" style="height:0.15em;"><span></span></span></span></span></span></span><span class="mspace" style="margin-right:0.2777777777777778em;"></span><span class="mrel">=</span><span class="mspace" style="margin-right:0.2777777777777778em;"></span></span><span class="base"><span class="strut" style="height:1.036108em;vertical-align:-0.286108em;"></span><span class="mord"><span class="mord"><span class="mord mathrm">S</span><span class="mord mathrm">K</span><span class="mord mathrm">I</span><span class="mord mathrm">P</span></span></span><span class="mord">∣</span><span class="mord"><span class="mord mathdefault">a</span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.311664em;"><span style="top:-2.5500000000000003em;margin-left:0em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mtight"><span class="mord mtight">0</span><span class="mpunct mtight">,</span><span class="minner mtight">⋯</span><span class="mspace mtight" style="margin-right:0.19516666666666668em;"></span><span class="mpunct mtight">,</span><span class="mord mathdefault mtight">i</span><span class="mbin mtight">−</span><span class="mord mtight">1</span></span></span></span></span><span class="vlist-s"></span></span><span class="vlist-r"><span class="vlist" style="height:0.286108em;"><span></span></span></span></span></span></span><span class="mpunct">,</span><span class="mspace" style="margin-right:0.16666666666666666em;"></span><span class="mord"><span class="mord mathdefault">b</span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.311664em;"><span style="top:-2.5500000000000003em;margin-left:0em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mtight"><span class="mord mtight">0</span><span class="mpunct mtight">,</span><span class="minner mtight">⋯</span><span class="mspace mtight" style="margin-right:0.19516666666666668em;"></span><span class="mpunct mtight">,</span><span class="mord mathdefault mtight">i</span><span class="mbin mtight">−</span><span class="mord mtight">1</span></span></span></span></span><span class="vlist-s"></span></span><span class="vlist-r"><span class="vlist" style="height:0.286108em;"><span></span></span></span></span></span></span><span class="mclose">)</span></span></span></span></span></p>
<p>其中 <span class="katex"><span class="katex-mathml"><math xmlns="http://www.w3.org/1998/Math/MathML"><semantics><mrow><msup><mi>R</mi><mi>A</mi></msup></mrow><annotation encoding="application/x-tex">R^A</annotation></semantics></math></span><span class="katex-html" aria-hidden="true"><span class="base"><span class="strut" style="height:0.8413309999999999em;vertical-align:0em;"></span><span class="mord"><span class="mord mathdefault" style="margin-right:0.00773em;">R</span><span class="msupsub"><span class="vlist-t"><span class="vlist-r"><span class="vlist" style="height:0.8413309999999999em;"><span style="top:-3.063em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mathdefault mtight">A</span></span></span></span></span></span></span></span></span></span></span> 是反派智能体(或者说表现最好的智能体)所获得的奖励。这个目标鼓励对抗智能体在学习智能体没有学习时,使用更少的预算(更多的跳过操作);而在学习智能体从环境中获得了正奖励时,使用更多的预算(更少的跳过操作)。</p>
<h2 id="实验验证">实验验证</h2>
<p>我们在 MiniWoB 框架实现的各种网页环境上测试了我们的模型。我们还利用相同的设计组件实现了几个不同难度等级的网站。这些环境包括了登录、输入地址、航班预定、付款、购物等网页,这些网页要求智能体在页面中导航的时候输入文本或者选择网页中的信息。通过逐渐向网页中增加组件,每个环境都有四个不同的难度等级。这些环境在训练时从未明确地向智能体展示,因此在这些环境上的表现可以看出智能体能够多好地泛化到没有见过的网站。</p>
<p><strong>智能体结构</strong> 我们使用基于 LSTM 的 DOM 树编码器,以及一个编码指令字段的前馈网络。导航智能体通过计算元素编码和字段编码之间的两两相似性,输出一个元素和字段之间的联合分布。我们使用元素的边际分布作为元素编码的注意力权重,并将上下文向量传递给一个前馈神经网络来计算状态价值。网页导航智能体是通过演员-评论家算法进行训练的。而基于 LSTM 的对抗智能体是通过应用了灵活 PAIRED 算法和灵活 b-PAIRED 算法的策略梯度算法进行训练的。</p>
<p><strong>基线方法</strong> 我们将 PAIRED 算法、灵活 PAIRED 算法和灵活 b-PAIRED 算法和两个基线算法进行对比:(1)域随机(Domain Randomization, DR)智能体,首先从均匀分布 <span class="katex"><span class="katex-mathml"><math xmlns="http://www.w3.org/1998/Math/MathML"><semantics><mrow><mi>U</mi><mo stretchy="false">[</mo><mn>0</mn><mo separator="true">,</mo><mi>K</mi><mo stretchy="false">]</mo></mrow><annotation encoding="application/x-tex">U[0, K]</annotation></semantics></math></span><span class="katex-html" aria-hidden="true"><span class="base"><span class="strut" style="height:1em;vertical-align:-0.25em;"></span><span class="mord mathdefault" style="margin-right:0.10903em;">U</span><span class="mopen">[</span><span class="mord">0</span><span class="mpunct">,</span><span class="mspace" style="margin-right:0.16666666666666666em;"></span><span class="mord mathdefault" style="margin-right:0.07153em;">K</span><span class="mclose">]</span></span></span></span> 中抽样空白页面的数量 <span class="katex"><span class="katex-mathml"><math xmlns="http://www.w3.org/1998/Math/MathML"><semantics><mrow><mi>k</mi></mrow><annotation encoding="application/x-tex">k</annotation></semantics></math></span><span class="katex-html" aria-hidden="true"><span class="base"><span class="strut" style="height:0.69444em;vertical-align:0em;"></span><span class="mord mathdefault" style="margin-right:0.03148em;">k</span></span></span></span> ,接下来,在 <span class="katex"><span class="katex-mathml"><math xmlns="http://www.w3.org/1998/Math/MathML"><semantics><mrow><mi>N</mi></mrow><annotation encoding="application/x-tex">N</annotation></semantics></math></span><span class="katex-html" aria-hidden="true"><span class="base"><span class="strut" style="height:0.68333em;vertical-align:0em;"></span><span class="mord mathdefault" style="margin-right:0.10903em;">N</span></span></span></span> 步中,每一步随机选择一个组件(包括跳过),以及均匀分布 <span class="katex"><span class="katex-mathml"><math xmlns="http://www.w3.org/1998/Math/MathML"><semantics><mrow><mi>U</mi><mo stretchy="false">[</mo><mn>0</mn><mo separator="true">,</mo><mi>k</mi><mo stretchy="false">]</mo></mrow><annotation encoding="application/x-tex">U[0, k]</annotation></semantics></math></span><span class="katex-html" aria-hidden="true"><span class="base"><span class="strut" style="height:1em;vertical-align:-0.25em;"></span><span class="mord mathdefault" style="margin-right:0.10903em;">U</span><span class="mopen">[</span><span class="mord">0</span><span class="mpunct">,</span><span class="mspace" style="margin-right:0.16666666666666666em;"></span><span class="mord mathdefault" style="margin-right:0.03148em;">k</span><span class="mclose">]</span></span></span></span> 中的一个页面。(2)课程学习(Curriculum Learning,CL)方法,以概率 <span class="katex"><span class="katex-mathml"><math xmlns="http://www.w3.org/1998/Math/MathML"><semantics><mrow><mi>p</mi></mrow><annotation encoding="application/x-tex">p</annotation></semantics></math></span><span class="katex-html" aria-hidden="true"><span class="base"><span class="strut" style="height:0.625em;vertical-align:-0.19444em;"></span><span class="mord mathdefault">p</span></span></span></span> 随机选择每个组件,其中 <span class="katex"><span class="katex-mathml"><math xmlns="http://www.w3.org/1998/Math/MathML"><semantics><mrow><mi>p</mi></mrow><annotation encoding="application/x-tex">p</annotation></semantics></math></span><span class="katex-html" aria-hidden="true"><span class="base"><span class="strut" style="height:0.625em;vertical-align:-0.19444em;"></span><span class="mord mathdefault">p</span></span></span></span> 一开始是一个很小的值,并在训练的过程中逐渐达到 <span class="katex"><span class="katex-mathml"><math xmlns="http://www.w3.org/1998/Math/MathML"><semantics><mrow><mn>1.0</mn></mrow><annotation encoding="application/x-tex">1.0</annotation></semantics></math></span><span class="katex-html" aria-hidden="true"><span class="base"><span class="strut" style="height:0.64444em;vertical-align:0em;"></span><span class="mord">1</span><span class="mord">.</span><span class="mord">0</span></span></span></span></p>
<h3 id="实验结果">实验结果</h3>
<div class="gallery ">
<p><img src="/images/aeg/fig4.png" class="lazyload" data-srcset="/images/aeg/fig4.png" srcset="data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAABGdBTUEAALGPC/xhBQAAADhlWElmTU0AKgAAAAgAAYdpAAQAAAABAAAAGgAAAAAAAqACAAQAAAABAAAAAaADAAQAAAABAAAAAQAAAADa6r/EAAAAC0lEQVQIHWNgAAIAAAUAAY27m/MAAAAASUVORK5CYII=" alt="灵活 PAIRED 算法实验结果"></p>
</div>
<p>我们首先将我们提出的灵活 PAIRED 算法与原始 PAIRED 算法进行比较。如图 4 所示:PAIRED 算法在实验环境中根本无法学习,灵活 PAIRED 算法提升非常明显。一个原因是当智能体是分开的,且有非常相似的奖励时,遗憾值会很小,这一现象在训练时尤其明显。这个没有信息量的信号让对抗智能体很难进行学习。另一方面,灵活 PAIRED 算法计算出的遗憾信号总是正的,这更清楚地向对抗智能体展示哪个环境更具有挑战性但是任务仍然可以被完成。后续的消融实验表明引入预算同时提升了原始 PAIRED 算法和灵活 PAIRED 算法的表现。</p>
<div class="gallery ">
<p><img src="/images/aeg/fig5.png" class="lazyload" data-srcset="/images/aeg/fig5.png" srcset="data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAABGdBTUEAALGPC/xhBQAAADhlWElmTU0AKgAAAAgAAYdpAAQAAAABAAAAGgAAAAAAAqACAAQAAAABAAAAAaADAAQAAAABAAAAAQAAAADa6r/EAAAAC0lEQVQIHWNgAAIAAAUAAY27m/MAAAAASUVORK5CYII=" alt="不同环境上的实验结果"></p>
</div>
<p><strong>测试环境上的比较</strong> 我们在不同难度等级的测试环境上,利用成功率指标,比较了我们提出的模型与基线模型的表现。如图 5 所示:灵活 b-PAIRED 算法比灵活 PAIRED 算法表现更好,说明预算目标显著提升了性能。另外,这两种算法都在所有任务上超越了基线模型,其中灵活 b-PAIRED 算法在难度 1 的任务上达到了超过 <span class="katex"><span class="katex-mathml"><math xmlns="http://www.w3.org/1998/Math/MathML"><semantics><mrow><mn>80</mn><mi mathvariant="normal">%</mi></mrow><annotation encoding="application/x-tex">80\%</annotation></semantics></math></span><span class="katex-html" aria-hidden="true"><span class="base"><span class="strut" style="height:0.80556em;vertical-align:-0.05556em;"></span><span class="mord">8</span><span class="mord">0</span><span class="mord">%</span></span></span></span> 的成功率。甚至随着环境的复杂度逐渐增加,灵活 b-PAIRED 智能体仍然能够表现得很好,性能下降很小。虽然 CL 方法在训练的早期表现得比灵活 PAIRED 算法更好,但是由于忽略智能体的技能水平,创造出让智能体很难完成的环境,CL 方法在后期的性能下降非常明显。我们还注意到:灵活 b-PAIRED 算法比灵活 PAIRED 算法在所有环境上都学习得更快,因为灵活 b-PAIRED 算法对智能体的表现反应更快。</p>
<p><strong>环境复杂度</strong> 以生成的主动组件的百分比作为环境复杂性的衡量指标。相对而言,在一个具有更多被动组件的网页中学习更加简单。因为组件要么是添加智能体应当忽略的噪音,要么只有在智能体导航到另一个页面时才使用它们。另一方面,如果有更多的主动组件,不仅 DOM 树的大小会增加,命令字段的数量也会增加,这会增加元素和命令之间的匹配难度。如图 4f 所示:灵活 b-PAIRED 算法在最开始随机选择了大约 60% 的主动组件,并逐渐生成更多的主动组件。尽管灵活 b-PAIRED 算法使用了更多的主动组件,智能体依旧能够提升性能,这要归功于灵活 b-PAIRED 算法根据智能体的技能准确调整环境难度的能力。我们还观察到:在训练后期,组件的分布转变为更复杂和相关的组件。</p>
<div class='footer'>
<div class='copyright'>
<blockquote>
<p>博客内容遵循 署名-非商业性使用-相同方式共享 4.0 国际 (CC BY-NC-SA 4.0) 协议</p>
<p>本文永久链接是:<a href=https://lornd.top/aeg.html>https://lornd.top/aeg.html</a></p>
</blockquote>
</div>
</div>
<div class='article-meta' id="bottom">
<div class='new-meta-box'>
<div class="new-meta-item date" itemprop="dateUpdated" datetime="2023-09-15T17:05:49+08:00">
<a class='notlink'>
<i class="fas fa-edit fa-fw" aria-hidden="true"></i>
<p>更新于:2023年9月15日</p>
</a>
</div>
<div class="new-meta-item share -mob-share-list">
<div class="-mob-share-list share-body">
<a class="-mob-share-qq" title="" rel="external nofollow noopener noreferrer noopener"
target="_blank" href="http://connect.qq.com/widget/shareqq/index.html?url=https://lornd.top/aeg.html&title=[论文解读]Adversarial Environment Generation for Learning to Navigate the Web - lornd's blog&summary="
>
<img src="https://cdn.jsdelivr.net/gh/volantis-x/cdn-org/logo/128/qq.png" class="lazyload" data-srcset="https://cdn.jsdelivr.net/gh/volantis-x/cdn-org/logo/128/qq.png" srcset="data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAABGdBTUEAALGPC/xhBQAAADhlWElmTU0AKgAAAAgAAYdpAAQAAAABAAAAGgAAAAAAAqACAAQAAAABAAAAAaADAAQAAAABAAAAAQAAAADa6r/EAAAAC0lEQVQIHWNgAAIAAAUAAY27m/MAAAAASUVORK5CYII=">
</a>
<a class="-mob-share-qzone" title="" rel="external nofollow noopener noreferrer noopener"
target="_blank" href="https://sns.qzone.qq.com/cgi-bin/qzshare/cgi_qzshare_onekey?url=https://lornd.top/aeg.html&title=[论文解读]Adversarial Environment Generation for Learning to Navigate the Web - lornd's blog&summary="
>
<img src="https://cdn.jsdelivr.net/gh/volantis-x/cdn-org/logo/128/qzone.png" class="lazyload" data-srcset="https://cdn.jsdelivr.net/gh/volantis-x/cdn-org/logo/128/qzone.png" srcset="data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAABGdBTUEAALGPC/xhBQAAADhlWElmTU0AKgAAAAgAAYdpAAQAAAABAAAAGgAAAAAAAqACAAQAAAABAAAAAaADAAQAAAABAAAAAQAAAADa6r/EAAAAC0lEQVQIHWNgAAIAAAUAAY27m/MAAAAASUVORK5CYII=">
</a>
<a class="-mob-share-weibo" title="" rel="external nofollow noopener noreferrer noopener"
target="_blank" href="http://service.weibo.com/share/share.php?url=https://lornd.top/aeg.html&title=[论文解读]Adversarial Environment Generation for Learning to Navigate the Web - lornd's blog&summary="
>
<img src="https://cdn.jsdelivr.net/gh/volantis-x/cdn-org/logo/128/weibo.png" class="lazyload" data-srcset="https://cdn.jsdelivr.net/gh/volantis-x/cdn-org/logo/128/weibo.png" srcset="data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAABGdBTUEAALGPC/xhBQAAADhlWElmTU0AKgAAAAgAAYdpAAQAAAABAAAAGgAAAAAAAqACAAQAAAABAAAAAaADAAQAAAABAAAAAQAAAADa6r/EAAAAC0lEQVQIHWNgAAIAAAUAAY27m/MAAAAASUVORK5CYII=">
</a>
</div>
</div>
</div>
</div>
<div class="prev-next">
<a class='prev' href='/flin.html'>
<p class='title'><i class="fas fa-chevron-left" aria-hidden="true"></i>[论文解读]FLIN: A Flexible Natural Language Interface for Web Navigation</p>
<p class='content'>论文地址:FLIN: A Flexible Natural Language Interface for Web Navigation 。
摘要
AI 助手现在可以通过直接与网页 UI 交互为用...</p>
</a>
<a class='next' href='/qweb.html'>
<p class='title'>[论文解读]learning to navigate the web<i class="fas fa-chevron-right" aria-hidden="true"></i></p>
<p class='content'>论文地址:learning to navigate the web 。
摘要
在有着巨大的状态空间和动作空间,以及稀疏奖励的环境中进行学习,会阻碍强化学习智能体通过试错的学习过程。例如,在网页上...</p>
</a>
</div>
</article>
<article class="post white-box reveal shadow" id="comments">
<p ct><i class='fas fa-comments'></i> 评论</p>
<div id="valine_container" class="valine_thread">
<i class="fas fa-cog fa-spin fa-fw fa-2x"></i>
</div>
</article>
<script>
// https://github.com/theme-next/hexo-theme-next/blob/master/layout/_third-party/math/mathjax.swig
if (typeof MathJax === 'undefined') {
window.MathJax = {
loader: {
source: {
'[tex]/amsCd': '[tex]/amscd',
'[tex]/AMScd': '[tex]/amscd'
}
},
tex: {
inlineMath: {'[+]': [['$', '$']]},
tags: 'ams'
},
options: {
renderActions: {
findScript: [10, doc => {
document.querySelectorAll('script[type^="math/tex"]').forEach(node => {
const display = !!node.type.match(/; *mode=display/);
const math = new doc.options.MathItem(node.textContent, doc.inputJax[0], display);
const text = document.createTextNode('');
node.parentNode.replaceChild(text, node);
math.start = {node: text, delim: '', n: 0};
math.end = {node: text, delim: '', n: 0};
doc.math.push(math);
});
}, '', false],
insertedScript: [200, () => {
document.querySelectorAll('mjx-container').forEach(node => {
let target = node.parentNode;
if (target.nodeName.toLowerCase() === 'li') {
target.parentNode.classList.add('has-jax');
}
});
}, '', false]
}
}
};
(function () {
var script = document.createElement('script');
script.src = 'https://cdn.jsdelivr.net/npm/mathjax@3.0/es5/tex-mml-chtml.js';
script.defer = true;
document.head.appendChild(script);
})();
} else {
// 文章章节标题不能为 “MathJax” ,否则会报错。
MathJax.startup.document.state(0);
MathJax.texReset();
MathJax.typeset();
}
</script>
</div>
<aside class='l_side'>
<section class="widget toc-wrapper shadow desktop mobile" id="toc-div" >
<header>
<i class="fas fa-list fa-fw" aria-hidden="true"></i><span class='name'>本文目录</span>
</header>
<div class='content'>
<ol class="toc"><li class="toc-item toc-level-2"><a class="toc-link" href="#%E6%91%98%E8%A6%81"><span class="toc-text">摘要</span></a></li><li class="toc-item toc-level-2"><a class="toc-link" href="#%E7%A0%94%E7%A9%B6%E8%83%8C%E6%99%AF"><span class="toc-text">研究背景</span></a></li><li class="toc-item toc-level-2"><a class="toc-link" href="#%E6%96%87%E7%AB%A0%E8%B4%A1%E7%8C%AE"><span class="toc-text">文章贡献</span></a></li><li class="toc-item toc-level-2"><a class="toc-link" href="#%E9%97%AE%E9%A2%98%E6%8F%8F%E8%BF%B0"><span class="toc-text">问题描述</span></a><ol class="toc-child"><li class="toc-item toc-level-3"><a class="toc-link" href="#%E7%BD%91%E9%A1%B5%E5%AF%BC%E8%88%AA%E9%97%AE%E9%A2%98"><span class="toc-text">网页导航问题</span></a></li><li class="toc-item toc-level-3"><a class="toc-link" href="#paired-%E7%AE%97%E6%B3%95"><span class="toc-text">PAIRED 算法</span></a></li></ol></li><li class="toc-item toc-level-2"><a class="toc-link" href="#%E8%A7%A3%E5%86%B3%E6%96%B9%E6%B3%95"><span class="toc-text">解决方法</span></a><ol class="toc-child"><li class="toc-item toc-level-3"><a class="toc-link" href="#%E7%BD%91%E9%A1%B5%E8%AE%BE%E8%AE%A1%E7%BB%84%E4%BB%B6"><span class="toc-text">网页设计组件</span></a></li><li class="toc-item toc-level-3"><a class="toc-link" href="#%E5%AF%B9%E6%8A%97%E6%99%BA%E8%83%BD%E4%BD%93%E6%9E%B6%E6%9E%84"><span class="toc-text">对抗智能体架构</span></a></li><li class="toc-item toc-level-3"><a class="toc-link" href="#%E7%81%B5%E6%B4%BB-paired-%E7%AE%97%E6%B3%95"><span class="toc-text">灵活 PAIRED 算法</span></a></li><li class="toc-item toc-level-3"><a class="toc-link" href="#%E5%B0%86%E9%A2%84%E7%AE%97%E5%BC%95%E5%85%A5%E5%AF%B9%E6%8A%97%E7%AD%96%E7%95%A5"><span class="toc-text">将预算引入对抗策略</span></a></li></ol></li><li class="toc-item toc-level-2"><a class="toc-link" href="#%E5%AE%9E%E9%AA%8C%E9%AA%8C%E8%AF%81"><span class="toc-text">实验验证</span></a><ol class="toc-child"><li class="toc-item toc-level-3"><a class="toc-link" href="#%E5%AE%9E%E9%AA%8C%E7%BB%93%E6%9E%9C"><span class="toc-text">实验结果</span></a></li></ol></li></ol>
</div>
</section>
</aside>
<!--此文件用来存放一些不方便取值的变量-->
<!--思路大概是将值藏到重加载的区域内-->
<script>
window.pdata={}
pdata.ispage=true;
pdata.postTitle="[论文解读]Adversarial Environment Generation for Learning to Navigate the Web";
pdata.commentPath="";
pdata.commentPlaceholder="";
// header 这里无论是否开启pjax都需要
var l_header=document.getElementById("l_header");
l_header.classList.add("show");
// cover
var cover_wrapper=document.querySelector('.cover-wrapper');
cover_wrapper.id="none";
cover_wrapper.style.display="none";
</script>
</div>
<footer class="footer clearfix">
<br><br>
<div class="aplayer-container">
</div>
<br>
<div class="social-wrapper">
</div>
<div><p>博客内容遵循 <a target="_blank" rel="noopener" href="https://creativecommons.org/licenses/by-nc-sa/4.0/deed.zh">署名-非商业性使用-相同方式共享 4.0 国际 (CC BY-NC-SA 4.0) 协议</a></p>
</div>
<div><p><span id="lc-sv">本站总访问量为 <span id='number'><i class="fas fa-circle-notch fa-spin fa-fw" aria-hidden="true"></i></span> 次</span> <span id="lc-uv">访客数为 <span id='number'><i class="fas fa-circle-notch fa-spin fa-fw" aria-hidden="true"></i></span> 人</span></p>
</div>
本站使用
<a href="https://github.com/volantis-x/hexo-theme-volantis/tree/4.3.1" target="_blank" class="codename">Volantis</a>
作为主题
<div class='copyright'>
<p><a href="/">Copyright © 2017-2020 XXX</a></p>
</div>
</footer>
<a id="s-top" class="fas fa-arrow-up fa-fw" href="javascript:void(0)"></a>
</div>
</div>
<div>
<script>
/************这个文件存放不需要重载的全局变量和全局函数*********/
window.volantis={};
window.volantis.loadcss=document.getElementById("loadcss");
/******************** Pjax ********************************/
function VPjax(){
this.list=[] // 存放回调函数
this.start=()=>{
for(var i=0;i<this.list.length;i++){
this.list[i].run();
}
}
this.push=(fn,name)=>{
var f=new PjaxItem(fn,name);
this.list.push(f);
}
// 构造一个可以run的对象
function PjaxItem(fn,name){
// 函数名称
this.name = name || fn.name
// run方法
this.run=()=>{
fn()
}
}
}
volantis.pjax={}
volantis.pjax.method={
complete: new VPjax(),
error: new VPjax(),
send: new VPjax()
}
volantis.pjax={
...volantis.pjax,
push: volantis.pjax.method.complete.push,
error: volantis.pjax.method.error.push,
send: volantis.pjax.method.send.push
}
/********************脚本懒加载函数********************************/
// 已经加入了setTimeout
function loadScript(src, cb) {
setTimeout(function() {
var HEAD = document.getElementsByTagName('head')[0] || document.documentElement;
var script = document.createElement('script');
script.setAttribute('type','text/javascript');
if (cb) script.onload = cb;
script.setAttribute('src', src);
HEAD.appendChild(script);
});
}
//https://github.com/filamentgroup/loadCSS
var loadCSS = function( href, before, media, attributes ){
var doc = window.document;
var ss = doc.createElement( "link" );
var ref;
if( before ){
ref = before;
}
else {
var refs = ( doc.body || doc.getElementsByTagName( "head" )[ 0 ] ).childNodes;
ref = refs[ refs.length - 1];
}
var sheets = doc.styleSheets;
if( attributes ){
for( var attributeName in attributes ){
if( attributes.hasOwnProperty( attributeName ) ){
ss.setAttribute( attributeName, attributes[attributeName] );
}
}
}
ss.rel = "stylesheet";
ss.href = href;
ss.media = "only x";
function ready( cb ){
if( doc.body ){
return cb();
}
setTimeout(function(){
ready( cb );
});
}
ready( function(){
ref.parentNode.insertBefore( ss, ( before ? ref : ref.nextSibling ) );
});
var onloadcssdefined = function( cb ){
var resolvedHref = ss.href;
var i = sheets.length;
while( i-- ){
if( sheets[ i ].href === resolvedHref ){
return cb();
}
}
setTimeout(function() {
onloadcssdefined( cb );
});
};
function loadCB(){
if( ss.addEventListener ){
ss.removeEventListener( "load", loadCB );
}
ss.media = media || "all";
}
if( ss.addEventListener ){
ss.addEventListener( "load", loadCB);
}
ss.onloadcssdefined = onloadcssdefined;
onloadcssdefined( loadCB );
return ss;
};
</script>
<script>
loadCSS("https://cdn.jsdelivr.net/npm/@fortawesome/fontawesome-free@5.14/css/all.min.css", window.volantis.loadcss);
</script>
<!-- required -->
<script src="https://cdn.jsdelivr.net/npm/jquery@3.5/dist/jquery.min.js"></script>
<script>
function pjax_fancybox() {
$(".md .gallery").find("img").each(function () { //渲染 fancybox
var element = document.createElement("a"); // a 标签
$(element).attr("class", "fancybox");
$(element).attr("pjax-fancybox", ""); // 过滤 pjax
$(element).attr("href", $(this).attr("src"));
if ($(this).attr("data-original")) {
$(element).attr("href", $(this).attr("data-original"));
}
$(element).attr("data-fancybox", "images");
var caption = ""; // 描述信息
if ($(this).attr('alt')) { // 判断当前页面是否存在描述信息
$(element).attr('data-caption', $(this).attr('alt'));
caption = $(this).attr('alt');
}
var div = document.createElement("div");
$(div).addClass("fancybox");
$(this).wrap(div); // 最外层套 div ,其实主要作用还是 class 样式
var span = document.createElement("span");
$(span).addClass("image-caption");
$(span).text(caption); // 加描述
$(this).after(span); // 再套一层描述
$(this).wrap(element); // 最后套 a 标签
})
$(".md .gallery").find("img").fancybox({
selector: '[data-fancybox="images"]',
hash: false,
loop: false,
closeClick: true,
helpers: {
overlay: {closeClick: true}
},
buttons: [
"zoom",
"close"
]
});
};
function SCload_fancybox() {
if ($(".md .gallery").find("img").length == 0) return;
loadCSS("https://cdn.jsdelivr.net/npm/@fancyapps/fancybox@3.5.7/dist/jquery.fancybox.min.css", document.getElementById("loadcss"));
loadScript('https://cdn.jsdelivr.net/gh/fancyapps/fancybox@3.5.7/dist/jquery.fancybox.min.js', pjax_fancybox)
};
$(function () {
SCload_fancybox();
});
function Pjax_SCload_fancybox(){
if (typeof $.fancybox == "undefined") {
SCload_fancybox();
} else {
pjax_fancybox();
}
}
volantis.pjax.push(Pjax_SCload_fancybox)
volantis.pjax.send(()=>{
if (typeof $.fancybox != "undefined") {
$.fancybox.close(); // 关闭弹窗
}
},'fancybox')
</script>
<!-- internal -->
<script>
function loadIssuesJS() {
if ($(".md").find(".issues-api").length == 0) return;
loadScript('/js/issues.js');
};
$(function () {
loadIssuesJS();
});
volantis.pjax.push(()=>{
if (typeof IssuesAPI == "undefined") {
loadIssuesJS();
}
},"IssuesJS")
</script>
<script defer src="https://cdn.jsdelivr.net/npm/vanilla-lazyload@17.1.0/dist/lazyload.min.js"></script>
<script>
// https://www.npmjs.com/package/vanilla-lazyload
// Set the options globally
// to make LazyLoad self-initialize
window.lazyLoadOptions = {
elements_selector: ".lazyload",
threshold: 0
};
// Listen to the initialization event
// and get the instance of LazyLoad
window.addEventListener(
"LazyLoad::Initialized",
function (event) {
window.lazyLoadInstance = event.detail.instance;
},
false
);
document.addEventListener('DOMContentLoaded', function () {
lazyLoadInstance.update();
});
document.addEventListener('pjax:complete', function () {
lazyLoadInstance.update();
});
</script>
<script>
window.FPConfig = {
delay: 0,
ignoreKeywords: [],
maxRPS: 5,
hoverDelay: 25
};
</script>
<script defer src="https://cdn.jsdelivr.net/gh/gijo-varghese/flying-pages@2.1.2/flying-pages.min.js"></script>
<script src="/js/valine.js"></script>
<script>
function emoji(path, idx, ext) {
return path + "/" + path + "-" + idx + "." + ext;
}
var emojiMaps = {};
for (var i = 1; i <= 54; i++) {