From e55db81c6d66a54e6db1e64481543a4c88973bad Mon Sep 17 00:00:00 2001 From: Nicolas Gontier Date: Tue, 16 Sep 2025 20:03:00 +0000 Subject: [PATCH 01/64] init commit for webarena verified --- .../experiments/benchmark/configs.py | 14 + .../benchmark/metadata/webarena_verified.csv | 813 ++++++++++++++++++ .../src/browsergym/experiments/loop.py | 1 + 3 files changed, 828 insertions(+) create mode 100644 browsergym/experiments/src/browsergym/experiments/benchmark/metadata/webarena_verified.csv diff --git a/browsergym/experiments/src/browsergym/experiments/benchmark/configs.py b/browsergym/experiments/src/browsergym/experiments/benchmark/configs.py index ab2858d3..ee239021 100644 --- a/browsergym/experiments/src/browsergym/experiments/benchmark/configs.py +++ b/browsergym/experiments/src/browsergym/experiments/benchmark/configs.py @@ -132,6 +132,20 @@ ), task_metadata=task_metadata("webarena"), ), + "webarena_verified": lambda n_repeats=1: Benchmark( + name="webarena_verified", + high_level_action_set_args=DEFAULT_HIGHLEVEL_ACTION_SET_ARGS["webarena"], + is_multi_tab=True, + supports_parallel_seeds=False, + backends=["webarena"], + env_args_list=make_env_args_list_from_repeat_tasks( + task_list=task_list_from_metadata(metadata=task_metadata("webarena_verified")), + max_steps=30, + n_repeats=n_repeats, + seeds_rng=np.random.RandomState(42), + ), + task_metadata=task_metadata("webarena_verified"), + ), "webarena_lite": lambda n_repeats=1: Benchmark( name="webarena_lite", high_level_action_set_args=DEFAULT_HIGHLEVEL_ACTION_SET_ARGS["webarena"], diff --git a/browsergym/experiments/src/browsergym/experiments/benchmark/metadata/webarena_verified.csv b/browsergym/experiments/src/browsergym/experiments/benchmark/metadata/webarena_verified.csv new file mode 100644 index 00000000..2b70a143 --- /dev/null +++ b/browsergym/experiments/src/browsergym/experiments/benchmark/metadata/webarena_verified.csv @@ -0,0 +1,813 @@ +task_name,requires_reset,sites,eval_types,task_id,browsergym_split,depends_on +webarena_verified.0,False,shopping_admin,retrieve_value,0,train, +webarena_verified.1,False,shopping_admin,retrieve_value,1,test,webarena_verified.0 +webarena_verified.2,False,shopping_admin,retrieve_value,2,train,webarena_verified.1 +webarena_verified.3,False,shopping_admin,retrieve_value,3,test,webarena_verified.2 +webarena_verified.4,False,shopping_admin,retrieve_value,4,train,webarena_verified.3 +webarena_verified.5,False,shopping_admin,retrieve_value,5,train,webarena_verified.4 +webarena_verified.6,False,shopping_admin,retrieve_value,6,test,webarena_verified.5 +webarena_verified.7,False,map,retrieve_value,7,train, +webarena_verified.8,False,map,string_match,8,test,webarena_verified.7 +webarena_verified.9,False,map,retrieve_value,9,test,webarena_verified.8 +webarena_verified.10,False,map,retrieve_value,10,test,webarena_verified.9 +webarena_verified.11,False,shopping_admin,retrieve_value,11,test,webarena_verified.6 +webarena_verified.12,False,shopping_admin,retrieve_value,12,train,webarena_verified.11 +webarena_verified.13,False,shopping_admin,retrieve_value,13,train,webarena_verified.12 +webarena_verified.14,False,shopping_admin,retrieve_value,14,train,webarena_verified.13 +webarena_verified.15,False,shopping_admin,retrieve_value,15,test,webarena_verified.14 +webarena_verified.16,False,map,string_match,16,test,webarena_verified.10 +webarena_verified.17,False,map,string_match,17,train,webarena_verified.16 +webarena_verified.18,False,map,string_match,18,test,webarena_verified.17 +webarena_verified.19,False,map,string_match,19,train,webarena_verified.18 +webarena_verified.20,False,map,string_match,20,test,webarena_verified.19 +webarena_verified.21,False,shopping,retrieve_value,21,test, +webarena_verified.22,False,shopping,retrieve_value,22,test,webarena_verified.21 +webarena_verified.23,False,shopping,retrieve_value,23,test,webarena_verified.22 +webarena_verified.24,False,shopping,retrieve_value,24,test,webarena_verified.23 +webarena_verified.25,False,shopping,retrieve_value,25,test,webarena_verified.24 +webarena_verified.26,False,shopping,retrieve_value,26,test,webarena_verified.25 +webarena_verified.27,False,reddit,retrieve_value,27,test, +webarena_verified.28,False,reddit,retrieve_value,28,train,webarena_verified.27 +webarena_verified.29,False,reddit,retrieve_value,29,train,webarena_verified.28 +webarena_verified.30,False,reddit,retrieve_value,30,test,webarena_verified.29 +webarena_verified.31,False,reddit,retrieve_value,31,train,webarena_verified.30 +webarena_verified.32,False,map,retrieve_value,32,test,webarena_verified.20 +webarena_verified.33,False,map,retrieve_value,33,test,webarena_verified.32 +webarena_verified.34,False,map,retrieve_value,34,train,webarena_verified.33 +webarena_verified.35,False,map,retrieve_value,35,test,webarena_verified.34 +webarena_verified.36,False,map,retrieve_value,36,test,webarena_verified.35 +webarena_verified.37,False,map,retrieve_value,37,train,webarena_verified.36 +webarena_verified.38,False,map,retrieve_value,38,train,webarena_verified.37 +webarena_verified.39,False,map,retrieve_value,39,train,webarena_verified.38 +webarena_verified.40,False,map,retrieve_value,40,test,webarena_verified.39 +webarena_verified.41,False,shopping_admin,retrieve_value,41,train,webarena_verified.15 +webarena_verified.42,False,shopping_admin,retrieve_value,42,train,webarena_verified.41 +webarena_verified.43,False,shopping_admin,retrieve_value,43,test,webarena_verified.42 +webarena_verified.44,False,gitlab,ui_state,44,train, +webarena_verified.45,False,gitlab,ui_state,45,test,webarena_verified.44 +webarena_verified.46,False,gitlab,ui_state,46,test,webarena_verified.45 +webarena_verified.47,False,shopping,retrieve_value,47,train,webarena_verified.26 +webarena_verified.48,False,shopping,retrieve_value,48,test,webarena_verified.47 +webarena_verified.49,False,shopping,retrieve_value,49,train,webarena_verified.48 +webarena_verified.50,False,shopping,retrieve_value,50,train,webarena_verified.49 +webarena_verified.51,False,shopping,retrieve_value,51,test,webarena_verified.50 +webarena_verified.52,False,map,string_match,52,test,webarena_verified.40 +webarena_verified.53,False,map,string_match,53,train,webarena_verified.52 +webarena_verified.54,False,map,string_match,54,test,webarena_verified.53 +webarena_verified.55,False,map,string_match,55,train,webarena_verified.54 +webarena_verified.56,False,map,string_match,56,train,webarena_verified.55 +webarena_verified.57,False,map,retrieve_value,57,train,webarena_verified.56 +webarena_verified.58,False,map,retrieve_value,58,train,webarena_verified.57 +webarena_verified.59,False,map,retrieve_value,59,test,webarena_verified.58 +webarena_verified.60,False,map,retrieve_value,60,test,webarena_verified.59 +webarena_verified.61,False,map,retrieve_value,61,train,webarena_verified.60 +webarena_verified.62,False,shopping_admin,retrieve_value,62,train,webarena_verified.43 +webarena_verified.63,False,shopping_admin,retrieve_value,63,test,webarena_verified.62 +webarena_verified.64,False,shopping_admin,retrieve_value,64,test,webarena_verified.63 +webarena_verified.65,False,shopping_admin,retrieve_value,65,train,webarena_verified.64 +webarena_verified.66,False,reddit,retrieve_value,66,test,webarena_verified.31 +webarena_verified.67,False,reddit,retrieve_value,67,test,webarena_verified.66 +webarena_verified.68,False,reddit,retrieve_value,68,train,webarena_verified.67 +webarena_verified.69,False,reddit,retrieve_value,69,test,webarena_verified.68 +webarena_verified.70,False,map,retrieve_value,70,train,webarena_verified.61 +webarena_verified.71,False,map,retrieve_value,71,test,webarena_verified.70 +webarena_verified.72,False,map,retrieve_value,72,train,webarena_verified.71 +webarena_verified.73,False,map,retrieve_value,73,test,webarena_verified.72 +webarena_verified.74,False,map,string_match,74,train,webarena_verified.73 +webarena_verified.75,False,map,string_match,75,train,webarena_verified.74 +webarena_verified.76,False,map,retrieve_value,76,train,webarena_verified.75 +webarena_verified.77,False,shopping_admin,retrieve_value,77,test,webarena_verified.65 +webarena_verified.78,False,shopping_admin,retrieve_value,78,train,webarena_verified.77 +webarena_verified.79,False,shopping_admin,retrieve_value,79,test,webarena_verified.78 +webarena_verified.80,False,map,string_match,80,test,webarena_verified.76 +webarena_verified.81,False,map,string_match,81,test,webarena_verified.80 +webarena_verified.82,False,map,string_match,82,train,webarena_verified.81 +webarena_verified.83,False,map,string_match,83,train,webarena_verified.82 +webarena_verified.84,False,map,string_match,84,train,webarena_verified.83 +webarena_verified.85,False,map,string_match,85,test,webarena_verified.84 +webarena_verified.86,False,map,string_match,86,test,webarena_verified.85 +webarena_verified.87,False,map,string_match,87,train,webarena_verified.86 +webarena_verified.88,False,map,string_match,88,train,webarena_verified.87 +webarena_verified.89,False,map,retrieve_value,89,test,webarena_verified.88 +webarena_verified.90,False,map,retrieve_value,90,test,webarena_verified.89 +webarena_verified.91,False,map,retrieve_value,91,train,webarena_verified.90 +webarena_verified.92,False,map,retrieve_value,92,train,webarena_verified.91 +webarena_verified.93,False,map,retrieve_value,93,train,webarena_verified.92 +webarena_verified.94,False,shopping_admin,retrieve_value,94,test,webarena_verified.79 +webarena_verified.95,False,shopping_admin,retrieve_value,95,train,webarena_verified.94 +webarena_verified.96,False,shopping,retrieve_value,96,test,webarena_verified.51 +webarena_verified.97,False,map wikipedia,retrieve_value,97,test,webarena_verified.93 +webarena_verified.98,False,map,retrieve_value,98,test,webarena_verified.97 +webarena_verified.99,False,map,retrieve_value,99,train,webarena_verified.98 +webarena_verified.100,False,map,retrieve_value,100,test,webarena_verified.99 +webarena_verified.101,False,map,string_match,101,train,webarena_verified.100 +webarena_verified.102,False,gitlab,ui_state,102,train,webarena_verified.46 +webarena_verified.103,False,gitlab,ui_state,103,train,webarena_verified.102 +webarena_verified.104,False,gitlab,ui_state,104,test,webarena_verified.103 +webarena_verified.105,False,gitlab,ui_state,105,train,webarena_verified.104 +webarena_verified.106,False,gitlab,ui_state,106,test,webarena_verified.105 +webarena_verified.107,False,shopping_admin,retrieve_value,107,test,webarena_verified.95 +webarena_verified.108,False,shopping_admin,retrieve_value,108,train,webarena_verified.107 +webarena_verified.109,False,shopping_admin,retrieve_value,109,test,webarena_verified.108 +webarena_verified.110,False,shopping_admin,retrieve_value,110,train,webarena_verified.109 +webarena_verified.111,False,shopping_admin,retrieve_value,111,train,webarena_verified.110 +webarena_verified.112,False,shopping_admin,retrieve_value,112,test,webarena_verified.111 +webarena_verified.113,False,shopping_admin,retrieve_value,113,test,webarena_verified.112 +webarena_verified.114,False,shopping_admin,retrieve_value,114,train,webarena_verified.113 +webarena_verified.115,False,shopping_admin,retrieve_value,115,test,webarena_verified.114 +webarena_verified.116,False,shopping_admin,retrieve_value,116,test,webarena_verified.115 +webarena_verified.117,False,shopping,retrieve_value,117,test,webarena_verified.96 +webarena_verified.118,False,shopping,program_html,118,train,webarena_verified.117 +webarena_verified.119,False,shopping_admin,retrieve_value,119,test,webarena_verified.116 +webarena_verified.120,False,shopping_admin,retrieve_value,120,train,webarena_verified.119 +webarena_verified.121,False,shopping_admin,retrieve_value,121,train,webarena_verified.120 +webarena_verified.122,False,shopping_admin,retrieve_value,122,test,webarena_verified.121 +webarena_verified.123,False,shopping_admin,retrieve_value,123,train,webarena_verified.122 +webarena_verified.124,False,shopping,retrieve_value,124,train,webarena_verified.118 +webarena_verified.125,False,shopping,retrieve_value,125,train,webarena_verified.124 +webarena_verified.126,False,shopping,retrieve_value,126,test,webarena_verified.125 +webarena_verified.127,False,shopping_admin,retrieve_value,127,train,webarena_verified.123 +webarena_verified.128,False,shopping_admin,retrieve_value,128,train,webarena_verified.127 +webarena_verified.129,False,shopping_admin,retrieve_value,129,train,webarena_verified.128 +webarena_verified.130,False,shopping_admin,retrieve_value,130,train,webarena_verified.129 +webarena_verified.131,False,shopping_admin,retrieve_value,131,test,webarena_verified.130 +webarena_verified.132,False,gitlab,retrieve_value,132,train,webarena_verified.106 +webarena_verified.133,False,gitlab,retrieve_value,133,test,webarena_verified.132 +webarena_verified.134,False,gitlab,retrieve_value,134,test,webarena_verified.133 +webarena_verified.135,False,gitlab,retrieve_value,135,train,webarena_verified.134 +webarena_verified.136,False,gitlab,retrieve_value,136,train,webarena_verified.135 +webarena_verified.137,False,map,string_match,137,test,webarena_verified.101 +webarena_verified.138,False,map,string_match,138,test,webarena_verified.137 +webarena_verified.139,False,map,string_match,139,test,webarena_verified.138 +webarena_verified.140,False,map,string_match,140,train,webarena_verified.139 +webarena_verified.141,False,shopping,retrieve_value,141,train,webarena_verified.126 +webarena_verified.142,False,shopping,retrieve_value,142,train,webarena_verified.141 +webarena_verified.143,False,shopping,retrieve_value,143,test,webarena_verified.142 +webarena_verified.144,False,shopping,retrieve_value,144,test,webarena_verified.143 +webarena_verified.145,False,shopping,retrieve_value,145,train,webarena_verified.144 +webarena_verified.146,False,shopping,retrieve_value,146,test,webarena_verified.145 +webarena_verified.147,False,shopping,retrieve_value,147,train,webarena_verified.146 +webarena_verified.148,False,shopping,retrieve_value,148,train,webarena_verified.147 +webarena_verified.149,False,shopping,retrieve_value,149,test,webarena_verified.148 +webarena_verified.150,False,shopping,retrieve_value,150,train,webarena_verified.149 +webarena_verified.151,False,map,string_match,151,train,webarena_verified.140 +webarena_verified.152,False,map,string_match,152,train,webarena_verified.151 +webarena_verified.153,False,map,string_match,153,test,webarena_verified.152 +webarena_verified.154,False,map,string_match,154,train,webarena_verified.153 +webarena_verified.155,False,map,string_match,155,test,webarena_verified.154 +webarena_verified.156,False,gitlab,ui_state,156,test,webarena_verified.136 +webarena_verified.157,False,shopping_admin,ui_state,157,train,webarena_verified.131 +webarena_verified.158,False,shopping,ui_state,158,test,webarena_verified.150 +webarena_verified.159,False,shopping,ui_state,159,train,webarena_verified.158 +webarena_verified.160,False,shopping,ui_state,160,train,webarena_verified.159 +webarena_verified.161,False,shopping,ui_state,161,train,webarena_verified.160 +webarena_verified.162,False,shopping,ui_state,162,test,webarena_verified.161 +webarena_verified.163,False,shopping,retrieve_value,163,test,webarena_verified.162 +webarena_verified.164,False,shopping,retrieve_value,164,test,webarena_verified.163 +webarena_verified.165,False,shopping,retrieve_value,165,test,webarena_verified.164 +webarena_verified.166,False,shopping,retrieve_value,166,test,webarena_verified.165 +webarena_verified.167,False,shopping,retrieve_value,167,test,webarena_verified.166 +webarena_verified.168,False,gitlab,retrieve_value,168,test,webarena_verified.156 +webarena_verified.169,False,gitlab,retrieve_value,169,train,webarena_verified.168 +webarena_verified.170,False,gitlab,retrieve_value,170,train,webarena_verified.169 +webarena_verified.171,False,gitlab,retrieve_value,171,test,webarena_verified.170 +webarena_verified.172,False,gitlab,retrieve_value,172,train,webarena_verified.171 +webarena_verified.173,False,gitlab,retrieve_value,173,train,webarena_verified.172 +webarena_verified.174,False,gitlab,retrieve_value,174,test,webarena_verified.173 +webarena_verified.175,False,gitlab,retrieve_value,175,train,webarena_verified.174 +webarena_verified.176,False,gitlab,retrieve_value,176,train,webarena_verified.175 +webarena_verified.177,False,gitlab,retrieve_value,177,test,webarena_verified.176 +webarena_verified.178,False,gitlab,retrieve_value,178,test,webarena_verified.177 +webarena_verified.179,False,gitlab,retrieve_value,179,train,webarena_verified.178 +webarena_verified.180,False,gitlab,retrieve_value,180,train,webarena_verified.179 +webarena_verified.181,False,gitlab,retrieve_value,181,test,webarena_verified.180 +webarena_verified.182,False,gitlab,retrieve_value,182,train,webarena_verified.181 +webarena_verified.183,False,shopping_admin,retrieve_value,183,train,webarena_verified.157 +webarena_verified.184,False,shopping_admin,retrieve_value,184,train,webarena_verified.183 +webarena_verified.185,False,shopping_admin,retrieve_value,185,test,webarena_verified.184 +webarena_verified.186,False,shopping_admin,retrieve_value,186,train,webarena_verified.185 +webarena_verified.187,False,shopping_admin,retrieve_value,187,test,webarena_verified.186 +webarena_verified.188,False,shopping,retrieve_value,188,test,webarena_verified.167 +webarena_verified.189,False,shopping,retrieve_value,189,train,webarena_verified.188 +webarena_verified.190,False,shopping,retrieve_value,190,train,webarena_verified.189 +webarena_verified.191,False,shopping,retrieve_value,191,train,webarena_verified.190 +webarena_verified.192,False,shopping,retrieve_value,192,test,webarena_verified.191 +webarena_verified.193,False,shopping_admin,retrieve_value,193,train,webarena_verified.187 +webarena_verified.194,False,shopping_admin,retrieve_value,194,train,webarena_verified.193 +webarena_verified.195,False,shopping_admin,retrieve_value,195,test,webarena_verified.194 +webarena_verified.196,False,shopping_admin,retrieve_value,196,train,webarena_verified.195 +webarena_verified.197,False,shopping_admin,retrieve_value,197,train,webarena_verified.196 +webarena_verified.198,False,shopping_admin,retrieve_value,198,train,webarena_verified.197 +webarena_verified.199,False,shopping_admin,retrieve_value,199,train,webarena_verified.198 +webarena_verified.200,False,shopping_admin,retrieve_value,200,train,webarena_verified.199 +webarena_verified.201,False,shopping_admin,retrieve_value,201,test,webarena_verified.200 +webarena_verified.202,False,shopping_admin,retrieve_value,202,train,webarena_verified.201 +webarena_verified.203,False,shopping_admin,retrieve_value,203,test,webarena_verified.202 +webarena_verified.204,False,shopping_admin,retrieve_value,204,test,webarena_verified.203 +webarena_verified.205,False,gitlab,retrieve_value,205,train,webarena_verified.182 +webarena_verified.206,False,gitlab,retrieve_value,206,test,webarena_verified.205 +webarena_verified.207,False,gitlab,retrieve_value,207,test,webarena_verified.206 +webarena_verified.208,False,shopping_admin,retrieve_value,208,test,webarena_verified.204 +webarena_verified.209,False,shopping_admin,retrieve_value,209,test,webarena_verified.208 +webarena_verified.210,False,shopping_admin,retrieve_value,210,train,webarena_verified.209 +webarena_verified.211,False,shopping_admin,retrieve_value,211,train,webarena_verified.210 +webarena_verified.212,False,shopping_admin,retrieve_value,212,train,webarena_verified.211 +webarena_verified.213,False,shopping_admin,retrieve_value,213,test,webarena_verified.212 +webarena_verified.214,False,shopping_admin,retrieve_value,214,train,webarena_verified.213 +webarena_verified.215,False,shopping_admin,retrieve_value,215,test,webarena_verified.214 +webarena_verified.216,False,shopping_admin,retrieve_value,216,train,webarena_verified.215 +webarena_verified.217,False,shopping_admin,retrieve_value,217,train,webarena_verified.216 +webarena_verified.218,False,map,string_match,218,train,webarena_verified.155 +webarena_verified.219,False,map,string_match,219,test,webarena_verified.218 +webarena_verified.220,False,map,string_match,220,train,webarena_verified.219 +webarena_verified.221,False,map,string_match,221,test,webarena_verified.220 +webarena_verified.222,False,map,string_match,222,train,webarena_verified.221 +webarena_verified.223,False,map,string_match,223,test,webarena_verified.222 +webarena_verified.224,False,map,string_match,224,test,webarena_verified.223 +webarena_verified.225,False,shopping,retrieve_value,225,test,webarena_verified.192 +webarena_verified.226,False,shopping,retrieve_value,226,train,webarena_verified.225 +webarena_verified.227,False,shopping,retrieve_value,227,train,webarena_verified.226 +webarena_verified.228,False,shopping,retrieve_value,228,test,webarena_verified.227 +webarena_verified.229,False,shopping,retrieve_value,229,test,webarena_verified.228 +webarena_verified.230,False,shopping,retrieve_value,230,train,webarena_verified.229 +webarena_verified.231,False,shopping,retrieve_value,231,test,webarena_verified.230 +webarena_verified.232,False,shopping,retrieve_value,232,train,webarena_verified.231 +webarena_verified.233,False,shopping,retrieve_value,233,test,webarena_verified.232 +webarena_verified.234,False,shopping,retrieve_value,234,train,webarena_verified.233 +webarena_verified.235,False,shopping,retrieve_value,235,train,webarena_verified.234 +webarena_verified.236,False,map,retrieve_value,236,train,webarena_verified.224 +webarena_verified.237,False,map,retrieve_value,237,train,webarena_verified.236 +webarena_verified.238,False,shopping,ui_state,238,train,webarena_verified.235 +webarena_verified.239,False,shopping,ui_state,239,train,webarena_verified.238 +webarena_verified.240,False,shopping,ui_state,240,test,webarena_verified.239 +webarena_verified.241,False,shopping,ui_state,241,train,webarena_verified.240 +webarena_verified.242,False,shopping,ui_state,242,test,webarena_verified.241 +webarena_verified.243,False,shopping_admin,retrieve_value,243,train,webarena_verified.217 +webarena_verified.244,False,shopping_admin,retrieve_value,244,test,webarena_verified.243 +webarena_verified.245,False,shopping_admin,retrieve_value,245,train,webarena_verified.244 +webarena_verified.246,False,shopping_admin,retrieve_value,246,test,webarena_verified.245 +webarena_verified.247,False,shopping_admin,retrieve_value,247,train,webarena_verified.246 +webarena_verified.248,False,map,retrieve_value,248,test,webarena_verified.237 +webarena_verified.249,False,map,retrieve_value,249,train,webarena_verified.248 +webarena_verified.250,False,map,retrieve_value,250,test,webarena_verified.249 +webarena_verified.251,False,map,retrieve_value,251,train,webarena_verified.250 +webarena_verified.252,False,map,retrieve_value,252,train,webarena_verified.251 +webarena_verified.253,False,map,string_match,253,test,webarena_verified.252 +webarena_verified.254,False,map,retrieve_value,254,train,webarena_verified.253 +webarena_verified.255,False,map,retrieve_value,255,test,webarena_verified.254 +webarena_verified.256,False,map,retrieve_value,256,train,webarena_verified.255 +webarena_verified.257,False,map,string_match,257,test,webarena_verified.256 +webarena_verified.258,False,gitlab,ui_state,258,train,webarena_verified.207 +webarena_verified.259,False,gitlab,retrieve_value,259,train,webarena_verified.258 +webarena_verified.260,False,shopping,ui_state,260,test,webarena_verified.242 +webarena_verified.261,False,shopping,ui_state,261,train,webarena_verified.260 +webarena_verified.262,False,shopping,ui_state,262,train,webarena_verified.261 +webarena_verified.263,False,shopping,ui_state,263,test,webarena_verified.262 +webarena_verified.264,False,shopping,ui_state,264,train,webarena_verified.263 +webarena_verified.265,False,wikipedia map,retrieve_value,265,test,webarena_verified.257 +webarena_verified.266,False,wikipedia map,retrieve_value,266,test,webarena_verified.265 +webarena_verified.267,False,wikipedia map,retrieve_value,267,train,webarena_verified.266 +webarena_verified.268,False,wikipedia map,retrieve_value,268,test,webarena_verified.267 +webarena_verified.269,False,shopping,ui_state,269,train,webarena_verified.264 +webarena_verified.270,False,shopping,ui_state,270,train,webarena_verified.269 +webarena_verified.271,False,shopping,ui_state,271,test,webarena_verified.270 +webarena_verified.272,False,shopping,ui_state,272,test,webarena_verified.271 +webarena_verified.273,False,shopping,ui_state,273,train,webarena_verified.272 +webarena_verified.274,False,shopping,ui_state,274,test,webarena_verified.273 +webarena_verified.275,False,shopping,ui_state,275,test,webarena_verified.274 +webarena_verified.276,False,shopping,ui_state,276,train,webarena_verified.275 +webarena_verified.277,False,shopping,ui_state,277,train,webarena_verified.276 +webarena_verified.278,False,shopping,ui_state,278,train,webarena_verified.277 +webarena_verified.279,False,shopping,retrieve_value,279,train,webarena_verified.278 +webarena_verified.280,False,shopping,retrieve_value,280,test,webarena_verified.279 +webarena_verified.281,False,shopping,retrieve_value,281,train,webarena_verified.280 +webarena_verified.282,False,shopping,retrieve_value,282,train,webarena_verified.281 +webarena_verified.283,False,shopping,ui_state,283,test,webarena_verified.282 +webarena_verified.284,False,shopping,ui_state,284,test,webarena_verified.283 +webarena_verified.285,False,shopping,ui_state,285,train,webarena_verified.284 +webarena_verified.286,False,shopping,ui_state,286,test,webarena_verified.285 +webarena_verified.287,False,map,string_match,287,test,webarena_verified.268 +webarena_verified.288,False,shopping_admin,retrieve_value,288,train,webarena_verified.247 +webarena_verified.289,False,shopping_admin,retrieve_value,289,test,webarena_verified.288 +webarena_verified.290,False,shopping_admin,retrieve_value,290,train,webarena_verified.289 +webarena_verified.291,False,shopping_admin,retrieve_value,291,train,webarena_verified.290 +webarena_verified.292,False,shopping_admin,retrieve_value,292,test,webarena_verified.291 +webarena_verified.293,False,gitlab,retrieve_value,293,train,webarena_verified.259 +webarena_verified.294,False,gitlab,retrieve_value,294,train,webarena_verified.293 +webarena_verified.295,False,gitlab,retrieve_value,295,test,webarena_verified.294 +webarena_verified.296,False,gitlab,retrieve_value,296,train,webarena_verified.295 +webarena_verified.297,False,gitlab,retrieve_value,297,test,webarena_verified.296 +webarena_verified.298,False,shopping,ui_state,298,train,webarena_verified.286 +webarena_verified.299,False,shopping,ui_state,299,train,webarena_verified.298 +webarena_verified.300,False,shopping,ui_state,300,test,webarena_verified.299 +webarena_verified.301,False,shopping,retrieve_value,301,test,webarena_verified.300 +webarena_verified.302,False,shopping,retrieve_value,302,train,webarena_verified.301 +webarena_verified.303,False,gitlab,retrieve_value,303,test,webarena_verified.297 +webarena_verified.304,False,gitlab,retrieve_value,304,train,webarena_verified.303 +webarena_verified.305,False,gitlab,retrieve_value,305,train,webarena_verified.304 +webarena_verified.306,False,gitlab,retrieve_value,306,test,webarena_verified.305 +webarena_verified.307,False,gitlab,retrieve_value,307,train,webarena_verified.306 +webarena_verified.308,False,gitlab,retrieve_value,308,train,webarena_verified.307 +webarena_verified.309,False,gitlab,retrieve_value,309,train,webarena_verified.308 +webarena_verified.310,False,gitlab,retrieve_value,310,train,webarena_verified.309 +webarena_verified.311,False,gitlab,retrieve_value,311,test,webarena_verified.310 +webarena_verified.312,False,gitlab,retrieve_value,312,test,webarena_verified.311 +webarena_verified.313,False,shopping,retrieve_value,313,train,webarena_verified.302 +webarena_verified.314,False,gitlab,retrieve_value,314,train,webarena_verified.312 +webarena_verified.315,False,gitlab,retrieve_value,315,train,webarena_verified.314 +webarena_verified.316,False,gitlab,retrieve_value,316,test,webarena_verified.315 +webarena_verified.317,False,gitlab,retrieve_value,317,test,webarena_verified.316 +webarena_verified.318,False,gitlab,retrieve_value,318,train,webarena_verified.317 +webarena_verified.319,False,shopping,retrieve_value,319,train,webarena_verified.313 +webarena_verified.320,False,shopping,retrieve_value,320,test,webarena_verified.319 +webarena_verified.321,False,shopping,retrieve_value,321,train,webarena_verified.320 +webarena_verified.322,False,shopping,retrieve_value,322,test,webarena_verified.321 +webarena_verified.323,False,shopping,retrieve_value,323,train,webarena_verified.322 +webarena_verified.324,False,shopping,ui_state,324,train,webarena_verified.323 +webarena_verified.325,False,shopping,ui_state,325,test,webarena_verified.324 +webarena_verified.326,False,shopping,ui_state,326,train,webarena_verified.325 +webarena_verified.327,False,shopping,ui_state,327,test,webarena_verified.326 +webarena_verified.328,False,shopping,ui_state,328,train,webarena_verified.327 +webarena_verified.329,False,shopping,retrieve_value,329,test,webarena_verified.328 +webarena_verified.330,False,shopping,retrieve_value,330,test,webarena_verified.329 +webarena_verified.331,False,shopping,retrieve_value,331,test,webarena_verified.330 +webarena_verified.332,False,shopping,retrieve_value,332,train,webarena_verified.331 +webarena_verified.333,False,shopping,retrieve_value,333,train,webarena_verified.332 +webarena_verified.334,False,shopping,retrieve_value,334,train,webarena_verified.333 +webarena_verified.335,False,shopping,retrieve_value,335,train,webarena_verified.334 +webarena_verified.336,False,shopping,retrieve_value,336,test,webarena_verified.335 +webarena_verified.337,False,shopping,retrieve_value,337,test,webarena_verified.336 +webarena_verified.338,False,shopping,retrieve_value,338,train,webarena_verified.337 +webarena_verified.339,False,gitlab,ui_state,339,test,webarena_verified.318 +webarena_verified.340,False,gitlab,ui_state,340,train,webarena_verified.339 +webarena_verified.341,False,gitlab,ui_state,341,test,webarena_verified.340 +webarena_verified.342,False,gitlab,ui_state,342,test,webarena_verified.341 +webarena_verified.343,False,gitlab,ui_state,343,test,webarena_verified.342 +webarena_verified.344,False,shopping_admin,retrieve_value,344,test,webarena_verified.292 +webarena_verified.345,False,shopping_admin,retrieve_value,345,train,webarena_verified.344 +webarena_verified.346,False,shopping_admin,retrieve_value,346,train,webarena_verified.345 +webarena_verified.347,False,shopping_admin,retrieve_value,347,train,webarena_verified.346 +webarena_verified.348,False,shopping_admin,retrieve_value,348,test,webarena_verified.347 +webarena_verified.349,False,gitlab,retrieve_value,349,test,webarena_verified.343 +webarena_verified.350,False,gitlab,retrieve_value,350,test,webarena_verified.349 +webarena_verified.351,False,shopping,ui_state,351,train,webarena_verified.338 +webarena_verified.352,False,shopping,ui_state,352,test,webarena_verified.351 +webarena_verified.353,False,shopping,ui_state,353,test,webarena_verified.352 +webarena_verified.354,False,shopping,ui_state,354,train,webarena_verified.353 +webarena_verified.355,False,shopping,ui_state,355,train,webarena_verified.354 +webarena_verified.356,False,map,program_html,356,test,webarena_verified.287 +webarena_verified.357,False,gitlab,ui_state,357,test,webarena_verified.350 +webarena_verified.358,False,shopping,retrieve_value,358,train,webarena_verified.355 +webarena_verified.359,False,shopping,retrieve_value,359,test,webarena_verified.358 +webarena_verified.360,False,shopping,retrieve_value,360,train,webarena_verified.359 +webarena_verified.361,False,shopping,retrieve_value,361,train,webarena_verified.360 +webarena_verified.362,False,shopping,retrieve_value,362,test,webarena_verified.361 +webarena_verified.363,False,map,retrieve_value,363,train,webarena_verified.356 +webarena_verified.364,False,map,retrieve_value,364,test,webarena_verified.363 +webarena_verified.365,False,map,retrieve_value,365,test,webarena_verified.364 +webarena_verified.366,False,map,retrieve_value,366,train,webarena_verified.365 +webarena_verified.367,False,map,retrieve_value,367,train,webarena_verified.366 +webarena_verified.368,False,shopping,retrieve_value,368,test,webarena_verified.362 +webarena_verified.369,False,map,program_html,369,train,webarena_verified.367 +webarena_verified.370,False,map,program_html,370,test,webarena_verified.369 +webarena_verified.371,False,map,program_html,371,test,webarena_verified.370 +webarena_verified.372,False,map,program_html,372,train,webarena_verified.371 +webarena_verified.373,False,map,program_html,373,train,webarena_verified.372 +webarena_verified.374,False,shopping_admin,ui_state,374,train,webarena_verified.348 +webarena_verified.375,False,shopping_admin,ui_state,375,train,webarena_verified.374 +webarena_verified.376,False,shopping,retrieve_value,376,test,webarena_verified.368 +webarena_verified.377,False,map,ui_state,377,test,webarena_verified.373 +webarena_verified.378,False,map,ui_state,378,train,webarena_verified.377 +webarena_verified.379,False,map,ui_state,379,train,webarena_verified.378 +webarena_verified.380,False,map,ui_state,380,test,webarena_verified.379 +webarena_verified.381,False,map,ui_state,381,train,webarena_verified.380 +webarena_verified.382,False,map,string_match,382,test,webarena_verified.381 +webarena_verified.383,False,map,retrieve_value,383,test,webarena_verified.382 +webarena_verified.384,False,shopping,retrieve_value,384,test,webarena_verified.376 +webarena_verified.385,False,shopping,retrieve_value,385,train,webarena_verified.384 +webarena_verified.386,False,shopping,retrieve_value,386,test,webarena_verified.385 +webarena_verified.387,False,shopping,retrieve_value,387,train,webarena_verified.386 +webarena_verified.388,False,shopping,retrieve_value,388,test,webarena_verified.387 +webarena_verified.389,False,gitlab,backend_state,389,test,webarena_verified.357 +webarena_verified.390,False,gitlab,backend_state,390,train,webarena_verified.389 +webarena_verified.391,False,gitlab,backend_state,391,train,webarena_verified.390 +webarena_verified.392,False,gitlab,backend_state,392,test,webarena_verified.391 +webarena_verified.393,False,gitlab,backend_state,393,train,webarena_verified.392 +webarena_verified.394,False,gitlab,backend_state,394,test,webarena_verified.393 +webarena_verified.395,False,gitlab,backend_state,395,train,webarena_verified.394 +webarena_verified.396,False,gitlab,backend_state,396,train,webarena_verified.395 +webarena_verified.397,False,gitlab,backend_state,397,train,webarena_verified.396 +webarena_verified.398,False,gitlab,backend_state,398,test,webarena_verified.397 +webarena_verified.399,False,reddit,backend_state,399,train,webarena_verified.69 +webarena_verified.400,False,reddit,backend_state,400,test,webarena_verified.399 +webarena_verified.401,False,reddit,backend_state,401,train,webarena_verified.400 +webarena_verified.402,False,reddit,backend_state,402,train,webarena_verified.401 +webarena_verified.403,False,reddit,backend_state,403,test,webarena_verified.402 +webarena_verified.404,False,reddit,backend_state,404,train,webarena_verified.403 +webarena_verified.405,False,reddit,backend_state,405,test,webarena_verified.404 +webarena_verified.406,False,reddit,backend_state,406,train,webarena_verified.405 +webarena_verified.407,False,reddit,backend_state,407,test,webarena_verified.406 +webarena_verified.408,False,reddit,backend_state,408,train,webarena_verified.407 +webarena_verified.409,False,reddit,backend_state,409,test,webarena_verified.408 +webarena_verified.410,False,reddit,backend_state,410,test,webarena_verified.409 +webarena_verified.411,False,gitlab,backend_state,411,test,webarena_verified.398 +webarena_verified.412,False,gitlab,backend_state,412,test,webarena_verified.411 +webarena_verified.413,False,gitlab,backend_state,413,test,webarena_verified.412 +webarena_verified.414,False,gitlab,backend_state,414,test,webarena_verified.413 +webarena_verified.415,False,gitlab,backend_state,415,test,webarena_verified.414 +webarena_verified.416,False,gitlab,backend_state,416,test,webarena_verified.415 +webarena_verified.417,False,gitlab,backend_state,417,test,webarena_verified.416 +webarena_verified.418,False,gitlab,backend_state,418,train,webarena_verified.417 +webarena_verified.419,False,gitlab,backend_state,419,test,webarena_verified.418 +webarena_verified.420,False,gitlab,backend_state,420,test,webarena_verified.419 +webarena_verified.421,False,gitlab,backend_state,421,train,webarena_verified.420 +webarena_verified.422,False,gitlab,backend_state,422,train,webarena_verified.421 +webarena_verified.423,False,shopping_admin,backend_state,423,train,webarena_verified.375 +webarena_verified.424,False,wikipedia map,program_html,424,train,webarena_verified.383 +webarena_verified.425,False,wikipedia map,program_html,425,train,webarena_verified.424 +webarena_verified.426,False,wikipedia map,program_html,426,test,webarena_verified.425 +webarena_verified.427,False,wikipedia map,program_html,427,test,webarena_verified.426 +webarena_verified.428,False,wikipedia map,program_html,428,train,webarena_verified.427 +webarena_verified.429,False,wikipedia map,program_html,429,train,webarena_verified.428 +webarena_verified.430,False,wikipedia map,program_html,430,test,webarena_verified.429 +webarena_verified.431,False,shopping,program_html,431,train,webarena_verified.388 +webarena_verified.432,False,shopping,backend_state,432,test,webarena_verified.431 +webarena_verified.433,False,shopping,backend_state,433,train,webarena_verified.432 +webarena_verified.434,False,shopping,backend_state,434,train,webarena_verified.433 +webarena_verified.435,False,shopping,backend_state,435,train,webarena_verified.434 +webarena_verified.436,False,shopping,backend_state,436,test,webarena_verified.435 +webarena_verified.437,False,shopping,backend_state,437,train,webarena_verified.436 +webarena_verified.438,False,shopping,backend_state,438,train,webarena_verified.437 +webarena_verified.439,False,shopping,backend_state,439,train,webarena_verified.438 +webarena_verified.440,False,shopping,backend_state,440,test,webarena_verified.439 +webarena_verified.441,False,gitlab,backend_state,441,train,webarena_verified.422 +webarena_verified.442,False,gitlab,backend_state,442,train,webarena_verified.441 +webarena_verified.443,False,gitlab,backend_state,443,test,webarena_verified.442 +webarena_verified.444,False,gitlab,backend_state,444,train,webarena_verified.443 +webarena_verified.445,False,gitlab,backend_state,445,test,webarena_verified.444 +webarena_verified.446,False,gitlab,backend_state,446,test,webarena_verified.445 +webarena_verified.447,False,gitlab,backend_state,447,train,webarena_verified.446 +webarena_verified.448,False,gitlab,backend_state,448,test,webarena_verified.447 +webarena_verified.449,False,gitlab,backend_state,449,test,webarena_verified.448 +webarena_verified.450,False,gitlab,retrieve_value,450,train,webarena_verified.449 +webarena_verified.451,False,gitlab,retrieve_value,451,train,webarena_verified.450 +webarena_verified.452,False,gitlab,retrieve_value,452,train,webarena_verified.451 +webarena_verified.453,False,shopping_admin,backend_state,453,train,webarena_verified.423 +webarena_verified.454,False,shopping_admin,backend_state,454,test,webarena_verified.453 +webarena_verified.455,False,shopping_admin,backend_state,455,train,webarena_verified.454 +webarena_verified.456,False,shopping_admin,backend_state,456,test,webarena_verified.455 +webarena_verified.457,False,shopping_admin,backend_state,457,train,webarena_verified.456 +webarena_verified.458,False,shopping_admin,backend_state,458,test,webarena_verified.457 +webarena_verified.459,False,shopping_admin,backend_state,459,test,webarena_verified.458 +webarena_verified.460,False,shopping_admin,backend_state,460,train,webarena_verified.459 +webarena_verified.461,False,shopping_admin,backend_state,461,train,webarena_verified.460 +webarena_verified.462,False,shopping_admin,backend_state,462,test,webarena_verified.461 +webarena_verified.463,False,shopping_admin,backend_state,463,test,webarena_verified.462 +webarena_verified.464,False,shopping_admin,backend_state,464,train,webarena_verified.463 +webarena_verified.465,False,shopping,backend_state,465,train,webarena_verified.440 +webarena_verified.466,False,shopping,backend_state,466,train,webarena_verified.465 +webarena_verified.467,False,shopping,backend_state,467,train,webarena_verified.466 +webarena_verified.468,False,shopping,backend_state,468,test,webarena_verified.467 +webarena_verified.469,False,shopping,backend_state,469,test,webarena_verified.468 +webarena_verified.470,False,shopping_admin,backend_state,470,test,webarena_verified.464 +webarena_verified.471,False,shopping_admin,backend_state,471,test,webarena_verified.470 +webarena_verified.472,False,shopping_admin,backend_state,472,train,webarena_verified.471 +webarena_verified.473,False,shopping_admin,backend_state,473,train,webarena_verified.472 +webarena_verified.474,False,shopping_admin,backend_state,474,train,webarena_verified.473 +webarena_verified.475,False,gitlab,backend_state,475,train,webarena_verified.452 +webarena_verified.476,False,gitlab,backend_state,476,train,webarena_verified.475 +webarena_verified.477,False,gitlab,backend_state,477,train,webarena_verified.476 +webarena_verified.478,False,gitlab,backend_state,478,test,webarena_verified.477 +webarena_verified.479,False,gitlab,backend_state,479,test,webarena_verified.478 +webarena_verified.480,False,gitlab,backend_state,480,train,webarena_verified.479 +webarena_verified.481,False,gitlab,backend_state,481,train,webarena_verified.480 +webarena_verified.482,False,gitlab,backend_state,482,train,webarena_verified.481 +webarena_verified.483,False,gitlab,backend_state,483,test,webarena_verified.482 +webarena_verified.484,False,gitlab,backend_state,484,train,webarena_verified.483 +webarena_verified.485,False,gitlab,backend_state,485,test,webarena_verified.484 +webarena_verified.486,False,shopping_admin,backend_state,486,train,webarena_verified.474 +webarena_verified.487,False,shopping_admin,backend_state,487,test,webarena_verified.486 +webarena_verified.488,False,shopping_admin,backend_state,488,test,webarena_verified.487 +webarena_verified.489,False,shopping_admin,backend_state,489,train,webarena_verified.488 +webarena_verified.490,False,shopping_admin,backend_state,490,train,webarena_verified.489 +webarena_verified.491,False,shopping_admin,retrieve_value,491,test,webarena_verified.490 +webarena_verified.492,False,shopping_admin,backend_state,492,train,webarena_verified.491 +webarena_verified.493,False,shopping_admin,backend_state,493,train,webarena_verified.492 +webarena_verified.494,False,shopping_admin,backend_state,494,train,webarena_verified.493 +webarena_verified.495,False,shopping_admin,backend_state,495,test,webarena_verified.494 +webarena_verified.496,False,shopping_admin,backend_state,496,train,webarena_verified.495 +webarena_verified.497,False,shopping_admin,backend_state,497,test,webarena_verified.496 +webarena_verified.498,False,shopping_admin,backend_state,498,test,webarena_verified.497 +webarena_verified.499,False,shopping_admin,backend_state,499,train,webarena_verified.498 +webarena_verified.500,False,shopping_admin,backend_state,500,train,webarena_verified.499 +webarena_verified.501,False,shopping_admin,backend_state,501,train,webarena_verified.500 +webarena_verified.502,False,shopping_admin,backend_state,502,test,webarena_verified.501 +webarena_verified.503,False,shopping_admin,backend_state,503,train,webarena_verified.502 +webarena_verified.504,False,shopping_admin,backend_state,504,test,webarena_verified.503 +webarena_verified.505,False,shopping_admin,backend_state,505,train,webarena_verified.504 +webarena_verified.506,False,shopping,backend_state,506,train,webarena_verified.469 +webarena_verified.507,False,shopping,backend_state,507,train,webarena_verified.506 +webarena_verified.508,False,shopping,backend_state,508,test,webarena_verified.507 +webarena_verified.509,False,shopping,backend_state,509,test,webarena_verified.508 +webarena_verified.510,False,shopping,backend_state,510,test,webarena_verified.509 +webarena_verified.511,False,shopping,program_html,511,test,webarena_verified.510 +webarena_verified.512,False,shopping,program_html,512,train,webarena_verified.511 +webarena_verified.513,False,shopping,program_html,513,train,webarena_verified.512 +webarena_verified.514,False,shopping,program_html,514,test,webarena_verified.513 +webarena_verified.515,False,shopping,program_html,515,train,webarena_verified.514 +webarena_verified.516,False,shopping,backend_state,516,train,webarena_verified.515 +webarena_verified.517,False,shopping,backend_state,517,test,webarena_verified.516 +webarena_verified.518,False,shopping,backend_state,518,test,webarena_verified.517 +webarena_verified.519,False,shopping,backend_state,519,test,webarena_verified.518 +webarena_verified.520,False,shopping,backend_state,520,train,webarena_verified.519 +webarena_verified.521,False,shopping,backend_state,521,test,webarena_verified.520 +webarena_verified.522,False,gitlab,backend_state,522,test,webarena_verified.485 +webarena_verified.523,False,gitlab,backend_state,523,train,webarena_verified.522 +webarena_verified.524,False,gitlab,backend_state,524,test,webarena_verified.523 +webarena_verified.525,False,gitlab,backend_state,525,train,webarena_verified.524 +webarena_verified.526,False,gitlab,backend_state,526,train,webarena_verified.525 +webarena_verified.527,False,gitlab,backend_state,527,test,webarena_verified.526 +webarena_verified.528,False,shopping,program_html,528,train,webarena_verified.521 +webarena_verified.529,False,shopping,program_html,529,test,webarena_verified.528 +webarena_verified.530,False,shopping,program_html,530,test,webarena_verified.529 +webarena_verified.531,False,shopping,program_html,531,train,webarena_verified.530 +webarena_verified.532,False,shopping,program_html,532,train,webarena_verified.531 +webarena_verified.533,False,gitlab,backend_state,533,test,webarena_verified.527 +webarena_verified.534,False,gitlab,backend_state,534,train,webarena_verified.533 +webarena_verified.535,False,gitlab,backend_state,535,test,webarena_verified.534 +webarena_verified.536,False,gitlab,backend_state,536,train,webarena_verified.535 +webarena_verified.537,False,gitlab,backend_state,537,train,webarena_verified.536 +webarena_verified.538,False,shopping_admin,backend_state,538,train,webarena_verified.505 +webarena_verified.539,False,shopping_admin,backend_state,539,train,webarena_verified.538 +webarena_verified.540,False,shopping_admin,backend_state,540,test,webarena_verified.539 +webarena_verified.541,False,shopping_admin,backend_state,541,test,webarena_verified.540 +webarena_verified.542,False,shopping_admin,backend_state,542,train,webarena_verified.541 +webarena_verified.543,False,shopping_admin,backend_state,543,test,webarena_verified.542 +webarena_verified.544,False,shopping_admin,backend_state,544,test,webarena_verified.543 +webarena_verified.545,False,shopping_admin,backend_state,545,test,webarena_verified.544 +webarena_verified.546,False,shopping_admin,retrieve_value,546,train,webarena_verified.545 +webarena_verified.547,False,shopping_admin,backend_state,547,train,webarena_verified.546 +webarena_verified.548,False,shopping_admin,backend_state,548,train,webarena_verified.547 +webarena_verified.549,False,shopping_admin,backend_state,549,test,webarena_verified.548 +webarena_verified.550,False,shopping_admin,backend_state,550,train,webarena_verified.549 +webarena_verified.551,False,shopping_admin,backend_state,551,test,webarena_verified.550 +webarena_verified.552,False,gitlab reddit,program_html,552,test,webarena_verified.537 webarena_verified.410 +webarena_verified.553,False,gitlab reddit,program_html,553,test,webarena_verified.552 +webarena_verified.554,False,gitlab reddit,program_html,554,test,webarena_verified.553 +webarena_verified.555,False,gitlab reddit,program_html,555,test,webarena_verified.554 +webarena_verified.556,False,gitlab wikipedia,program_html,556,train,webarena_verified.555 +webarena_verified.557,False,gitlab wikipedia,program_html,557,test,webarena_verified.556 +webarena_verified.558,False,gitlab wikipedia,program_html,558,train,webarena_verified.557 +webarena_verified.559,False,gitlab wikipedia,program_html,559,train,webarena_verified.558 +webarena_verified.560,False,gitlab wikipedia,program_html,560,test,webarena_verified.559 +webarena_verified.561,False,gitlab wikipedia,program_html,561,test,webarena_verified.560 +webarena_verified.562,False,gitlab reddit,program_html,562,train,webarena_verified.561 webarena_verified.555 +webarena_verified.563,False,gitlab reddit,program_html,563,train,webarena_verified.562 +webarena_verified.564,False,gitlab reddit,program_html,564,train,webarena_verified.563 +webarena_verified.565,False,gitlab reddit,program_html,565,test,webarena_verified.564 +webarena_verified.566,False,gitlab reddit,program_html,566,test,webarena_verified.565 +webarena_verified.567,False,gitlab,backend_state,567,test,webarena_verified.566 +webarena_verified.568,False,gitlab,backend_state,568,train,webarena_verified.567 +webarena_verified.569,False,gitlab,backend_state,569,train,webarena_verified.568 +webarena_verified.570,False,gitlab,backend_state,570,test,webarena_verified.569 +webarena_verified.571,False,shopping,backend_state,571,test,webarena_verified.532 +webarena_verified.572,False,shopping,backend_state,572,train,webarena_verified.571 +webarena_verified.573,False,shopping,backend_state,573,train,webarena_verified.572 +webarena_verified.574,False,shopping,backend_state,574,test,webarena_verified.573 +webarena_verified.575,False,shopping,backend_state,575,train,webarena_verified.574 +webarena_verified.576,False,gitlab,backend_state,576,test,webarena_verified.570 +webarena_verified.577,False,gitlab,backend_state,577,train,webarena_verified.576 +webarena_verified.578,False,gitlab,backend_state,578,test,webarena_verified.577 +webarena_verified.579,False,gitlab,backend_state,579,train,webarena_verified.578 +webarena_verified.580,False,reddit,backend_state,580,train,webarena_verified.566 +webarena_verified.581,False,reddit,backend_state,581,train,webarena_verified.580 +webarena_verified.582,False,reddit,backend_state,582,test,webarena_verified.581 +webarena_verified.583,False,reddit,backend_state,583,test,webarena_verified.582 +webarena_verified.584,False,reddit,backend_state,584,train,webarena_verified.583 +webarena_verified.585,False,shopping,backend_state,585,train,webarena_verified.575 +webarena_verified.586,False,shopping,backend_state,586,test,webarena_verified.585 +webarena_verified.587,False,shopping,backend_state,587,train,webarena_verified.586 +webarena_verified.588,False,shopping,backend_state,588,train,webarena_verified.587 +webarena_verified.589,False,shopping,backend_state,589,test,webarena_verified.588 +webarena_verified.590,False,gitlab,backend_state,590,train,webarena_verified.579 +webarena_verified.591,False,gitlab,backend_state,591,test,webarena_verified.590 +webarena_verified.592,False,gitlab,backend_state,592,test,webarena_verified.591 +webarena_verified.593,False,gitlab,backend_state,593,test,webarena_verified.592 +webarena_verified.594,False,gitlab,backend_state,594,train,webarena_verified.593 +webarena_verified.595,False,reddit,backend_state,595,train,webarena_verified.584 +webarena_verified.596,False,reddit,backend_state,596,test,webarena_verified.595 +webarena_verified.597,False,reddit,backend_state,597,train,webarena_verified.596 +webarena_verified.598,False,reddit,backend_state,598,train,webarena_verified.597 +webarena_verified.599,False,reddit,backend_state,599,test,webarena_verified.598 +webarena_verified.600,False,reddit,backend_state,600,test,webarena_verified.599 +webarena_verified.601,False,reddit,backend_state,601,train,webarena_verified.600 +webarena_verified.602,False,reddit,backend_state,602,train,webarena_verified.601 +webarena_verified.603,False,reddit,backend_state,603,train,webarena_verified.602 +webarena_verified.604,False,reddit,backend_state,604,test,webarena_verified.603 +webarena_verified.605,False,reddit,backend_state,605,train,webarena_verified.604 +webarena_verified.606,False,reddit,backend_state,606,train,webarena_verified.605 +webarena_verified.607,False,reddit,backend_state,607,test,webarena_verified.606 +webarena_verified.608,False,reddit,backend_state,608,test,webarena_verified.607 +webarena_verified.609,False,reddit,backend_state,609,train,webarena_verified.608 +webarena_verified.610,False,reddit,backend_state,610,train,webarena_verified.609 +webarena_verified.611,False,reddit,backend_state,611,train,webarena_verified.610 +webarena_verified.612,False,reddit,backend_state,612,test,webarena_verified.611 +webarena_verified.613,False,reddit,backend_state,613,train,webarena_verified.612 +webarena_verified.614,False,reddit,backend_state,614,test,webarena_verified.613 +webarena_verified.615,False,reddit,ui_state,615,test,webarena_verified.614 +webarena_verified.616,False,reddit,ui_state,616,test,webarena_verified.615 +webarena_verified.617,False,reddit,ui_state,617,train,webarena_verified.616 +webarena_verified.618,False,reddit,ui_state,618,train,webarena_verified.617 +webarena_verified.619,False,reddit,ui_state,619,train,webarena_verified.618 +webarena_verified.620,False,reddit,backend_state,620,train,webarena_verified.619 +webarena_verified.621,False,reddit,backend_state,621,train,webarena_verified.620 +webarena_verified.622,False,reddit,backend_state,622,train,webarena_verified.621 +webarena_verified.623,False,reddit,backend_state,623,test,webarena_verified.622 +webarena_verified.624,False,reddit,backend_state,624,test,webarena_verified.623 +webarena_verified.625,False,reddit,backend_state,625,train,webarena_verified.624 +webarena_verified.626,False,reddit,backend_state,626,train,webarena_verified.625 +webarena_verified.627,False,reddit,backend_state,627,train,webarena_verified.626 +webarena_verified.628,False,reddit,backend_state,628,test,webarena_verified.627 +webarena_verified.629,False,reddit,backend_state,629,test,webarena_verified.628 +webarena_verified.630,False,reddit,backend_state,630,test,webarena_verified.629 +webarena_verified.631,False,reddit,backend_state,631,train,webarena_verified.630 +webarena_verified.632,False,reddit,backend_state,632,train,webarena_verified.631 +webarena_verified.633,False,reddit,backend_state,633,test,webarena_verified.632 +webarena_verified.634,False,reddit,backend_state,634,train,webarena_verified.633 +webarena_verified.635,False,reddit,backend_state,635,train,webarena_verified.634 +webarena_verified.636,False,reddit,backend_state,636,train,webarena_verified.635 +webarena_verified.637,False,reddit,backend_state,637,train,webarena_verified.636 +webarena_verified.638,False,reddit,ui_state,638,test,webarena_verified.637 +webarena_verified.639,False,reddit,backend_state,639,test,webarena_verified.638 +webarena_verified.640,False,reddit,backend_state,640,train,webarena_verified.639 +webarena_verified.641,False,reddit,backend_state,641,test,webarena_verified.640 +webarena_verified.642,False,reddit,backend_state,642,test,webarena_verified.641 +webarena_verified.643,False,reddit,backend_state,643,train,webarena_verified.642 +webarena_verified.644,False,reddit,backend_state,644,train,webarena_verified.643 +webarena_verified.645,False,reddit,backend_state,645,train,webarena_verified.644 +webarena_verified.646,False,reddit,backend_state,646,train,webarena_verified.645 +webarena_verified.647,False,reddit,backend_state,647,train,webarena_verified.646 +webarena_verified.648,False,reddit,backend_state,648,test,webarena_verified.647 +webarena_verified.649,False,reddit,backend_state,649,test,webarena_verified.648 +webarena_verified.650,False,reddit,backend_state,650,train,webarena_verified.649 +webarena_verified.651,False,reddit,backend_state,651,train,webarena_verified.650 +webarena_verified.652,False,reddit,backend_state,652,train,webarena_verified.651 +webarena_verified.653,False,shopping,ui_state,653,train,webarena_verified.589 +webarena_verified.654,False,shopping,ui_state,654,test,webarena_verified.653 +webarena_verified.655,False,shopping,ui_state,655,test,webarena_verified.654 +webarena_verified.656,False,shopping,ui_state,656,train,webarena_verified.655 +webarena_verified.657,False,shopping,ui_state,657,train,webarena_verified.656 +webarena_verified.658,False,gitlab,backend_state,658,train,webarena_verified.594 +webarena_verified.659,False,gitlab,backend_state,659,test,webarena_verified.658 +webarena_verified.660,False,gitlab,backend_state,660,test,webarena_verified.659 +webarena_verified.661,False,gitlab,backend_state,661,test,webarena_verified.660 +webarena_verified.662,False,gitlab,backend_state,662,train,webarena_verified.661 +webarena_verified.663,False,gitlab,backend_state,663,train,webarena_verified.662 +webarena_verified.664,False,gitlab,backend_state,664,test,webarena_verified.663 +webarena_verified.665,False,gitlab,backend_state,665,train,webarena_verified.664 +webarena_verified.666,False,gitlab,retrieve_value,666,test,webarena_verified.665 +webarena_verified.667,False,gitlab,backend_state,667,test,webarena_verified.666 +webarena_verified.668,False,gitlab,retrieve_value,668,test,webarena_verified.667 +webarena_verified.669,False,gitlab,backend_state,669,test,webarena_verified.668 +webarena_verified.670,False,gitlab,backend_state,670,train,webarena_verified.669 +webarena_verified.671,False,shopping reddit,ui_state,671,train,webarena_verified.657 webarena_verified.652 +webarena_verified.672,False,shopping reddit,ui_state,672,train,webarena_verified.671 +webarena_verified.673,False,shopping reddit,ui_state,673,test,webarena_verified.672 +webarena_verified.674,False,shopping reddit,ui_state,674,test,webarena_verified.673 +webarena_verified.675,False,shopping reddit,ui_state,675,train,webarena_verified.674 +webarena_verified.676,False,shopping_admin,ui_state,676,test,webarena_verified.551 +webarena_verified.677,False,shopping_admin,ui_state,677,test,webarena_verified.676 +webarena_verified.678,False,shopping_admin,ui_state,678,train,webarena_verified.677 +webarena_verified.679,False,shopping_admin,ui_state,679,train,webarena_verified.678 +webarena_verified.680,False,shopping_admin,ui_state,680,train,webarena_verified.679 +webarena_verified.681,False,reddit gitlab,ui_state,681,train,webarena_verified.675 webarena_verified.670 +webarena_verified.682,False,reddit gitlab,ui_state,682,train,webarena_verified.681 +webarena_verified.683,False,reddit gitlab,ui_state,683,test,webarena_verified.682 +webarena_verified.684,False,reddit gitlab,ui_state,684,train,webarena_verified.683 +webarena_verified.685,False,reddit gitlab,ui_state,685,train,webarena_verified.684 +webarena_verified.686,False,reddit gitlab,ui_state,686,train,webarena_verified.685 +webarena_verified.687,False,reddit gitlab,ui_state,687,test,webarena_verified.686 +webarena_verified.688,False,reddit gitlab,ui_state,688,test,webarena_verified.687 +webarena_verified.689,False,shopping,ui_state,689,test,webarena_verified.675 +webarena_verified.690,False,shopping,ui_state,690,test,webarena_verified.689 +webarena_verified.691,False,shopping,ui_state,691,train,webarena_verified.690 +webarena_verified.692,False,shopping,ui_state,692,train,webarena_verified.691 +webarena_verified.693,False,shopping,ui_state,693,train,webarena_verified.692 +webarena_verified.694,False,shopping_admin,backend_state,694,train,webarena_verified.680 +webarena_verified.695,False,shopping_admin,backend_state,695,train,webarena_verified.694 +webarena_verified.696,False,shopping_admin,backend_state,696,test,webarena_verified.695 +webarena_verified.697,False,shopping_admin,backend_state,697,train,webarena_verified.696 +webarena_verified.698,False,shopping_admin,backend_state,698,test,webarena_verified.697 +webarena_verified.699,False,shopping_admin,backend_state,699,train,webarena_verified.698 +webarena_verified.700,False,shopping_admin,backend_state,700,test,webarena_verified.699 +webarena_verified.701,False,shopping_admin,backend_state,701,test,webarena_verified.700 +webarena_verified.702,False,shopping_admin,backend_state,702,train,webarena_verified.701 +webarena_verified.703,False,shopping_admin,backend_state,703,train,webarena_verified.702 +webarena_verified.704,False,shopping_admin,ui_state,704,test,webarena_verified.703 +webarena_verified.705,False,shopping_admin,ui_state,705,test,webarena_verified.704 +webarena_verified.706,False,shopping_admin,ui_state,706,train,webarena_verified.705 +webarena_verified.707,False,shopping_admin,ui_state,707,train,webarena_verified.706 +webarena_verified.708,False,shopping_admin,ui_state,708,train,webarena_verified.707 +webarena_verified.709,False,shopping_admin,ui_state,709,test,webarena_verified.708 +webarena_verified.710,False,shopping_admin,ui_state,710,test,webarena_verified.709 +webarena_verified.711,False,shopping_admin,ui_state,711,train,webarena_verified.710 +webarena_verified.712,False,shopping_admin,ui_state,712,train,webarena_verified.711 +webarena_verified.713,False,shopping_admin,ui_state,713,train,webarena_verified.712 +webarena_verified.714,False,reddit,backend_state,714,train,webarena_verified.688 +webarena_verified.715,False,reddit,backend_state,715,train,webarena_verified.714 +webarena_verified.716,False,reddit,backend_state,716,train,webarena_verified.715 +webarena_verified.717,False,reddit,backend_state,717,test,webarena_verified.716 +webarena_verified.718,False,reddit,backend_state,718,test,webarena_verified.717 +webarena_verified.719,False,reddit,backend_state,719,train,webarena_verified.718 +webarena_verified.720,False,reddit,backend_state,720,test,webarena_verified.719 +webarena_verified.721,False,reddit,backend_state,721,train,webarena_verified.720 +webarena_verified.722,False,reddit,backend_state,722,train,webarena_verified.721 +webarena_verified.723,False,reddit,backend_state,723,test,webarena_verified.722 +webarena_verified.724,False,reddit,backend_state,724,test,webarena_verified.723 +webarena_verified.725,False,reddit,backend_state,725,test,webarena_verified.724 +webarena_verified.726,False,reddit,backend_state,726,test,webarena_verified.725 +webarena_verified.727,False,reddit,backend_state,727,train,webarena_verified.726 +webarena_verified.728,False,reddit,backend_state,728,train,webarena_verified.727 +webarena_verified.729,False,reddit,backend_state,729,train,webarena_verified.728 +webarena_verified.730,False,reddit,backend_state,730,test,webarena_verified.729 +webarena_verified.731,False,reddit,backend_state,731,test,webarena_verified.730 +webarena_verified.732,False,reddit,backend_state,732,train,webarena_verified.731 +webarena_verified.733,False,reddit,backend_state,733,train,webarena_verified.732 +webarena_verified.734,False,reddit,program_html,734,train,webarena_verified.733 +webarena_verified.735,False,reddit,program_html,735,test,webarena_verified.734 +webarena_verified.736,False,gitlab,backend_state,736,train,webarena_verified.688 +webarena_verified.737,False,wikipedia map,program_html,737,train,webarena_verified.430 +webarena_verified.738,False,wikipedia map,program_html,738,test,webarena_verified.737 +webarena_verified.739,False,wikipedia map,program_html,739,train,webarena_verified.738 +webarena_verified.740,False,wikipedia map,program_html,740,test,webarena_verified.739 +webarena_verified.741,False,wikipedia map,program_html,741,train,webarena_verified.740 +webarena_verified.742,False,gitlab,backend_state,742,test,webarena_verified.736 +webarena_verified.743,False,gitlab,backend_state,743,test,webarena_verified.742 +webarena_verified.744,False,gitlab,backend_state,744,test,webarena_verified.743 +webarena_verified.745,False,gitlab,backend_state,745,test,webarena_verified.744 +webarena_verified.746,False,gitlab,backend_state,746,train,webarena_verified.745 +webarena_verified.747,False,gitlab,backend_state,747,train,webarena_verified.746 +webarena_verified.748,False,gitlab,backend_state,748,train,webarena_verified.747 +webarena_verified.749,False,gitlab,backend_state,749,test,webarena_verified.748 +webarena_verified.750,False,gitlab,backend_state,750,test,webarena_verified.749 +webarena_verified.751,False,gitlab,backend_state,751,train,webarena_verified.750 +webarena_verified.752,False,gitlab,backend_state,752,train,webarena_verified.751 +webarena_verified.753,False,gitlab,backend_state,753,test,webarena_verified.752 +webarena_verified.754,False,gitlab,backend_state,754,train,webarena_verified.753 +webarena_verified.755,False,gitlab,backend_state,755,test,webarena_verified.754 +webarena_verified.756,False,gitlab,backend_state,756,train,webarena_verified.755 +webarena_verified.757,False,map,program_html,757,test,webarena_verified.741 +webarena_verified.758,False,map,program_html,758,test,webarena_verified.757 +webarena_verified.759,False,map shopping_admin,program_html,759,test,webarena_verified.758 webarena_verified.713 +webarena_verified.760,False,map shopping_admin,program_html,760,test,webarena_verified.759 +webarena_verified.761,False,map,program_html,761,train,webarena_verified.760 +webarena_verified.762,False,map,program_html,762,train,webarena_verified.761 +webarena_verified.763,False,map,program_html,763,test,webarena_verified.762 +webarena_verified.764,False,map,program_html,764,test,webarena_verified.763 +webarena_verified.765,False,map,program_html,765,train,webarena_verified.764 +webarena_verified.766,False,map,program_html,766,train,webarena_verified.765 +webarena_verified.767,False,map,program_html,767,train,webarena_verified.766 +webarena_verified.768,False,shopping_admin,backend_state,768,test,webarena_verified.760 +webarena_verified.769,False,shopping_admin,backend_state,769,test,webarena_verified.768 +webarena_verified.770,False,shopping_admin,backend_state,770,train,webarena_verified.769 +webarena_verified.771,False,shopping_admin,backend_state,771,test,webarena_verified.770 +webarena_verified.772,False,shopping_admin,backend_state,772,test,webarena_verified.771 +webarena_verified.773,False,shopping_admin,backend_state,773,train,webarena_verified.772 +webarena_verified.774,False,shopping_admin,backend_state,774,train,webarena_verified.773 +webarena_verified.775,False,shopping_admin,backend_state,775,train,webarena_verified.774 +webarena_verified.776,False,shopping_admin,backend_state,776,test,webarena_verified.775 +webarena_verified.777,False,shopping_admin,backend_state,777,train,webarena_verified.776 +webarena_verified.778,False,shopping_admin,backend_state,778,test,webarena_verified.777 +webarena_verified.779,False,shopping_admin,backend_state,779,train,webarena_verified.778 +webarena_verified.780,False,shopping_admin,backend_state,780,test,webarena_verified.779 +webarena_verified.781,False,shopping_admin,backend_state,781,train,webarena_verified.780 +webarena_verified.782,False,shopping_admin,backend_state,782,test,webarena_verified.781 +webarena_verified.783,False,gitlab,retrieve_value,783,train,webarena_verified.756 +webarena_verified.784,False,gitlab,retrieve_value,784,test,webarena_verified.783 +webarena_verified.785,False,gitlab,retrieve_value,785,test,webarena_verified.784 +webarena_verified.786,False,gitlab,retrieve_value,786,test,webarena_verified.785 +webarena_verified.787,False,gitlab,retrieve_value,787,test,webarena_verified.786 +webarena_verified.788,False,gitlab,retrieve_value,788,test,webarena_verified.787 +webarena_verified.789,False,gitlab,retrieve_value,789,test,webarena_verified.788 +webarena_verified.790,False,shopping_admin,retrieve_value,790,test,webarena_verified.782 +webarena_verified.791,False,gitlab reddit,string_match,791,train,webarena_verified.789 webarena_verified.735 +webarena_verified.792,False,shopping,retrieve_value,792,test,webarena_verified.693 +webarena_verified.793,False,shopping,retrieve_value,793,train,webarena_verified.792 +webarena_verified.794,False,shopping,retrieve_value,794,test,webarena_verified.793 +webarena_verified.795,False,shopping,retrieve_value,795,train,webarena_verified.794 +webarena_verified.796,False,shopping,retrieve_value,796,train,webarena_verified.795 +webarena_verified.797,False,shopping,retrieve_value,797,test,webarena_verified.796 +webarena_verified.798,False,shopping,retrieve_value,798,train,webarena_verified.797 +webarena_verified.799,False,gitlab,backend_state,799,train,webarena_verified.791 +webarena_verified.800,False,gitlab,backend_state,800,test,webarena_verified.799 +webarena_verified.801,False,gitlab,backend_state,801,train,webarena_verified.800 +webarena_verified.802,False,gitlab,backend_state,802,train,webarena_verified.801 +webarena_verified.803,False,gitlab,backend_state,803,test,webarena_verified.802 +webarena_verified.804,False,gitlab,backend_state,804,train,webarena_verified.803 +webarena_verified.805,False,gitlab,backend_state,805,test,webarena_verified.804 +webarena_verified.806,False,gitlab,backend_state,806,test,webarena_verified.805 +webarena_verified.807,False,gitlab,backend_state,807,train,webarena_verified.806 +webarena_verified.808,False,gitlab,backend_state,808,train,webarena_verified.807 +webarena_verified.809,False,gitlab,backend_state,809,train,webarena_verified.808 +webarena_verified.810,False,gitlab,backend_state,810,test,webarena_verified.809 +webarena_verified.811,False,gitlab,backend_state,811,test,webarena_verified.810 diff --git a/browsergym/experiments/src/browsergym/experiments/loop.py b/browsergym/experiments/src/browsergym/experiments/loop.py index 1a36e4c7..b1f9bf56 100644 --- a/browsergym/experiments/src/browsergym/experiments/loop.py +++ b/browsergym/experiments/src/browsergym/experiments/loop.py @@ -937,6 +937,7 @@ def _get_env_name(task_name: str): elif task_name.startswith("webarena"): import browsergym.webarena import browsergym.webarenalite + import browsergym.webarena_verified elif task_name.startswith("visualwebarena"): import browsergym.visualwebarena elif task_name.startswith("assistantbench"): From e8a5594dd3eb3da31a136b1afecdf6e4ad301151 Mon Sep 17 00:00:00 2001 From: Nicolas Gontier Date: Wed, 17 Sep 2025 19:30:07 +0000 Subject: [PATCH 02/64] upd Makefile --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 52ddc156..1d5834bb 100644 --- a/Makefile +++ b/Makefile @@ -1,6 +1,6 @@ install: @echo "--- 🚀 Installing project dependencies ---" - pip install -e ./browsergym/core -e ./browsergym/miniwob -e ./browsergym/webarena -e ./browsergym/webarenalite -e ./browsergym/visualwebarena/ -e ./browsergym/experiments -e ./browsergym/assistantbench -e ./browsergym/ + pip install -e ./browsergym/core -e ./browsergym/miniwob -e ./browsergym/webarena -e ./browsergym/webarenalite -e ./browsergym/webarena_verified -e ./browsergym/visualwebarena/ -e ./browsergym/experiments -e ./browsergym/assistantbench -e ./browsergym/ playwright install chromium install-demo: From 8f33d1004c9b88e93babefc250aa65807e1a2ce4 Mon Sep 17 00:00:00 2001 From: Nicolas Gontier Date: Wed, 17 Sep 2025 19:33:47 +0000 Subject: [PATCH 03/64] adding the basic files --- browsergym/webarena_verified/README.md | 117 + browsergym/webarena_verified/pyproject.toml | 41 + browsergym/webarena_verified/requirements.txt | 4 + .../browsergym/webarena_verified/__init__.py | 24 + .../browsergym/webarena_verified/config.py | 1 + .../webarena_verified/webarena_verified.json | 49835 ++++++++++++++++ 6 files changed, 50022 insertions(+) create mode 100644 browsergym/webarena_verified/README.md create mode 100644 browsergym/webarena_verified/pyproject.toml create mode 100644 browsergym/webarena_verified/requirements.txt create mode 100644 browsergym/webarena_verified/src/browsergym/webarena_verified/__init__.py create mode 100644 browsergym/webarena_verified/src/browsergym/webarena_verified/config.py create mode 100644 browsergym/webarena_verified/src/browsergym/webarena_verified/webarena_verified.json diff --git a/browsergym/webarena_verified/README.md b/browsergym/webarena_verified/README.md new file mode 100644 index 00000000..11244aac --- /dev/null +++ b/browsergym/webarena_verified/README.md @@ -0,0 +1,117 @@ +# WebArena Verified benchmark for BrowserGym + +This package provides `browsergym.webarena_verified`, which integrates the WebArena Verified benchmark from the [platform-labs-agent-eval-harness](https://github.com/ServiceNow/platform-labs-agent-eval-harness) into BrowserGym. + +## Installation + +### 0. Prerequisites + +Before installing this package, you need to clone the platform-labs-agent-eval-harness repository locally: + +```bash +git clone https://github.com/ServiceNow/platform-labs-agent-eval-harness.git /home/toolkit/platform-labs-agent-eval-harness +``` + +### 1. Install this BrowserGym package + +```bash +pip install browsergym-webarena-verified +``` + +This will automatically install the required dependencies from local file paths: +- `webarena-verified` from local platform-labs-agent-eval-harness repository +- `agent-eval-harness-common` from local platform-labs-agent-eval-harness repository + +**Note**: This package requires the [platform-labs-agent-eval-harness](https://github.com/ServiceNow/platform-labs-agent-eval-harness) repository to be cloned locally at `/home/toolkit/platform-labs-agent-eval-harness` before installation. + +### 2. Download required resources + +```bash +# Download NLTK tokenizer resources +python -c "import nltk; nltk.download('punkt_tab')" +``` + +## Setup + +### Environment Variables + +Set up the WebArena environment URLs. The ports should correspond to your WebArena instance setup: + +```bash +BASE_URL= # example: "http://myazuremachine.eastus.cloudapp.azure.com" + +# WebArena environment variables (change ports as needed) +export WA_SHOPPING="$BASE_URL:8082/" +export WA_SHOPPING_ADMIN="$BASE_URL:8083/admin" +export WA_REDDIT="$BASE_URL:8080" +export WA_GITLAB="$BASE_URL:9001" +export WA_WIKIPEDIA="$BASE_URL:8081/wikipedia_en_all_maxi_2022-05/A/User:The_other_Kiwix_guy/Landing" +export WA_MAP="$BASE_URL:443" +export WA_HOMEPAGE="$BASE_URL:80" + +# Optional: Full reset functionality +export WA_FULL_RESET="$BASE_URL:7565" +``` + +### API Keys + +Set up required API keys: + +```bash +# OpenAI API key (required for LLM-based evaluations) +export OPENAI_API_KEY=... + +# Optional: Langfuse API key for tracing +export LANGFUSE_PUBLIC_KEY=... +export LANGFUSE_SECRET_KEY=... +``` + +## Usage + +```python +import browsergym.webarena_verified + +# The package automatically registers all WebArena Verified tasks +# Task IDs range from 0 to 811 (812 total tasks) + +# Example: Run a specific task +from browsergym.webarena_verified import ALL_WEBARENA_TASK_IDS +print(f"Available tasks: {len(ALL_WEBARENA_TASK_IDS)}") + +# Example: Create a task +from browsergym.webarena_verified.task import WebArenaVerifiedTask + +task = WebArenaVerifiedTask(seed=42, task_id=0) +``` + +## Task Configuration + +WebArena Verified tasks are configured via the `webarena_verified.json` file, which includes: + +- **Task metadata**: task_id, intent, intent_template +- **Environment setup**: sites, start_url, geolocation +- **Evaluation criteria**: expected_retrieve_value, expected_backend_state, expected_ui_state +- **Authentication**: storage_state for logged-in sessions + +## Evaluation System + +The evaluation system supports three types of validation: + +1. **Retrieve Value**: Validates that the agent successfully retrieved the expected information +2. **Backend State**: Validates that the agent made the expected changes to the backend/database +3. **UI State**: Validates that the agent achieved the expected UI state + +## Differences from Original WebArena + +- Enhanced evaluation with multiple validation types +- Integration with platform-labs evaluation framework +- Support for more sophisticated task validation +- Better error handling and logging +- Structured agent response format + +## Troubleshooting + +- Ensure all environment variables are set correctly +- Verify that the WebArena instance is running and accessible +- Check that all required API keys are configured +- Review logs for detailed error information diff --git a/browsergym/webarena_verified/pyproject.toml b/browsergym/webarena_verified/pyproject.toml new file mode 100644 index 00000000..10c9d726 --- /dev/null +++ b/browsergym/webarena_verified/pyproject.toml @@ -0,0 +1,41 @@ +[build-system] +requires = ["hatchling", "hatch-requirements-txt"] +build-backend = "hatchling.build" + +[project] +name = "browsergym-webarena-verified" +description = "WebArena Verified benchmark for BrowserGym" +authors = [ + {name = "ServiceNow"}, +] +requires-python = ">=3.13" +license = {text = "Apache-2.0"} +classifiers = [ + "Development Status :: 3 - Alpha", + "Programming Language :: Python :: 3", + "Operating System :: OS Independent", + "Intended Audience :: Science/Research", + "Topic :: Scientific/Engineering :: Artificial Intelligence", + "License :: OSI Approved :: Apache Software License", +] +dynamic = ["dependencies", "version"] + +[project.urls] +homepage = "https://github.com/ServiceNow/BrowserGym" + +[tool.hatch.version] +path = "../core/src/browsergym/core/__init__.py" + +[tool.hatch.metadata.hooks.requirements_txt] +files = ["requirements.txt"] + +[tool.hatch.build] +include = [ + "src/browsergym/webarena_verified/webarena_verified.json" +] + +[tool.hatch.metadata] +allow-direct-references = true + +[tool.hatch.build.targets.wheel] +packages = ["src/browsergym"] diff --git a/browsergym/webarena_verified/requirements.txt b/browsergym/webarena_verified/requirements.txt new file mode 100644 index 00000000..f95ac098 --- /dev/null +++ b/browsergym/webarena_verified/requirements.txt @@ -0,0 +1,4 @@ +browsergym-core==0.14.2 +libwebarena==0.0.4 +webarena-verified @ file:///home/toolkit/platform-labs-agent-eval-harness/benchmarks/webarena-verified +agent-eval-harness-common @ file:///home/toolkit/platform-labs-agent-eval-harness/packages/agent-eval-harness-common diff --git a/browsergym/webarena_verified/src/browsergym/webarena_verified/__init__.py b/browsergym/webarena_verified/src/browsergym/webarena_verified/__init__.py new file mode 100644 index 00000000..fffda0ee --- /dev/null +++ b/browsergym/webarena_verified/src/browsergym/webarena_verified/__init__.py @@ -0,0 +1,24 @@ +import nltk + +from browsergym.core.registration import register_task + +from . import config, task + +# download necessary tokenizer resources +# note: deprecated punkt -> punkt_tab https://github.com/nltk/nltk/issues/3293 +try: + nltk.data.find("tokenizers/punkt_tab") +except: + nltk.download("punkt_tab", quiet=True, raise_on_error=True) + +ALL_WEBARENA_TASK_IDS = [] + +# register all WebArena benchmark +for task_id in config.TASK_IDS: + gym_id = f"webarena_verified.{task_id}" + register_task( + gym_id, + task.WebArenaVerifiedTask, + task_kwargs={"task_id": task_id}, + ) + ALL_WEBARENA_TASK_IDS.append(gym_id) diff --git a/browsergym/webarena_verified/src/browsergym/webarena_verified/config.py b/browsergym/webarena_verified/src/browsergym/webarena_verified/config.py new file mode 100644 index 00000000..b3aabb95 --- /dev/null +++ b/browsergym/webarena_verified/src/browsergym/webarena_verified/config.py @@ -0,0 +1 @@ +TASK_IDS = range(812) diff --git a/browsergym/webarena_verified/src/browsergym/webarena_verified/webarena_verified.json b/browsergym/webarena_verified/src/browsergym/webarena_verified/webarena_verified.json new file mode 100644 index 00000000..1a607487 --- /dev/null +++ b/browsergym/webarena_verified/src/browsergym/webarena_verified/webarena_verified.json @@ -0,0 +1,49835 @@ +[ + { + "sites": [ + "shopping_admin" + ], + "task_id": 0, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "What is the top-{{n}} best-selling product name(s) in {{year}}", + "original.intent_template": "What is the top-{{n}} best-selling product in {{year}}", + "instantiation_dict": { + "n": 1, + "year": 2022 + }, + "intent": "What is the top-1 best-selling product name(s) in 2022", + "original.intent": "What is the top-1 best-selling product in 2022", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "verify_retrieved_value_normalized", + "eval_params": { + "ordered": false, + "type": "text" + }, + "expected_data": [ + "Quest Lumaflex\u2122 Band" + ] + } + } + ], + "site": "shopping_admin" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "exact_match": "Quest Lumaflex\u2122 Band" + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "Quest Lumaflex\u2122 Band" + }, + "intent_template_id": 279, + "changelogs": [ + { + "key": "intent_template", + "category": "task_ambiguity", + "note": "Template asks for 'product' but agent can return the id, the color as part of the response leading to false negatives. Should specifically ask for product name(s)" + } + ] + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 1, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "What is the top-{{n}} best-selling brand name(s) in {{period}}", + "original.intent_template": "What is the top-{{n}} best-selling brand in {{period}}", + "instantiation_dict": { + "n": 1, + "period": "Quarter 1 2022" + }, + "intent": "What is the top-1 best-selling brand name(s) in Quarter 1 2022", + "original.intent": "What is the top-1 best-selling brand in Quarter 1 2022", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "verify_retrieved_value_normalized", + "eval_params": { + "ordered": false, + "type": "text" + }, + "expected_data": [ + "Sprite" + ] + } + } + ], + "site": "shopping_admin" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "exact_match": "Sprite" + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "Sprite" + }, + "intent_template_id": 279, + "changelogs": [ + { + "key": "intent_template", + "category": "task_ambiguity", + "note": "Template asks for 'brand' but agent can return the id, or the name as part of the response leading to false negatives. Should specifically ask for brand name(s)" + } + ] + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 2, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "What is the top-{{n}} best-selling product type name(s) in {{period}}", + "original.intent_template": "What is the top-{{n}} best-selling product type in {{period}}", + "instantiation_dict": { + "n": 1, + "period": "Quarter 1 2022" + }, + "intent": "What is the top-1 best-selling product type name(s) in Quarter 1 2022", + "original.intent": "What is the top-1 best-selling product type in Quarter 1 2022", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "verify_retrieved_value_normalized", + "eval_params": { + "ordered": false, + "type": "text" + }, + "expected_data": [ + "Yoga strap" + ] + } + } + ], + "site": "shopping_admin" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "exact_match": "Yoga ball" + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "Yoga ball" + }, + "intent_template_id": 279, + "changelogs": [ + { + "key": "intent_template", + "category": "task_ambiguity", + "note": "Template asks for 'product type' but agent can return the id, or the name as part of the response leading to false negatives. Should specifically ask for product type name(s)" + }, + { + "key": "expected_retrieve_value", + "category": "reference_alignment", + "note": "Yoga straps are the best selling product type in q1 2022" + } + ] + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 3, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "What are the top-{{n}} best-selling product name(s) in {{year}}", + "original.intent_template": "What are the top-{{n}} best-selling product in {{year}}", + "instantiation_dict": { + "n": 2, + "year": 2022 + }, + "intent": "What are the top-2 best-selling product name(s) in 2022", + "original.intent": "What are the top-2 best-selling product in 2022", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "verify_retrieved_value_normalized", + "eval_params": { + "ordered": false, + "type": "text" + }, + "expected_data": [ + "Quest Lumaflex\u2122 Band", + "Sprite Stasis Ball 65 cm" + ] + } + } + ], + "site": "shopping_admin" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "Quest Lumaflex\u2122 Band", + "Sprite Stasis Ball 65 cm" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "Quest Lumaflex\u2122 Band, Sprite Stasis Ball 65 cm" + }, + "intent_template_id": 279, + "changelogs": [ + { + "key": "intent_template", + "category": "task_ambiguity", + "note": "Template asks for 'product' but agent can return the id, the color as part of the response leading to false negatives. Should specifically ask for product name(s)" + }, + { + "key": "eval", + "category": "permissive_string_match", + "note": "Switched to exact match" + } + ] + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 4, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "What are the top-{{n}} best-selling product name(s) in {{period}}", + "original.intent_template": "What are the top-{{n}} best-selling product in {{period}}", + "instantiation_dict": { + "n": 3, + "period": "Jan 2023" + }, + "intent": "What are the top-3 best-selling product name(s) in Jan 2023", + "original.intent": "What are the top-3 best-selling product in Jan 2023", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "verify_retrieved_value_normalized", + "eval_params": { + "ordered": false, + "type": "text" + }, + "expected_data": [ + "Impulse Duffle", + "Overnight Duffle", + "Hawkeye Yoga Short-32-Blue" + ] + } + } + ], + "site": "shopping_admin" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "Impulse Duffle", + "Overnight Duffle", + "Hawkeye Yoga Short-32-Blue" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "Impulse Duffle, Overnight Duffle, Hawkeye Yoga Short-32-Blue" + }, + "intent_template_id": 279, + "changelogs": [ + { + "key": "intent_template", + "category": "task_ambiguity", + "note": "Template asks for 'product' but agent can return the id, the color as part of the response leading to false negatives. Should specifically ask for product name(s)" + }, + { + "key": "eval", + "category": "permissive_string_match", + "note": "Switched to exact match" + } + ] + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 5, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "What is the top-{{n}} best-selling product type name(s) in {{period}}", + "original.intent_template": "What is the top-{{n}} best-selling product type in {{period}}", + "instantiation_dict": { + "n": 1, + "period": "Jan 2023" + }, + "intent": "What is the top-1 best-selling product type name(s) in Jan 2023", + "original.intent": "What is the top-1 best-selling product type in Jan 2023", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "verify_retrieved_value_normalized", + "eval_params": { + "ordered": false, + "type": "text" + }, + "expected_data": [ + "Duffle" + ] + } + } + ], + "site": "shopping_admin" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "exact_match": "Duffle" + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "Duffle" + }, + "intent_template_id": 279, + "changelogs": [ + { + "key": "intent_template", + "category": "task_ambiguity", + "note": "Template asks for 'product type' but agent can return the id, or name as part of the response leading to false negatives. Should specifically ask for product type name(s)" + } + ] + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 6, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "What are the top-{{n}} best-selling product name(s) in {{year}}", + "original.intent_template": "What are the top-{{n}} best-selling product in {{year}}", + "instantiation_dict": { + "n": 2, + "year": 2023 + }, + "original.instantiation_dict": { + "n": 5, + "year": 2023 + }, + "intent": "What are the top-2 best-selling product name(s) in 2023", + "original.intent": "What are the top-5 best-selling product in 2023", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "verify_retrieved_value_normalized", + "eval_params": { + "ordered": false, + "type": "text" + }, + "expected_data": [ + "Sprite Yoga Strap 6 foot", + "Overnight Duffle" + ] + } + } + ], + "site": "shopping_admin" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "Sprite Yoga Strap 6 foot", + "Overnight Duffle", + "Ida Workout Parachute Pant-29-Purple", + "Hawkeye Yoga Short-32-Blue", + "Sprite Stasis Ball 65 cm" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "Sprite Yoga Strap 6 foot, Overnight Duffle, Ida Workout Parachute Pant-29-Purple, Hawkeye Yoga Short-32-Blue, Sprite Stasis Ball 65 cm" + }, + "intent_template_id": 279, + "changelogs": [ + { + "key": "instantiation_dict", + "category": "task_ambiguity", + "note": "Last two items of the top 5 do not provide consistent results due to numerious contenders for spots 4 and 5 changed to top 2" + }, + { + "key": "eval", + "category": "permissive_string_match", + "note": "Switched to exact match" + } + ] + }, + { + "sites": [ + "map" + ], + "task_id": 7, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "Tell me the full address of all {{airport_type}} that are within a driving distance of {{radius}} to {{start}}", + "instantiation_dict": { + "airport_type": "international airports", + "start": "Carnegie Mellon University", + "radius": "50 km" + }, + "intent": "Tell me the full address of all international airports that are within a driving distance of 50 km to Carnegie Mellon University", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "verify_retrieved_value_normalized", + "eval_params": { + "ordered": false, + "type": "text" + }, + "expected_data": [ + "Pittsburgh International Airport, Southern Beltway, Findlay Township, Allegheny County, 15231, United States" + ] + } + } + ], + "site": "map" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "exact_match": "Pittsburgh International Airport, Southern Beltway, Findlay Township, Allegheny County, 15231, United States" + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "Pittsburgh International Airport People Movers, Airport Boulevard, Findlay Township, Allegheny County, Pennsylvania, 15231, United States" + }, + "intent_template_id": 79 + }, + { + "sites": [ + "map" + ], + "task_id": 8, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "Tell me the full address of all {{airport_type}} that are within a driving distance of {{radius}} to {{start}}", + "instantiation_dict": { + "airport_type": "international airports", + "start": "Carnegie Mellon University", + "radius": "5 km" + }, + "intent": "Tell me the full address of all international airports that are within a driving distance of 5 km to Carnegie Mellon University", + "require_reset": false, + "eval": { + "site": "map" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": [ + "There is no airport within 5 km of Carnegie Mellon University" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "There is no airport within 5 km of Carnegie Mellon University" + }, + "intent_template_id": 79, + "changelogs": [ + { + "key": "eval", + "category": "permissive_string_match", + "note": "Switched from fuzzy eval" + } + ] + }, + { + "sites": [ + "map" + ], + "task_id": 9, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "Tell me the full address of all {{airport_type}} that are within a driving distance of {{radius}} to {{start}}", + "instantiation_dict": { + "airport_type": "international airports", + "start": "Carnegie Art Museum", + "radius": "30 km" + }, + "intent": "Tell me the full address of all international airports that are within a driving distance of 30 km to Carnegie Art Museum", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "verify_retrieved_value_normalized", + "eval_params": { + "ordered": false, + "type": "text" + }, + "expected_data": [ + "Pittsburgh International Airport, Southern Beltway, Findlay Township, Allegheny County, 15231, United States" + ] + } + } + ], + "site": "map" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "Pittsburgh International Airport, Southern Beltway, Findlay Township, Allegheny County, 15231, United States" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "Pittsburgh International Airport People Movers, Airport Boulevard, Findlay Township, Allegheny County, Pennsylvania, 15231, United States" + }, + "intent_template_id": 79, + "changelogs": [ + { + "key": "eval", + "category": "permissive_string_match", + "note": "Switched to exact match" + } + ] + }, + { + "sites": [ + "map" + ], + "task_id": 10, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "Tell me the full address of all {{airport_type}} that are within a driving distance of {{radius}} to {{start}}", + "instantiation_dict": { + "airport_type": "US international airports", + "start": "Niagara Falls", + "radius": "60 km" + }, + "intent": "Tell me the full address of all US international airports that are within a driving distance of 60 km to Niagara Falls", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "verify_retrieved_value_normalized", + "eval_params": { + "ordered": false, + "type": "text" + }, + "expected_data": [ + "Niagara Falls International Airport, 2035, Niagara Falls Boulevard, City of Niagara Falls, Town of Wheatfield, Niagara County, New York, 14304, United States", + "Buffalo-Niagara International Airport, Holtz Drive, Town of Cheektowaga, Erie County, New York, 14225, United States" + ] + } + } + ], + "site": "map" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "Niagara Falls International Airport, 2035, Niagara Falls Boulevard, City of Niagara Falls, Town of Wheatfield, Niagara County, New York, 14304, United States", + "Buffalo-Niagara International Airport, Holtz Drive, Town of Cheektowaga, Erie County, New York, 14225, United States" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "Niagara Falls International Airport, 2035, Niagara Falls Boulevard, City of Niagara Falls, Town of Wheatfield, Niagara County, New York, 14304, United States Buffalo-Niagara International Airport, South Youngs Road, Town of Cheektowaga, Erie County, New York, 14221, United States" + }, + "intent_template_id": 79, + "changelogs": [ + { + "key": "eval", + "category": "permissive_string_match", + "note": "Switched to exact match" + } + ] + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 11, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Tell me the number of reviews that our store received so far that mention term \"{{term}}\"", + "original.intent_template": "Tell me the the number of reviews that our store received by far that mention term \"{{term}}\"", + "instantiation_dict": { + "term": "disappointed" + }, + "intent": "Tell me the number of reviews that our store received so far that mention term \"disappointed\"", + "original.intent": "Tell me the the number of reviews that our store received by far that mention term \"disappointed\"", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "verify_retrieved_value_normalized", + "eval_params": { + "ordered": false, + "type": "numeric" + }, + "expected_data": [ + 6 + ] + } + } + ], + "site": "shopping_admin" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "6" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "6" + }, + "intent_template_id": 288, + "changelogs": [ + { + "key": "intent_template", + "category": "spelling_or_grammar", + "note": "Fix task wording." + }, + { + "key": "eval", + "category": "permissive_string_match", + "note": "Switched to exact match" + } + ] + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 12, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Tell me the number of reviews that our store received so far that mention term \"{{term}}\"", + "original.intent_template": "Tell me the the number of reviews that our store received by far that mention term \"{{term}}\"", + "instantiation_dict": { + "term": "satisfied" + }, + "intent": "Tell me the number of reviews that our store received so far that mention term \"satisfied\"", + "original.intent": "Tell me the the number of reviews that our store received by far that mention term \"satisfied\"", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "verify_retrieved_value_normalized", + "eval_params": { + "ordered": false, + "type": "numeric" + }, + "expected_data": [ + 2 + ] + } + } + ], + "site": "shopping_admin" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "2" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "2" + }, + "intent_template_id": 288, + "changelogs": [ + { + "key": "intent_template", + "category": "spelling_or_grammar", + "note": "Fix task wording." + }, + { + "key": "eval", + "category": "permissive_string_match", + "note": "Switched to exact match" + } + ] + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 13, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Tell me the number of reviews that our store received so far that mention term \"{{term}}\"", + "original.intent_template": "Tell me the the number of reviews that our store received by far that mention term \"{{term}}\"", + "instantiation_dict": { + "term": "decent" + }, + "intent": "Tell me the number of reviews that our store received so far that mention term \"decent\"", + "original.intent": "Tell me the the number of reviews that our store received by far that mention term \"decent\"", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "verify_retrieved_value_normalized", + "eval_params": { + "ordered": false, + "type": "numeric" + }, + "expected_data": [ + 2 + ] + } + } + ], + "site": "shopping_admin" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "2" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "2" + }, + "intent_template_id": 288, + "changelogs": [ + { + "key": "intent_template", + "category": "spelling_or_grammar", + "note": "Fix task wording." + }, + { + "key": "eval", + "category": "permissive_string_match", + "note": "Switched to exact match" + } + ] + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 14, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Tell me the number of reviews that our store received so far that mention term \"{{term}}\"", + "original.intent_template": "Tell me the the number of reviews that our store received by far that mention term \"{{term}}\"", + "instantiation_dict": { + "term": "not useful" + }, + "intent": "Tell me the number of reviews that our store received so far that mention term \"not useful\"", + "original.intent": "Tell me the the number of reviews that our store received by far that mention term \"not useful\"", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "verify_retrieved_value_normalized", + "eval_params": { + "ordered": false, + "type": "numeric" + }, + "expected_data": [ + 0 + ] + } + } + ], + "site": "shopping_admin" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "0" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "0" + }, + "intent_template_id": 288, + "changelogs": [ + { + "key": "intent_template", + "category": "spelling_or_grammar", + "note": "Fix task wording." + }, + { + "key": "eval", + "category": "permissive_string_match", + "note": "Switched to exact match" + } + ] + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 15, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Tell me the number of reviews that our store received so far that mention term \"{{term}}\"", + "original.intent_template": "Tell me the the number of reviews that our store received by far that mention term \"{{term}}\"", + "instantiation_dict": { + "term": "best" + }, + "intent": "Tell me the number of reviews that our store received so far that mention term \"best\"", + "original.intent": "Tell me the the number of reviews that our store received by far that mention term \"best\"", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "verify_retrieved_value_normalized", + "eval_params": { + "ordered": false, + "type": "numeric" + }, + "expected_data": [ + 2 + ] + } + } + ], + "site": "shopping_admin" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "2" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "2" + }, + "intent_template_id": 288, + "changelogs": [ + { + "key": "intent_template", + "category": "spelling_or_grammar", + "note": "Fix task wording." + }, + { + "key": "eval", + "category": "permissive_string_match", + "note": "Switched to exact match" + } + ] + }, + { + "sites": [ + "map" + ], + "task_id": 16, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "Compare the time for walking and driving route from {{start}} to {{end}}", + "instantiation_dict": { + "start": "5000 Fifth Avenue, Pittsburgh", + "end": "UPMC family health center" + }, + "intent": "Compare the time for walking and driving route from 5000 Fifth Avenue, Pittsburgh to UPMC family health center", + "require_reset": false, + "eval": { + "site": "map" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": [ + "driving: 2min", + "walking: 16min" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "Driving: 2min. Walking: 16min." + }, + "intent_template_id": 73, + "changelogs": [ + { + "key": "eval", + "category": "permissive_string_match", + "note": "Switched from fuzzy eval" + } + ] + }, + { + "sites": [ + "map" + ], + "task_id": 17, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "Compare the time for walking and driving route from {{start}} to {{end}}", + "instantiation_dict": { + "start": "AMC Waterfront", + "end": "Carnegie Mellon University" + }, + "intent": "Compare the time for walking and driving route from AMC Waterfront to Carnegie Mellon University", + "require_reset": false, + "eval": { + "site": "map" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": [ + "driving: 13min", + "walking: 1h 35min" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "driving: 13min, walking: 1h 35min." + }, + "intent_template_id": 73, + "changelogs": [ + { + "key": "eval", + "category": "permissive_string_match", + "note": "Switched from fuzzy eval" + } + ] + }, + { + "sites": [ + "map" + ], + "task_id": 18, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "Compare the time for walking and driving route from {{start}} to {{end}}", + "instantiation_dict": { + "start": "AMC Waterfront", + "end": "Univ of Pittsburgh" + }, + "intent": "Compare the time for walking and driving route from AMC Waterfront to Univ of Pittsburgh", + "require_reset": false, + "eval": { + "site": "map" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": [ + "driving: 15min", + "walking: 1h 47min" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "driving: 15min, walking: 1h 47min." + }, + "intent_template_id": 73, + "changelogs": [ + { + "key": "eval", + "category": "permissive_string_match", + "note": "Switched from fuzzy eval" + } + ] + }, + { + "sites": [ + "map" + ], + "task_id": 19, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "Compare the time for walking and driving route from {{start}} to {{end}}", + "instantiation_dict": { + "start": "Carnegie Science Center", + "end": "Carnegie Mellon University" + }, + "intent": "Compare the time for walking and driving route from Carnegie Science Center to Carnegie Mellon University", + "require_reset": false, + "eval": { + "site": "map" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": [ + "driving: 12min", + "walking: 1h 44min." + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "driving: 12min, walking: 1h 44min." + }, + "intent_template_id": 73, + "changelogs": [ + { + "key": "eval", + "category": "permissive_string_match", + "note": "Switched from fuzzy eval" + } + ] + }, + { + "sites": [ + "map" + ], + "task_id": 20, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "Compare the difference in time for walking and driving route from {{start}} to {{end}}", + "instantiation_dict": { + "start": "Randyland", + "end": "Carnegie Mellon University" + }, + "intent": "Compare the difference in time for walking and driving route from Randyland to Carnegie Mellon University", + "require_reset": false, + "eval": { + "site": "map" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": [ + "driving: 13min", + "walking: 1h 45min" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "driving: 13min, walking: 1h 45min." + }, + "intent_template_id": 73, + "changelogs": [ + { + "key": "eval", + "category": "permissive_string_match", + "note": "Switched from fuzzy eval" + } + ] + }, + { + "sites": [ + "shopping" + ], + "task_id": 21, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__/6s-wireless-headphones-over-ear-noise-canceling-hi-fi-bass-foldable-stereo-wireless-kid-headsets-earbuds-with-built-in-mic-micro-sd-tf-fm-for-iphone-samsung-ipad-pc-black-gold.html", + "geolocation": null, + "intent_template": "List out reviewer names, if exist, who mention about {{description}}", + "original.intent_template": "List out reviewers, if exist, who mention about {{description}}", + "instantiation_dict": { + "description": "ear cups being small" + }, + "intent": "List out reviewer names, if exist, who mention about ear cups being small", + "original.intent": "List out reviewers, if exist, who mention about ear cups being small", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "verify_retrieved_value_normalized", + "eval_params": { + "ordered": false, + "type": "text" + }, + "expected_data": [ + "Joseph Brzezinski", + "Catso", + "Dibbins", + "Anglebert Dinkherhump", + "Michelle Davis" + ] + } + } + ], + "site": "shopping" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "Joseph Brzezinski", + "Catso", + "Dibbins", + "Anglebert Dinkherhump", + "Michelle Davis" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "Joseph Brzezinski, Catso, Dibbins, Anglebert Dinkherhump, Michelle Davis" + }, + "intent_template_id": 222, + "changelogs": [ + { + "key": "intent_template", + "category": "task_ambiguity", + "note": "Clarify the intent to specify the reviewers name instead of email or other attributes" + }, + { + "key": "eval", + "category": "permissive_string_match", + "note": "Switched to exact match" + } + ] + }, + { + "sites": [ + "shopping" + ], + "task_id": 22, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__/fujifilm-finepix-z200fd-10mp-digital-camera-with-5x-optical-dual-image-stabilized-zoom-black.html", + "geolocation": null, + "intent_template": "List out reviewer names, if exist, who mention about {{description}}", + "original.intent_template": "List out reviewers, if exist, who mention about {{description}}", + "instantiation_dict": { + "description": "under water photo" + }, + "intent": "List out reviewer names, if exist, who mention about under water photo", + "original.intent": "List out reviewers, if exist, who mention about under water photo", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "validate_status", + "expected_data": { + "expected_status": "NOT_FOUND_ERROR" + } + } + } + ], + "site": "shopping" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": "N/A" + }, + "reference_url": "", + "program_html": [], + "string_note": "There is no review about under water photo", + "reference_answer_raw_annotation": "N/A" + }, + "intent_template_id": 222, + "changelogs": [ + { + "key": "intent_template", + "category": "task_ambiguity", + "note": "Clarify the intent to specify the reviewers name instead of email or other attributes" + }, + { + "key": "expected_retrieve_value", + "category": "unachievable_tasks", + "note": "Status check used instead of fuzzy match" + } + ] + }, + { + "sites": [ + "shopping" + ], + "task_id": 23, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__/3-pack-samsung-galaxy-s6-screen-protector-nearpow-tempered-glass-screen-protector-with-9h-hardness-crystal-clear-easy-bubble-free-installation-scratch-resist.html", + "geolocation": null, + "intent_template": "List out reviewer names, if exist, who mention about {{description}}", + "original.intent_template": "List out reviewers, if exist, who mention about {{description}}", + "instantiation_dict": { + "description": "good fingerprint resistant" + }, + "intent": "List out reviewer names, if exist, who mention about good fingerprint resistant", + "original.intent": "List out reviewers, if exist, who mention about good fingerprint resistant", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "verify_retrieved_value_normalized", + "eval_params": { + "ordered": false, + "type": "text" + }, + "expected_data": [ + "Rachel", + "T. Gannon" + ] + } + } + ], + "site": "shopping" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "Rachel", + "T. Gannon" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "Rachel, T. Gannon, " + }, + "intent_template_id": 222, + "changelogs": [ + { + "key": "intent_template", + "category": "task_ambiguity", + "note": "Clarify the intent to specify the reviewers name instead of email or other attributes" + }, + { + "key": "eval", + "category": "permissive_string_match", + "note": "Switched to exact match" + } + ] + }, + { + "sites": [ + "shopping" + ], + "task_id": 24, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__/haflinger-men-s-wool-felt-open-back-slippers-beige-550-peat-us-7.html", + "geolocation": null, + "intent_template": "List out reviewer names, if exist, who mention about {{description}}", + "original.intent_template": "List out reviewers, if exist, who mention about {{description}}", + "instantiation_dict": { + "description": "price being unfair" + }, + "intent": "List out reviewer names, if exist, who mention about price being unfair", + "original.intent": "List out reviewers, if exist, who mention about price being unfair", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "validate_status", + "expected_data": { + "expected_status": "NOT_FOUND_ERROR" + } + } + } + ], + "site": "shopping" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": "N/A" + }, + "reference_url": "", + "program_html": [], + "string_note": "There is no reivew about price being unfair", + "reference_answer_raw_annotation": "N/A" + }, + "intent_template_id": 222, + "changelogs": [ + { + "key": "intent_template", + "category": "task_ambiguity", + "note": "Clarify the intent to specify the reviewers name instead of email or other attributes" + }, + { + "key": "expected_retrieve_value", + "category": "unachievable_tasks", + "note": "Status check used instead of fuzzy match" + } + ] + }, + { + "sites": [ + "shopping" + ], + "task_id": 25, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__/epson-workforce-wf-3620-wifi-direct-all-in-one-color-inkjet-printer-copier-scanner-amazon-dash-replenishment-ready.html", + "geolocation": null, + "intent_template": "List out reviewer names, if exist, who mention about {{description}}", + "original.intent_template": "List out reviewers, if exist, who mention about {{description}}", + "instantiation_dict": { + "description": "average print quality" + }, + "intent": "List out reviewer names, if exist, who mention about average print quality", + "original.intent": "List out reviewers, if exist, who mention about average print quality", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "verify_retrieved_value_normalized", + "eval_params": { + "ordered": false, + "type": "text" + }, + "expected_data": [ + "GoldfishGoldfish", + "Roxanne Brandon Coffey" + ] + } + } + ], + "site": "shopping" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "Goldfish", + "Roxanne Brandon Coffey" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "GoldfishGoldfish, Roxanne Brandon Coffey" + }, + "intent_template_id": 222, + "changelogs": [ + { + "key": "intent_template", + "category": "task_ambiguity", + "note": "Clarify the intent to specify the reviewers name instead of email or other attributes" + }, + { + "key": "eval", + "category": "permissive_string_match", + "note": "Switched to exact match" + } + ] + }, + { + "sites": [ + "shopping" + ], + "task_id": 26, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__/epson-workforce-wf-3620-wifi-direct-all-in-one-color-inkjet-printer-copier-scanner-amazon-dash-replenishment-ready.html", + "geolocation": null, + "intent_template": "List out reviewer names, if exist, who mention about {{description}}", + "original.intent_template": "List out reviewers, if exist, who mention about {{description}}", + "instantiation_dict": { + "description": "complain of the customer service" + }, + "intent": "List out reviewer names, if exist, who mention about complain of the customer service", + "original.intent": "List out reviewers, if exist, who mention about complain of the customer service", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "verify_retrieved_value_normalized", + "eval_params": { + "ordered": false, + "type": "text" + }, + "expected_data": [ + "RemyRRemyR", + "Bob in Vegas" + ] + } + } + ], + "site": "shopping" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "Bob in Vegas", + "RemyR" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "Bob in Vegas, RemyRRemyR" + }, + "intent_template_id": 222, + "changelogs": [ + { + "key": "intent_template", + "category": "task_ambiguity", + "note": "Clarify the intent to specify the reviewers name instead of email or other attributes" + }, + { + "key": "eval", + "category": "permissive_string_match", + "note": "Switched to exact match" + } + ] + }, + { + "sites": [ + "reddit" + ], + "task_id": 27, + "require_login": true, + "storage_state": "./.auth/reddit_state.json", + "start_url": "__REDDIT__", + "geolocation": null, + "intent_template": "Tell me the count of comments that have received more downvotes than upvotes for the user who made the latest post on the {{forum}} forum.", + "instantiation_dict": { + "forum": "Showerthoughts" + }, + "intent": "Tell me the count of comments that have received more downvotes than upvotes for the user who made the latest post on the Showerthoughts forum.", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "verify_retrieved_value_normalized", + "eval_params": { + "ordered": false, + "type": "numeric" + }, + "expected_data": [ + 0 + ] + } + } + ], + "site": "reddit" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "0" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "0" + }, + "intent_template_id": 33, + "changelogs": [ + { + "key": "eval", + "category": "permissive_string_match", + "note": "Switched to exact match" + } + ] + }, + { + "sites": [ + "reddit" + ], + "task_id": 28, + "require_login": true, + "storage_state": "./.auth/reddit_state.json", + "start_url": "__REDDIT__", + "geolocation": null, + "intent_template": "Tell me the count of comments that have received more downvotes than upvotes for the user who made the latest post on the {{forum}} forum.", + "instantiation_dict": { + "forum": "Worcester" + }, + "intent": "Tell me the count of comments that have received more downvotes than upvotes for the user who made the latest post on the Worcester forum.", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "verify_retrieved_value_normalized", + "eval_params": { + "ordered": false, + "type": "numeric" + }, + "expected_data": [ + 1 + ] + } + } + ], + "site": "reddit" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "0" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "0" + }, + "intent_template_id": 33, + "changelogs": [ + { + "key": "expected_retrieve_value", + "category": "reference_alignment", + "note": "Original expected was missing a comment" + }, + { + "key": "eval", + "category": "permissive_string_match", + "note": "Switched to exact match" + } + ] + }, + { + "sites": [ + "reddit" + ], + "task_id": 29, + "require_login": true, + "storage_state": "./.auth/reddit_state.json", + "start_url": "__REDDIT__", + "geolocation": null, + "intent_template": "Tell me the count of comments that have received more downvotes than upvotes for the user who made the latest post on the {{forum}} forum.", + "instantiation_dict": { + "forum": "DIY" + }, + "intent": "Tell me the count of comments that have received more downvotes than upvotes for the user who made the latest post on the DIY forum.", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "verify_retrieved_value_normalized", + "eval_params": { + "ordered": false, + "type": "numeric" + }, + "expected_data": [ + 1 + ] + } + } + ], + "site": "reddit" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "1" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "1" + }, + "intent_template_id": 33, + "changelogs": [ + { + "key": "eval", + "category": "permissive_string_match", + "note": "Switched to exact match" + } + ] + }, + { + "sites": [ + "reddit" + ], + "task_id": 30, + "require_login": true, + "storage_state": "./.auth/reddit_state.json", + "start_url": "__REDDIT__", + "geolocation": null, + "intent_template": "Tell me the count of comments that have received more downvotes than upvotes for the user who made the latest post on the {{forum}} forum.", + "instantiation_dict": { + "forum": "space" + }, + "intent": "Tell me the count of comments that have received more downvotes than upvotes for the user who made the latest post on the space forum.", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "verify_retrieved_value_normalized", + "eval_params": { + "ordered": false, + "type": "numeric" + }, + "expected_data": [ + 0 + ] + } + } + ], + "site": "reddit" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "0" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "0" + }, + "intent_template_id": 33, + "changelogs": [ + { + "key": "eval", + "category": "permissive_string_match", + "note": "Switched to exact match" + } + ] + }, + { + "sites": [ + "reddit" + ], + "task_id": 31, + "require_login": true, + "storage_state": "./.auth/reddit_state.json", + "start_url": "__REDDIT__", + "geolocation": null, + "intent_template": "Tell me the count of comments that have received more downvotes than upvotes for the user who made the latest post on the {{forum}} forum.", + "instantiation_dict": { + "forum": "photoshopbattles" + }, + "intent": "Tell me the count of comments that have received more downvotes than upvotes for the user who made the latest post on the photoshopbattles forum.", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "verify_retrieved_value_normalized", + "eval_params": { + "ordered": false, + "type": "numeric" + }, + "expected_data": [ + 0 + ] + } + } + ], + "site": "reddit" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "0" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "0" + }, + "intent_template_id": 33, + "changelogs": [ + { + "key": "eval", + "category": "permissive_string_match", + "note": "Switched to exact match" + } + ] + }, + { + "sites": [ + "map" + ], + "task_id": 32, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "I will arrive {{place}} soon. Provide the name of a {{target1}} in the vicinity, if available. Then, tell me the {{information}} to {{target2}} from the hotel.", + "instantiation_dict": { + "place": "Pittsburgh Airport", + "information": "the walking distance", + "target1": "Hilton hotel", + "target2": "the nearest supermarket own by a local company" + }, + "intent": "I will arrive Pittsburgh Airport soon. Provide the name of a Hilton hotel in the vicinity, if available. Then, tell me the the walking distance to the nearest supermarket own by a local company from the hotel.", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "verify_retrieved_value_normalized", + "eval_params": { + "ordered": false, + "type": "text" + }, + "expected_data": [ + "DoubleTree by Hilton Hotel Pittsburgh Airport", + "2.0km" + ] + } + } + ], + "site": "map" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "DoubleTree by Hilton Hotel Pittsburgh Airport", + "2.0km" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "DoubleTree by Hilton Hotel Pittsburgh Airport Distance: 2.0km" + }, + "intent_template_id": 78, + "changelogs": [ + { + "key": "eval", + "category": "permissive_string_match", + "note": "Switched to exact match" + } + ] + }, + { + "sites": [ + "map" + ], + "task_id": 33, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "I will arrive {{place}} soon. Provide the name of a {{target1}} in the vicinity, if available. Then, tell me the {{information}} to {{target2}} from the hotel.", + "instantiation_dict": { + "place": "Pittsburgh Airport", + "target1": "Hilton hotel", + "information": "the shortest walking distance", + "target2": "a supermarket" + }, + "intent": "I will arrive Pittsburgh Airport soon. Provide the name of a Hilton hotel in the vicinity, if available. Then, tell me the the shortest walking distance to a supermarket from the hotel.", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "verify_retrieved_value_normalized", + "eval_params": { + "ordered": false, + "type": "text" + }, + "expected_data": [ + "DoubleTree by Hilton Hotel Pittsburgh Airport", + "1.4km" + ] + } + } + ], + "site": "map" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "DoubleTree by Hilton Hotel Pittsburgh Airport", + "1.4km" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "DoubleTree by Hilton Hotel Pittsburgh Airport Distance: 1.4km" + }, + "intent_template_id": 78, + "changelogs": [ + { + "key": "eval", + "category": "permissive_string_match", + "note": "Switched to exact match" + } + ] + }, + { + "sites": [ + "map" + ], + "task_id": 34, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "I will arrive {{place}} soon. Provide the name of a {{target1}} in the vicinity, if available. Then, tell me the {{information}} to {{target2}} from the hotel.", + "instantiation_dict": { + "place": "Pittsburgh Airport", + "target1": "Hyatt hotel", + "information": "the shortest walking time", + "target2": "a supermarket" + }, + "intent": "I will arrive Pittsburgh Airport soon. Provide the name of a Hyatt hotel in the vicinity, if available. Then, tell me the the shortest walking time to a supermarket from the hotel.", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "verify_retrieved_value_normalized", + "eval_params": { + "ordered": false, + "type": "text" + }, + "expected_data": [ + "Hyatt Regency Pittsburgh International Airport" + ] + } + } + ], + "site": "map" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "Hyatt Regency Pittsburgh International Airport" + ], + "fuzzy_match": [ + "Time: 3h 30min" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "Hyatt Regency Pittsburgh International Airport\n3:30" + }, + "intent_template_id": 78, + "changelogs": [ + { + "key": "eval", + "category": "permissive_string_match", + "note": "Switched from fuzzy eval" + } + ] + }, + { + "sites": [ + "map" + ], + "task_id": 35, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "I will arrive {{place}} soon. Provide the name of a {{target1}} in the vicinity, if available. Then, tell me the {{information}} to {{target2}} from the hotel.", + "instantiation_dict": { + "place": "Pittsburgh Airport", + "target1": "Hyatt hotel", + "information": "the minimal driving time", + "target2": "a supermarket" + }, + "intent": "I will arrive Pittsburgh Airport soon. Provide the name of a Hyatt hotel in the vicinity, if available. Then, tell me the the minimal driving time to a supermarket from the hotel.", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "verify_retrieved_value_normalized", + "eval_params": { + "ordered": false, + "type": "text" + }, + "expected_data": [ + "Hyatt Regency Pittsburgh International Airport" + ] + } + } + ], + "site": "map" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "Hyatt Regency Pittsburgh International Airport" + ], + "fuzzy_match": [ + "Time: 15min" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "Hyatt Regency Pittsburgh International Airport Time: 15min" + }, + "intent_template_id": 78, + "changelogs": [ + { + "key": "eval", + "category": "permissive_string_match", + "note": "Switched from fuzzy eval" + } + ] + }, + { + "sites": [ + "map" + ], + "task_id": 36, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "Check if the {{place}} in pittsburgh can be reached in one hour by car from {{location}}", + "instantiation_dict": { + "place": "social security administration", + "location": "Carnegie Mellon University" + }, + "intent": "Check if the social security administration in pittsburgh can be reached in one hour by car from Carnegie Mellon University", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "verify_retrieved_value_normalized", + "eval_params": { + "ordered": false, + "type": "text" + }, + "expected_data": [ + "Yes" + ] + } + } + ], + "site": "map" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "Yes" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "Yes" + }, + "intent_template_id": 77, + "changelogs": [ + { + "key": "eval", + "category": "permissive_string_match", + "note": "Switched to exact match" + } + ] + }, + { + "sites": [ + "map" + ], + "task_id": 37, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "Check if the {{place}} in pittsburgh can be reached in one hour by car from {{location}}", + "instantiation_dict": { + "place": "police station", + "location": "gates building at CMU" + }, + "intent": "Check if the police station in pittsburgh can be reached in one hour by car from gates building at CMU", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "verify_retrieved_value_normalized", + "eval_params": { + "ordered": false, + "type": "text" + }, + "expected_data": [ + "Yes" + ] + } + } + ], + "site": "map" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "Yes" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "Yes" + }, + "intent_template_id": 77, + "changelogs": [ + { + "key": "eval", + "category": "permissive_string_match", + "note": "Switched to exact match" + } + ] + }, + { + "sites": [ + "map" + ], + "task_id": 38, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "Check if the {{place}} in pittsburgh can be reached in one hour by car from {{location}}", + "instantiation_dict": { + "place": "duquesne university", + "location": "pittsburgh airport" + }, + "intent": "Check if the duquesne university in pittsburgh can be reached in one hour by car from pittsburgh airport", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "verify_retrieved_value_normalized", + "eval_params": { + "ordered": false, + "type": "text" + }, + "expected_data": [ + "Yes" + ] + } + } + ], + "site": "map" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "Yes" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "Yes" + }, + "intent_template_id": 77, + "changelogs": [ + { + "key": "eval", + "category": "permissive_string_match", + "note": "Switched to exact match" + } + ] + }, + { + "sites": [ + "map" + ], + "task_id": 39, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "Check if the {{place}} in pittsburgh can be reached in one hour by car from {{location}}", + "instantiation_dict": { + "place": "walmart", + "location": "5600 fifth avenue" + }, + "intent": "Check if the walmart in pittsburgh can be reached in one hour by car from 5600 fifth avenue", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "verify_retrieved_value_normalized", + "eval_params": { + "ordered": false, + "type": "text" + }, + "expected_data": [ + "Yes" + ] + } + } + ], + "site": "map" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "Yes" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "Yes" + }, + "intent_template_id": 77, + "changelogs": [ + { + "key": "eval", + "category": "permissive_string_match", + "note": "Switched to exact match" + } + ] + }, + { + "sites": [ + "map" + ], + "task_id": 40, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "Check if the {{place}} in pittsburgh can be reached in one hour by car from {{location}}", + "instantiation_dict": { + "place": "amc theatre", + "location": "hobart street" + }, + "intent": "Check if the amc theatre in pittsburgh can be reached in one hour by car from hobart street", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "verify_retrieved_value_normalized", + "eval_params": { + "ordered": false, + "type": "text" + }, + "expected_data": [ + "Yes" + ] + } + } + ], + "site": "map" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "Yes" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "Yes" + }, + "intent_template_id": 77, + "changelogs": [ + { + "key": "eval", + "category": "permissive_string_match", + "note": "Switched to exact match" + } + ] + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 41, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "List the top {{n}} search term(s) in my store", + "original.intent_template": "List the top {{n}} search terms in my store", + "instantiation_dict": { + "n": "1" + }, + "intent": "List the top 1 search term(s) in my store", + "original.intent": "List the top 1 search terms in my store", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "verify_retrieved_value_normalized", + "eval_params": { + "ordered": false, + "type": "text" + }, + "expected_data": [ + "hollister" + ] + } + } + ], + "site": "shopping_admin" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "exact_match": "hollister" + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "hollister" + }, + "intent_template_id": 285, + "changelogs": [ + { + "key": "intent_template", + "category": "spelling_or_grammar", + "note": "Use proper optional pluralization" + } + ] + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 42, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "List the top {{n}} search term(s) in my store", + "original.intent_template": "List the top {{n}} search terms in my store", + "instantiation_dict": { + "n": "2" + }, + "intent": "List the top 2 search term(s) in my store", + "original.intent": "List the top 2 search terms in my store", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "verify_retrieved_value_normalized", + "eval_params": { + "ordered": false, + "type": "text" + }, + "expected_data": [ + "hollister", + "Joust Bag" + ] + } + } + ], + "site": "shopping_admin" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "hollister", + "Joust Bag" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "hollister, Joust Bag" + }, + "intent_template_id": 285, + "changelogs": [ + { + "key": "intent_template", + "category": "spelling_or_grammar", + "note": "Use proper optional pluralization" + }, + { + "key": "eval", + "category": "permissive_string_match", + "note": "Switched to exact match" + } + ] + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 43, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "List the top {{n}} search term(s) in my store", + "original.intent_template": "List the top {{n}} search terms in my store", + "instantiation_dict": { + "n": "3" + }, + "intent": "List the top 3 search term(s) in my store", + "original.intent": "List the top 3 search terms in my store", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "verify_retrieved_value_normalized", + "eval_params": { + "ordered": false, + "type": "text" + }, + "expected_data": [ + "hollister", + "Joust Bag", + "Antonia Racer Tank" + ] + } + } + ], + "site": "shopping_admin" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "hollister", + "Joust Bag", + "Antonia Racer Tank" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "hollister, Joust Bag, Antonia Race Tank" + }, + "intent_template_id": 285, + "changelogs": [ + { + "key": "intent_template", + "category": "spelling_or_grammar", + "note": "Use proper optional pluralization" + }, + { + "key": "eval", + "category": "permissive_string_match", + "note": "Switched to exact match" + } + ] + }, + { + "sites": [ + "gitlab" + ], + "task_id": 44, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Navigate to my todos", + "original.intent_template": "Check out my todos", + "instantiation_dict": {}, + "intent": "Navigate to my todos", + "original.intent": "Check out my todos", + "require_reset": false, + "eval": { + "expected_ui_state": [ + { + "url": "__GITLAB__/dashboard/todos" + } + ], + "site": "gitlab" + }, + "original.eval": { + "eval_types": [ + "url_match" + ], + "reference_answers": null, + "reference_url": "__GITLAB__/dashboard/todos", + "program_html": [], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 303, + "changelogs": [ + { + "key": "intent_template", + "category": "clarify_instructions", + "note": "Clarify the intent as a navigation task" + } + ] + }, + { + "sites": [ + "gitlab" + ], + "task_id": 45, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__/a11yproject/a11yproject.com", + "geolocation": null, + "intent_template": "Navigate to the most recent open issues", + "original.intent_template": "Check out the most recent open issues", + "instantiation_dict": {}, + "intent": "Navigate to the most recent open issues", + "original.intent": "Check out the most recent open issues", + "require_reset": false, + "eval": { + "expected_ui_state": [ + { + "url": "__GITLAB__/a11yproject/a11yproject.com/-/issues/?sort=created_asc&state=opened" + } + ], + "site": "gitlab" + }, + "original.eval": { + "eval_types": [ + "url_match" + ], + "reference_answers": null, + "reference_url": "__GITLAB__/a11yproject/a11yproject.com/-/issues/?sort=created_asc&state=opened", + "program_html": [], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 300, + "changelogs": [ + { + "key": "intent_template", + "category": "clarify_instructions", + "note": "Clarify the intent as a navigation task" + } + ] + }, + { + "sites": [ + "gitlab" + ], + "task_id": 46, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__/primer/design", + "geolocation": null, + "intent_template": "Navigate to the most recent open issues", + "original.intent_template": "Check out the most recent open issues", + "instantiation_dict": {}, + "intent": "Navigate to the most recent open issues", + "original.intent": "Check out the most recent open issues", + "require_reset": false, + "eval": { + "expected_ui_state": [ + { + "url": "__GITLAB__/primer/design/-/issues/?sort=created_date&state=opened" + } + ], + "site": "gitlab" + }, + "original.eval": { + "eval_types": [ + "url_match" + ], + "reference_answers": null, + "reference_url": "__GITLAB__/primer/design/-/issues/?sort=created_date&state=opened", + "program_html": [], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 300, + "changelogs": [ + { + "key": "intent_template", + "category": "clarify_instructions", + "note": "Clarify the intent as a navigation task" + } + ] + }, + { + "sites": [ + "shopping" + ], + "task_id": 47, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "Today is 6/12/2023. Tell me how many fulfilled orders I have {{period}}, and the total amount of money I spent.", + "instantiation_dict": { + "period": "over the past month" + }, + "intent": "Today is 6/12/2023. Tell me how many fulfilled orders I have over the past month, and the total amount of money I spent.", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "verify_retrieved_value_normalized", + "eval_params": { + "ordered": false, + "type": "object" + }, + "expected_data": [ + { + "order_count": { + "value": 0, + "type": "numeric" + }, + "amount": { + "value": 0, + "type": "currency" + } + } + ] + } + } + ], + "site": "shopping" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": [ + "0 order", + "$0 total spend" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "0 order, $0 total spend" + }, + "intent_template_id": 197, + "format_specification": "Use \"order_count\" for the number of orders and \"amount\" for the total amount spent.", + "changelogs": [ + { + "key": "eval", + "category": "permissive_string_match", + "note": "Switched from fuzzy eval" + } + ] + }, + { + "sites": [ + "shopping" + ], + "task_id": 48, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "Today is 6/12/2023. Tell me how many fulfilled orders I have {{period}}, and the total amount of money I spent.", + "instantiation_dict": { + "period": "over the past three days" + }, + "intent": "Today is 6/12/2023. Tell me how many fulfilled orders I have over the past three days, and the total amount of money I spent.", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "verify_retrieved_value_normalized", + "eval_params": { + "ordered": false, + "type": "object" + }, + "expected_data": [ + { + "order_count": { + "value": 0, + "type": "numeric" + }, + "amount": { + "value": 0, + "type": "currency" + } + } + ] + } + } + ], + "site": "shopping" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": [ + "0 order", + "$0 total spend" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "0 order, $0 total spend" + }, + "intent_template_id": 197, + "format_specification": "Use \"order_count\" for the number of orders and \"amount\" for the total amount spent.", + "changelogs": [ + { + "key": "eval", + "category": "permissive_string_match", + "note": "Switched from fuzzy eval" + } + ] + }, + { + "sites": [ + "shopping" + ], + "task_id": 49, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "Today is 6/12/2023. Tell me how many fulfilled orders I have {{period}}, and the total amount of money I spent.", + "instantiation_dict": { + "period": "over the past four month" + }, + "intent": "Today is 6/12/2023. Tell me how many fulfilled orders I have over the past four month, and the total amount of money I spent.", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "verify_retrieved_value_normalized", + "eval_params": { + "ordered": false, + "type": "object" + }, + "expected_data": [ + { + "order_count": { + "value": 3, + "type": "numeric" + }, + "amount": { + "value": 845.49, + "type": "currency" + } + } + ] + } + } + ], + "site": "shopping" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": [ + "3 orders", + "$845.49 total spend" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "3 orders, $845.49 total spend" + }, + "intent_template_id": 197, + "format_specification": "Use \"order_count\" for the number of orders and \"amount\" for the total amount spent.", + "changelogs": [ + { + "key": "eval", + "category": "permissive_string_match", + "note": "Switched from fuzzy eval" + } + ] + }, + { + "sites": [ + "shopping" + ], + "task_id": 50, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "Today is 6/12/2023. Tell me how many fulfilled orders I have {{period}}, and the total amount of money I spent.", + "instantiation_dict": { + "period": "over the past year" + }, + "intent": "Today is 6/12/2023. Tell me how many fulfilled orders I have over the past year, and the total amount of money I spent.", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "verify_retrieved_value_normalized", + "eval_params": { + "ordered": false, + "type": "object" + }, + "expected_data": [ + { + "order_count": { + "value": 24, + "type": "numeric" + }, + "amount": { + "value": 6560.69, + "type": "currency" + } + } + ] + } + } + ], + "site": "shopping" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": [ + "24 orders", + "$6560.69 total spend" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "24 orders, $6560.69 total spend" + }, + "intent_template_id": 197, + "format_specification": "Use \"order_count\" for the number of orders and \"amount\" for the total amount spent.", + "changelogs": [ + { + "key": "eval", + "category": "permissive_string_match", + "note": "Switched from fuzzy eval" + } + ] + }, + { + "sites": [ + "shopping" + ], + "task_id": 51, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "Today is 6/12/2023. Tell me how many fulfilled orders I have {{period}}, and the total amount of money I spent.", + "instantiation_dict": { + "period": "over the past six month" + }, + "intent": "Today is 6/12/2023. Tell me how many fulfilled orders I have over the past six month, and the total amount of money I spent.", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "verify_retrieved_value_normalized", + "eval_params": { + "ordered": false, + "type": "object" + }, + "expected_data": [ + { + "order_count": { + "value": 12, + "type": "numeric" + }, + "amount": { + "value": 1603.69, + "type": "currency" + } + } + ] + } + } + ], + "site": "shopping" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": [ + "12 orders", + "$1603.69 total spend" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "12 orders, $1603.69 total spend" + }, + "intent_template_id": 197, + "format_specification": "Use \"order_count\" for the number of orders and \"amount\" for the total amount spent.", + "changelogs": [ + { + "key": "eval", + "category": "permissive_string_match", + "note": "Switched from fuzzy eval" + } + ] + }, + { + "sites": [ + "map" + ], + "task_id": 52, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "How long does it take to walk from {{start}} to {{end}}?", + "instantiation_dict": { + "start": "Carnegie Mellon University", + "end": "starbucks on Craig Street" + }, + "intent": "How long does it take to walk from Carnegie Mellon University to starbucks on Craig Street?", + "require_reset": false, + "eval": { + "site": "map" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": [ + "7 min" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "7 min" + }, + "intent_template_id": 68, + "changelogs": [ + { + "key": "eval", + "category": "permissive_string_match", + "note": "Switched from fuzzy eval" + } + ] + }, + { + "sites": [ + "map" + ], + "task_id": 53, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "How long does it take to walk from {{start}} to {{end}}?", + "instantiation_dict": { + "start": "Univ of Pittsburgh", + "end": "starbucks on Craig Street" + }, + "intent": "How long does it take to walk from Univ of Pittsburgh to starbucks on Craig Street?", + "require_reset": false, + "eval": { + "site": "map" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": [ + "18 min" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "18 min" + }, + "intent_template_id": 68, + "changelogs": [ + { + "key": "eval", + "category": "permissive_string_match", + "note": "Switched from fuzzy eval" + } + ] + }, + { + "sites": [ + "map" + ], + "task_id": 54, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "How long does it take to walk from {{start}} to {{end}}?", + "instantiation_dict": { + "start": "Carnegie Mellon University", + "end": "Univ of Pittsburgh" + }, + "intent": "How long does it take to walk from Carnegie Mellon University to Univ of Pittsburgh?", + "require_reset": false, + "eval": { + "site": "map" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": [ + "25 min" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "25 min" + }, + "intent_template_id": 68, + "changelogs": [ + { + "key": "eval", + "category": "permissive_string_match", + "note": "Switched from fuzzy eval" + } + ] + }, + { + "sites": [ + "map" + ], + "task_id": 55, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "How long does it take to walk from {{start}} to {{end}}?", + "instantiation_dict": { + "start": "the starbuck near CMU", + "end": "Chatham university" + }, + "intent": "How long does it take to walk from the starbuck near CMU to Chatham university?", + "require_reset": false, + "eval": { + "site": "map" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": [ + "30 min" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "30 min" + }, + "intent_template_id": 68, + "changelogs": [ + { + "key": "eval", + "category": "permissive_string_match", + "note": "Switched from fuzzy eval" + } + ] + }, + { + "sites": [ + "map" + ], + "task_id": 56, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "How long does it take to walk from {{start}} to {{end}}?", + "instantiation_dict": { + "start": "Carnegie Museum of Art", + "end": "a library at CMU" + }, + "intent": "How long does it take to walk from Carnegie Museum of Art to a library at CMU?", + "require_reset": false, + "eval": { + "site": "map" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": [ + "11 min" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "11 min" + }, + "intent_template_id": 68, + "changelogs": [ + { + "key": "eval", + "category": "permissive_string_match", + "note": "Switched from fuzzy eval" + } + ] + }, + { + "sites": [ + "map" + ], + "task_id": 57, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "Tell me the closest {{place1}}(s) to {{place2}}", + "instantiation_dict": { + "place1": "restaurant", + "place2": "university center at Carnegie Mellon University" + }, + "intent": "Tell me the closest restaurant(s) to university center at Carnegie Mellon University", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "verify_retrieved_value_normalized", + "eval_params": { + "ordered": false, + "type": "text" + }, + "expected_data": [ + "El Gallo de Oro", + "Back Bar Grill", + "Grano", + "Beefsteak", + "Nourish", + "Schatz Dining Room", + "Au Bon Pain" + ] + } + } + ], + "site": "map" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "El Gallo de Oro", + "Back Bar Grill", + "Grano", + "Beefsteak", + "Nourish", + "Schatz Dining Room", + "Au Bon Pain" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "El Gallo de Oro, Back Bar Grill, Grano, Beefsteak, Nourish, Schatz Dining Room, Au Bon Pain" + }, + "intent_template_id": 69, + "changelogs": [ + { + "key": "eval", + "category": "permissive_string_match", + "note": "Switched to exact match" + } + ] + }, + { + "sites": [ + "map" + ], + "task_id": 58, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "Tell me the closest {{place1}}(s) to {{place2}}", + "instantiation_dict": { + "place1": "cafe", + "place2": "CMU Hunt library" + }, + "intent": "Tell me the closest cafe(s) to CMU Hunt library", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "verify_retrieved_value_normalized", + "eval_params": { + "ordered": false, + "type": "text" + }, + "expected_data": [ + "De Fer Coffee & Tea" + ] + } + } + ], + "site": "map" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "De Fer Coffee & Tea" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "De Fer Coffee & Tea" + }, + "intent_template_id": 69, + "changelogs": [ + { + "key": "eval", + "category": "permissive_string_match", + "note": "Switched to exact match" + } + ] + }, + { + "sites": [ + "map" + ], + "task_id": 59, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "Tell me the closest {{place1}}(s) to {{place2}}", + "instantiation_dict": { + "place1": "restaurant", + "place2": "CMU Hunt library" + }, + "intent": "Tell me the closest restaurant(s) to CMU Hunt library", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "verify_retrieved_value_normalized", + "eval_params": { + "ordered": false, + "type": "text" + }, + "expected_data": [ + "The exchange" + ] + } + } + ], + "site": "map" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "The exchange" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "The exchange" + }, + "intent_template_id": 69, + "changelogs": [ + { + "key": "eval", + "category": "permissive_string_match", + "note": "Switched to exact match" + } + ] + }, + { + "sites": [ + "map" + ], + "task_id": 60, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "Tell me the closest {{place1}}(s) to {{place2}}", + "instantiation_dict": { + "place1": "restaurant", + "place2": "CMU Posner Hall" + }, + "intent": "Tell me the closest restaurant(s) to CMU Posner Hall", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "verify_retrieved_value_normalized", + "eval_params": { + "ordered": false, + "type": "text" + }, + "expected_data": [ + "The exchange" + ] + } + } + ], + "site": "map" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "The exchange" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "The exchange" + }, + "intent_template_id": 69, + "changelogs": [ + { + "key": "eval", + "category": "permissive_string_match", + "note": "Switched to exact match" + } + ] + }, + { + "sites": [ + "map" + ], + "task_id": 61, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "Tell me the closest {{place1}}(s) to {{place2}}", + "instantiation_dict": { + "place1": "restaurant", + "place2": "CMU Sorrells Library" + }, + "intent": "Tell me the closest restaurant(s) to CMU Sorrells Library", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "verify_retrieved_value_normalized", + "eval_params": { + "ordered": false, + "type": "text" + }, + "expected_data": [ + "La Prima Espresso" + ] + } + } + ], + "site": "map" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "La Prima Espresso" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "La Prima Espresso" + }, + "intent_template_id": 69, + "changelogs": [ + { + "key": "eval", + "category": "permissive_string_match", + "note": "Switched to exact match" + } + ] + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 62, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Get customer names that have completed the {{quantifier}} number of orders in the entire history?", + "original.intent_template": "Which customer has completed the {{quantifier}} number of orders in the entire history?", + "instantiation_dict": { + "quantifier": "most" + }, + "intent": "Get customer names that have completed the most number of orders in the entire history?", + "original.intent": "Which customer has completed the most number of orders in the entire history?", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "verify_retrieved_value_normalized", + "eval_params": { + "ordered": false, + "type": "text" + }, + "expected_data": [ + "Jane Smith" + ] + } + } + ], + "site": "shopping_admin" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "Jane Smith" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "Jane Smith" + }, + "intent_template_id": 276, + "changelogs": [ + { + "key": "intent_template", + "category": "task_ambiguity", + "note": "Clearly specify to return names to avoid false negatives when agents return emails" + }, + { + "key": "eval", + "category": "permissive_string_match", + "note": "Switched to exact match" + } + ] + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 63, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Get customer names that have completed the {{quantifier}} number of orders in the entire history?", + "original.intent_template": "Which customer(s) has completed the {{quantifier}} number of orders in the entire history?", + "instantiation_dict": { + "quantifier": "second most" + }, + "intent": "Get customer names that have completed the second most number of orders in the entire history?", + "original.intent": "Which customer(s) has completed the second most number of orders in the entire history?", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "verify_retrieved_value_normalized", + "eval_params": { + "ordered": false, + "type": "text" + }, + "expected_data": [ + "Adam Garcia", + "Michael Nguyen", + "Sarah Miller" + ] + } + } + ], + "site": "shopping_admin" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "Adam Garcia", + "Michael Nguyen", + "Sarah Miller" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "Adam Garcia, Michael Nguyen, Sarah Miller" + }, + "intent_template_id": 276, + "changelogs": [ + { + "key": "intent_template", + "category": "task_ambiguity", + "note": "Clearly specify to return names to avoid false negatives when agents return emails" + }, + { + "key": "eval", + "category": "permissive_string_match", + "note": "Switched to exact match" + } + ] + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 64, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Get customer names that have placed {{number}} orders in the entire history?", + "original.intent_template": "Which customer has placed {{number}} orders in the entire history?", + "instantiation_dict": { + "number": "2" + }, + "intent": "Get customer names that have placed 2 orders in the entire history?", + "original.intent": "Which customer has placed 2 orders in the entire history?", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "verify_retrieved_value_normalized", + "eval_params": { + "ordered": false, + "type": "text" + }, + "expected_data": [ + "Lisa Kim", + "Lisa Green", + "Julia Williams", + "Brian Smith", + "Alexander Thomas" + ] + } + } + ], + "site": "shopping_admin" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "Lisa Kim", + "Lisa Green", + "Julia Williams", + "Brian Smith", + "Alexander Thomas" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "Lisa Kim, Lisa Green, Julia Williams, Brian Smith, Alexander Thomas" + }, + "intent_template_id": 276, + "changelogs": [ + { + "key": "intent_template", + "category": "task_ambiguity", + "note": "Clearly specify to return names to avoid false negatives when agents return emails" + }, + { + "key": "intent_template", + "category": "task_ambiguity", + "note": "Clearly specify to return names to avoid false negatives when agents return emails" + }, + { + "key": "eval", + "category": "permissive_string_match", + "note": "Switched to exact match" + } + ] + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 65, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Get customer names that have completed the {{quantifier}} number of orders in the entire history?", + "original.intent_template": "Which customer has completed the {{quantifier}} number of orders in the entire history?", + "instantiation_dict": { + "quantifier": "fifth most" + }, + "intent": "Get customer names that have completed the fifth most number of orders in the entire history?", + "original.intent": "Which customer has completed the fifth most number of orders in the entire history?", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "verify_retrieved_value_normalized", + "eval_params": { + "ordered": false, + "type": "text" + }, + "expected_data": [ + "Jason Miller", + "Jennifer White" + ] + } + } + ], + "site": "shopping_admin" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "Jane Doe" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "Jane Doe" + }, + "intent_template_id": 276, + "changelogs": [ + { + "key": "intent_template", + "category": "task_ambiguity", + "note": "Clearly specify to return names to avoid false negatives when agents return emails" + }, + { + "key": "expected_retrieve_value", + "category": "reference_alignment", + "note": "Both have 5 completed orders which brings them in fifth place (1st 11 orders, 2nd 8, 3rd 7, 4th 6, 5th 5)" + }, + { + "key": "eval", + "category": "permissive_string_match", + "note": "Switched to exact match" + } + ] + }, + { + "sites": [ + "reddit" + ], + "task_id": 66, + "require_login": true, + "storage_state": "./.auth/reddit_state.json", + "start_url": "__REDDIT__", + "geolocation": null, + "intent_template": "Among the top {{number}} post in \"{{subreddit}}\" forum, {{description}}", + "instantiation_dict": { + "number": 10, + "subreddit": "books", + "description": "Provide the URLs of posts that recommend a single book" + }, + "original.instantiation_dict": { + "number": 10, + "subreddit": "books", + "description": "show me the post URLs that recommand a single book" + }, + "intent": "Among the top 10 post in \"books\" forum, Provide the URLs of posts that recommend a single book", + "original.intent": "Among the top 10 post in \"books\" forum, show me the post URLs that recommand a single book", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "verify_retrieved_value_normalized", + "eval_params": { + "ordered": false, + "type": "text" + }, + "expected_data": [ + "http://www.reddit.com/f/books/59396/apple-books-has-a-free-audiobook-of-a-christmas-carol", + "http://www.reddit.com/f/books/17445/i-just-finished-reading-the-hobbit-to-my-6-year-old-daughter" + ] + } + } + ], + "site": "reddit" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "http://www.reddit.com/f/books/59396/apple-books-has-a-free-audiobook-of-a-christmas-carol", + "http://www.reddit.com/f/books/17445/i-just-finished-reading-the-hobbit-to-my-6-year-old-daughter" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "__REDDIT__/f/books/59396/apple-books-has-a-free-audiobook-of-a-christmas-carol, __REDDIT__/f/books/17445/i-just-finished-reading-the-hobbit-to-my-6-year-old-daughter" + }, + "intent_template_id": 17, + "changelogs": [ + { + "key": "instantiation_dict", + "category": "spelling_or_grammar", + "note": "Typo in the description" + }, + { + "key": "eval", + "category": "permissive_string_match", + "note": "Switched to exact match" + } + ] + }, + { + "sites": [ + "reddit" + ], + "task_id": 67, + "require_login": true, + "storage_state": "./.auth/reddit_state.json", + "start_url": "__REDDIT__", + "geolocation": null, + "intent_template": "Among the top {{number}} post in \"{{subreddit}}\" forum, {{description}}", + "instantiation_dict": { + "number": 10, + "subreddit": "books", + "description": "Provide the book names from posts that recommend a single book" + }, + "original.instantiation_dict": { + "number": 10, + "subreddit": "books", + "description": "show me the book names from posts that recommand a single book" + }, + "intent": "Among the top 10 post in \"books\" forum, Provide the book names from posts that recommend a single book", + "original.intent": "Among the top 10 post in \"books\" forum, show me the book names from posts that recommand a single book", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "verify_retrieved_value_normalized", + "eval_params": { + "ordered": false, + "type": "text" + }, + "expected_data": [ + "A Christmas Carol", + "The Hobbit" + ] + } + } + ], + "site": "reddit" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "A Christmas Carol", + "The Hobbit" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "A Christmas Carol, The Hobbit" + }, + "intent_template_id": 17, + "changelogs": [ + { + "key": "instantiation_dict", + "category": "spelling_or_grammar", + "note": "Typo in the description" + }, + { + "key": "eval", + "category": "permissive_string_match", + "note": "Switched to exact match" + } + ] + }, + { + "sites": [ + "reddit" + ], + "task_id": 68, + "require_login": true, + "storage_state": "./.auth/reddit_state.json", + "start_url": "__REDDIT__", + "geolocation": null, + "intent_template": "Among the top {{number}} post in \"{{subreddit}}\" forum, {{description}}", + "instantiation_dict": { + "number": 10, + "subreddit": "books", + "description": "Provide the author names and the book names from posts that recommend a single book" + }, + "original.instantiation_dict": { + "number": 10, + "subreddit": "books", + "description": "show me the author name and the book name from posts that recommand a single book" + }, + "intent": "Among the top 10 post in \"books\" forum, Provide the author names and the book names from posts that recommend a single book", + "original.intent": "Among the top 10 post in \"books\" forum, show me the author name and the book name from posts that recommand a single book", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "verify_retrieved_value_normalized", + "eval_params": { + "ordered": false, + "type": "object" + }, + "expected_data": [ + { + "book": { + "value": "The Little Prince", + "type": "text" + }, + "author": { + "value": "Levar Burton", + "type": "text" + } + }, + { + "book": { + "value": "The Hobbit", + "type": "text" + }, + "author": { + "value": "Tolkien", + "type": "text" + } + } + ] + } + } + ], + "site": "reddit" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "A Christmas Carol", + "Levar Burton", + "The Hobbit", + "J. R. R. Tolkien" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "A Christmas Carol by Levar Burton: , The Hobbit by J. R. R. Tolkien" + }, + "intent_template_id": 17, + "format_specification": "For each pair, return a dictionary with the key 'book' for the book name and 'author' for the author name.", + "changelogs": [ + { + "key": "instantiation_dict", + "category": "spelling_or_grammar", + "note": "Typo in the description" + }, + { + "key": "expected_retrieve_value", + "category": "avoid_permissive_text", + "note": "Use structured response to judge correctness" + }, + { + "key": "eval", + "category": "permissive_string_match", + "note": "Switched to exact match" + } + ] + }, + { + "sites": [ + "reddit" + ], + "task_id": 69, + "require_login": true, + "storage_state": "./.auth/reddit_state.json", + "start_url": "__REDDIT__", + "geolocation": null, + "intent_template": "Among the top {{number}} post in \"{{subreddit}}\" forum, {{description}}", + "instantiation_dict": { + "number": 10, + "subreddit": "books", + "description": "is there any post talks about supporting local book stores? If so, tell me the names of the organizations involved" + }, + "original.instantiation_dict": { + "number": 10, + "subreddit": "books", + "description": "is there any post talks about supporting local book stores? If so, tell me the organizations involved" + }, + "intent": "Among the top 10 post in \"books\" forum, is there any post talks about supporting local book stores? If so, tell me the names of the organizations involved", + "original.intent": "Among the top 10 post in \"books\" forum, is there any post talks about supporting local book stores? If so, tell me the organizations involved", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "verify_retrieved_value_normalized", + "eval_params": { + "ordered": false, + "type": "text" + }, + "expected_data": [ + "bookshop.org" + ] + } + } + ], + "site": "reddit" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "bookshop.org" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "bookshop.org" + }, + "intent_template_id": 17, + "changelogs": [ + { + "key": "instantiation_dict", + "category": "clarify_instructions", + "note": "Task did not indicate to return the names of the organizations" + }, + { + "key": "eval", + "category": "permissive_string_match", + "note": "Switched to exact match" + } + ] + }, + { + "sites": [ + "map" + ], + "task_id": 70, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "What is the zip code of {{place}}?", + "instantiation_dict": { + "place": "Carnegie Mellon University" + }, + "intent": "What is the zip code of Carnegie Mellon University?", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "verify_retrieved_value_normalized", + "eval_params": { + "ordered": false, + "type": "text" + }, + "expected_data": [ + "15213" + ] + } + } + ], + "site": "map" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "exact_match": "15213" + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "15213" + }, + "intent_template_id": 70 + }, + { + "sites": [ + "map" + ], + "task_id": 71, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "What is the zip code of {{place}}?", + "instantiation_dict": { + "place": "Chatham University" + }, + "intent": "What is the zip code of Chatham University?", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "verify_retrieved_value_normalized", + "eval_params": { + "ordered": false, + "type": "text" + }, + "expected_data": [ + "15232" + ] + } + } + ], + "site": "map" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "exact_match": "15232" + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "15232" + }, + "intent_template_id": 70 + }, + { + "sites": [ + "map" + ], + "task_id": 72, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "What is the zip code of {{place}}?", + "instantiation_dict": { + "place": "Yale University" + }, + "intent": "What is the zip code of Yale University?", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "verify_retrieved_value_normalized", + "eval_params": { + "ordered": false, + "type": "text" + }, + "expected_data": [ + "06516" + ] + } + } + ], + "site": "map" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "exact_match": "06516" + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "06516" + }, + "intent_template_id": 70 + }, + { + "sites": [ + "map" + ], + "task_id": 73, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "What is the zip code of {{place}}?", + "instantiation_dict": { + "place": "Columbia University" + }, + "intent": "What is the zip code of Columbia University?", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "verify_retrieved_value_normalized", + "eval_params": { + "ordered": false, + "type": "text" + }, + "expected_data": [ + "10027" + ] + } + } + ], + "site": "map" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "exact_match": "10027" + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "10027" + }, + "intent_template_id": 70 + }, + { + "sites": [ + "map" + ], + "task_id": 74, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "Given the following locations, {{place_list}}, what would be the optimal route to travel through them all in order to minimize total travel time? Please note the journey begins at the first place listed.", + "instantiation_dict": { + "place_list": [ + "Carnegie Mellon University", + "apple store shadyside", + "starbucks on craig street" + ] + }, + "intent": "Given the following locations, ['Carnegie Mellon University', 'apple store shadyside', 'starbucks on craig street'], what would be the optimal route to travel through them all in order to minimize total travel time? Please note the journey begins at the first place listed.", + "require_reset": false, + "eval": { + "site": "map" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": [ + "The order is Carnegie Mellon University, starbucks on forbes ave, apple store shadyside" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "Carnegie Mellon University, starbucks on forbes ave, apple store shadyside" + }, + "intent_template_id": 65, + "changelogs": [ + { + "key": "eval", + "category": "permissive_string_match", + "note": "Switched from fuzzy eval" + } + ] + }, + { + "sites": [ + "map" + ], + "task_id": 75, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "Given the following locations, {{place_list}}, what would be the optimal route to travel through them all in order to minimize total travel time? Please note the journey begins at the first place listed.", + "instantiation_dict": { + "place_list": [ + "Massachusetts Institute of Technology", + "Harvard University", + "Boston Logan International Airport" + ] + }, + "intent": "Given the following locations, ['Massachusetts Institute of Technology', 'Harvard University', 'Boston Logan International Airport'], what would be the optimal route to travel through them all in order to minimize total travel time? Please note the journey begins at the first place listed.", + "require_reset": false, + "eval": { + "site": "map" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": [ + "The order is Massachusetts Institute of Technology, Harvard University, Boston Logan International Airport" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "Massachusetts Institute of Technology, Harvard University, Boston Logan International Airport" + }, + "intent_template_id": 65, + "changelogs": [ + { + "key": "eval", + "category": "permissive_string_match", + "note": "Switched from fuzzy eval" + } + ] + }, + { + "sites": [ + "map" + ], + "task_id": 76, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "Given the following locations, {{place_list}}, what would be the optimal route to travel through them all in order to minimize total travel time? Please note the journey begins at the first place listed.", + "instantiation_dict": { + "place_list": [ + "Princeton University", + "Yale University", + "Harvard University" + ] + }, + "intent": "Given the following locations, ['Princeton University', 'Yale University', 'Harvard University'], what would be the optimal route to travel through them all in order to minimize total travel time? Please note the journey begins at the first place listed.", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "verify_retrieved_value_normalized", + "eval_params": { + "ordered": false, + "type": "text" + }, + "expected_data": [ + "The order is Princeton University, Yale University, Harvard University" + ] + } + } + ], + "site": "map" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "The order is Princeton University, Yale University, Harvard University" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "Princeton University, Yale University, Harvard University" + }, + "intent_template_id": 65, + "changelogs": [ + { + "key": "eval", + "category": "permissive_string_match", + "note": "Switched to exact match" + } + ] + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 77, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "What is the total count of {{status}} reviews amongst all the reviews?", + "instantiation_dict": { + "status": "Pending" + }, + "intent": "What is the total count of Pending reviews amongst all the reviews?", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "verify_retrieved_value_normalized", + "eval_params": { + "ordered": false, + "type": "numeric" + }, + "expected_data": [ + 5 + ] + } + } + ], + "site": "shopping_admin" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "5" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "5" + }, + "intent_template_id": 277, + "changelogs": [ + { + "key": "eval", + "category": "permissive_string_match", + "note": "Switched to exact match" + } + ] + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 78, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "What is the total count of {{status}} reviews amongst all the reviews?", + "instantiation_dict": { + "status": "Approved" + }, + "intent": "What is the total count of Approved reviews amongst all the reviews?", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "verify_retrieved_value_normalized", + "eval_params": { + "ordered": false, + "type": "numeric" + }, + "expected_data": [ + 346 + ] + } + } + ], + "site": "shopping_admin" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "346" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "346" + }, + "intent_template_id": 277, + "changelogs": [ + { + "key": "eval", + "category": "permissive_string_match", + "note": "Switched to exact match" + } + ] + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 79, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "What is the total count of {{status}} reviews amongst all the reviews?", + "instantiation_dict": { + "status": "Not Approved" + }, + "intent": "What is the total count of Not Approved reviews amongst all the reviews?", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "verify_retrieved_value_normalized", + "eval_params": { + "ordered": false, + "type": "numeric" + }, + "expected_data": [ + 0 + ] + } + } + ], + "site": "shopping_admin" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "0" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "0" + }, + "intent_template_id": 277, + "changelogs": [ + { + "key": "eval", + "category": "permissive_string_match", + "note": "Switched to exact match" + } + ] + }, + { + "sites": [ + "map" + ], + "task_id": 80, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "What is the duration required to first walk from {{place_A}} to {{place_B}}, and then drive to {{place_C}}?", + "instantiation_dict": { + "place_A": "Carnegie Mellon University", + "place_B": "Starbucks on Craig Street", + "place_C": "Pittsburgh International Airport" + }, + "intent": "What is the duration required to first walk from Carnegie Mellon University to Starbucks on Craig Street, and then drive to Pittsburgh International Airport?", + "require_reset": false, + "eval": { + "site": "map" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": [ + "38 min" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "38 min" + }, + "intent_template_id": 72, + "changelogs": [ + { + "key": "eval", + "category": "permissive_string_match", + "note": "Switched from fuzzy eval" + } + ] + }, + { + "sites": [ + "map" + ], + "task_id": 81, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "What is the duration required to first walk from {{place_A}} to {{place_B}}, and then drive to {{place_C}}?", + "instantiation_dict": { + "place_A": "Univ of Pittsburgh", + "place_B": "starbucks on Craig Street", + "place_C": "Pittsburgh International Airport" + }, + "intent": "What is the duration required to first walk from Univ of Pittsburgh to starbucks on Craig Street, and then drive to Pittsburgh International Airport?", + "require_reset": false, + "eval": { + "site": "map" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": [ + "49 min" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "49 min" + }, + "intent_template_id": 72, + "changelogs": [ + { + "key": "eval", + "category": "permissive_string_match", + "note": "Switched from fuzzy eval" + } + ] + }, + { + "sites": [ + "map" + ], + "task_id": 82, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "What is the duration required to first walk from {{place_A}} to {{place_B}}, and then drive to {{place_C}}?", + "instantiation_dict": { + "place_A": "Massachusetts Institute of Technology", + "place_B": "Harvard University", + "place_C": "Boston Logan International Airport" + }, + "intent": "What is the duration required to first walk from Massachusetts Institute of Technology to Harvard University, and then drive to Boston Logan International Airport?", + "require_reset": false, + "eval": { + "site": "map" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": [ + "63 min" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "63 min" + }, + "intent_template_id": 72, + "changelogs": [ + { + "key": "eval", + "category": "permissive_string_match", + "note": "Switched from fuzzy eval" + } + ] + }, + { + "sites": [ + "map" + ], + "task_id": 83, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "What is the duration required to first walk from {{place_A}} to {{place_B}}, and then drive to {{place_C}}?", + "instantiation_dict": { + "place_A": "Carnegie Mellon University", + "place_B": "apple store shadyside", + "place_C": "starbucks on craig street" + }, + "intent": "What is the duration required to first walk from Carnegie Mellon University to apple store shadyside, and then drive to starbucks on craig street?", + "require_reset": false, + "eval": { + "site": "map" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": [ + "22 min" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "22 min" + }, + "intent_template_id": 72, + "changelogs": [ + { + "key": "eval", + "category": "permissive_string_match", + "note": "Switched from fuzzy eval" + } + ] + }, + { + "sites": [ + "map" + ], + "task_id": 84, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "From my stay at {{hotel}}, what's the estimated driving time to reach {{place}}?", + "instantiation_dict": { + "hotel": "DoubleTree by Hilton New York Downtown", + "place": "Keens Steakhouse" + }, + "intent": "From my stay at DoubleTree by Hilton New York Downtown, what's the estimated driving time to reach Keens Steakhouse?", + "require_reset": false, + "eval": { + "site": "map" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": [ + "14 minutes" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "14 minutes" + }, + "intent_template_id": 64, + "changelogs": [ + { + "key": "eval", + "category": "permissive_string_match", + "note": "Switched from fuzzy eval" + } + ] + }, + { + "sites": [ + "map" + ], + "task_id": 85, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "From my stay at {{hotel}}, what's the estimated driving time to reach {{place}}?", + "instantiation_dict": { + "hotel": "La Quinta Inn near the airport", + "place": "Carnegie Mellon University" + }, + "intent": "From my stay at La Quinta Inn near the airport, what's the estimated driving time to reach Carnegie Mellon University?", + "require_reset": false, + "eval": { + "site": "map" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": [ + "30 minutes" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "30 minutes" + }, + "intent_template_id": 64, + "changelogs": [ + { + "key": "eval", + "category": "permissive_string_match", + "note": "Switched from fuzzy eval" + } + ] + }, + { + "sites": [ + "map" + ], + "task_id": 86, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "From my stay at {{hotel}}, what's the estimated driving time to reach {{place}}?", + "instantiation_dict": { + "hotel": "La Quinta Inn near the airport", + "place": "Upitt" + }, + "intent": "From my stay at La Quinta Inn near the airport, what's the estimated driving time to reach Upitt?", + "require_reset": false, + "eval": { + "site": "map" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": [ + "29 minutes" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "29 minutes" + }, + "intent_template_id": 64, + "changelogs": [ + { + "key": "eval", + "category": "permissive_string_match", + "note": "Switched from fuzzy eval" + } + ] + }, + { + "sites": [ + "map" + ], + "task_id": 87, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "From my stay at {{hotel}}, what's the estimated driving time to reach {{place}}?", + "instantiation_dict": { + "hotel": "red roof inn", + "place": "Pittsburgh science museum" + }, + "intent": "From my stay at red roof inn, what's the estimated driving time to reach Pittsburgh science museum?", + "require_reset": false, + "eval": { + "site": "map" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": [ + "20 minutes" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "20 minutes" + }, + "intent_template_id": 64, + "changelogs": [ + { + "key": "eval", + "category": "permissive_string_match", + "note": "Switched from fuzzy eval" + } + ] + }, + { + "sites": [ + "map" + ], + "task_id": 88, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "From my stay at {{hotel}}, what's the estimated driving time to reach {{place}}?", + "instantiation_dict": { + "hotel": "Homewood Suites Southpointe", + "place": "PPG Paints Arena" + }, + "intent": "From my stay at Homewood Suites Southpointe, what's the estimated driving time to reach PPG Paints Arena?", + "require_reset": false, + "eval": { + "site": "map" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": [ + "34 minutes" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "34 minutes" + }, + "intent_template_id": 64, + "changelogs": [ + { + "key": "eval", + "category": "permissive_string_match", + "note": "Switched from fuzzy eval" + } + ] + }, + { + "sites": [ + "map" + ], + "task_id": 89, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "Which US states border {{state}}?", + "instantiation_dict": { + "state": "Connecticut" + }, + "intent": "Which US states border Connecticut?", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "verify_retrieved_value_normalized", + "eval_params": { + "ordered": false, + "type": "text" + }, + "expected_data": [ + "Rhode Island", + "Massachusetts", + "New York" + ] + } + } + ], + "site": "map" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "Rhode Island", + "Massachusetts", + "New York" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "Rhode Island, Massachusetts, New York" + }, + "intent_template_id": 67, + "changelogs": [ + { + "key": "eval", + "category": "permissive_string_match", + "note": "Switched to exact match" + } + ] + }, + { + "sites": [ + "map" + ], + "task_id": 90, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "Which US states border {{state}}?", + "instantiation_dict": { + "state": "Pennsylvania" + }, + "intent": "Which US states border Pennsylvania?", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "verify_retrieved_value_normalized", + "eval_params": { + "ordered": false, + "type": "text" + }, + "expected_data": [ + "Ohio", + "Maryland", + "New York", + "New Jersey", + "Delaware", + "West Virginia" + ] + } + } + ], + "site": "map" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "Ohio", + "Maryland", + "New York", + "New Jersey", + "Delaware", + "West Virginia" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "Ohio, Maryland, New York, New Jersey, Delaware, West Virginia" + }, + "intent_template_id": 67, + "changelogs": [ + { + "key": "eval", + "category": "permissive_string_match", + "note": "Switched to exact match" + } + ] + }, + { + "sites": [ + "map" + ], + "task_id": 91, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "Which US states border {{state}}?", + "instantiation_dict": { + "state": "Massachusetts" + }, + "intent": "Which US states border Massachusetts?", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "verify_retrieved_value_normalized", + "eval_params": { + "ordered": false, + "type": "text" + }, + "expected_data": [ + "Rhode Island", + "Connecticut", + "New York", + "New Hampshire", + "Vermont" + ] + } + } + ], + "site": "map" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "Rhode Island", + "Connecticut", + "New York", + "New Hampshire", + "Vermont" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "Rhode Island, Connecticut, New York, New Hampshire, Vermont" + }, + "intent_template_id": 67, + "changelogs": [ + { + "key": "eval", + "category": "permissive_string_match", + "note": "Switched to exact match" + } + ] + }, + { + "sites": [ + "map" + ], + "task_id": 92, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "Which US states border {{state}}?", + "instantiation_dict": { + "state": "Vermont" + }, + "intent": "Which US states border Vermont?", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "verify_retrieved_value_normalized", + "eval_params": { + "ordered": false, + "type": "text" + }, + "expected_data": [ + "New York", + "New Hampshire", + "Massachusetts" + ] + } + } + ], + "site": "map" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "New York", + "New Hampshire", + "Massachusetts" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "New York, New Hampshire, Massachusetts" + }, + "intent_template_id": 67, + "changelogs": [ + { + "key": "eval", + "category": "permissive_string_match", + "note": "Switched to exact match" + } + ] + }, + { + "sites": [ + "map" + ], + "task_id": 93, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "Which US states border {{state}}?", + "instantiation_dict": { + "state": "New Hampshire" + }, + "intent": "Which US states border New Hampshire?", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "verify_retrieved_value_normalized", + "eval_params": { + "ordered": false, + "type": "text" + }, + "expected_data": [ + "Massachusetts", + "Vermont", + "Maine" + ] + } + } + ], + "site": "map" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "Massachusetts", + "Vermont", + "Maine" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "Massachusetts, Vermont, Maine" + }, + "intent_template_id": 67, + "changelogs": [ + { + "key": "eval", + "category": "permissive_string_match", + "note": "Switched to exact match" + } + ] + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 94, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Tell me the grand total of invoice {{id}}.", + "original.intent_template": "Telll me the grand total of invoice {{id}}.", + "instantiation_dict": { + "id": "000000001" + }, + "intent": "Tell me the grand total of invoice 000000001.", + "original.intent": "Telll me the grand total of invoice 000000001.", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "verify_retrieved_value_normalized", + "eval_params": { + "ordered": false, + "type": "currency" + }, + "expected_data": [ + "36.39" + ] + } + } + ], + "site": "shopping_admin" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "36.39" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "$36.39" + }, + "intent_template_id": 274, + "changelogs": [ + { + "key": "intent_template", + "category": "spelling_or_grammar", + "note": "Update Tell spelling" + }, + { + "key": "eval", + "category": "permissive_string_match", + "note": "Switched to exact match" + } + ] + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 95, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Tell me the grand total of invoice {{id}}.", + "original.intent_template": "Telll me the grand total of invoice {{id}}.", + "instantiation_dict": { + "id": "000000002" + }, + "intent": "Tell me the grand total of invoice 000000002.", + "original.intent": "Telll me the grand total of invoice 000000002.", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "verify_retrieved_value_normalized", + "eval_params": { + "ordered": false, + "type": "currency" + }, + "expected_data": [ + "39.64" + ] + } + } + ], + "site": "shopping_admin" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "39.64" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "$39.64" + }, + "intent_template_id": 274, + "changelogs": [ + { + "key": "intent_template", + "category": "spelling_or_grammar", + "note": "Update Tell spelling" + }, + { + "key": "eval", + "category": "permissive_string_match", + "note": "Switched to exact match" + } + ] + }, + { + "sites": [ + "shopping" + ], + "task_id": 96, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "Tell me the status of my latest order and when will it arrive", + "instantiation_dict": {}, + "intent": "Tell me the status of my latest order and when will it arrive", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "verify_retrieved_value_normalized", + "eval_params": { + "ordered": false, + "type": "object" + }, + "expected_data": [ + { + "status": { + "value": "canceled", + "type": "text" + }, + "arrival_date": { + "value": null, + "type": "text" + } + } + ] + } + } + ], + "site": "shopping" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": [ + "The last order was canceled. It will never arrive." + ] + }, + "reference_url": "", + "program_html": [], + "reference_answer_raw_annotation": "The last order was canceled. It will never arrive.", + "string_note": "" + }, + "intent_template_id": 193, + "format_specification": "Use \"status\" for status and \"arrival_date\" for arrival date (null if not applicable).", + "changelogs": [ + { + "key": "eval", + "category": "permissive_string_match", + "note": "Switched from fuzzy eval" + } + ] + }, + { + "sites": [ + "map", + "wikipedia" + ], + "task_id": 97, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "Tell me the distance to drive from Carnegie Mellon University to the top computer science school in massachusetts", + "instantiation_dict": {}, + "intent": "Tell me the distance to drive from Carnegie Mellon University to the top computer science school in massachusetts", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "verify_retrieved_value_normalized", + "eval_params": { + "ordered": false, + "type": "text" + }, + "expected_data": [ + "914km" + ] + } + } + ], + "site": "map" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "914km" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "914 km" + }, + "intent_template_id": 120, + "changelogs": [ + { + "key": "eval", + "category": "permissive_string_match", + "note": "Switched to exact match" + } + ] + }, + { + "sites": [ + "map" + ], + "task_id": 98, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "Where is the nearest {{places}} to {{start}}, and what is the walking distance to it?", + "instantiation_dict": { + "places": "tea cafe", + "start": "University of Pittsburgh" + }, + "intent": "Where is the nearest tea cafe to University of Pittsburgh, and what is the walking distance to it?", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "verify_retrieved_value_normalized", + "eval_params": { + "ordered": false, + "type": "text" + }, + "expected_data": [ + "Fuku Tea", + "3716", + "Forbes Avenue", + "Central Oakland", + "Pittsburgh", + "653m" + ] + } + } + ], + "site": "map" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "Fuku Tea", + "3716", + "Forbes Avenue", + "Central Oakland", + "Pittsburgh", + "653m" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "Fuku Tea, 3716, Forbes Avenue, Oakland, Central Oakland, Pittsburgh, Allegheny County, Pennsylvania, 15213, United States\n653m" + }, + "intent_template_id": 66, + "changelogs": [ + { + "key": "eval", + "category": "permissive_string_match", + "note": "Switched to exact match" + } + ] + }, + { + "sites": [ + "map" + ], + "task_id": 99, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "Where is the nearest {{places}} to {{start}}, and what is the walking distance to it?", + "instantiation_dict": { + "places": "Five Guys", + "start": "5700 Penn Ave" + }, + "intent": "Where is the nearest Five Guys to 5700 Penn Ave, and what is the walking distance to it?", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "verify_retrieved_value_normalized", + "eval_params": { + "ordered": false, + "type": "text" + }, + "expected_data": [ + "Five Guys", + "117", + "South Bouquet Street", + "North Oakland", + "Pittsburgh", + "4.0km" + ] + } + } + ], + "site": "map" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "Five Guys", + "117", + "South Bouquet Street", + "North Oakland", + "Pittsburgh", + "4.0km" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "Five Guys, 117, South Bouquet Street, Oakland, North Oakland, Pittsburgh, Allegheny County, Pennsylvania, 15213, United States\n4.0km" + }, + "intent_template_id": 66, + "changelogs": [ + { + "key": "eval", + "category": "permissive_string_match", + "note": "Switched to exact match" + } + ] + }, + { + "sites": [ + "map" + ], + "task_id": 100, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "Where is the nearest {{places}} to {{start}}, and what is the walking distance to it?", + "instantiation_dict": { + "places": "Starbucks", + "start": "Carnegie Mellon" + }, + "intent": "Where is the nearest Starbucks to Carnegie Mellon, and what is the walking distance to it?", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "verify_retrieved_value_normalized", + "eval_params": { + "ordered": false, + "type": "text" + }, + "expected_data": [ + "Starbucks", + "417", + "South Craig Street", + "Bellefield", + "Pittsburgh", + "557m" + ] + } + } + ], + "site": "map" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "Starbucks", + "417", + "South Craig Street", + "Bellefield", + "Pittsburgh", + "557m" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "Starbucks, 417, South Craig Street, Bellefield, Pittsburgh, Allegheny County, Pennsylvania, 15213, United States\n557m" + }, + "intent_template_id": 66, + "changelogs": [ + { + "key": "eval", + "category": "permissive_string_match", + "note": "Switched to exact match" + } + ] + }, + { + "sites": [ + "map" + ], + "task_id": 101, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "Where is the nearest {{places}} to {{start}}, and what is the walking distance to it?", + "instantiation_dict": { + "places": "In-N-Out", + "start": "Upitts" + }, + "intent": "Where is the nearest In-N-Out to Upitts, and what is the walking distance to it?", + "require_reset": false, + "eval": { + "site": "map" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": "N/A" + }, + "reference_url": "", + "program_html": [], + "string_note": "There is no In-N-Out near University of Pittsburgh", + "reference_answer_raw_annotation": "N/A" + }, + "intent_template_id": 66, + "changelogs": [ + { + "key": "expected_retrieve_value", + "category": "unachievable_tasks", + "note": "Status check used instead of fuzzy match" + } + ] + }, + { + "sites": [ + "gitlab" + ], + "task_id": 102, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Navigate to and display the list of issues in the {{repo}} repository that have labels related to {{label}}", + "original.intent_template": "Display the list of issues in the {{repo}} repository that have labels related to {{label}}", + "instantiation_dict": { + "label": "help wanted", + "repo": "a11yproject/a11yproject.com" + }, + "original.instantiation_dict": { + "label": "help needed", + "repo": "a11yproject/a11yproject.com" + }, + "intent": "Navigate to and display the list of issues in the a11yproject/a11yproject.com repository that have labels related to help wanted", + "original.intent": "Display the list of issues in the a11yproject/a11yproject.com repository that have labels related to help needed", + "require_reset": false, + "eval": { + "expected_ui_state": [ + { + "url": "__GITLAB__/byteblaze/a11y-syntax-highlighting/-/issues/?label_name%5B%5D=help%20wanted" + } + ], + "site": "gitlab" + }, + "original.eval": { + "eval_types": [ + "url_match" + ], + "reference_answers": null, + "reference_url": "__GITLAB__/byteblaze/a11y-syntax-highlighting/-/issues/?label_name%5B%5D=help%20wanted", + "program_html": [], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 349, + "changelogs": [ + { + "key": "intent_template", + "category": "clarify_instructions", + "note": "Clarify the intent as a navigation task" + }, + { + "key": "instantiation_dict", + "category": "reference_alignment", + "note": "Use available label 'help wanted'; correct org/repo" + } + ] + }, + { + "sites": [ + "gitlab" + ], + "task_id": 103, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Navigate to and display the list of issues in the {{repo}} repository that have labels related to {{label}}", + "original.intent_template": "Display the list of issues in the {{repo}} repository that have labels related to {{label}}", + "instantiation_dict": { + "label": "question", + "repo": "kkroening/ffmpeg-python" + }, + "original.instantiation_dict": { + "label": "questions", + "repo": "kkroening/ffmpeg-python" + }, + "intent": "Navigate to and display the list of issues in the kkroening/ffmpeg-python repository that have labels related to question", + "original.intent": "Display the list of issues in the kkroening/ffmpeg-python repository that have labels related to questions", + "require_reset": false, + "eval": { + "expected_ui_state": [ + { + "url": "__GITLAB__/kkroening/ffmpeg-python/-/issues/?label_name%5B%5D=question" + } + ], + "site": "gitlab" + }, + "original.eval": { + "eval_types": [ + "url_match" + ], + "reference_answers": null, + "reference_url": "__GITLAB__/kkroening/ffmpeg-python/-/issues/?label_name%5B%5D=question", + "program_html": [], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 349, + "changelogs": [ + { + "key": "intent_template", + "category": "clarify_instructions", + "note": "Clarify the intent as a navigation task" + }, + { + "key": "instantiation_dict", + "category": "spelling_or_grammar", + "note": "Use singular 'question' label" + } + ] + }, + { + "sites": [ + "gitlab" + ], + "task_id": 104, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Navigate to and display the list of issues in the {{repo}} repository that have labels related to {{label}}", + "original.intent_template": "Display the list of issues in the {{repo}} repository that have labels related to {{label}}", + "instantiation_dict": { + "label": "flaky-test", + "repo": "keycloak/keycloak" + }, + "intent": "Navigate to and display the list of issues in the keycloak/keycloak repository that have labels related to flaky-test", + "original.intent": "Display the list of issues in the keycloak/keycloak repository that have labels related to flaky-test", + "require_reset": false, + "eval": { + "expected_ui_state": [ + { + "url": "__GITLAB__/keycloak/keycloak/-/issues/?label_name%5B%5D=flaky-test" + } + ], + "site": "gitlab" + }, + "original.eval": { + "eval_types": [ + "url_match" + ], + "reference_answers": null, + "reference_url": "__GITLAB__/keycloak/keycloak/-/issues/?label_name%5B%5D=flaky-test", + "program_html": [], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 349, + "changelogs": [ + { + "key": "intent_template", + "category": "clarify_instructions", + "note": "Clarify the intent as a navigation task" + } + ] + }, + { + "sites": [ + "gitlab" + ], + "task_id": 105, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Navigate to and display the list of issues in the {{repo}} repository that have labels related to {{label}}", + "original.intent_template": "Display the list of issues in the {{repo}} repository that have labels related to {{label}}", + "instantiation_dict": { + "label": "OpenAPI Generator CLI", + "repo": "OpenAPITools/openapi-generator" + }, + "intent": "Navigate to and display the list of issues in the OpenAPITools/openapi-generator repository that have labels related to OpenAPI Generator CLI", + "original.intent": "Display the list of issues in the OpenAPITools/openapi-generator repository that have labels related to OpenAPI Generator CLI", + "require_reset": false, + "eval": { + "expected_ui_state": [ + { + "url": "__GITLAB__/OpenAPITools/openapi-generator/-/issues/?label_name%5B%5D=OpenAPI%20Generator%20CLI" + } + ], + "site": "gitlab" + }, + "original.eval": { + "eval_types": [ + "url_match" + ], + "reference_answers": null, + "reference_url": "__GITLAB__/OpenAPITools/openapi-generator/-/issues/?label_name%5B%5D=OpenAPI%20Generator%20CLI", + "program_html": [], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 349, + "changelogs": [ + { + "key": "intent_template", + "category": "clarify_instructions", + "note": "Clarify the intent as a navigation task" + } + ] + }, + { + "sites": [ + "gitlab" + ], + "task_id": 106, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Navigate to and display the list of issues in the {{repo}} repository that have labels related to {{label}}", + "original.intent_template": "Display the list of issues in the {{repo}} repository that have labels related to {{label}}", + "instantiation_dict": { + "label": "BUG", + "repo": "umano/AndroidSlidingUpPanel" + }, + "intent": "Navigate to and display the list of issues in the umano/AndroidSlidingUpPanel repository that have labels related to BUG", + "original.intent": "Display the list of issues in the umano/AndroidSlidingUpPanel repository that have labels related to BUG", + "require_reset": false, + "eval": { + "expected_ui_state": [ + { + "url": "__GITLAB__/umano/AndroidSlidingUpPanel/-/issues/?label_name%5B%5D=BUG" + } + ], + "site": "gitlab" + }, + "original.eval": { + "eval_types": [ + "url_match" + ], + "reference_answers": null, + "reference_url": "__GITLAB__/umano/AndroidSlidingUpPanel/-/issues/?label_name%5B%5D=BUG", + "program_html": [], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 349, + "changelogs": [ + { + "key": "intent_template", + "category": "clarify_instructions", + "note": "Clarify the intent as a navigation task" + } + ] + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 107, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Get the monthly count of successful orders {{period}}", + "original.intent_template": "Presents the monthly count of successful orders {{period}} in MM:COUNT format", + "instantiation_dict": { + "period": "from May to December 2022" + }, + "intent": "Get the monthly count of successful orders from May to December 2022", + "original.intent": "Presents the monthly count of successful orders from May to December 2022 in MM:COUNT format", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "verify_retrieved_value_normalized", + "eval_params": { + "ordered": false, + "type": "object" + }, + "expected_data": [ + { + "month": { + "value": "May", + "type": "month" + }, + "count": { + "value": 8, + "type": "numeric" + } + }, + { + "month": { + "value": "June", + "type": "month" + }, + "count": { + "value": 13, + "type": "numeric" + } + }, + { + "month": { + "value": "July", + "type": "month" + }, + "count": { + "value": 9, + "type": "numeric" + } + }, + { + "month": { + "value": "August", + "type": "month" + }, + "count": { + "value": 8, + "type": "numeric" + } + }, + { + "month": { + "value": "Sepetember", + "type": "month" + }, + "count": { + "value": 10, + "type": "numeric" + } + }, + { + "month": { + "value": "October", + "type": "month" + }, + "count": { + "value": 4, + "type": "numeric" + } + }, + { + "month": { + "value": "November", + "type": "month" + }, + "count": { + "value": 5, + "type": "numeric" + } + }, + { + "month": { + "value": "December", + "type": "month" + }, + "count": { + "value": 10, + "type": "numeric" + } + } + ] + } + } + ], + "site": "shopping_admin" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": [ + "May: 8 orders", + "June: 13 orders", + "July: 9 orders", + "August: 8 orders", + "Sepetember: 10 orders", + "October: 4 orders", + "November: 5 orders", + "December: 10 orders" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "May: 8 orders June: 13 orders July: 9 orders August: 8 orders Sepetember: 10 orders Octorbor: 4 orders November: 5 orders December: 10 orders " + }, + "intent_template_id": 270, + "format_specification": "Use \"month\" for the month and \"count\" for the count.", + "changelogs": [ + { + "key": "intent_template", + "category": "task_ambiguity", + "note": "'Presents' does not match the evaluation that checks for a return value" + }, + { + "key": "eval", + "category": "permissive_string_match", + "note": "Switched from fuzzy eval" + } + ] + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 108, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Get the monthly count of successful orders {{period}}", + "original.intent_template": "Presents the monthly count of successful orders {{period}} in MM:COUNT format", + "instantiation_dict": { + "period": "01/2023-05/2023" + }, + "intent": "Get the monthly count of successful orders 01/2023-05/2023", + "original.intent": "Presents the monthly count of successful orders 01/2023-05/2023 in MM:COUNT format", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "verify_retrieved_value_normalized", + "eval_params": { + "ordered": false, + "type": "object" + }, + "expected_data": [ + { + "month": { + "value": "January", + "type": "month" + }, + "count": { + "value": 12, + "type": "numeric" + } + }, + { + "month": { + "value": "February", + "type": "month" + }, + "count": { + "value": 7, + "type": "numeric" + } + }, + { + "month": { + "value": "March", + "type": "month" + }, + "count": { + "value": 5, + "type": "numeric" + } + }, + { + "month": { + "value": "April", + "type": "month" + }, + "count": { + "value": 9, + "type": "numeric" + } + }, + { + "month": { + "value": "May", + "type": "month" + }, + "count": { + "value": 5, + "type": "numeric" + } + } + ] + } + } + ], + "site": "shopping_admin" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": [ + "January: 12 orders", + "Feburary: 7 orders", + "March: 5 orders", + "April: 9 orders", + "May: 5 orders" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "January: 12 orders Febulary: 7 orders March: 5 orders Apirl: 9 orders May: 5 orders" + }, + "intent_template_id": 270, + "format_specification": "Use \"month\" for the month and \"count\" for the count.", + "changelogs": [ + { + "key": "intent_template", + "category": "task_ambiguity", + "note": "'Presents' does not match the evaluation that checks for a return value" + }, + { + "key": "eval", + "category": "permissive_string_match", + "note": "Switched from fuzzy eval" + } + ] + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 109, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Get the monthly count of successful orders {{period}}", + "original.intent_template": "Presents the monthly count of successful orders {{period}} in MM:COUNT format", + "instantiation_dict": { + "period": "from Jan to December 2022" + }, + "intent": "Get the monthly count of successful orders from Jan to December 2022", + "original.intent": "Presents the monthly count of successful orders from Jan to December 2022 in MM:COUNT format", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "verify_retrieved_value_normalized", + "eval_params": { + "ordered": false, + "type": "object" + }, + "expected_data": [ + { + "month": { + "value": "January", + "type": "month" + }, + "count": { + "value": 11, + "type": "numeric" + } + }, + { + "month": { + "value": "February", + "type": "month" + }, + "count": { + "value": 16, + "type": "numeric" + } + }, + { + "month": { + "value": "March", + "type": "month" + }, + "count": { + "value": 14, + "type": "numeric" + } + }, + { + "month": { + "value": "April", + "type": "month" + }, + "count": { + "value": 7, + "type": "numeric" + } + }, + { + "month": { + "value": "May", + "type": "month" + }, + "count": { + "value": 8, + "type": "numeric" + } + }, + { + "month": { + "value": "June", + "type": "month" + }, + "count": { + "value": 13, + "type": "numeric" + } + }, + { + "month": { + "value": "July", + "type": "month" + }, + "count": { + "value": 9, + "type": "numeric" + } + }, + { + "month": { + "value": "August", + "type": "month" + }, + "count": { + "value": 8, + "type": "numeric" + } + }, + { + "month": { + "value": "September", + "type": "month" + }, + "count": { + "value": 10, + "type": "numeric" + } + }, + { + "month": { + "value": "October", + "type": "month" + }, + "count": { + "value": 4, + "type": "numeric" + } + }, + { + "month": { + "value": "November", + "type": "month" + }, + "count": { + "value": 5, + "type": "numeric" + } + }, + { + "month": { + "value": "December", + "type": "month" + }, + "count": { + "value": 10, + "type": "numeric" + } + } + ] + } + } + ], + "site": "shopping_admin" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": [ + "January: 11 orders", + "Feburary: 16 orders", + "March: 14 orders", + "April: 7 orders", + "May: 8 orders", + "June: 13 orders", + "July: 9 orders", + "August: 8 orders", + "Sepetember: 10 orders", + "Octorbor: 4 orders", + "November: 5 orders", + "December: 10 orders" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "January: 11 orders Feburary: 16 orders March: 14 orders April: 7 orders May: 8 orders June: 13 orders July: 9 orders August: 8 orders Sepetember: 10 orders Octorbor: 4 orders November: 5 orders December: 10 orders " + }, + "intent_template_id": 270, + "format_specification": "Use \"month\" for the month and \"count\" for the count.", + "changelogs": [ + { + "key": "intent_template", + "category": "task_ambiguity", + "note": "'Presents' does not match the evaluation that checks for a return value" + }, + { + "key": "eval", + "category": "permissive_string_match", + "note": "Switched from fuzzy eval" + } + ] + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 110, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Get the monthly count of successful orders {{period}}", + "original.intent_template": "Presents the monthly count of successful orders {{period}} in MM:COUNT format", + "instantiation_dict": { + "period": "from Jan to Nov 2022" + }, + "intent": "Get the monthly count of successful orders from Jan to Nov 2022", + "original.intent": "Presents the monthly count of successful orders from Jan to Nov 2022 in MM:COUNT format", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "verify_retrieved_value_normalized", + "eval_params": { + "ordered": false, + "type": "object" + }, + "expected_data": [ + { + "month": { + "value": "January", + "type": "month" + }, + "count": { + "value": 11, + "type": "numeric" + } + }, + { + "month": { + "value": "February", + "type": "month" + }, + "count": { + "value": 16, + "type": "numeric" + } + }, + { + "month": { + "value": "March", + "type": "month" + }, + "count": { + "value": 14, + "type": "numeric" + } + }, + { + "month": { + "value": "April", + "type": "month" + }, + "count": { + "value": 7, + "type": "numeric" + } + }, + { + "month": { + "value": "May", + "type": "month" + }, + "count": { + "value": 8, + "type": "numeric" + } + }, + { + "month": { + "value": "June", + "type": "month" + }, + "count": { + "value": 13, + "type": "numeric" + } + }, + { + "month": { + "value": "July", + "type": "month" + }, + "count": { + "value": 9, + "type": "numeric" + } + }, + { + "month": { + "value": "August", + "type": "month" + }, + "count": { + "value": 8, + "type": "numeric" + } + }, + { + "month": { + "value": "September", + "type": "month" + }, + "count": { + "value": 10, + "type": "numeric" + } + }, + { + "month": { + "value": "October", + "type": "month" + }, + "count": { + "value": 4, + "type": "numeric" + } + }, + { + "month": { + "value": "November", + "type": "month" + }, + "count": { + "value": 5, + "type": "numeric" + } + } + ] + } + } + ], + "site": "shopping_admin" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": [ + "January: 11 orders", + "Feburary: 16 orders", + "March: 14 orders", + "April: 7 orders", + "May: 8 orders", + "June: 13 orders", + "July: 9 orders", + "August: 8 orders", + "Sepetember: 10 orders", + "Octorbor: 4 orders", + "November: 5 orders" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "January: 11 orders Feburary: 16 orders March: 14 orders April: 7 orders May: 8 orders June: 13 orders July: 9 orders August: 8 orders Sepetember: 10 orders Octorbor: 4 orders November: 5 orders " + }, + "intent_template_id": 270, + "format_specification": "Use \"month\" for the month and \"count\" for the count.", + "changelogs": [ + { + "key": "intent_template", + "category": "task_ambiguity", + "note": "'Presents' does not match the evaluation that checks for a return value" + }, + { + "key": "eval", + "category": "permissive_string_match", + "note": "Switched from fuzzy eval" + } + ] + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 111, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Get the monthly count of successful orders {{period}}", + "original.intent_template": "Presents the monthly count of successful orders {{period}} in MM:COUNT format", + "instantiation_dict": { + "period": "from Feb to Nov 2022" + }, + "intent": "Get the monthly count of successful orders from Feb to Nov 2022", + "original.intent": "Presents the monthly count of successful orders from Feb to Nov 2022 in MM:COUNT format", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "verify_retrieved_value_normalized", + "eval_params": { + "ordered": false, + "type": "object" + }, + "expected_data": [ + { + "month": { + "value": "February", + "type": "month" + }, + "count": { + "value": 16, + "type": "numeric" + } + }, + { + "month": { + "value": "March", + "type": "month" + }, + "count": { + "value": 14, + "type": "numeric" + } + }, + { + "month": { + "value": "April", + "type": "month" + }, + "count": { + "value": 7, + "type": "numeric" + } + }, + { + "month": { + "value": "May", + "type": "month" + }, + "count": { + "value": 8, + "type": "numeric" + } + }, + { + "month": { + "value": "June", + "type": "month" + }, + "count": { + "value": 13, + "type": "numeric" + } + }, + { + "month": { + "value": "July", + "type": "month" + }, + "count": { + "value": 9, + "type": "numeric" + } + }, + { + "month": { + "value": "August", + "type": "month" + }, + "count": { + "value": 8, + "type": "numeric" + } + }, + { + "month": { + "value": "September", + "type": "month" + }, + "count": { + "value": 10, + "type": "numeric" + } + }, + { + "month": { + "value": "October", + "type": "month" + }, + "count": { + "value": 4, + "type": "numeric" + } + }, + { + "month": { + "value": "November", + "type": "month" + }, + "count": { + "value": 5, + "type": "numeric" + } + } + ] + } + } + ], + "site": "shopping_admin" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": [ + "Feburary: 16 orders", + "March: 14 orders", + "April: 7 orders", + "May: 8 orders", + "June: 13 orders", + "July: 9 orders", + "August: 8 orders", + "Sepetember: 10 orders", + "Octorbor: 4 orders", + "November: 5 orders" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "Feburary: 16 orders March: 14 orders April: 7 orders May: 8 orders June: 13 orders July: 9 orders August: 8 orders Sepetember: 10 orders Octorbor: 4 orders November: 5 orders " + }, + "intent_template_id": 270, + "format_specification": "Use \"month\" for the month and \"count\" for the count.", + "changelogs": [ + { + "key": "intent_template", + "category": "task_ambiguity", + "note": "'Presents' does not match the evaluation that checks for a return value" + }, + { + "key": "eval", + "category": "permissive_string_match", + "note": "Switched from fuzzy eval" + } + ] + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 112, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Return the customer names for those who gave a rating of 3 stars or below for {{product}}?", + "original.intent_template": "Show me the customers who have expressed dissatisfaction with {{product}}?", + "instantiation_dict": { + "product": "Circe fleece" + }, + "intent": "Return the customer names for those who gave a rating of 3 stars or below for Circe fleece?", + "original.intent": "Show me the customers who have expressed dissatisfaction with Circe fleece?", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "verify_retrieved_value_normalized", + "eval_params": { + "ordered": false, + "type": "text" + }, + "expected_data": [ + "Hannah Lim" + ] + } + } + ], + "site": "shopping_admin" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "exact_match": "Hannah Lim" + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "Hannah Lim" + }, + "intent_template_id": 245, + "changelogs": [ + { + "key": "intent_template", + "category": "task_ambiguity", + "note": "Clarify the intent to specify return value instead of navigation" + } + ] + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 113, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Return the customer names for those who gave a rating of 3 stars or below for {{product}}?", + "original.intent_template": "Show me the customers who have expressed dissatisfaction with {{product}}?", + "instantiation_dict": { + "product": "Olivia zip jacket" + }, + "intent": "Return the customer names for those who gave a rating of 3 stars or below for Olivia zip jacket?", + "original.intent": "Show me the customers who have expressed dissatisfaction with Olivia zip jacket?", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "verify_retrieved_value_normalized", + "eval_params": { + "ordered": false, + "type": "text" + }, + "expected_data": [ + "Emma Lopez", + "Seam Miller" + ] + } + } + ], + "site": "shopping_admin" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "Emma Lopez", + "Seam Miller" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "Emma Lopez, Seam Miller" + }, + "intent_template_id": 245, + "changelogs": [ + { + "key": "intent_template", + "category": "task_ambiguity", + "note": "Clarify the intent to specify return value instead of navigation" + }, + { + "key": "eval", + "category": "permissive_string_match", + "note": "Switched to exact match" + } + ] + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 114, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Return the customer names for those who gave a rating of 3 stars or below for {{product}}?", + "original.intent_template": "Show me the customers who have expressed dissatisfaction with {{product}}?", + "instantiation_dict": { + "product": "Antonia racer tank" + }, + "intent": "Return the customer names for those who gave a rating of 3 stars or below for Antonia racer tank?", + "original.intent": "Show me the customers who have expressed dissatisfaction with Antonia racer tank?", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "verify_retrieved_value_normalized", + "eval_params": { + "ordered": false, + "type": "text" + }, + "expected_data": [ + "Shaunte", + "Merrie" + ] + } + } + ], + "site": "shopping_admin" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "Shaunte", + "Merrie" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "Shaunte, Merrie" + }, + "intent_template_id": 245, + "changelogs": [ + { + "key": "intent_template", + "category": "task_ambiguity", + "note": "Clarify the intent to specify return value instead of navigation" + }, + { + "key": "eval", + "category": "permissive_string_match", + "note": "Switched to exact match" + } + ] + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 115, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Return the customer names for those who gave a rating of 3 stars or below for {{product}}?", + "original.intent_template": "Show me the name of the customers who have expressed dissatisfaction with {{product}}", + "instantiation_dict": { + "product": "Chloe tank" + }, + "intent": "Return the customer names for those who gave a rating of 3 stars or below for Chloe tank?", + "original.intent": "Show me the name of the customers who have expressed dissatisfaction with Chloe tank", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "validate_status", + "expected_data": { + "expected_status": "NOT_FOUND_ERROR" + } + } + } + ], + "site": "shopping_admin" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": "N/A" + }, + "reference_url": "", + "program_html": [], + "string_note": "There is no negative review for Chloe tank", + "reference_answer_raw_annotation": "" + }, + "intent_template_id": 245, + "changelogs": [ + { + "key": "intent_template", + "category": "task_ambiguity", + "note": "Clarify the intent to specify return value instead of navigation" + }, + { + "key": "expected_retrieve_value", + "category": "unachievable_tasks", + "note": "Status check used instead of fuzzy match" + } + ] + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 116, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Return the customer names for those who gave a rating of 3 stars or below for {{product}}?", + "original.intent_template": "Show me the name of the customers who have expressed dissatisfaction with {{product}}?", + "instantiation_dict": { + "product": "tanks products" + }, + "intent": "Return the customer names for those who gave a rating of 3 stars or below for tanks products?", + "original.intent": "Show me the name of the customers who have expressed dissatisfaction with tanks products?", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "verify_retrieved_value_normalized", + "eval_params": { + "ordered": false, + "type": "text" + }, + "expected_data": [ + "Alexander", + "Carma", + "Dominic", + "Merrie", + "Monroe", + "Scotty", + "Shaunte", + "Teofila", + "Valorie", + "Yan" + ] + } + } + ], + "site": "shopping_admin" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "Alexander", + "Carma", + "Dominic", + "Merrie", + "Monroe", + "Scotty", + "Shaunte", + "Teofila", + "Valorie" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "Alexander, Carma, Dominic, Merrie, Monroe, Scotty, Shaunte, Teofila, Valorie" + }, + "intent_template_id": 245, + "changelogs": [ + { + "key": "intent_template", + "category": "task_ambiguity", + "note": "Clarify the intent to specify return value instead of navigation" + }, + { + "key": "expected_retrieve_value", + "category": "reference_alignment", + "note": "Missing expected reviewer name" + }, + { + "key": "eval", + "category": "permissive_string_match", + "note": "Switched to exact match" + } + ] + }, + { + "sites": [ + "shopping" + ], + "task_id": 117, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "Return the date when I made my first purchase on this site?", + "original.intent_template": "What is the date when I made my first purchase on this site?", + "instantiation_dict": {}, + "intent": "Return the date when I made my first purchase on this site?", + "original.intent": "What is the date when I made my first purchase on this site?", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "verify_retrieved_value_normalized", + "eval_params": { + "ordered": false, + "type": "date" + }, + "expected_data": [ + "3/2/22" + ] + } + } + ], + "site": "shopping" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": [ + "3/2/22" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "3/2/22" + }, + "intent_template_id": 161, + "changelogs": [ + { + "key": "intent_template", + "category": "task_ambiguity", + "note": "Clarify the intent to specify return value" + }, + { + "key": "eval", + "category": "permissive_string_match", + "note": "Switched from fuzzy eval" + } + ] + }, + { + "sites": [ + "shopping" + ], + "task_id": 118, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "I have jaw bruxism problem, show me something that could alleviate the problem.", + "instantiation_dict": {}, + "intent": "I have jaw bruxism problem, show me something that could alleviate the problem.", + "require_reset": false, + "eval": { + "site": "shopping" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": null, + "program_html": [ + { + "url": "last", + "locator": "", + "required_contents": { + "must_include": [ + "jaw bruxism", + "mouth guard" + ] + } + } + ] + }, + "intent_template_id": 151 + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 119, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "List all reviews with 4 stars or above for {{product}}.", + "original.intent_template": "Tell me the reasons why customers like {{product}}", + "instantiation_dict": { + "product": "Antonia Racer Tank" + }, + "intent": "List all reviews with 4 stars or above for Antonia Racer Tank.", + "original.intent": "Tell me the reasons why customers like Antonia Racer Tank", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "verify_retrieved_value_normalized", + "eval_params": { + "ordered": false, + "type": "object" + }, + "expected_data": [ + { + "title": { + "value": "A regular or me", + "type": "text" + }, + "rating": { + "value": 4, + "type": "numeric" + } + } + ] + } + } + ], + "site": "shopping_admin" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": [ + "Its color and style is good" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "Its color and style is good" + }, + "intent_template_id": 250, + "format_specification": "Use \"title\" for the review title and \"rating\" for rating (number of stars).", + "changelogs": [ + { + "key": "eval", + "category": "permissive_string_match", + "note": "Switched from fuzzy eval" + } + ] + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 120, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "List all reviews with 4 stars or above for {{product}}.", + "original.intent_template": "Tell me the reasons why customers like {{product}}", + "instantiation_dict": { + "product": "Ana Running Short" + }, + "intent": "List all reviews with 4 stars or above for Ana Running Short.", + "original.intent": "Tell me the reasons why customers like Ana Running Short", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "verify_retrieved_value_normalized", + "eval_params": { + "ordered": false, + "type": "object" + }, + "expected_data": [ + { + "title": { + "value": "It was really hard to find the right siz", + "type": "text" + }, + "rating": { + "value": 4, + "type": "numeric" + } + }, + { + "title": { + "value": "VERY LIGHTWEIGHT COMFY-GOOD SHOES", + "type": "text" + }, + "rating": { + "value": 5, + "type": "numeric" + } + } + ] + } + } + ], + "site": "shopping_admin" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": [ + "It is comfortable" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "It is comfortable" + }, + "intent_template_id": 250, + "format_specification": "Use \"title\" for the review title and \"rating\" for rating (number of stars).", + "changelogs": [ + { + "key": "eval", + "category": "permissive_string_match", + "note": "Switched from fuzzy eval" + } + ] + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 121, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "List all reviews with 4 stars or above for {{product}}.", + "original.intent_template": "Tell me the reasons why customers like {{product}}", + "instantiation_dict": { + "product": "Circe hooded fleece" + }, + "intent": "List all reviews with 4 stars or above for Circe hooded fleece.", + "original.intent": "Tell me the reasons why customers like Circe hooded fleece", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "verify_retrieved_value_normalized", + "eval_params": { + "ordered": false, + "type": "object" + }, + "expected_data": [ + { + "title": { + "value": "Good but not perfect", + "type": "text" + }, + "rating": { + "value": 4, + "type": "numeric" + } + } + ] + } + } + ], + "site": "shopping_admin" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": [ + "Warm and comfortable. True to size." + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "Warm and comfortable. True to size." + }, + "intent_template_id": 250, + "format_specification": "Use \"title\" for the review title and \"rating\" for rating (number of stars).", + "changelogs": [ + { + "key": "eval", + "category": "permissive_string_match", + "note": "Switched from fuzzy eval" + } + ] + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 122, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "List all reviews with 4 stars or above for {{product}}.", + "original.intent_template": "Tell me the reasons why customers like {{product}}", + "instantiation_dict": { + "product": "Olivia zip jacket" + }, + "intent": "List all reviews with 4 stars or above for Olivia zip jacket.", + "original.intent": "Tell me the reasons why customers like Olivia zip jacket", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "verify_retrieved_value_normalized", + "eval_params": { + "ordered": false, + "type": "object" + }, + "expected_data": [ + { + "title": { + "value": "Quite good", + "type": "text" + }, + "rating": { + "value": 5, + "type": "numeric" + } + } + ] + } + } + ], + "site": "shopping_admin" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": [ + "Lightweight, comfortable and stylish. Good design and details." + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "Lightweight, comfortable, and stylish. Good design and details." + }, + "intent_template_id": 250, + "format_specification": "Use \"title\" for the review title and \"rating\" for rating (number of stars).", + "changelogs": [ + { + "key": "eval", + "category": "permissive_string_match", + "note": "Switched from fuzzy eval" + } + ] + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 123, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "List all reviews with 4 stars or above for {{product}}.", + "original.intent_template": "Tell me the reasons why customers like {{product}}", + "instantiation_dict": { + "product": "Circe's products" + }, + "intent": "List all reviews with 4 stars or above for Circe's products.", + "original.intent": "Tell me the reasons why customers like Circe's products", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "verify_retrieved_value_normalized", + "eval_params": { + "ordered": false, + "type": "object" + }, + "expected_data": [ + { + "title": { + "value": "Good but not perfect", + "type": "text" + }, + "rating": { + "value": 4, + "type": "numeric" + } + } + ] + } + } + ], + "site": "shopping_admin" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": [ + "Warm and comfortable. True to size." + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "Warm and comfortable. True to size." + }, + "intent_template_id": 250, + "format_specification": "Use \"title\" for the review title and \"rating\" for rating (number of stars).", + "changelogs": [ + { + "key": "eval", + "category": "permissive_string_match", + "note": "Switched from fuzzy eval" + } + ] + }, + { + "sites": [ + "shopping" + ], + "task_id": 124, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "What is the price range of {{product}} in the One Stop Market?", + "instantiation_dict": { + "product": "wireless earphone" + }, + "intent": "What is the price range of wireless earphone in the One Stop Market?", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "verify_retrieved_value_normalized", + "eval_params": { + "ordered": false, + "type": "object" + }, + "expected_data": [ + { + "min": { + "value": 0.01, + "type": "currency" + }, + "max": { + "value": 745.0, + "type": "currency" + } + } + ] + } + } + ], + "site": "shopping" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "0.14", + "745.00" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "$0.14 - $745.00" + }, + "intent_template_id": 159, + "format_specification": "Use \"min\" for lowest price and \"max\" for highest price.", + "changelogs": [ + { + "key": "expected_retrieve_value", + "category": "reference_alignment", + "note": "Min cost for earphone is a earbud for 0.01" + }, + { + "key": "eval", + "category": "permissive_string_match", + "note": "Switched to exact match" + } + ] + }, + { + "sites": [ + "shopping" + ], + "task_id": 125, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "What is the price range of {{product}} in the One Stop Market?", + "instantiation_dict": { + "product": "teeth grinding mouth guard" + }, + "intent": "What is the price range of teeth grinding mouth guard in the One Stop Market?", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "verify_retrieved_value_normalized", + "eval_params": { + "ordered": false, + "type": "object" + }, + "expected_data": [ + { + "min": { + "value": 1.46, + "type": "currency" + }, + "max": { + "value": 85.0, + "type": "currency" + } + } + ] + } + } + ], + "site": "shopping" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "1.46", + "85" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "$1.46 - $85" + }, + "intent_template_id": 159, + "format_specification": "Use \"min\" for lowest price and \"max\" for highest price.", + "changelogs": [ + { + "key": "eval", + "category": "permissive_string_match", + "note": "Switched to exact match" + } + ] + }, + { + "sites": [ + "shopping" + ], + "task_id": 126, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "What is the price range of {{product}} in the One Stop Market?", + "instantiation_dict": { + "product": "Canon photo printer" + }, + "intent": "What is the price range of Canon photo printer in the One Stop Market?", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "verify_retrieved_value_normalized", + "eval_params": { + "ordered": false, + "type": "object" + }, + "expected_data": [ + { + "min": { + "value": 2.56, + "type": "currency" + }, + "max": { + "value": 649.99, + "type": "currency" + } + } + ] + } + } + ], + "site": "shopping" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "2.56", + "649.99" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "$2.56 - $649.99" + }, + "intent_template_id": 159, + "format_specification": "Use \"min\" for lowest price and \"max\" for highest price.", + "changelogs": [ + { + "key": "eval", + "category": "permissive_string_match", + "note": "Switched to exact match" + } + ] + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 127, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "What brands appear most frequently among the top search terms?", + "instantiation_dict": {}, + "intent": "What brands appear most frequently among the top search terms?", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "verify_retrieved_value_normalized", + "eval_params": { + "ordered": false, + "type": "text" + }, + "expected_data": [ + "Hollister", + "Joust", + "Antonia" + ] + } + } + ], + "site": "shopping_admin" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "Hollister", + "Joust", + "Antonia" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "Hollister, Joust, Antonia" + }, + "intent_template_id": 1001, + "changelogs": [ + { + "key": "eval", + "category": "permissive_string_match", + "note": "Switched to exact match" + } + ] + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 128, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "What's the total number of items sold in the most recent {{k}} orders?", + "instantiation_dict": { + "k": "2" + }, + "intent": "What's the total number of items sold in the most recent 2 orders?", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "verify_retrieved_value_normalized", + "eval_params": { + "ordered": false, + "type": "numeric" + }, + "expected_data": [ + 9 + ] + } + } + ], + "site": "shopping_admin" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "9" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "9" + }, + "intent_template_id": 1002, + "changelogs": [ + { + "key": "eval", + "category": "permissive_string_match", + "note": "Switched to exact match" + } + ] + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 129, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "What's the total number of items sold in the most recent {{k}} orders?", + "instantiation_dict": { + "k": "4" + }, + "intent": "What's the total number of items sold in the most recent 4 orders?", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "verify_retrieved_value_normalized", + "eval_params": { + "ordered": false, + "type": "numeric" + }, + "expected_data": [ + 16 + ] + } + } + ], + "site": "shopping_admin" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "16" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "16" + }, + "intent_template_id": 1002, + "changelogs": [ + { + "key": "eval", + "category": "permissive_string_match", + "note": "Switched to exact match" + } + ] + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 130, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "What's the total number of items sold in the most recent {{k}} orders?", + "instantiation_dict": { + "k": "5" + }, + "intent": "What's the total number of items sold in the most recent 5 orders?", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "verify_retrieved_value_normalized", + "eval_params": { + "ordered": false, + "type": "numeric" + }, + "expected_data": [ + 18 + ] + } + } + ], + "site": "shopping_admin" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "18" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "18" + }, + "intent_template_id": 1002, + "changelogs": [ + { + "key": "eval", + "category": "permissive_string_match", + "note": "Switched to exact match" + } + ] + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 131, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "What's the total number of items sold in the most recent {{k}} orders?", + "instantiation_dict": { + "k": "7" + }, + "intent": "What's the total number of items sold in the most recent 7 orders?", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "verify_retrieved_value_normalized", + "eval_params": { + "ordered": false, + "type": "numeric" + }, + "expected_data": [ + 25 + ] + } + } + ], + "site": "shopping_admin" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "25" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "25" + }, + "intent_template_id": 1002, + "changelogs": [ + { + "key": "eval", + "category": "permissive_string_match", + "note": "Switched to exact match" + } + ] + }, + { + "sites": [ + "gitlab" + ], + "task_id": 132, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "How many commits did {{user}} make to {{repo}} on {{date}}?", + "instantiation_dict": { + "user": "kilian", + "repo": "a11yproject.com", + "date": "3/5/2023" + }, + "original.instantiation_dict": { + "user": "kilian", + "repo": "a11yproject", + "date": "3/5/2023" + }, + "intent": "How many commits did kilian make to a11yproject.com on 3/5/2023?", + "original.intent": "How many commits did kilian make to a11yproject on 3/5/2023?", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "verify_retrieved_value_normalized", + "eval_params": { + "ordered": "false", + "type": "numeric" + }, + "expected_data": [ + 1 + ] + } + } + ], + "site": "gitlab" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "1" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "1" + }, + "intent_template_id": 322, + "changelogs": [ + { + "key": "instantiation_dict", + "category": "clarify_instructions", + "note": "Repo corrected; date format ambiguous; require numeric answer" + }, + { + "key": "expected_retrieve_value", + "category": "avoid_permissive_text", + "note": "Use structured response to judge correctness" + }, + { + "key": "eval", + "category": "permissive_string_match", + "note": "Switched to exact match" + } + ] + }, + { + "sites": [ + "gitlab" + ], + "task_id": 133, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "How many commits did {{user}} make to {{repo}} on {{date}}?", + "instantiation_dict": { + "user": "Eric", + "repo": "a11yproject.com", + "date": "3/2" + }, + "original.instantiation_dict": { + "user": "Eric", + "repo": "a11yproject", + "date": "3/2" + }, + "intent": "How many commits did Eric make to a11yproject.com on 3/2?", + "original.intent": "How many commits did Eric make to a11yproject on 3/2?", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "verify_retrieved_value_normalized", + "eval_params": { + "ordered": "false", + "type": "numeric" + }, + "expected_data": [ + 2 + ] + } + } + ], + "site": "gitlab" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "2" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "2" + }, + "intent_template_id": 322, + "changelogs": [ + { + "key": "instantiation_dict", + "category": "clarify_instructions", + "note": "Same as 132: repo corrected; ambiguous date; numeric answer" + }, + { + "key": "expected_retrieve_value", + "category": "avoid_permissive_text", + "note": "Use structured response to judge correctness" + }, + { + "key": "eval", + "category": "permissive_string_match", + "note": "Switched to exact match" + } + ] + }, + { + "sites": [ + "gitlab" + ], + "task_id": 134, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "How many commits did {{user}} make to {{repo}} on {{date}}?", + "instantiation_dict": { + "user": "kilian", + "repo": "a11yproject.com", + "date": "3/1/2023" + }, + "original.instantiation_dict": { + "user": "kilian", + "repo": "a11yproject", + "date": "3/1/2023" + }, + "intent": "How many commits did kilian make to a11yproject.com on 3/1/2023?", + "original.intent": "How many commits did kilian make to a11yproject on 3/1/2023?", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "verify_retrieved_value_normalized", + "eval_params": { + "ordered": "false", + "type": "numeric" + }, + "expected_data": [ + 0 + ] + } + } + ], + "site": "gitlab" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "0" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "0" + }, + "intent_template_id": 322, + "changelogs": [ + { + "key": "instantiation_dict", + "category": "clarify_instructions", + "note": "Same as 132: repo corrected; ambiguous date; numeric answer" + }, + { + "key": "expected_retrieve_value", + "category": "avoid_permissive_text", + "note": "Use structured response to judge correctness" + }, + { + "key": "eval", + "category": "permissive_string_match", + "note": "Switched to exact match" + } + ] + }, + { + "sites": [ + "gitlab" + ], + "task_id": 135, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "How many commits did {{user}} make to {{repo}} on {{date}}?", + "instantiation_dict": { + "user": "Eric and Kilian", + "repo": "a11yproject.com", + "date": "1/3/2023" + }, + "original.instantiation_dict": { + "user": "Eric and Kilian", + "repo": "a11yproject", + "date": "1/3/2023" + }, + "intent": "How many commits did Eric and Kilian make to a11yproject.com on 1/3/2023?", + "original.intent": "How many commits did Eric and Kilian make to a11yproject on 1/3/2023?", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "verify_retrieved_value_normalized", + "eval_params": { + "ordered": "false", + "type": "numeric" + }, + "expected_data": [ + 1 + ] + } + } + ], + "site": "gitlab" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "1" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "1" + }, + "intent_template_id": 322, + "changelogs": [ + { + "key": "instantiation_dict", + "category": "clarify_instructions", + "note": "Repo corrected; date format ambiguous; require numeric answer" + }, + { + "key": "expected_retrieve_value", + "category": "avoid_permissive_text", + "note": "Use structured response to judge correctness" + }, + { + "key": "eval", + "category": "permissive_string_match", + "note": "Switched to exact match" + } + ] + }, + { + "sites": [ + "gitlab" + ], + "task_id": 136, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "How many commits did {{user}} make to {{repo}} on {{date}}?", + "instantiation_dict": { + "user": "Steven Woodson", + "repo": "a11y-webring.club", + "date": "2/6/2023" + }, + "intent": "How many commits did Steven Woodson make to a11y-webring.club on 2/6/2023?", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "verify_retrieved_value_normalized", + "eval_params": { + "ordered": "false", + "type": "numeric" + }, + "expected_data": [ + 5 + ] + } + } + ], + "site": "gitlab" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "5" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "5" + }, + "intent_template_id": 322, + "changelogs": [ + { + "key": "expected_retrieve_value", + "category": "avoid_permissive_text", + "note": "Use structured response to judge correctness" + }, + { + "key": "eval", + "category": "permissive_string_match", + "note": "Switched to exact match" + } + ] + }, + { + "sites": [ + "map" + ], + "task_id": 137, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "What is the estimated driving time between {{city1}} and {{city2}}?", + "instantiation_dict": { + "city1": "the city where the Liberty Bell is located", + "city2": "the home city of Pirates" + }, + "intent": "What is the estimated driving time between the city where the Liberty Bell is located and the home city of Pirates?", + "require_reset": false, + "eval": { + "site": "map" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": [ + "5h 47min" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "5h 47min" + }, + "intent_template_id": 51, + "changelogs": [ + { + "key": "eval", + "category": "permissive_string_match", + "note": "Switched from fuzzy eval" + } + ] + }, + { + "sites": [ + "map" + ], + "task_id": 138, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "What is the estimated driving time between {{city1}} and {{city2}}?", + "instantiation_dict": { + "city1": "the big apple", + "city2": "the city with the most authentic Philly cheesesteaks" + }, + "intent": "What is the estimated driving time between the big apple and the city with the most authentic Philly cheesesteaks?", + "require_reset": false, + "eval": { + "site": "map" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": [ + "1h 58min" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "1h 58min" + }, + "intent_template_id": 51, + "changelogs": [ + { + "key": "eval", + "category": "permissive_string_match", + "note": "Switched from fuzzy eval" + } + ] + }, + { + "sites": [ + "map" + ], + "task_id": 139, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "What is the estimated driving time between {{city1}} and {{city2}}?", + "instantiation_dict": { + "city1": "the hometown of Joe Biden", + "city2": "Bridgeport" + }, + "intent": "What is the estimated driving time between the hometown of Joe Biden and Bridgeport?", + "require_reset": false, + "eval": { + "site": "map" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": [ + "3h 20min" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "3h 20min" + }, + "intent_template_id": 51, + "changelogs": [ + { + "key": "eval", + "category": "permissive_string_match", + "note": "Switched from fuzzy eval" + } + ] + }, + { + "sites": [ + "map" + ], + "task_id": 140, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "What is the estimated driving time between {{city1}} and {{city2}}?", + "instantiation_dict": { + "city1": "the city of Niagara Falls", + "city2": "the city of Yale University" + }, + "intent": "What is the estimated driving time between the city of Niagara Falls and the city of Yale University?", + "require_reset": false, + "eval": { + "site": "map" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": [ + "8h 33min" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "8h 33min" + }, + "intent_template_id": 51, + "changelogs": [ + { + "key": "eval", + "category": "permissive_string_match", + "note": "Switched from fuzzy eval" + } + ] + }, + { + "sites": [ + "shopping" + ], + "task_id": 141, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "Return how much I spent on {{category}} shopping during {{time}}", + "original.intent_template": "How much I spent on {{category}} shopping during {{time}}", + "instantiation_dict": { + "category": "food-related", + "time": "March 2023" + }, + "intent": "Return how much I spent on food-related shopping during March 2023", + "original.intent": "How much I spent on food-related shopping during March 2023", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "verify_retrieved_value_normalized", + "eval_params": { + "ordered": false, + "type": "currency" + }, + "expected_data": [ + "47.41" + ] + } + } + ], + "site": "shopping" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "47.41" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "$47.41" + }, + "intent_template_id": 162, + "changelogs": [ + { + "key": "intent_template", + "category": "task_ambiguity", + "note": "Clarify the intent to specify return value instead of navigation" + }, + { + "key": "eval", + "category": "permissive_string_match", + "note": "Switched to exact match" + } + ] + }, + { + "sites": [ + "shopping" + ], + "task_id": 142, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "Return how much I spent on {{category}} shopping during {{time}}", + "original.intent_template": "How much I spent on {{category}} shopping during {{time}}", + "instantiation_dict": { + "category": "hair care and hair style", + "time": "Jan 2023" + }, + "intent": "Return how much I spent on hair care and hair style shopping during Jan 2023", + "original.intent": "How much I spent on hair care and hair style shopping during Jan 2023", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "verify_retrieved_value_normalized", + "eval_params": { + "ordered": false, + "type": "currency" + }, + "expected_data": [ + "68.51" + ] + } + } + ], + "site": "shopping" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "95.23" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "$95.23" + }, + "intent_template_id": 162, + "changelogs": [ + { + "key": "intent_template", + "category": "task_ambiguity", + "note": "Clarify the intent to specify return value instead of navigation" + }, + { + "key": "expected_retrieve_value", + "category": "reference_alignment", + "note": "Update expected value to match two products in jan orders (50.52 + 17.99)" + }, + { + "key": "eval", + "category": "permissive_string_match", + "note": "Switched to exact match" + } + ] + }, + { + "sites": [ + "shopping" + ], + "task_id": 143, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "Return how much I spent on {{category}} shopping during {{time}}", + "original.intent_template": "How much I spent on {{category}} shopping during {{time}}", + "instantiation_dict": { + "category": "home decoration", + "time": "1/29/2023" + }, + "intent": "Return how much I spent on home decoration shopping during 1/29/2023", + "original.intent": "How much I spent on home decoration shopping during 1/29/2023", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "verify_retrieved_value_normalized", + "eval_params": { + "ordered": false, + "type": "currency" + }, + "expected_data": [ + "260.69" + ] + } + } + ], + "site": "shopping" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "265.69" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "$265.69" + }, + "intent_template_id": 162, + "changelogs": [ + { + "key": "intent_template", + "category": "task_ambiguity", + "note": "Clarify the intent to specify return value instead of navigation" + }, + { + "key": "expected_retrieve_value", + "category": "reference_alignment", + "note": "Update expected value to tree purchase 260.69" + }, + { + "key": "eval", + "category": "permissive_string_match", + "note": "Switched to exact match" + } + ] + }, + { + "sites": [ + "shopping" + ], + "task_id": 144, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "Return how much I spent on {{category}} shopping during {{time}}", + "original.intent_template": "How much I spent on {{category}} shopping during {{time}}", + "instantiation_dict": { + "category": "food", + "time": "from mid Jan to the end Jan 2023" + }, + "intent": "Return how much I spent on food shopping during from mid Jan to the end Jan 2023", + "original.intent": "How much I spent on food shopping during from mid Jan to the end Jan 2023", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "verify_retrieved_value_normalized", + "eval_params": { + "ordered": false, + "type": "numeric" + }, + "expected_data": [ + 0 + ] + } + } + ], + "site": "shopping" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "0" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "0" + }, + "intent_template_id": 162, + "changelogs": [ + { + "key": "intent_template", + "category": "task_ambiguity", + "note": "Clarify the intent to specify return value instead of navigation" + }, + { + "key": "eval", + "category": "permissive_string_match", + "note": "Switched to exact match" + } + ] + }, + { + "sites": [ + "shopping" + ], + "task_id": 145, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "Return how much I spent on {{category}} shopping during {{time}}", + "original.intent_template": "How much I spent on {{category}} shopping during {{time}}", + "instantiation_dict": { + "category": "cooking and food", + "time": "March 2022" + }, + "intent": "Return how much I spent on cooking and food shopping during March 2022", + "original.intent": "How much I spent on cooking and food shopping during March 2022", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "verify_retrieved_value_normalized", + "eval_params": { + "ordered": false, + "type": "currency" + }, + "expected_data": [ + "52.35" + ] + } + } + ], + "site": "shopping" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "52.35" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "$52.35" + }, + "intent_template_id": 162, + "changelogs": [ + { + "key": "intent_template", + "category": "task_ambiguity", + "note": "Clarify the intent to specify return value instead of navigation" + }, + { + "key": "eval", + "category": "permissive_string_match", + "note": "Switched to exact match" + } + ] + }, + { + "sites": [ + "shopping" + ], + "task_id": 146, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "What is the {{option}} of the {{product}} I bought {{time}}", + "original.intent_template": "What is the {{option}} configuration of the {{product}} I bought {{time}}", + "instantiation_dict": { + "option": "size", + "product": "picture frame", + "time": "Sep 2022" + }, + "intent": "What is the size of the picture frame I bought Sep 2022", + "original.intent": "What is the size configuration of the picture frame I bought Sep 2022", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "verify_retrieved_value_normalized", + "eval_params": { + "ordered": false, + "type": "text" + }, + "expected_data": [ + "16\"x24\"" + ] + } + } + ], + "site": "shopping" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "16x24" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "16x24" + }, + "intent_template_id": 155, + "changelogs": [ + { + "key": "intent_template", + "category": "task_ambiguity", + "note": "Clarify the intent by removing \"configuration\"" + }, + { + "key": "expected_retrieve_value", + "category": "reference_alignment", + "note": "Update to correct size" + }, + { + "key": "eval", + "category": "permissive_string_match", + "note": "Switched to exact match" + } + ] + }, + { + "sites": [ + "shopping" + ], + "task_id": 147, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "What is the {{option}} of the {{product}} I bought {{time}}", + "original.intent_template": "What is the {{option}} configuration of the {{product}} I bought {{time}}", + "instantiation_dict": { + "option": "size", + "product": "picture frame", + "time": "2022" + }, + "intent": "What is the size of the picture frame I bought between June and December 2022", + "original.intent": "What is the size configuration of the picture frame I bought 2022", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "verify_retrieved_value_normalized", + "eval_params": { + "ordered": false, + "type": "text" + }, + "expected_data": [ + "16\"x24\"" + ] + } + } + ], + "site": "shopping" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "16x24" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "16x24" + }, + "intent_template_id": 155, + "changelogs": [ + { + "key": "intent_template", + "category": "task_ambiguity", + "note": "Clarify the intent by removing \"configuration\"" + }, + { + "key": "expected_retrieve_value", + "category": "reference_alignment", + "note": "Update to correct size" + }, + { + "key": "eval", + "category": "permissive_string_match", + "note": "Switched to exact match" + } + ] + }, + { + "sites": [ + "shopping" + ], + "task_id": 148, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "What is the {{option}} of the {{product}} I bought {{time}}", + "original.intent_template": "What is the {{option}} configuration of the {{product}} I bought {{time}}", + "instantiation_dict": { + "option": "color", + "product": "picture frame", + "time": "Sep 2022" + }, + "intent": "What is the color of the picture frame I bought Sep 2022", + "original.intent": "What is the color configuration of the picture frame I bought Sep 2022", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "verify_retrieved_value_normalized", + "eval_params": { + "ordered": false, + "type": "text" + }, + "expected_data": [ + "Mist" + ] + } + } + ], + "site": "shopping" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "Mist" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "Mist" + }, + "intent_template_id": 155, + "changelogs": [ + { + "key": "intent_template", + "category": "task_ambiguity", + "note": "Clarify the intent by removing \"configuration\"" + }, + { + "key": "eval", + "category": "permissive_string_match", + "note": "Switched to exact match" + } + ] + }, + { + "sites": [ + "shopping" + ], + "task_id": 149, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "What is the {{option}} of the {{product}} I bought {{time}}", + "original.intent_template": "What is the {{option}} configuration of the {{product}} I bought {{time}}", + "instantiation_dict": { + "option": "color", + "product": "artifical plants", + "time": "Feb 2023" + }, + "intent": "What is the color of the artifical plants I bought Feb 2023", + "original.intent": "What is the color configuration of the artifical plants I bought Feb 2023", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "verify_retrieved_value_normalized", + "eval_params": { + "ordered": false, + "type": "text" + }, + "expected_data": [ + "Green" + ] + } + } + ], + "site": "shopping" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "Green-vines" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "Green-vines" + }, + "intent_template_id": 155, + "changelogs": [ + { + "key": "intent_template", + "category": "task_ambiguity", + "note": "Clarify the intent by removing \"configuration\"" + }, + { + "key": "expected_retrieve_value", + "category": "reference_alignment", + "note": "Update to the exact color" + }, + { + "key": "eval", + "category": "permissive_string_match", + "note": "Switched to exact match" + } + ] + }, + { + "sites": [ + "shopping" + ], + "task_id": 150, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "What is the {{option}} of the {{product}} I bought {{time}}", + "original.intent_template": "What is the {{option}} configuration of the {{product}} I bought {{time}}", + "instantiation_dict": { + "option": "price", + "product": "fake tree", + "time": "Jan 2023" + }, + "intent": "What is the price of the fake tree I bought Jan 2023", + "original.intent": "What is the price configuration of the fake tree I bought Jan 2023", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "verify_retrieved_value_normalized", + "eval_params": { + "ordered": false, + "type": "currency" + }, + "expected_data": [ + "260.69" + ] + } + } + ], + "site": "shopping" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "260.69" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "260.69" + }, + "intent_template_id": 155, + "changelogs": [ + { + "key": "intent_template", + "category": "task_ambiguity", + "note": "Clarify the intent by removing \"configuration\"" + }, + { + "key": "eval", + "category": "permissive_string_match", + "note": "Switched to exact match" + } + ] + }, + { + "sites": [ + "map" + ], + "task_id": 151, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "What is the minimum travel time by car from {{location1}} to {{location2}}?", + "instantiation_dict": { + "location1": "CMU", + "location2": "University of Pittsburgh" + }, + "intent": "What is the minimum travel time by car from CMU to University of Pittsburgh?", + "require_reset": false, + "eval": { + "site": "map" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": [ + "4min" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "4min" + }, + "intent_template_id": 36, + "changelogs": [ + { + "key": "eval", + "category": "permissive_string_match", + "note": "Switched from fuzzy eval" + } + ] + }, + { + "sites": [ + "map" + ], + "task_id": 152, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "What is the minimum travel time by car from {{location1}} to {{location2}}?", + "instantiation_dict": { + "location1": "Schenley park", + "location2": "Upitt" + }, + "intent": "What is the minimum travel time by car from Schenley park to Upitt?", + "require_reset": false, + "eval": { + "site": "map" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": [ + "4min" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "4min" + }, + "intent_template_id": 36, + "changelogs": [ + { + "key": "eval", + "category": "permissive_string_match", + "note": "Switched from fuzzy eval" + } + ] + }, + { + "sites": [ + "map" + ], + "task_id": 153, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "What is the minimum travel time by car from {{location1}} to {{location2}}?", + "instantiation_dict": { + "location1": "REI", + "location2": "CMU" + }, + "intent": "What is the minimum travel time by car from REI to CMU?", + "require_reset": false, + "eval": { + "site": "map" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": [ + "7min" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "7min" + }, + "intent_template_id": 36, + "changelogs": [ + { + "key": "eval", + "category": "permissive_string_match", + "note": "Switched from fuzzy eval" + } + ] + }, + { + "sites": [ + "map" + ], + "task_id": 154, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "What is the minimum travel time by car from {{location1}} to {{location2}}?", + "instantiation_dict": { + "location1": "CMU gates building", + "location2": "Schenley park" + }, + "intent": "What is the minimum travel time by car from CMU gates building to Schenley park?", + "require_reset": false, + "eval": { + "site": "map" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": [ + "4min" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "4min" + }, + "intent_template_id": 36, + "changelogs": [ + { + "key": "eval", + "category": "permissive_string_match", + "note": "Switched from fuzzy eval" + } + ] + }, + { + "sites": [ + "map" + ], + "task_id": 155, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "What is the minimum travel time by car from {{location1}} to {{location2}}?", + "instantiation_dict": { + "location1": "Animal Rescue League of Pittsburgh", + "location2": "Schenley park" + }, + "intent": "What is the minimum travel time by car from Animal Rescue League of Pittsburgh to Schenley park?", + "require_reset": false, + "eval": { + "site": "map" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": [ + "9min" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "9min" + }, + "intent_template_id": 36, + "changelogs": [ + { + "key": "eval", + "category": "permissive_string_match", + "note": "Switched from fuzzy eval" + } + ] + }, + { + "sites": [ + "gitlab" + ], + "task_id": 156, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Navigate to the merge requests assigned to me", + "original.intent_template": "Checkout merge requests assigned to me", + "instantiation_dict": {}, + "intent": "Navigate to the merge requests assigned to me", + "original.intent": "Checkout merge requests assigned to me", + "require_reset": false, + "eval": { + "expected_ui_state": [ + { + "url": "__GITLAB__/dashboard/merge_requests?assignee_username=byteblaze" + } + ], + "site": "gitlab" + }, + "original.eval": { + "eval_types": [ + "url_match" + ], + "reference_answers": null, + "reference_url": "__GITLAB__/dashboard/merge_requests?assignee_username=byteblaze", + "program_html": [], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 290, + "changelogs": [ + { + "key": "intent_template", + "category": "clarify_instructions", + "note": "Clarify the intent as a navigation task" + } + ] + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 157, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Navigate to all customers", + "original.intent_template": "Show all customers", + "instantiation_dict": {}, + "intent": "Navigate to all customers", + "original.intent": "Show all customers", + "require_reset": false, + "eval": { + "expected_ui_state": [ + { + "url": "__SHOPPING_ADMIN__/customer/index/" + } + ], + "site": "shopping_admin" + }, + "original.eval": { + "eval_types": [ + "url_match" + ], + "reference_answers": null, + "reference_url": "__SHOPPING_ADMIN__/customer/index/", + "program_html": [], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 255, + "changelogs": [ + { + "key": "intent_template", + "category": "task_ambiguity", + "note": "Clearly specify to navigate to the customers page instead of returning a list of customers" + } + ] + }, + { + "sites": [ + "shopping" + ], + "task_id": 158, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "I have a lot of Nintendo Switch game cards now, help me find the best storage option to fit all {{num}} cards", + "instantiation_dict": { + "num": 11 + }, + "intent": "I have a lot of Nintendo Switch game cards now, help me find the best storage option to fit all 11 cards", + "require_reset": false, + "eval": { + "expected_ui_state": [ + { + "url": "__SHOPPING__/heiying-game-card-case-for-nintendo-switch-switch-oled-game-card-or-micro-sd-memory-cards-portable-switch-game-memory-card-storage-with-24-game-card-slots-and-24-micro-sd-card-slots-black.html" + } + ], + "site": "shopping" + }, + "original.eval": { + "eval_types": [ + "url_match" + ], + "reference_answers": null, + "reference_url": "__SHOPPING__/heiying-game-card-case-for-nintendo-switch-switch-oled-game-card-or-micro-sd-memory-cards-portable-switch-game-memory-card-storage-with-24-game-card-slots-and-24-micro-sd-card-slots-black.html", + "program_html": [], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 171 + }, + { + "sites": [ + "shopping" + ], + "task_id": 159, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "I have a lot of Nintendo Switch game cards now, help me find the best storage option to fit all {{num}} cards", + "instantiation_dict": { + "num": 31 + }, + "intent": "I have a lot of Nintendo Switch game cards now, help me find the best storage option to fit all 31 cards", + "require_reset": false, + "eval": { + "expected_ui_state": [ + { + "url": "__SHOPPING__/game-card-holder-storage-case-for-nintendo-switch-games-or-ps-vita-game-case-or-sd-memory-cards-black.html" + } + ], + "site": "shopping" + }, + "original.eval": { + "eval_types": [ + "url_match" + ], + "reference_answers": null, + "reference_url": "__SHOPPING__/game-card-holder-storage-case-for-nintendo-switch-games-or-ps-vita-game-case-or-sd-memory-cards-black.html", + "program_html": [], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 171 + }, + { + "sites": [ + "shopping" + ], + "task_id": 160, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "I have a lot of Nintendo Switch game cards now, help me find the best storage option to fit all {{num}} cards", + "instantiation_dict": { + "num": 6 + }, + "intent": "I have a lot of Nintendo Switch game cards now, help me find the best storage option to fit all 6 cards", + "require_reset": false, + "eval": { + "expected_ui_state": [ + { + "url": "__SHOPPING__/heiying-game-card-case-for-nintendo-switch-switch-oled-game-card-or-micro-sd-memory-cards-portable-switch-game-memory-card-storage-with-24-game-card-slots-and-24-micro-sd-card-slots-black.html" + } + ], + "site": "shopping" + }, + "original.eval": { + "eval_types": [ + "url_match" + ], + "reference_answers": null, + "reference_url": "__SHOPPING__/heiying-game-card-case-for-nintendo-switch-switch-oled-game-card-or-micro-sd-memory-cards-portable-switch-game-memory-card-storage-with-24-game-card-slots-and-24-micro-sd-card-slots-black.html", + "program_html": [], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 171 + }, + { + "sites": [ + "shopping" + ], + "task_id": 161, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "I have a lot of Nintendo Switch game cards now, help me find the best storage option to fit all {{num}} cards", + "instantiation_dict": { + "num": 23 + }, + "intent": "I have a lot of Nintendo Switch game cards now, help me find the best storage option to fit all 23 cards", + "require_reset": false, + "eval": { + "expected_ui_state": [ + { + "url": "__SHOPPING__/heiying-game-card-case-for-nintendo-switch-switch-oled-game-card-or-micro-sd-memory-cards-portable-switch-game-memory-card-storage-with-24-game-card-slots-and-24-micro-sd-card-slots-black.html" + } + ], + "site": "shopping" + }, + "original.eval": { + "eval_types": [ + "url_match" + ], + "reference_answers": null, + "reference_url": "__SHOPPING__/heiying-game-card-case-for-nintendo-switch-switch-oled-game-card-or-micro-sd-memory-cards-portable-switch-game-memory-card-storage-with-24-game-card-slots-and-24-micro-sd-card-slots-black.html", + "program_html": [], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 171 + }, + { + "sites": [ + "shopping" + ], + "task_id": 162, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "I have a lot of Nintendo Switch game cards now, help me find the best storage option to fit all {{num}} cards", + "instantiation_dict": { + "num": 40 + }, + "intent": "I have a lot of Nintendo Switch game cards now, help me find the best storage option to fit all 40 cards", + "require_reset": false, + "eval": { + "expected_ui_state": [ + { + "url": "__SHOPPING__/game-card-holder-storage-case-for-nintendo-switch-games-or-ps-vita-game-case-or-sd-memory-cards-black.html" + } + ], + "site": "shopping" + }, + "original.eval": { + "eval_types": [ + "url_match" + ], + "reference_answers": null, + "reference_url": "__SHOPPING__/game-card-holder-storage-case-for-nintendo-switch-games-or-ps-vita-game-case-or-sd-memory-cards-black.html", + "program_html": [], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 171 + }, + { + "sites": [ + "shopping" + ], + "task_id": 163, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__/ostent-16gb-memory-card-stick-storage-for-sony-ps-vita-psv1000-2000-pch-z081-z161-z321-z641.html", + "geolocation": null, + "intent_template": "List all review titles with 2 stars or below for this product.", + "original.intent_template": "What are the main criticisms of this product? Please extract the relevant sentences.", + "instantiation_dict": {}, + "intent": "List all review titles with 2 stars or below for this product.", + "original.intent": "What are the main criticisms of this product? Please extract the relevant sentences.", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "verify_retrieved_value_normalized", + "eval_params": { + "ordered": false, + "type": "text" + }, + "expected_data": [ + "Memory Card Came Defective" + ] + } + } + ], + "site": "shopping" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "I ordered the 16gb but I only got 14 gigs even though I formatted the card", + "The memory card is kind of slow on games and downloads", + "No original packaging It's used and the previous owners data has not been erased", + "The product is a legit sony hardware that have been owned by someone else before", + "The media could not be loaded", + "I could not format the card so I wasn\u2019t able to use it for my VITA" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "I ordered the 16gb but I only got 14 gigs even though I formatted the card. The memory card is kind of slow on games and downloads. No original packaging It's used and the previous owners data has not been erased. The product is a legit sony hardware that have been owned by someone else before The media could not be loaded. I could not format the card so I wasn\u2019t able to use it for my VITA" + }, + "intent_template_id": 136, + "changelogs": [ + { + "key": "intent_template", + "category": "permissive_string_match", + "note": "Original task required subjective interpretation of review sentiment. Reframed to objective extraction of structured review data with clear filtering criteria (\u22642 stars) to enable exact matching instead of fuzzy semantic matching." + } + ] + }, + { + "sites": [ + "shopping" + ], + "task_id": 164, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__/mineralogie-all-natural-lip-gloss-ruby-rose.html", + "geolocation": null, + "intent_template": "List all review titles with 2 stars or below for this product.", + "original.intent_template": "What are the main criticisms of this product? Please extract the relevant sentences.", + "instantiation_dict": {}, + "intent": "List all review titles with 2 stars or below for this product.", + "original.intent": "What are the main criticisms of this product? Please extract the relevant sentences.", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "verify_retrieved_value_normalized", + "eval_params": { + "ordered": false, + "type": "text" + }, + "expected_data": [ + "Meh" + ] + } + } + ], + "site": "shopping" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "Dry", + "Uneven color" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "DryUneven color" + }, + "intent_template_id": 136, + "changelogs": [ + { + "key": "intent_template", + "category": "permissive_string_match", + "note": "Original task required subjective interpretation of review sentiment. Reframed to objective extraction of structured review data with clear filtering criteria (\u22642 stars) to enable exact matching instead of fuzzy semantic matching." + } + ] + }, + { + "sites": [ + "shopping" + ], + "task_id": 165, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__/sandgrens-swedish-handmade-wooden-clog-sandal-copenhagen.html", + "geolocation": null, + "intent_template": "List all review titles with 2 stars or below for this product.", + "original.intent_template": "What are the main criticisms of this product? Please extract the relevant sentences.", + "instantiation_dict": {}, + "intent": "List all review titles with 2 stars or below for this product.", + "original.intent": "What are the main criticisms of this product? Please extract the relevant sentences.", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "verify_retrieved_value_normalized", + "eval_params": { + "ordered": false, + "type": "text" + }, + "expected_data": [ + "So cute but too small", + "Toe rubbed" + ] + } + } + ], + "site": "shopping" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "The 39 was too small. I am afraid the 40 will be too big", + "I was very sad when the shoe rubbed up against my baby toe", + "I had to return them because I knew in time it would tear up my feet", + "The problem is that the strap is made of some really stiff leather and is painful to my heel", + "The front is also uncomfortably tight", + "The Dansko's were similar (not as bad) and loosened up over time" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "The 39 was too small. I am afraid the 40 will be too big. I was very sad when the shoe rubbed up against my baby toe. I had to return them because I knew in time it would tear up my feet. The problem is that the strap is made of some really stiff leather and is painful to my heel. The front is also uncomfortably tight. The Dansko's were similar (not as bad) and loosened up over time." + }, + "intent_template_id": 136, + "changelogs": [ + { + "key": "intent_template", + "category": "permissive_string_match", + "note": "Original task required subjective interpretation of review sentiment. Reframed to objective extraction of structured review data with clear filtering criteria (\u22642 stars) to enable exact matching instead of fuzzy semantic matching." + } + ] + }, + { + "sites": [ + "shopping" + ], + "task_id": 166, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__/sensodyne-repair-protect-whitening-toothpaste-with-fluoride-3-4-oz-pack-of-3.html", + "geolocation": null, + "intent_template": "List all review titles with 2 stars or below for this product.", + "original.intent_template": "What are the main criticisms of this product? Please extract the relevant sentences.", + "instantiation_dict": {}, + "intent": "List all review titles with 2 stars or below for this product.", + "original.intent": "What are the main criticisms of this product? Please extract the relevant sentences.", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "validate_status", + "expected_data": { + "expected_status": "NOT_FOUND_ERROR" + } + } + } + ], + "site": "shopping" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": "N/A" + }, + "reference_url": "", + "program_html": [], + "string_note": "there is no existing criticism", + "reference_answer_raw_annotation": "N/A" + }, + "intent_template_id": 136, + "changelogs": [ + { + "key": "intent_template", + "category": "permissive_string_match", + "note": "Original task required subjective interpretation of review sentiment. Reframed to objective extraction of structured review data with clear filtering criteria (\u22642 stars) to enable exact matching instead of fuzzy semantic matching." + }, + { + "key": "expected_retrieve_value", + "category": "unachievable_tasks", + "note": "Status check used instead of fuzzy match" + } + ] + }, + { + "sites": [ + "shopping" + ], + "task_id": 167, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__/photosmart-plus-b209-clr-inkjetfb-p-s-c-usb-wrls-1.html", + "geolocation": null, + "intent_template": "List all review titles with 2 stars or below for this product.", + "original.intent_template": "What are the main criticisms of this product? Please extract the relevant sentences.", + "instantiation_dict": {}, + "intent": "List all review titles with 2 stars or below for this product.", + "original.intent": "What are the main criticisms of this product? Please extract the relevant sentences.", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "verify_retrieved_value_normalized", + "eval_params": { + "ordered": false, + "type": "text" + }, + "expected_data": [ + "Waste of big money" + ] + } + } + ], + "site": "shopping" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "The wireless connection works on a whim (about 40% of the time I've owned it)", + "It seems to constantly run out of ink", + "Cartridge prices are less than some printers I've had", + "This printer seems to have more reasons NOT to work (none that are findable or correctable) Ex: error boxes saying that it's out of paper when it automatically switches to photo printing for some reason", + "Scanner is as slow as my first scanner I ever owned in the mid-90's", + "For the $176 I paid, there isn't even a fax component on it. I guess the \"PLUS\" part of it's name is in reference to the migraines it causes when you can't figure out the new reason why it's not working for the 10th time in the past 2 months." + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "The wireless connection works on a whim (about 40% of the time I've owned it). It seems to constantly run out of ink. Cartridge prices are less than some printers I've had, but now I understand why. This printer seems to have more reasons NOT to work (none that are findable or correctable) Ex: error boxes saying that it's out of paper when it automatically switches to photo printing for some reason. Scanner is as slow as my first scanner I ever owned in the mid-90's. For the $176 I paid, there isn't even a fax component on it. I guess the \"PLUS\" part of it's name is in reference to the migraines it causes when you can't figure out the new reason why it's not working for the 10th time in the past 2 months." + }, + "intent_template_id": 136, + "changelogs": [ + { + "key": "intent_template", + "category": "permissive_string_match", + "note": "Original task required subjective interpretation of review sentiment. Reframed to objective extraction of structured review data with clear filtering criteria (\u22642 stars) to enable exact matching instead of fuzzy semantic matching." + } + ] + }, + { + "sites": [ + "gitlab" + ], + "task_id": 168, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Tell me the full names of the repositories where I made contributions and they got {{description}} stars?", + "instantiation_dict": { + "description": "more than 100" + }, + "intent": "Tell me the full names of the repositories where I made contributions and they got more than 100 stars?", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "validate_status", + "expected_data": { + "expected_status": "RESOURCE_NOT_FOUND_ERROR" + } + } + } + ], + "site": "gitlab" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": "N/A" + }, + "reference_url": "", + "program_html": [], + "string_note": "No repo found", + "reference_answer_raw_annotation": "No repo found" + }, + "intent_template_id": 289, + "format_specification": "A list of the repository names (do not include the group/organization name)", + "changelogs": [ + { + "key": "format_specification", + "category": "clarify_instructions", + "note": "Clarify the format of the response" + }, + { + "key": "expected_retrieve_value", + "category": "unachievable_tasks", + "note": "Expect the right error code for unachievable tasks" + } + ] + }, + { + "sites": [ + "gitlab" + ], + "task_id": 169, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Tell me the full names of the repositories where I made contributions and they got {{description}} stars?", + "instantiation_dict": { + "description": "the most" + }, + "intent": "Tell me the full names of the repositories where I made contributions and they got the most stars?", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "verify_retrieved_value_normalized", + "eval_params": { + "ordered": "false", + "type": "text" + }, + "expected_data": [ + "a11yproject.com", + "design" + ] + } + } + ], + "site": "gitlab" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "a11yproject.com", + "design" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "a11yproject.com, Primer/design" + }, + "intent_template_id": 289, + "format_specification": "A list of the repository names (do not include the group/organization name)", + "changelogs": [ + { + "key": "format_specification", + "category": "clarify_instructions", + "note": "Clarify the format of the response" + }, + { + "key": "expected_retrieve_value", + "category": "avoid_permissive_text", + "note": "Use structured response to judge correctness" + }, + { + "key": "eval", + "category": "permissive_string_match", + "note": "Switched to exact match" + } + ] + }, + { + "sites": [ + "gitlab" + ], + "task_id": 170, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Tell me the full names of the repositories where I made contributions and they got {{description}} stars?", + "instantiation_dict": { + "description": "the least" + }, + "intent": "Tell me the full names of the repositories where I made contributions and they got the least stars?", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "verify_retrieved_value_normalized", + "eval_params": { + "ordered": "false", + "type": "text" + }, + "expected_data": [ + "cloud-to-butt", + "dotfiles", + "timeit", + "solarized-prism-theme", + "gimmiethat.space", + "remove-board-movement-events-from-the-github-issue-timeline" + ] + } + } + ], + "site": "gitlab" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "cloud-to-butt", + "dotfiles", + "timeit", + "solarized-prism-theme", + "gimmiethat.space", + "remove-board-movement-events-from-the-github-issue-timeline" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "cloud-to-butt, dotfiles, timeit, solarized-prism-theme, gimmiethat.space, remove-board-movement-events-from-the-github-issue-timeline" + }, + "intent_template_id": 289, + "format_specification": "A list of the repository names (do not include the group/organization name)", + "changelogs": [ + { + "key": "format_specification", + "category": "clarify_instructions", + "note": "Clarify the format of the response" + }, + { + "key": "expected_retrieve_value", + "category": "avoid_permissive_text", + "note": "Use structured response to judge correctness" + }, + { + "key": "eval", + "category": "permissive_string_match", + "note": "Switched to exact match" + } + ] + }, + { + "sites": [ + "gitlab" + ], + "task_id": 171, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Tell me the full names of the repositories where I made contributions and they got {{description}} stars?", + "instantiation_dict": { + "description": "less than 5" + }, + "intent": "Tell me the full names of the repositories where I made contributions and they got less than 5 stars?", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "verify_retrieved_value_normalized", + "eval_params": { + "ordered": "false", + "type": "text" + }, + "expected_data": [ + "a11y-syntax-highlighting", + "a11y-webring.club", + "accessible-html-content-patterns", + "ericwbailey.website", + "cloud-to-butt", + "dotfiles", + "timeit", + "solarized-prism-theme", + "gimmiethat.space", + "remove-board-movement-events-from-the-github-issue-timeline" + ] + } + } + ], + "site": "gitlab" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "a11y-syntax-highlighting", + "a11y-webring.club", + "accessible-html-content-patterns", + "ericwbailey.website", + "cloud-to-butt", + "dotfiles", + "timeit", + "solarized-prism-theme", + "gimmiethat.space", + "remove-board-movement-events-from-the-github-issue-timeline" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "a11y-syntax-highlighting, a11y-webring.club, accessible-html-content-patterns, ericwbailey.website, cloud-to-butt, dotfiles, timeit, solarized-prism-theme, gimmiethat.space, remove-board-movement-events-from-the-github-issue-timeline" + }, + "intent_template_id": 289, + "format_specification": "A list of the repository names (do not include the group/organization name)", + "changelogs": [ + { + "key": "format_specification", + "category": "clarify_instructions", + "note": "Clarify the format of the response" + }, + { + "key": "expected_retrieve_value", + "category": "avoid_permissive_text", + "note": "Use structured response to judge correctness" + }, + { + "key": "eval", + "category": "permissive_string_match", + "note": "Switched to exact match" + } + ] + }, + { + "sites": [ + "gitlab" + ], + "task_id": 172, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Tell me the full names of the repositories where I made contributions and they got {{description}} stars?", + "instantiation_dict": { + "description": "no" + }, + "intent": "Tell me the full names of the repositories where I made contributions and they got no stars?", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "verify_retrieved_value_normalized", + "eval_params": { + "ordered": "false", + "type": "text" + }, + "expected_data": [ + "cloud-to-butt", + "dotfiles", + "timeit", + "solarized-prism-theme", + "gimmiethat.space", + "remove-board-movement-events-from-the-github-issue-timeline" + ] + } + } + ], + "site": "gitlab" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "cloud-to-butt", + "dotfiles", + "timeit", + "solarized-prism-theme", + "gimmiethat.space", + "remove-board-movement-events-from-the-github-issue-timeline" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "cloud-to-butt, dotfiles, timeit, solarized-prism-theme, gimmiethat.space, remove-board-movement-events-from-the-github-issue-timeline" + }, + "intent_template_id": 289, + "format_specification": "A list of the repository names (do not include the group/organization name)", + "changelogs": [ + { + "key": "format_specification", + "category": "clarify_instructions", + "note": "Clarify the format of the response" + }, + { + "key": "expected_retrieve_value", + "category": "avoid_permissive_text", + "note": "Use structured response to judge correctness" + }, + { + "key": "eval", + "category": "permissive_string_match", + "note": "Switched to exact match" + } + ] + }, + { + "sites": [ + "gitlab" + ], + "task_id": 173, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Navigate to my latest updated issue that has keyword \"{{keyword}}\" in its title to check if it is closed", + "original.intent_template": "Open my latest updated issue that has keyword \"{{keyword}}\" in its title to check if it is closed", + "instantiation_dict": { + "keyword": "better" + }, + "intent": "Navigate to my latest updated issue that has keyword \"better\" in its title to check if it is closed", + "original.intent": "Open my latest updated issue that has keyword \"better\" in its title to check if it is closed", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "verify_retrieved_value_normalized", + "eval_params": { + "ordered": "false", + "type": "text" + }, + "expected_data": [ + "Not closed" + ] + } + } + ], + "site": "gitlab" + }, + "original.eval": { + "eval_types": [ + "string_match", + "url_match" + ], + "reference_answers": { + "fuzzy_match": [ + "No, it is open" + ] + }, + "reference_url": "__GITLAB__/byteblaze/empathy-prompts/-/issues/8", + "program_html": [], + "reference_answer_raw_annotation": "Not closed", + "string_note": "", + "url_note": "GOLD in PRED" + }, + "intent_template_id": 310, + "format_specification": "Respond with \"Not closed\" or \"Closed\"", + "changelogs": [ + { + "key": "intent_template", + "category": "clarify_instructions", + "note": "Clarify the intent as a navigation task" + }, + { + "key": "expected_retrieve_value", + "category": "avoid_permissive_text", + "note": "Use structured response to judge correctness" + }, + { + "key": "eval", + "category": "permissive_string_match", + "note": "Switched from fuzzy eval" + } + ] + }, + { + "sites": [ + "gitlab" + ], + "task_id": 174, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Navigate to my latest updated issue that has keyword \"{{keyword}}\" in its title to check if it is closed", + "original.intent_template": "Open my latest updated issue that has keyword \"{{keyword}}\" in its title to check if it is closed", + "instantiation_dict": { + "keyword": "feature" + }, + "intent": "Navigate to my latest updated issue that has keyword \"feature\" in its title to check if it is closed", + "original.intent": "Open my latest updated issue that has keyword \"feature\" in its title to check if it is closed", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "verify_retrieved_value_normalized", + "eval_params": { + "ordered": "false", + "type": "text" + }, + "expected_data": [ + "Not closed" + ] + } + } + ], + "site": "gitlab" + }, + "original.eval": { + "eval_types": [ + "string_match", + "url_match" + ], + "reference_answers": { + "fuzzy_match": [ + "No, it is open" + ] + }, + "reference_url": "__GITLAB__/byteblaze/a11y-webring.club/-/issues/71", + "program_html": [], + "reference_answer_raw_annotation": "Not closed", + "string_note": "" + }, + "intent_template_id": 310, + "format_specification": "Respond with \"Not closed\" or \"Closed\"", + "changelogs": [ + { + "key": "intent_template", + "category": "clarify_instructions", + "note": "Clarify the intent as a navigation task" + }, + { + "key": "expected_retrieve_value", + "category": "avoid_permissive_text", + "note": "Use structured response to judge correctness" + }, + { + "key": "eval", + "category": "permissive_string_match", + "note": "Switched from fuzzy eval" + } + ] + }, + { + "sites": [ + "gitlab" + ], + "task_id": 175, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Navigate to my latest updated issue that has keyword \"{{keyword}}\" in its title to check if it is closed", + "original.intent_template": "Open my latest updated issue that has keyword \"{{keyword}}\" in its title to check if it is closed", + "instantiation_dict": { + "keyword": "dependency" + }, + "intent": "Navigate to my latest updated issue that has keyword \"dependency\" in its title to check if it is closed", + "original.intent": "Open my latest updated issue that has keyword \"dependency\" in its title to check if it is closed", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "verify_retrieved_value_normalized", + "eval_params": { + "ordered": "false", + "type": "text" + }, + "expected_data": [ + "Not closed" + ] + } + } + ], + "site": "gitlab" + }, + "original.eval": { + "eval_types": [ + "string_match", + "url_match" + ], + "reference_answers": { + "fuzzy_match": [ + "No, it is open" + ] + }, + "reference_url": "__GITLAB__/byteblaze/empathy-prompts/-/issues/18", + "program_html": [], + "reference_answer_raw_annotation": "Not closed", + "string_note": "" + }, + "intent_template_id": 310, + "format_specification": "Respond with \"Not closed\" or \"Closed\"", + "changelogs": [ + { + "key": "intent_template", + "category": "clarify_instructions", + "note": "Clarify the intent as a navigation task" + }, + { + "key": "expected_retrieve_value", + "category": "avoid_permissive_text", + "note": "Use structured response to judge correctness" + }, + { + "key": "eval", + "category": "permissive_string_match", + "note": "Switched from fuzzy eval" + } + ] + }, + { + "sites": [ + "gitlab" + ], + "task_id": 176, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Navigate to my latest updated issue that has keyword \"{{keyword}}\" in its title to check if it is closed", + "original.intent_template": "Open my latest updated issue that has keyword \"{{keyword}}\" in its title to check if it is closed", + "instantiation_dict": { + "keyword": "theme editor" + }, + "intent": "Navigate to my latest updated issue that has keyword \"theme editor\" in its title to check if it is closed", + "original.intent": "Open my latest updated issue that has keyword \"theme editor\" in its title to check if it is closed", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "verify_retrieved_value_normalized", + "eval_params": { + "ordered": "false", + "type": "text" + }, + "expected_data": [ + "Not closed" + ] + } + } + ], + "site": "gitlab" + }, + "original.eval": { + "eval_types": [ + "string_match", + "url_match" + ], + "reference_answers": { + "fuzzy_match": [ + "No, it is open" + ] + }, + "reference_url": "__GITLAB__/byteblaze/a11y-syntax-highlighting/-/issues/1", + "program_html": [], + "reference_answer_raw_annotation": "Not closed", + "string_note": "" + }, + "intent_template_id": 310, + "format_specification": "Respond with \"Not closed\" or \"Closed\"", + "changelogs": [ + { + "key": "intent_template", + "category": "clarify_instructions", + "note": "Clarify the intent as a navigation task" + }, + { + "key": "expected_retrieve_value", + "category": "avoid_permissive_text", + "note": "Use structured response to judge correctness" + }, + { + "key": "eval", + "category": "permissive_string_match", + "note": "Switched from fuzzy eval" + } + ] + }, + { + "sites": [ + "gitlab" + ], + "task_id": 177, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Navigate to my latest updated issue that has keyword \"{{keyword}}\" in its title to check if it is closed", + "original.intent_template": "Open my latest updated issue that has keyword \"{{keyword}}\" in its title to check if it is closed", + "instantiation_dict": { + "keyword": "homepage content" + }, + "intent": "Navigate to my latest updated issue that has keyword \"homepage content\" in its title to check if it is closed", + "original.intent": "Open my latest updated issue that has keyword \"homepage content\" in its title to check if it is closed", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "verify_retrieved_value_normalized", + "eval_params": { + "ordered": "false", + "type": "text" + }, + "expected_data": [ + "Closed" + ] + } + } + ], + "site": "gitlab" + }, + "original.eval": { + "eval_types": [ + "string_match", + "url_match" + ], + "reference_answers": { + "fuzzy_match": [ + "Yes, it is closed" + ] + }, + "reference_url": "__GITLAB__/a11yproject/a11yproject.com/-/issues/719", + "program_html": [], + "reference_answer_raw_annotation": "closed", + "string_note": "" + }, + "intent_template_id": 310, + "format_specification": "Respond with \"Not closed\" or \"Closed\"", + "changelogs": [ + { + "key": "intent_template", + "category": "clarify_instructions", + "note": "Clarify the intent as a navigation task" + }, + { + "key": "expected_retrieve_value", + "category": "avoid_permissive_text", + "note": "Use structured response to judge correctness" + }, + { + "key": "eval", + "category": "permissive_string_match", + "note": "Switched from fuzzy eval" + } + ] + }, + { + "sites": [ + "gitlab" + ], + "task_id": 178, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Navigate to and open my latest created issue that has {{keyword}} in its title to check if it is closed", + "original.intent_template": "Open my latest created issue that has {{keyword}} in its title to check if it is closed", + "instantiation_dict": { + "keyword": "better" + }, + "intent": "Navigate to and open my latest created issue that has better in its title to check if it is closed", + "original.intent": "Open my latest created issue that has better in its title to check if it is closed", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "verify_retrieved_value_normalized", + "eval_params": { + "type": "text" + }, + "expected_data": [ + "Yes" + ] + } + } + ], + "site": "gitlab" + }, + "original.eval": { + "eval_types": [ + "string_match", + "url_match" + ], + "reference_answers": { + "exact_match": "Yes" + }, + "reference_url": "__GITLAB__/a11yproject/a11yproject.com/-/issues/566", + "program_html": [], + "reference_answer_raw_annotation": "Closed", + "string_note": "" + }, + "intent_template_id": 500, + "format_specification": "Return \"Yes\" if the issue is closed and \"No\" if the issue is opened", + "changelogs": [ + { + "key": "intent_template", + "category": "clarify_instructions", + "note": "Clarify that this task requires navigation and a response" + }, + { + "key": "expected_retrieve_value", + "category": "avoid_permissive_text", + "note": "Use structured response to judge correctness" + } + ] + }, + { + "sites": [ + "gitlab" + ], + "task_id": 179, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Navigate to and open my latest created issue that has {{keyword}} in its title to check if it is closed", + "original.intent_template": "Open my latest created issue that has {{keyword}} in its title to check if it is closed", + "instantiation_dict": { + "keyword": "feature" + }, + "intent": "Navigate to and open my latest created issue that has feature in its title to check if it is closed", + "original.intent": "Open my latest created issue that has feature in its title to check if it is closed", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "verify_retrieved_value_normalized", + "eval_params": { + "type": "text" + }, + "expected_data": [ + "Yes" + ] + } + } + ], + "site": "gitlab" + }, + "original.eval": { + "eval_types": [ + "string_match", + "url_match" + ], + "reference_answers": { + "exact_match": "Yes" + }, + "reference_url": "__GITLAB__/a11yproject/a11yproject.com/-/issues/1517", + "program_html": [], + "reference_answer_raw_annotation": "Closed", + "string_note": "" + }, + "intent_template_id": 500, + "format_specification": "Return \"Yes\" if the issue is closed and \"No\" if the issue is opened", + "changelogs": [ + { + "key": "intent_template", + "category": "clarify_instructions", + "note": "Clarify that this task requires navigation and a response" + }, + { + "key": "expected_retrieve_value", + "category": "avoid_permissive_text", + "note": "Use structured response to judge correctness" + } + ] + }, + { + "sites": [ + "gitlab" + ], + "task_id": 180, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Navigate to and open my latest created issue that has {{keyword}} in its title to check if it is closed", + "original.intent_template": "Open my latest created issue that has {{keyword}} in its title to check if it is closed", + "instantiation_dict": { + "keyword": "dependency" + }, + "intent": "Navigate to and open my latest created issue that has dependency in its title to check if it is closed", + "original.intent": "Open my latest created issue that has dependency in its title to check if it is closed", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "verify_retrieved_value_normalized", + "eval_params": { + "type": "text" + }, + "expected_data": [ + "No" + ] + } + } + ], + "site": "gitlab" + }, + "original.eval": { + "eval_types": [ + "string_match", + "url_match" + ], + "reference_answers": { + "exact_match": "No" + }, + "reference_url": "__GITLAB__/byteblaze/empathy-prompts/-/issues/18", + "program_html": [], + "reference_answer_raw_annotation": "Not closed", + "string_note": "" + }, + "intent_template_id": 500, + "format_specification": "Return \"Yes\" if the issue is closed and \"No\" if the issue is opened", + "changelogs": [ + { + "key": "intent_template", + "category": "clarify_instructions", + "note": "Clarify that this task requires navigation and a response" + }, + { + "key": "expected_retrieve_value", + "category": "avoid_permissive_text", + "note": "Use structured response to judge correctness" + } + ] + }, + { + "sites": [ + "gitlab" + ], + "task_id": 181, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Navigate to and open my latest created issue that has {{keyword}} in its title to check if it is closed", + "original.intent_template": "Open my latest created issue that has {{keyword}} in its title to check if it is closed", + "instantiation_dict": { + "keyword": "theme editor" + }, + "intent": "Navigate to and open my latest created issue that has theme editor in its title to check if it is closed", + "original.intent": "Open my latest created issue that has theme editor in its title to check if it is closed", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "verify_retrieved_value_normalized", + "eval_params": { + "type": "text" + }, + "expected_data": [ + "No" + ] + } + } + ], + "site": "gitlab" + }, + "original.eval": { + "eval_types": [ + "string_match", + "url_match" + ], + "reference_answers": { + "exact_match": "No" + }, + "reference_url": "__GITLAB__/byteblaze/a11y-syntax-highlighting/-/issues/1", + "program_html": [], + "reference_answer_raw_annotation": "Not closed", + "string_note": "" + }, + "intent_template_id": 500, + "format_specification": "Return \"Yes\" if the issue is closed and \"No\" if the issue is opened", + "changelogs": [ + { + "key": "intent_template", + "category": "clarify_instructions", + "note": "Clarify that this task requires navigation and a response" + }, + { + "key": "expected_retrieve_value", + "category": "avoid_permissive_text", + "note": "Use structured response to judge correctness" + } + ] + }, + { + "sites": [ + "gitlab" + ], + "task_id": 182, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Navigate to and open my latest created issue that has {{keyword}} in its title to check if it is closed", + "original.intent_template": "Open my latest created issue that has {{keyword}} in its title to check if it is closed", + "instantiation_dict": { + "keyword": "homepage content" + }, + "intent": "Navigate to and open my latest created issue that has homepage content in its title to check if it is closed", + "original.intent": "Open my latest created issue that has homepage content in its title to check if it is closed", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "verify_retrieved_value_normalized", + "eval_params": { + "type": "text" + }, + "expected_data": [ + "Yes" + ] + } + } + ], + "site": "gitlab" + }, + "original.eval": { + "eval_types": [ + "string_match", + "url_match" + ], + "reference_answers": { + "exact_match": "Yes" + }, + "reference_url": "__GITLAB__/a11yproject/a11yproject.com/-/issues/719", + "program_html": [], + "reference_answer_raw_annotation": "closed", + "string_note": "" + }, + "intent_template_id": 500, + "format_specification": "Return \"Yes\" if the issue is closed and \"No\" if the issue is opened", + "changelogs": [ + { + "key": "intent_template", + "category": "clarify_instructions", + "note": "Clarify that this task requires navigation and a response" + }, + { + "key": "expected_retrieve_value", + "category": "avoid_permissive_text", + "note": "Use structured response to judge correctness" + } + ] + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 183, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Give me the {{Attribute}} of the products that have {{N}} units left", + "instantiation_dict": { + "Attribute": "SKU", + "N": "10" + }, + "intent": "Give me the SKU of the products that have 10 units left", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "validate_status", + "expected_data": { + "expected_status": "NOT_FOUND_ERROR" + } + } + } + ], + "site": "shopping_admin" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": "N/A" + }, + "reference_url": "", + "program_html": [], + "string_note": "There is no product that has 10 quantities left.", + "reference_answer_raw_annotation": "There is no product that has 10 quantities left." + }, + "intent_template_id": 368, + "changelogs": [ + { + "key": "expected_retrieve_value", + "category": "unachievable_tasks", + "note": "Status check used instead of fuzzy match" + } + ] + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 184, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Give me the {{Attribute}} of the products that have {{N}} units left", + "instantiation_dict": { + "Attribute": "name", + "N": "0" + }, + "intent": "Give me the name of the products that have 0 units left", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "verify_retrieved_value_normalized", + "eval_params": { + "ordered": false, + "type": "text" + }, + "expected_data": [ + "Mona Pullover Hoodlie", + "Nadia Elements Shell", + "Neve Studio Dance Jacket", + "Juno Jacket", + "Olivia 1/4 Zip Light Jacket", + "Gabrielle Micro Sleeve Top", + "Iris Workout Top", + "Layla Tee", + "Elisa EverCool\u2122 Tee", + "Juliana Short-Sleeve Tee", + "Minerva LumaTech\u2122 V-Tee", + "Tiffany Fitness Tee", + "Karissa V-Neck Tee", + "Diva Gym Tee", + "Radiant Tee", + "Gwyn Endurance Tee", + "Desiree Fitness Tee", + "Jade Yoga Jacket", + "Adrienne Trek Jacket", + "Inez Full Zip Jacket", + "Hera Pullover Hoodie", + "Autumn Pullie", + "Miko Pullover Hoodie", + "Selene Yoga Hoodie", + "Daphne Full-Zip Hoodie", + "Phoebe Zipper Sweatshirt", + "Cassia Funnel Sweatshirt", + "Ariel Roll Sleeve Sweatshirt", + "Helena Hooded Fleece", + "Eos V-Neck Hoodie", + "Circe Hooded Ice Fleece", + "Stellar Solar Jacket", + "Josie Yoga Jacket", + "Augusta Pullover Jacket", + "Ingrid Running Jacket", + "Riona Full Zip Jacket", + "Electra Bra Top", + "Erica Evercool Sports Bra", + "Celeste Sports Bra", + "Carina Basic Capri", + "Daria Bikram Pant", + "Sylvia Capri", + "Deirdre Relaxed-Fit Capri", + "Portia Capri", + "Fiona Fitness Short", + "Maxima Drawstring Short", + "Gwen Drawstring Bike Short", + "Artemis Running Short", + "Bess Yoga Short", + "Angel Light Running Short", + "Echo Fit Compression Short", + "Sybil Running Short", + "Mimi All-Purpose Short", + "Ana Running Short", + "Ina Compression Short", + "Bardot Capri", + "Aeon Capri", + "Diana Tights", + "Prima Compete Bra Top", + "Lucia Cross-Fit Bra", + "Bella Tank", + "Zoe Tank", + "Nora Practice Tank", + "Nona Fitness Tank", + "Leah Yoga Top", + "Chloe Compete Tank", + "Maya Tunic", + "Antonia Racer Tank", + "Breathe-Easy Tank", + "Karmen Yoga Pant", + "Emma Leggings", + "Ida Workout Parachute Pant", + "Cora Parachute Pant", + "Sahara Leggings", + "Erika Running Short", + "Sprite Yoga Companion Kit", + "Taurus Elements Shell", + "Mars HeatTech\u2122 Pullover", + "Typhon Performance Fleece-lined Jacket", + "Jupiter All-Weather Trainer", + "Montana Wind Jacket", + "Proteus Fitness Jackshirt", + "Gobi HeatTec\u00ae Tee", + "Helios EverCool\u2122 Tee", + "Ryker LumaTech\u2122 Tee (Crew-neck)", + "Atomic Endurance Running Tee (V-neck)", + "Atomic Endurance Running Tee (Crew-Neck)", + "Balboa Persistence Tee", + "Zoltan Gym Tee", + "Aero Daily Fitness Tee", + "Ryker LumaTech\u2122 Tee (V-neck)", + "Logan HeatTec\u00ae Tee", + "Lando Gym Jacket", + "Orion Two-Tone Fitted Jacket", + "Kenobi Trail Jacket", + "Set of Sprite Yoga Straps", + "Chaz Kangeroo Hoodie", + "Teton Pullover Hoodie", + "Bruno Compete Hoodie", + "Frankie Sweatshirt", + "Hollister Backyard Sweatshirt", + "Stark Fundamental Hoodie", + "Hero Hoodie", + "Oslo Trek Hoodie", + "Abominable Hoodie", + "Mach Street Sweatshirt", + "Grayson Crewneck Sweatshirt", + "Ajax Full-Zip Sweatshirt", + "Marco Lightweight Active Hoodie", + "Beaumont Summit Kit", + "Hyperion Elements Jacket", + "Deion Long-Sleeve EverCool\u2122 Tee", + "Strike Endurance Tee", + "Erikssen CoolTech\u2122 Fitness Tank", + "Livingston All-Purpose Tight", + "Orestes Yoga Pant", + "Aether Gym Pant", + "Cronus Yoga Pant -33-Blue", + "Cronus Yoga Pant", + "Cobalt CoolTech\u2122 Fitness Short", + "Apollo Running Short", + "Meteor Workout Short", + "Torque Power Short", + "Hawkeye Yoga Short", + "Lono Yoga Short", + "Rapha Sports Short", + "Orestes Fitness Short", + "Troy Yoga Short", + "Sol Active Short", + "Arcadio Gym Short", + "Zeppelin Yoga Pant", + "Thorpe Track Pant", + "Mithra Warmup Pant", + "Tristan Endurance Tank", + "Primo Endurance Tank", + "Helios Endurance Tank", + "Rocco Gym Tank", + "Vulcan Weightlifting Tank", + "Argus All-Weather Tank", + "Sparta Gym Tank", + "Sinbad Fitness Tank", + "Tiberius Gym Tank", + "Atlas Fitness Tank", + "Cassius Sparring Tank", + "Caesar Warm-Up Pant", + "Viktor LumaTech\u2122 Pant", + "Geo Insulated Jogging Pant", + "Supernova Sport Pant", + "Kratos Gym Pant", + "Pierce Gym Short" + ] + } + } + ], + "site": "shopping_admin" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "exact_match": "Sinbad Fitness Tank" + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "Sinbad Fitness Tank" + }, + "intent_template_id": 368 + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 185, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Give me the {{Attribute}} of the products that have {{N}} units left", + "instantiation_dict": { + "Attribute": "brand", + "N": "3" + }, + "intent": "Give me the brand of the products that have 3 units left", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "verify_retrieved_value_normalized", + "eval_params": { + "ordered": false, + "type": "text" + }, + "expected_data": [ + "Eos V-Neck Hoodie-S-Blue", + "Minera Luma Tech V-Tee-XS-Blue" + ] + } + } + ], + "site": "shopping_admin" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "Eos", + "Minerva" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "Eos, Minerva" + }, + "intent_template_id": 368, + "changelogs": [ + { + "key": "eval", + "category": "permissive_string_match", + "note": "Switched to exact match" + } + ] + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 186, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Give me the {{Attribute}} of the products that have {{N}} units left", + "instantiation_dict": { + "Attribute": "product names and the sizes", + "N": "2-3" + }, + "intent": "Give me the product names and the sizes of the products that have 2-3 units left", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "verify_retrieved_value_normalized", + "eval_params": { + "ordered": false, + "type": "object" + }, + "expected_data": [ + { + "name": { + "value": "Eos V-Neck Hoodie-S-Blue", + "type": "text" + }, + "size": { + "value": "S", + "type": "text" + } + }, + { + "name": { + "value": "Minerva LumaTech\u2122 V-Tee-XS-Blue", + "type": "text" + }, + "size": { + "value": "XS", + "type": "text" + } + } + ] + } + } + ], + "site": "shopping_admin" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": [ + "Eos V-Neck Hoodie: S", + "Minera Luma Tech V-Tee: XS" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "Eos V-Neck Hoodie: S Minera Luma Tech V-Tee: XS" + }, + "intent_template_id": 368, + "format_specification": "Use \"name\" for the product name and \"size\" for the size.", + "changelogs": [ + { + "key": "eval", + "category": "permissive_string_match", + "note": "Switched from fuzzy eval" + } + ] + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 187, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Give me the {{Attribute}} of the products that have {{N}} units left", + "instantiation_dict": { + "Attribute": "SKU", + "N": "1-3" + }, + "intent": "Give me the SKU of the products that have 1-3 units left", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "verify_retrieved_value_normalized", + "eval_params": { + "ordered": false, + "type": "text" + }, + "expected_data": [ + "WH11-S-Blue", + "WS08-XS-Blue" + ] + } + } + ], + "site": "shopping_admin" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "WH11-S-Blue", + "WS08-XS-Blue" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "WH11-S-Blue, WS08-XS-Blue" + }, + "intent_template_id": 368, + "changelogs": [ + { + "key": "eval", + "category": "permissive_string_match", + "note": "Switched to exact match" + } + ] + }, + { + "sites": [ + "shopping" + ], + "task_id": 188, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "Get the total cost of my latest {{status}} order", + "original.intent_template": "Tell me the total cost of my latest {{status}} order?", + "instantiation_dict": { + "status": "cancelled" + }, + "intent": "Get the total cost of my latest cancelled order", + "original.intent": "Tell me the total cost of my latest cancelled order?", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "verify_retrieved_value_normalized", + "eval_params": { + "ordered": false, + "type": "currency" + }, + "expected_data": [ + "365.42" + ] + } + } + ], + "site": "shopping" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "365.42" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "365.42" + }, + "intent_template_id": 214, + "changelogs": [ + { + "key": "intent_template", + "category": "task_ambiguity", + "note": "Clarify the intent to specify return value instead of navigation" + }, + { + "key": "eval", + "category": "permissive_string_match", + "note": "Switched to exact match" + } + ] + }, + { + "sites": [ + "shopping" + ], + "task_id": 189, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "Get the total cost of my latest {{status}} order", + "original.intent_template": "Tell me the total cost of my latest {{status}} order?", + "instantiation_dict": { + "status": "pending" + }, + "intent": "Get the total cost of my latest pending order", + "original.intent": "Tell me the total cost of my latest pending order?", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "verify_retrieved_value_normalized", + "eval_params": { + "ordered": false, + "type": "currency" + }, + "expected_data": [ + "754.99" + ] + } + } + ], + "site": "shopping" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "754.99" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "754.99" + }, + "intent_template_id": 214, + "changelogs": [ + { + "key": "intent_template", + "category": "task_ambiguity", + "note": "Clarify the intent to specify return value instead of navigation" + }, + { + "key": "eval", + "category": "permissive_string_match", + "note": "Switched to exact match" + } + ] + }, + { + "sites": [ + "shopping" + ], + "task_id": 190, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "Get the total cost of my latest {{status}} order", + "original.intent_template": "Tell me the total cost of my latest {{status}} order?", + "instantiation_dict": { + "status": "complete" + }, + "intent": "Get the total cost of my latest complete order", + "original.intent": "Tell me the total cost of my latest complete order?", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "verify_retrieved_value_normalized", + "eval_params": { + "ordered": false, + "type": "currency" + }, + "expected_data": [ + "65.32" + ] + } + } + ], + "site": "shopping" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "65.32" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "65.32" + }, + "intent_template_id": 214, + "changelogs": [ + { + "key": "intent_template", + "category": "task_ambiguity", + "note": "Clarify the intent to specify return value instead of navigation" + }, + { + "key": "eval", + "category": "permissive_string_match", + "note": "Switched to exact match" + } + ] + }, + { + "sites": [ + "shopping" + ], + "task_id": 191, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "Get the total cost of my latest {{status}} order", + "original.intent_template": "Tell me the total cost of my latest {{status}} order?", + "instantiation_dict": { + "status": "processing" + }, + "intent": "Get the total cost of my latest processing order", + "original.intent": "Tell me the total cost of my latest processing order?", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "validate_status", + "expected_data": { + "expected_status": "NOT_FOUND_ERROR" + } + } + } + ], + "site": "shopping" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": "N/A" + }, + "reference_url": "", + "program_html": [], + "string_note": "There is no order of \"processing\" status", + "reference_answer_raw_annotation": "There is no order of \"processing\" status" + }, + "intent_template_id": 214, + "changelogs": [ + { + "key": "intent_template", + "category": "task_ambiguity", + "note": "Clarify the intent to specify return value instead of navigation" + }, + { + "key": "expected_retrieve_value", + "category": "unachievable_tasks", + "note": "Status check used instead of fuzzy match" + } + ] + }, + { + "sites": [ + "shopping" + ], + "task_id": 192, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "Get the total cost of my latest {{status}} order", + "original.intent_template": "Tell me the total cost of my latest {{status}} order?", + "instantiation_dict": { + "status": "non-cancelled" + }, + "intent": "Get the total cost of my latest non-cancelled order", + "original.intent": "Tell me the total cost of my latest non-cancelled order?", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "verify_retrieved_value_normalized", + "eval_params": { + "ordered": false, + "type": "currency" + }, + "expected_data": [ + "754.99" + ] + } + } + ], + "site": "shopping" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "754.99" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "754.99" + }, + "intent_template_id": 214, + "changelogs": [ + { + "key": "intent_template", + "category": "task_ambiguity", + "note": "Clarify the intent to specify return value instead of navigation" + }, + { + "key": "eval", + "category": "permissive_string_match", + "note": "Switched to exact match" + } + ] + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 193, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Get the total payment amount of the last {{N}} {{status}} orders", + "instantiation_dict": { + "status": "completed", + "N": "2" + }, + "intent": "Get the total payment amount of the last 2 completed orders", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "verify_retrieved_value_normalized", + "eval_params": { + "ordered": false, + "type": "currency" + }, + "expected_data": [ + "182.4" + ] + } + } + ], + "site": "shopping_admin" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "182.4" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "182.4" + }, + "intent_template_id": 367, + "changelogs": [ + { + "key": "eval", + "category": "permissive_string_match", + "note": "Switched to exact match" + } + ] + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 194, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Get the total payment amount of the last {{N}} {{status}} orders", + "instantiation_dict": { + "status": "completed", + "N": "5" + }, + "intent": "Get the total payment amount of the last 5 completed orders", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "verify_retrieved_value_normalized", + "eval_params": { + "ordered": false, + "type": "currency" + }, + "expected_data": [ + "555.2" + ] + } + } + ], + "site": "shopping_admin" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "555.2" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "555.2" + }, + "intent_template_id": 367, + "changelogs": [ + { + "key": "eval", + "category": "permissive_string_match", + "note": "Switched to exact match" + } + ] + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 195, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Get the total payment amount of the last {{N}} {{status}} orders", + "instantiation_dict": { + "status": "pending", + "N": "5" + }, + "intent": "Get the total payment amount of the last 5 pending orders", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "verify_retrieved_value_normalized", + "eval_params": { + "ordered": false, + "type": "currency" + }, + "expected_data": [ + "885.4" + ] + } + } + ], + "site": "shopping_admin" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "885.4" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "885.4" + }, + "intent_template_id": 367, + "changelogs": [ + { + "key": "eval", + "category": "permissive_string_match", + "note": "Switched to exact match" + } + ] + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 196, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Compare the payment difference of the last {{N}} {{status_1}} orders and {{status_2}} orders", + "instantiation_dict": { + "status_1": "cancelled", + "status_2": "completed", + "N": "4" + }, + "intent": "Return the payment difference between the last 4 cancelled orders and the last 4 completed orders", + "original.intent": "Compare the payment difference of the last 4 cancelled orders and completed orders", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "verify_retrieved_value_normalized", + "eval_params": { + "ordered": false, + "type": "currency" + }, + "expected_data": [ + "194.25" + ] + } + } + ], + "site": "shopping_admin" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "194.25" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "194.25" + }, + "intent_template_id": 367, + "changelogs": [ + { + "key": "intent", + "category": "task_ambiguity", + "note": "Compare vs return the payment difference" + }, + { + "key": "eval", + "category": "permissive_string_match", + "note": "Switched to exact match" + } + ] + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 197, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Get the total payment amount of the last {{N}} {{status}} orders", + "instantiation_dict": { + "status": "non-cancelled", + "N": "5" + }, + "intent": "Get the total payment amount of the last 5 non-cancelled orders", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "verify_retrieved_value_normalized", + "eval_params": { + "ordered": false, + "type": "currency" + }, + "expected_data": [ + "778.2" + ] + } + } + ], + "site": "shopping_admin" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "778.2" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "annotation_note": "219.4+210+166.4+93.4+89", + "reference_answer_raw_annotation": "778.2" + }, + "intent_template_id": 367, + "changelogs": [ + { + "key": "eval", + "category": "permissive_string_match", + "note": "Switched to exact match" + } + ] + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 198, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Get the {{attribute}} of the {{status}} order", + "instantiation_dict": { + "attribute": "customer name", + "status": "most recent cancelled" + }, + "intent": "Get the customer name of the most recent cancelled order", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "verify_retrieved_value_normalized", + "eval_params": { + "ordered": false, + "type": "text" + }, + "expected_data": [ + "Lily Potter" + ] + } + } + ], + "site": "shopping_admin" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "exact_match": "Lily Potter" + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "Lily Potter" + }, + "intent_template_id": 366 + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 199, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Get the {{attribute}} of the {{status}} order", + "instantiation_dict": { + "attribute": "order ID", + "status": "newest pending" + }, + "intent": "Get the order ID of the newest pending order", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "verify_retrieved_value_normalized", + "eval_params": { + "ordered": false, + "type": "numeric" + }, + "expected_data": [ + 299 + ] + } + } + ], + "site": "shopping_admin" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "299" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "299" + }, + "intent_template_id": 366, + "changelogs": [ + { + "key": "eval", + "category": "permissive_string_match", + "note": "Switched to exact match" + } + ] + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 200, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Get the {{attribute}} of the {{status}} order", + "instantiation_dict": { + "attribute": "billing name", + "status": "oldest complete" + }, + "intent": "Get the billing name of the oldest complete order", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "verify_retrieved_value_normalized", + "eval_params": { + "ordered": false, + "type": "text" + }, + "expected_data": [ + "John Lee" + ] + } + } + ], + "site": "shopping_admin" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "exact_match": "John Lee" + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "John Lee" + }, + "intent_template_id": 366 + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 201, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Get the {{attribute}} of the {{status}} order", + "instantiation_dict": { + "attribute": "customer name", + "status": "earliest fraud suspect" + }, + "intent": "Get the customer name of the earliest fraud suspect order", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "validate_status", + "expected_data": { + "expected_status": "NOT_FOUND_ERROR" + } + } + } + ], + "site": "shopping_admin" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": "N/A" + }, + "reference_url": "", + "program_html": [], + "string_note": "There is no order of \"fraud suspect\" status", + "reference_answer_raw_annotation": "N/A" + }, + "intent_template_id": 366, + "changelogs": [ + { + "key": "expected_retrieve_value", + "category": "unachievable_tasks", + "note": "Status check used instead of fuzzy match" + } + ] + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 202, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Get the {{attribute}} of the {{status}} order", + "instantiation_dict": { + "attribute": "date", + "status": "most recent cancelled" + }, + "original.instantiation_dict": { + "attribute": "date", + "status": "most recent canlled" + }, + "intent": "Get the date of the most recent cancelled order", + "original.intent": "Get the date of the most recent canlled order", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "verify_retrieved_value_normalized", + "eval_params": { + "ordered": false, + "type": "date" + }, + "expected_data": [ + "May 23 2023" + ] + } + } + ], + "site": "shopping_admin" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": [ + "May 23 2023" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "May 23, 2023" + }, + "intent_template_id": 366, + "changelogs": [ + { + "key": "instantiation_dict", + "category": "spelling_or_grammar", + "note": "Update cancelled spelling" + }, + { + "key": "eval", + "category": "permissive_string_match", + "note": "Switched from fuzzy eval" + } + ] + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 203, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Get the {{attribute}} of the {{status}} order", + "instantiation_dict": { + "attribute": "purchase date and order id", + "status": "most recent pending" + }, + "intent": "Get the purchase date and order id of the most recent pending order", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "verify_retrieved_value_normalized", + "eval_params": { + "ordered": false, + "type": "object" + }, + "expected_data": [ + { + "date": { + "value": "May 31, 2023", + "type": "date" + }, + "order_id": { + "value": "000000299", + "type": "text" + } + } + ] + } + } + ], + "site": "shopping_admin" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": [ + "order id: 000000299", + "purchase date: May 31, 2023" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "000000299, May 31, 2023, 2:55:09 AM" + }, + "intent_template_id": 366, + "format_specification": "Use \"date\" for the date and \"order_id\" for the order id.", + "changelogs": [ + { + "key": "eval", + "category": "permissive_string_match", + "note": "Switched from fuzzy eval" + } + ] + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 204, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Get the {{attribute}} of the {{status}} order", + "instantiation_dict": { + "attribute": "product name and discounted price (low to high)", + "status": "most recent completed" + }, + "intent": "Get the product name and discounted price (low to high) of the most recent completed order", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "verify_retrieved_value_normalized", + "eval_params": { + "ordered": "true", + "type": "object" + }, + "expected_data": [ + { + "name": { + "value": "Proteus Fitness Jackshirt", + "type": "text" + }, + "price": { + "value": "$45", + "type": "currency" + } + }, + { + "name": { + "value": "Ida Workout Parachute Pant", + "type": "text" + }, + "price": { + "value": "$48", + "type": "currency" + } + } + ] + } + } + ], + "site": "shopping_admin" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": [ + "Rapha Sports Short: $35", + "Thorpe Track Pant: $54.4", + "Mach Street Sweatshirt: $62" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "Rapha Sports Short: $35 Thorpe Track Pant: $54.4 Mach Street Sweatshirt: $62" + }, + "intent_template_id": 366, + "format_specification": "Use \"name\" for the product name and \"price\" for the discounted price.", + "changelogs": [ + { + "key": "format_specification", + "category": "reference_alignment", + "note": "Original expected was for the first pending order not a completed order" + }, + { + "key": "eval", + "category": "permissive_string_match", + "note": "Switched from fuzzy eval" + } + ] + }, + { + "sites": [ + "gitlab" + ], + "task_id": 205, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__/a11yproject/a11yproject.com", + "geolocation": null, + "intent_template": "How many commits did {{user}} make on {{date}}?", + "instantiation_dict": { + "user": "kilian", + "date": "3/5/2023" + }, + "intent": "How many commits did kilian make on 3/5/2023?", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "verify_retrieved_value_normalized", + "eval_params": { + "ordered": "false", + "type": "numeric" + }, + "expected_data": [ + 1 + ] + } + } + ], + "site": "gitlab" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "1" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "1" + }, + "intent_template_id": 320, + "changelogs": [ + { + "key": "expected_retrieve_value", + "category": "avoid_permissive_text", + "note": "Use structured response to judge correctness" + }, + { + "key": "eval", + "category": "permissive_string_match", + "note": "Switched to exact match" + } + ] + }, + { + "sites": [ + "gitlab" + ], + "task_id": 206, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__/a11yproject/a11yproject.com", + "geolocation": null, + "intent_template": "How many commits did {{user}} make on {{date}}?", + "instantiation_dict": { + "user": "Eric", + "date": "3/2" + }, + "intent": "How many commits did Eric make on 3/2?", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "verify_retrieved_value_normalized", + "eval_params": { + "ordered": "false", + "type": "numeric" + }, + "expected_data": [ + 2 + ] + } + } + ], + "site": "gitlab" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "2" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "2" + }, + "intent_template_id": 320, + "changelogs": [ + { + "key": "expected_retrieve_value", + "category": "avoid_permissive_text", + "note": "Use structured response to judge correctness" + }, + { + "key": "eval", + "category": "permissive_string_match", + "note": "Switched to exact match" + } + ] + }, + { + "sites": [ + "gitlab" + ], + "task_id": 207, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__/a11yproject/a11yproject.com", + "geolocation": null, + "intent_template": "How many commits did {{user}} make on {{date}} in total?", + "instantiation_dict": { + "user": "Eric and Kilian", + "date": "1/3/2023" + }, + "intent": "How many commits did Eric and Kilian make on 1/3/2023 in total?", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "verify_retrieved_value_normalized", + "eval_params": { + "ordered": "false", + "type": "numeric" + }, + "expected_data": [ + 1 + ] + } + } + ], + "site": "gitlab" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "1" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "1" + }, + "intent_template_id": 320, + "changelogs": [ + { + "key": "expected_retrieve_value", + "category": "avoid_permissive_text", + "note": "Use structured response to judge correctness" + }, + { + "key": "eval", + "category": "permissive_string_match", + "note": "Switched to exact match" + } + ] + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 208, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Find the customer name and email with phone number {{PhoneNum}}", + "instantiation_dict": { + "PhoneNum": "+1 2058812302" + }, + "intent": "Find the customer name and email with phone number +1 2058812302", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "verify_retrieved_value_normalized", + "eval_params": { + "ordered": false, + "type": "object" + }, + "expected_data": [ + { + "name": { + "value": "John Smith", + "type": "text" + }, + "email": { + "value": "john.smith.xyz@gmail.com", + "type": "text" + } + } + ] + } + } + ], + "site": "shopping_admin" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "John Smith", + "john.smith.xyz@gmail.com" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "John Smith, john.smith.xyz@gmail.com" + }, + "intent_template_id": 364, + "format_specification": "Use \"name\" for the customer name and \"email\" for the email.", + "original.format_specification": "Return customer name in field \"name\" and email in field \"email\"", + "changelogs": [ + { + "key": "eval", + "category": "permissive_string_match", + "note": "Switched to exact match" + } + ] + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 209, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Find the customer name and email with phone number {{PhoneNum}}", + "instantiation_dict": { + "PhoneNum": "2137418080" + }, + "intent": "Find the customer name and email with phone number 2137418080", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "verify_retrieved_value_normalized", + "eval_params": { + "ordered": false, + "type": "object" + }, + "expected_data": [ + { + "name": { + "value": "Jennifer White", + "type": "text" + }, + "email": { + "value": "jennifer.white@yahoo.com", + "type": "text" + } + } + ] + } + } + ], + "site": "shopping_admin" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "Jennifer White", + "jennifer.white@yahoo.com" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "Jennifer White, jennifer.white@yahoo.com" + }, + "intent_template_id": 364, + "format_specification": "Use \"name\" for the customer name and \"email\" for the email.", + "original.format_specification": "Return customer name in field \"name\" and email in field \"email\"", + "changelogs": [ + { + "key": "eval", + "category": "permissive_string_match", + "note": "Switched to exact match" + } + ] + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 210, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Find the customer name and email with phone number {{PhoneNum}}", + "instantiation_dict": { + "PhoneNum": "2065555555" + }, + "intent": "Find the customer name and email with phone number 2065555555", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "verify_retrieved_value_normalized", + "eval_params": { + "ordered": false, + "type": "object" + }, + "expected_data": [ + { + "name": { + "value": "Adam Garcia", + "type": "text" + }, + "email": { + "value": "gamingpro456@gmail.com", + "type": "text" + } + } + ] + } + } + ], + "site": "shopping_admin" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "Adam Garcia", + "gamingpro456@gmail.com" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "Adam Garcia, gamingpro456@gmail.com" + }, + "intent_template_id": 364, + "format_specification": "Use \"name\" for the customer name and \"email\" for the email.", + "original.format_specification": "Return customer name in field \"name\" and email in field \"email\"", + "changelogs": [ + { + "key": "eval", + "category": "permissive_string_match", + "note": "Switched to exact match" + } + ] + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 211, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Find the customer name and email with phone number {{PhoneNum}}", + "instantiation_dict": { + "PhoneNum": "8015551212" + }, + "intent": "Find the customer name and email with phone number 8015551212", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "verify_retrieved_value_normalized", + "eval_params": { + "ordered": false, + "type": "object" + }, + "expected_data": [ + { + "name": { + "value": "Sean Miller", + "type": "text" + }, + "email": { + "value": "sean.miller@gmail.com", + "type": "text" + } + } + ] + } + } + ], + "site": "shopping_admin" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "Sean Miller", + "sean.miller@gmail.com" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "Sean Miller, sean.miller@gmail.com" + }, + "intent_template_id": 364, + "format_specification": "Use \"name\" for the customer name and \"email\" for the email.", + "original.format_specification": "Return customer name in field \"name\" and email in field \"email\"", + "changelogs": [ + { + "key": "eval", + "category": "permissive_string_match", + "note": "Switched to exact match" + } + ] + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 212, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Find the customer name and email with phone number {{PhoneNum}}", + "instantiation_dict": { + "PhoneNum": "555-229-3326" + }, + "intent": "Find the customer name and email with phone number 555-229-3326", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "verify_retrieved_value_normalized", + "eval_params": { + "ordered": false, + "type": "object" + }, + "expected_data": [ + { + "name": { + "value": "Veronica Costello", + "type": "text" + }, + "email": { + "value": "roni_cost@example.com", + "type": "text" + } + } + ] + } + } + ], + "site": "shopping_admin" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "Veronica Costello", + "roni_cost@example.com" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "Veronica Costello, roni_cost@example.com" + }, + "intent_template_id": 364, + "format_specification": "Use \"name\" for the customer name and \"email\" for the email.", + "original.format_specification": "Return customer name in field \"name\" and email in field \"email\"", + "changelogs": [ + { + "key": "eval", + "category": "permissive_string_match", + "note": "Switched to exact match" + } + ] + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 213, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "List all reviews with 3 stars or below for {{product}}.", + "original.intent_template": "What are the key aspects that the customers don't like about {{product}}", + "instantiation_dict": { + "product": "Antonia Racer Tank" + }, + "intent": "List all reviews with 3 stars or below for Antonia Racer Tank.", + "original.intent": "What are the key aspects that the customers don't like about Antonia Racer Tank", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "verify_retrieved_value_normalized", + "eval_params": { + "ordered": false, + "type": "object" + }, + "expected_data": [ + { + "title": { + "value": "Zero support/modesty", + "type": "text" + }, + "rating": { + "value": "2", + "type": "numeric" + } + }, + { + "title": { + "value": "Not for high impact", + "type": "text" + }, + "rating": { + "value": "3", + "type": "numeric" + } + } + ] + } + } + ], + "site": "shopping_admin" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": [ + "Not suitable for high-impact workouts" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "Not suitable for high-impact workouts" + }, + "intent_template_id": 249, + "format_specification": "Use \"title\" for the review title and \"rating\" for rating (number of stars).", + "changelogs": [ + { + "key": "eval", + "category": "permissive_string_match", + "note": "Switched from fuzzy eval" + } + ] + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 214, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "List all reviews with 3 stars or below for {{product}}.", + "original.intent_template": "What are the key aspects that the customers don't like about {{product}}", + "instantiation_dict": { + "product": "Zing Jump Rope" + }, + "intent": "List all reviews with 3 stars or below for Zing Jump Rope.", + "original.intent": "What are the key aspects that the customers don't like about Zing Jump Rope", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "verify_retrieved_value_normalized", + "eval_params": { + "ordered": false, + "type": "object" + }, + "expected_data": [] + } + } + ], + "site": "shopping_admin" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": [ + "It is hard to find the right size. Won't last long" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "It is hard to find the right size. Won't last long" + }, + "intent_template_id": 249, + "format_specification": "Use \"title\" for the review title and \"rating\" for rating (number of stars).", + "changelogs": [ + { + "key": "eval", + "category": "permissive_string_match", + "note": "Switched from fuzzy eval" + } + ] + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 215, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "List all reviews with 3 stars or below for {{product}}.", + "original.intent_template": "What are the key aspects that the customers don't like about {{product}}", + "instantiation_dict": { + "product": "Circe ice fleece" + }, + "intent": "List all reviews with 3 stars or below for Circe ice fleece.", + "original.intent": "What are the key aspects that the customers don't like about Circe ice fleece", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "verify_retrieved_value_normalized", + "eval_params": { + "ordered": false, + "type": "object" + }, + "expected_data": [ + { + "title": { + "value": "Bad!", + "type": "text" + }, + "rating": { + "value": "1", + "type": "numeric" + } + } + ] + } + } + ], + "site": "shopping_admin" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": [ + "Material quality, fit, insufficient warmth, color" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "Material quality, fit, insufficient warmth, color" + }, + "intent_template_id": 249, + "format_specification": "Use \"title\" for the review title and \"rating\" for rating (number of stars).", + "changelogs": [ + { + "key": "eval", + "category": "permissive_string_match", + "note": "Switched from fuzzy eval" + } + ] + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 216, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "List all reviews with 3 stars or below for {{product}}.", + "original.intent_template": "What are the key aspects that the customers don't like about {{product}}", + "instantiation_dict": { + "product": "Electra Bra Top" + }, + "intent": "List all reviews with 3 stars or below for Electra Bra Top.", + "original.intent": "What are the key aspects that the customers don't like about Electra Bra Top", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "verify_retrieved_value_normalized", + "eval_params": { + "ordered": false, + "type": "object" + }, + "expected_data": [ + { + "title": { + "value": "Not exactly true to size", + "type": "text" + }, + "rating": { + "value": "3", + "type": "numeric" + } + } + ] + } + } + ], + "site": "shopping_admin" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": [ + "Not true to size" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "Not true to size" + }, + "intent_template_id": 249, + "format_specification": "Use \"title\" for the review title and \"rating\" for rating (number of stars).", + "changelogs": [ + { + "key": "eval", + "category": "permissive_string_match", + "note": "Switched from fuzzy eval" + } + ] + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 217, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "List all reviews with 3 stars or below for {{product}}.", + "original.intent_template": "What are the key aspects that the customers don't like about {{product}}", + "instantiation_dict": { + "product": "Pursuit Tone Band" + }, + "intent": "List all reviews with 3 stars or below for Pursuit Tone Band.", + "original.intent": "What are the key aspects that the customers don't like about Pursuit Tone Band", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "verify_retrieved_value_normalized", + "eval_params": { + "ordered": false, + "type": "object" + }, + "expected_data": [ + { + "title": { + "value": "Agreed. More resistance", + "type": "text" + }, + "rating": { + "value": "3", + "type": "numeric" + } + }, + { + "title": { + "value": "Want more resistance", + "type": "text" + }, + "rating": { + "value": "3", + "type": "numeric" + } + } + ] + } + } + ], + "site": "shopping_admin" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": [ + "Insufficient resistance for their workouts." + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "Insufficient resistance for their workouts." + }, + "intent_template_id": 249, + "format_specification": "Use \"title\" for the review title and \"rating\" for rating (number of stars).", + "changelogs": [ + { + "key": "eval", + "category": "permissive_string_match", + "note": "Switched from fuzzy eval" + } + ] + }, + { + "sites": [ + "map" + ], + "task_id": 218, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "Show me the walking distance from nearby hotels to {{location}} that take at most {{n}} minutes?", + "instantiation_dict": { + "location": "CMU, Pittsburgh", + "n": "5" + }, + "intent": "Show me the walking distance from nearby hotels to CMU, Pittsburgh that take at most 5 minutes?", + "require_reset": false, + "eval": { + "site": "map" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": "N/A" + }, + "reference_url": "", + "program_html": [], + "string_note": "There is no hotel near CMU that is within 5 minutes walking distance", + "reference_answer_raw_annotation": "N/A" + }, + "intent_template_id": 41, + "changelogs": [ + { + "key": "expected_retrieve_value", + "category": "unachievable_tasks", + "note": "Status check used instead of fuzzy match" + } + ] + }, + { + "sites": [ + "map" + ], + "task_id": 219, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "Show me the walking distance from nearby hotels to {{location}} that take at most {{n}} minutes?", + "instantiation_dict": { + "location": "Pittsburgh airport", + "n": "3" + }, + "intent": "Show me the walking distance from nearby hotels to Pittsburgh airport that take at most 3 minutes?", + "require_reset": false, + "eval": { + "site": "map" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": "N/A" + }, + "reference_url": "", + "program_html": [], + "string_note": "There is no hotel near CMU that is within 5 minutes walking distance", + "reference_answer_raw_annotation": "N/A" + }, + "intent_template_id": 41, + "changelogs": [ + { + "key": "expected_retrieve_value", + "category": "unachievable_tasks", + "note": "Status check used instead of fuzzy match" + } + ] + }, + { + "sites": [ + "map" + ], + "task_id": 220, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "Show me the walking distance from nearby hotels to {{location}} that take at most {{n}} minutes?", + "instantiation_dict": { + "location": "Gardner Steel Conference Center,", + "n": 5 + }, + "intent": "Show me the walking distance from nearby hotels to Gardner Steel Conference Center, that take at most 5 minutes?", + "require_reset": false, + "eval": { + "site": "map" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": [ + "Wyndham Pittsburgh University Cente: 375m", + "The Oaklander Hotel: 338m" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "Wyndham Pittsburgh University Cente: 375 m\nThe Oaklander Hotel: 338 m" + }, + "intent_template_id": 41, + "changelogs": [ + { + "key": "eval", + "category": "permissive_string_match", + "note": "Switched from fuzzy eval" + } + ] + }, + { + "sites": [ + "map" + ], + "task_id": 221, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "I am at CMU Pittsburgh, how long it takes to the nearest {{location}} with different transportation methods?", + "instantiation_dict": { + "location": "USPS postal office" + }, + "intent": "I am at CMU Pittsburgh, how long it takes to the nearest USPS postal office with different transportation methods?", + "require_reset": false, + "eval": { + "site": "map" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": [ + "Walk: 1 minute", + "Drive: less than 1 minute", + "Bike: less than 1 minute" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "Walk: 1 minute to walk and\nDrive: less than 1 minute\nBike: less than 1 minute" + }, + "intent_template_id": 35, + "changelogs": [ + { + "key": "eval", + "category": "permissive_string_match", + "note": "Switched from fuzzy eval" + } + ] + }, + { + "sites": [ + "map" + ], + "task_id": 222, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "I am at CMU Pittsburgh, how long it takes to drive to the nearest {{location}}", + "instantiation_dict": { + "location": "cold stone ice cream" + }, + "intent": "I am at CMU Pittsburgh, how long it takes to drive to the nearest cold stone ice cream", + "require_reset": false, + "eval": { + "site": "map" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": [ + "3min" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "3min" + }, + "intent_template_id": 35, + "changelogs": [ + { + "key": "eval", + "category": "permissive_string_match", + "note": "Switched from fuzzy eval" + } + ] + }, + { + "sites": [ + "map" + ], + "task_id": 223, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "I am at CMU Pittsburgh, how long it takes to drive to the nearest {{location}}", + "instantiation_dict": { + "location": "Mcdonald's" + }, + "intent": "I am at CMU Pittsburgh, how long it takes to drive to the nearest Mcdonald's", + "require_reset": false, + "eval": { + "site": "map" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": [ + "4min" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "4min" + }, + "intent_template_id": 35, + "changelogs": [ + { + "key": "eval", + "category": "permissive_string_match", + "note": "Switched from fuzzy eval" + } + ] + }, + { + "sites": [ + "map" + ], + "task_id": 224, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "I am at CMU Pittsburgh, how long it takes to drive to the nearest {{location}}", + "instantiation_dict": { + "location": "wendys" + }, + "intent": "I am at CMU Pittsburgh, how long it takes to drive to the nearest wendys", + "require_reset": false, + "eval": { + "site": "map" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": [ + "3min" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "3min" + }, + "intent_template_id": 35, + "changelogs": [ + { + "key": "eval", + "category": "permissive_string_match", + "note": "Switched from fuzzy eval" + } + ] + }, + { + "sites": [ + "shopping" + ], + "task_id": 225, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "Return the titles for reviews with 3 stars or below for {{product_type}} from {{manufature}}", + "original.intent_template": "What do customers say about {{product_type}} from {{manufature}}", + "instantiation_dict": { + "product_type": "brush", + "manufature": "sephora" + }, + "intent": "Return the titles for reviews with 3 stars or below for brush from sephora", + "original.intent": "What do customers say about brush from sephora", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "validate_status", + "expected_data": { + "expected_status": "NOT_FOUND_ERROR" + } + } + } + ], + "site": "shopping" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": "N/A" + }, + "reference_url": "", + "program_html": [], + "string_note": "The sephora brushes don't have reviews", + "reference_answer_raw_annotation": "N/A" + }, + "intent_template_id": 135, + "changelogs": [ + { + "key": "expected_retrieve_value", + "category": "unachievable_tasks", + "note": "Status check used instead of fuzzy match" + } + ] + }, + { + "sites": [ + "shopping" + ], + "task_id": 226, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "What is the price range for products from {{brand}}?", + "instantiation_dict": { + "brand": "Amazon basic" + }, + "intent": "What is the price range for products from Amazon basic?", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "verify_retrieved_value_normalized", + "eval_params": { + "ordered": false, + "type": "object" + }, + "expected_data": [ + { + "min": { + "value": 5.49, + "type": "currency" + }, + "max": { + "value": 375.19, + "type": "currency" + } + } + ] + } + } + ], + "site": "shopping" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "5.49", + "375.19" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "$5.49 - $375.19" + }, + "intent_template_id": 370, + "format_specification": "Use \"min\" for lowest price and \"max\" for highest price.", + "changelogs": [ + { + "key": "eval", + "category": "permissive_string_match", + "note": "Switched to exact match" + } + ] + }, + { + "sites": [ + "shopping" + ], + "task_id": 227, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "What is the price range for products from {{brand}}?", + "instantiation_dict": { + "brand": "EYZUTAK" + }, + "intent": "What is the price range for products from EYZUTAK?", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "verify_retrieved_value_normalized", + "eval_params": { + "ordered": false, + "type": "object" + }, + "expected_data": [ + { + "min": { + "value": 9.99, + "type": "currency" + }, + "max": { + "value": 9.99, + "type": "currency" + } + } + ] + } + } + ], + "site": "shopping" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "9.99" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "$9.99" + }, + "intent_template_id": 370, + "format_specification": "Use \"min\" for lowest price and \"max\" for highest price.", + "changelogs": [ + { + "key": "eval", + "category": "permissive_string_match", + "note": "Switched to exact match" + } + ] + }, + { + "sites": [ + "shopping" + ], + "task_id": 228, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "What is the price range for products from {{brand}}?", + "instantiation_dict": { + "brand": "sephora" + }, + "intent": "What is the price range for products from sephora?", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "verify_retrieved_value_normalized", + "eval_params": { + "ordered": false, + "type": "object" + }, + "expected_data": [ + { + "min": { + "value": 18.18, + "type": "currency" + }, + "max": { + "value": 94.99, + "type": "currency" + } + } + ] + } + } + ], + "site": "shopping" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "18.18", + "94.99" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "$18.18 - $94.99" + }, + "intent_template_id": 370, + "format_specification": "Use \"min\" for lowest price and \"max\" for highest price.", + "changelogs": [ + { + "key": "eval", + "category": "permissive_string_match", + "note": "Switched to exact match" + } + ] + }, + { + "sites": [ + "shopping" + ], + "task_id": 229, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "What is the price range for products from {{brand}}?", + "instantiation_dict": { + "brand": "ugreen" + }, + "intent": "What is the price range for products from ugreen?", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "verify_retrieved_value_normalized", + "eval_params": { + "ordered": false, + "type": "object" + }, + "expected_data": [ + { + "min": { + "value": 6.99, + "type": "currency" + }, + "max": { + "value": 38.99, + "type": "currency" + } + } + ] + } + } + ], + "site": "shopping" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "6.99", + "38.99" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "$6.99 - $38.99" + }, + "intent_template_id": 370, + "format_specification": "Use \"min\" for lowest price and \"max\" for highest price.", + "changelogs": [ + { + "key": "eval", + "category": "permissive_string_match", + "note": "Switched to exact match" + } + ] + }, + { + "sites": [ + "shopping" + ], + "task_id": 230, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "What is the price range for products from {{brand}}?", + "instantiation_dict": { + "brand": "Perricone MD" + }, + "intent": "What is the price range for products from Perricone MD?", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "verify_retrieved_value_normalized", + "eval_params": { + "ordered": false, + "type": "object" + }, + "expected_data": [ + { + "min": { + "value": 35.0, + "type": "currency" + }, + "max": { + "value": 149.0, + "type": "currency" + } + } + ] + } + } + ], + "site": "shopping" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "35", + "149" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "$35 - $149" + }, + "intent_template_id": 370, + "format_specification": "Use \"min\" for lowest price and \"max\" for highest price.", + "changelogs": [ + { + "key": "eval", + "category": "permissive_string_match", + "note": "Switched to exact match" + } + ] + }, + { + "sites": [ + "shopping" + ], + "task_id": 231, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "Get the order number of my most recent {{status}} order ", + "instantiation_dict": { + "status": "cancelled" + }, + "intent": "Get the order number of my most recent cancelled order ", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "verify_retrieved_value_normalized", + "eval_params": { + "ordered": false, + "type": "numeric" + }, + "expected_data": [ + 170 + ] + } + } + ], + "site": "shopping" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "170" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "000000170" + }, + "intent_template_id": 213, + "changelogs": [ + { + "key": "eval", + "category": "permissive_string_match", + "note": "Switched to exact match" + } + ] + }, + { + "sites": [ + "shopping" + ], + "task_id": 232, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "Get the order number of my most recent {{status}} order ", + "instantiation_dict": { + "status": "pending" + }, + "intent": "Get the order number of my most recent pending order ", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "verify_retrieved_value_normalized", + "eval_params": { + "ordered": false, + "type": "numeric" + }, + "expected_data": [ + 189 + ] + } + } + ], + "site": "shopping" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "189" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "000000189" + }, + "intent_template_id": 213, + "changelogs": [ + { + "key": "eval", + "category": "permissive_string_match", + "note": "Switched to exact match" + } + ] + }, + { + "sites": [ + "shopping" + ], + "task_id": 233, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "Get the order number of my most recent {{status}} order ", + "instantiation_dict": { + "status": "complete" + }, + "intent": "Get the order number of my most recent complete order ", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "verify_retrieved_value_normalized", + "eval_params": { + "ordered": false, + "type": "numeric" + }, + "expected_data": [ + 180 + ] + } + } + ], + "site": "shopping" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "180" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "000000180" + }, + "intent_template_id": 213, + "changelogs": [ + { + "key": "eval", + "category": "permissive_string_match", + "note": "Switched to exact match" + } + ] + }, + { + "sites": [ + "shopping" + ], + "task_id": 234, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "Get the order number of my most recent {{status}} order ", + "instantiation_dict": { + "status": "on hold" + }, + "intent": "Get the order number of my most recent on hold order ", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "validate_status", + "expected_data": { + "expected_status": "NOT_FOUND_ERROR" + } + } + } + ], + "site": "shopping" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": "N/A" + }, + "reference_url": "", + "program_html": [], + "string_note": "there is no on hold order", + "reference_answer_raw_annotation": "N/A" + }, + "intent_template_id": 213, + "changelogs": [ + { + "key": "expected_retrieve_value", + "category": "unachievable_tasks", + "note": "Status check used instead of fuzzy match" + } + ] + }, + { + "sites": [ + "shopping" + ], + "task_id": 235, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "Get the order number of my most recent {{status}} order ", + "instantiation_dict": { + "status": "under delivery" + }, + "intent": "Get the order number of my most recent under delivery order ", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "validate_status", + "expected_data": { + "expected_status": "NOT_FOUND_ERROR" + } + } + } + ], + "site": "shopping" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": "N/A" + }, + "reference_url": "", + "program_html": [], + "string_note": "There is no under delivery order", + "reference_answer_raw_annotation": "N/A" + }, + "intent_template_id": 213, + "changelogs": [ + { + "key": "expected_retrieve_value", + "category": "unachievable_tasks", + "note": "Status check used instead of fuzzy match" + } + ] + }, + { + "sites": [ + "map" + ], + "task_id": 236, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "Where is the nearest {{location}} from {{location2}} {{condition}}", + "instantiation_dict": { + "location": "pharmacy", + "location2": "Carnegie Mellon", + "condition": "I can walk within 20mins" + }, + "intent": "Where is the nearest pharmacy from Carnegie Mellon I can walk within 20mins", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "verify_retrieved_value_normalized", + "eval_params": { + "ordered": false, + "type": "text" + }, + "expected_data": [ + "Schiller's Pharmacy", + "811", + "South Aiken Avenue", + "Shadyside", + "Pittsburgh" + ] + } + } + ], + "site": "map" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "Schiller's Pharmacy", + "811", + "South Aiken Avenue", + "Shadyside", + "Pittsburgh" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "Schiller's Pharmacy, 811, South Aiken Avenue, Shadyside, Pittsburgh, Allegheny County, 15232, United States" + }, + "intent_template_id": 39, + "changelogs": [ + { + "key": "eval", + "category": "permissive_string_match", + "note": "Switched to exact match" + } + ] + }, + { + "sites": [ + "map" + ], + "task_id": 237, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "Where is the nearest {{location}} from {{location2}} {{condition}}", + "instantiation_dict": { + "location": "gas station", + "location2": "CMU", + "condition": "" + }, + "intent": "Where is the nearest gas station from CMU ", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "verify_retrieved_value_normalized", + "eval_params": { + "ordered": false, + "type": "text" + }, + "expected_data": [ + "Sunoco", + "North Craig Street", + "North Oakland", + "Pittsburgh" + ] + } + } + ], + "site": "map" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "Sunoco", + "North Craig Street", + "North Oakland", + "Pittsburgh" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "Sunoco, North Craig Street, North Oakland, Pittsburgh, Allegheny County, 15213, United States" + }, + "intent_template_id": 39, + "changelogs": [ + { + "key": "eval", + "category": "permissive_string_match", + "note": "Switched to exact match" + } + ] + }, + { + "sites": [ + "shopping" + ], + "task_id": 238, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "I am doing a market survey for one stop market, show me the most expensive product from {{product_category}} category", + "instantiation_dict": { + "product_category": "PS4 accessories" + }, + "intent": "I am doing a market survey for one stop market, show me the most expensive product from PS4 accessories category", + "require_reset": false, + "eval": { + "expected_ui_state": [ + { + "url": "__SHOPPING__/astro-gaming-a50-wireless-headset-base-station-gen-4-compatible-with-ps5-ps4-pc-mac-black-silver.html" + } + ], + "site": "shopping" + }, + "original.eval": { + "eval_types": [ + "url_match" + ], + "reference_answers": null, + "reference_url": "__SHOPPING__/astro-gaming-a50-wireless-headset-base-station-gen-4-compatible-with-ps5-ps4-pc-mac-black-silver.html", + "program_html": [], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 138 + }, + { + "sites": [ + "shopping" + ], + "task_id": 239, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "I am doing a market survey for one stop market, show me the most expensive product from {{product_category}} category", + "instantiation_dict": { + "product_category": "nutrition bars and drinks" + }, + "intent": "I am doing a market survey for one stop market, show me the most expensive product from nutrition bars and drinks category", + "require_reset": false, + "eval": { + "expected_ui_state": [ + { + "url": "__SHOPPING__/kellogg-s-special-k-protein-meal-bars-chocolate-caramel-12-7oz-6-count.html" + } + ], + "site": "shopping" + }, + "original.eval": { + "eval_types": [ + "url_match" + ], + "reference_answers": null, + "reference_url": "__SHOPPING__/kellogg-s-special-k-protein-meal-bars-chocolate-caramel-12-7oz-6-count.html", + "program_html": [], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 138 + }, + { + "sites": [ + "shopping" + ], + "task_id": 240, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "I am doing a market survey for one stop market, show me the most expensive product from {{product_category}} category", + "instantiation_dict": { + "product_category": "competitive swimwear" + }, + "intent": "I am doing a market survey for one stop market, show me the most expensive product from competitive swimwear category", + "require_reset": false, + "eval": { + "expected_ui_state": [ + { + "url": "__SHOPPING__/women-cross-flower-beachwear-tankini-bandeau-bandage-bikini-set-push-up-swimwear-bathing-suit-two-pieces-swimsuits.html" + } + ], + "site": "shopping" + }, + "original.eval": { + "eval_types": [ + "url_match" + ], + "reference_answers": null, + "reference_url": "__SHOPPING__/women-cross-flower-beachwear-tankini-bandeau-bandage-bikini-set-push-up-swimwear-bathing-suit-two-pieces-swimsuits.html", + "program_html": [], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 138 + }, + { + "sites": [ + "shopping" + ], + "task_id": 241, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "I am doing a market survey for one stop market, show me the most expensive product from {{product_category}} category", + "instantiation_dict": { + "product_category": "skin care tool" + }, + "intent": "I am doing a market survey for one stop market, show me the most expensive product from skin care tool category", + "require_reset": false, + "eval": { + "expected_ui_state": [ + { + "url": "__SHOPPING__/professional-medi-spa-scar-stretch-mark-reduction-system.html" + } + ], + "site": "shopping" + }, + "original.eval": { + "eval_types": [ + "url_match" + ], + "reference_answers": null, + "reference_url": "__SHOPPING__/professional-medi-spa-scar-stretch-mark-reduction-system.html", + "program_html": [], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 138 + }, + { + "sites": [ + "shopping" + ], + "task_id": 242, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "I am doing a market survey for one stop market, show me the most expensive product from {{product_category}} category", + "instantiation_dict": { + "product_category": "Household Supplies" + }, + "intent": "I am doing a market survey for one stop market, show me the most expensive product from Household Supplies category", + "require_reset": false, + "eval": { + "expected_ui_state": [ + { + "url": "__SHOPPING__/lynx-battery-12v-200ah-lithium-iron-phosphate-lifepo4-prismatic-deep-cell-battery-set-of-4-3-2v-cells-with-3-bus-bars-and-8-lug-nuts-for-rv-solar-marine-off-grid-applications.html" + } + ], + "site": "shopping" + }, + "original.eval": { + "eval_types": [ + "url_match" + ], + "reference_answers": null, + "reference_url": "__SHOPPING__/lynx-battery-12v-200ah-lithium-iron-phosphate-lifepo4-prismatic-deep-cell-battery-set-of-4-3-2v-cells-with-3-bus-bars-and-8-lug-nuts-for-rv-solar-marine-off-grid-applications.html", + "program_html": [], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 138 + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 243, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Show me the {{information}} of the customer who is the most unhappy with {{product}}", + "instantiation_dict": { + "information": "email address", + "product": "Circe fleece" + }, + "intent": "Show me the email address of the customer who is the most unhappy with Circe fleece", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "verify_retrieved_value_normalized", + "eval_params": { + "ordered": false, + "type": "text" + }, + "expected_data": [ + "hannah.lim@gmail.com" + ] + } + } + ], + "site": "shopping_admin" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "exact_match": "hannah.lim@gmail.com" + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "hannah.lim@gmail.com" + }, + "intent_template_id": 244 + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 244, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Show me the {{information}} of the customer who is the most unhappy with {{product}}", + "instantiation_dict": { + "information": "email address", + "product": "Olivia zip jacket" + }, + "intent": "Show me the email address of the customer who is the most unhappy with Olivia zip jacket", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "verify_retrieved_value_normalized", + "eval_params": { + "ordered": false, + "type": "text" + }, + "expected_data": [ + "emma.lopez@gmail.com" + ] + } + } + ], + "site": "shopping_admin" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "exact_match": "emma.lopez@gmail.com" + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "emma.lopez@gmail.com" + }, + "intent_template_id": 244 + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 245, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Show me the {{information}} of the customer who is the most unhappy with {{product}}", + "instantiation_dict": { + "information": "name", + "product": "Antonia racer tank" + }, + "intent": "Show me the name of the customer who is the most unhappy with Antonia racer tank", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "verify_retrieved_value_normalized", + "eval_params": { + "ordered": false, + "type": "text" + }, + "expected_data": [ + "Shaunte" + ] + } + } + ], + "site": "shopping_admin" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "exact_match": "Shaunte" + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "Shaunte" + }, + "intent_template_id": 244 + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 246, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Show me the {{information}} of the customer who is the most unhappy with {{product}}", + "instantiation_dict": { + "information": "name", + "product": "Chloe tank" + }, + "intent": "Show me the name of the customer who is the most unhappy with Chloe tank", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "verify_retrieved_value_normalized", + "eval_params": { + "ordered": false, + "type": "text" + }, + "expected_data": [ + "Teofila" + ] + } + } + ], + "site": "shopping_admin" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "exact_match": "Teofila" + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "Teofila" + }, + "intent_template_id": 244 + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 247, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Show me the {{information}} of the customer who is the most unhappy with {{product}}", + "instantiation_dict": { + "information": "email address", + "product": "the style of Zoe products" + }, + "intent": "Show me the email address of the customer who is the most unhappy with the style of Zoe products", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "validate_status", + "expected_data": { + "expected_status": "NOT_FOUND_ERROR" + } + } + } + ], + "site": "shopping_admin" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": "N/A" + }, + "reference_url": "Valorie doesn't have a email in the system", + "program_html": [], + "string_note": "There is no negative review for Zoe products", + "reference_answer_raw_annotation": "N/A" + }, + "intent_template_id": 244, + "changelogs": [ + { + "key": "expected_retrieve_value", + "category": "unachievable_tasks", + "note": "Status check used instead of fuzzy match" + } + ] + }, + { + "sites": [ + "map" + ], + "task_id": 248, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "Tell me the coordinates of {{location}} in DD format", + "instantiation_dict": { + "location": "Carnegie Mellon Caf\u00e9" + }, + "intent": "Tell me the coordinates of Carnegie Mellon Caf\u00e9 in DD format", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "verify_retrieved_value_normalized", + "eval_params": { + "ordered": false, + "type": "text" + }, + "expected_data": [ + "40.442", + "-79.939" + ] + } + } + ], + "site": "map" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "40.442", + "-79.939" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "40.4424191, -79.9397388" + }, + "intent_template_id": 46, + "changelogs": [ + { + "key": "eval", + "category": "permissive_string_match", + "note": "Switched to exact match" + } + ] + }, + { + "sites": [ + "map" + ], + "task_id": 249, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "Tell me the coordinates of {{location}} in DD format", + "instantiation_dict": { + "location": "Western Pennsylvania Hospital Heliport" + }, + "intent": "Tell me the coordinates of Western Pennsylvania Hospital Heliport in DD format", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "verify_retrieved_value_normalized", + "eval_params": { + "ordered": false, + "type": "text" + }, + "expected_data": [ + "40.460", + "-79.946" + ] + } + } + ], + "site": "map" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "40.460", + "-79.946" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "40.46076, -79.94666" + }, + "intent_template_id": 46, + "changelogs": [ + { + "key": "eval", + "category": "permissive_string_match", + "note": "Switched to exact match" + } + ] + }, + { + "sites": [ + "map" + ], + "task_id": 250, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "Tell me the coordinates of {{location}} in DD format", + "instantiation_dict": { + "location": "Apple Store near Pitt" + }, + "intent": "Tell me the coordinates of Apple Store near Pitt in DD format", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "verify_retrieved_value_normalized", + "eval_params": { + "ordered": false, + "type": "text" + }, + "expected_data": [ + "40.451", + "-79.933" + ] + } + } + ], + "site": "map" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "40.451", + "-79.933" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "40.4511693, -79.9334241" + }, + "intent_template_id": 46, + "changelogs": [ + { + "key": "eval", + "category": "permissive_string_match", + "note": "Switched to exact match" + } + ] + }, + { + "sites": [ + "map" + ], + "task_id": 251, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "Tell me the coordinates of {{location}} in DD format", + "instantiation_dict": { + "location": "bus stop on the Carnegie art museum side of the street near CMU" + }, + "intent": "Tell me the coordinates of bus stop on the Carnegie art museum side of the street near CMU in DD format", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "verify_retrieved_value_normalized", + "eval_params": { + "ordered": false, + "type": "text" + }, + "expected_data": [ + "40.444", + "-79.948" + ] + } + } + ], + "site": "map" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "40.444", + "-79.948" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "40.4443, -79.94889" + }, + "intent_template_id": 46, + "changelogs": [ + { + "key": "eval", + "category": "permissive_string_match", + "note": "Switched to exact match" + } + ] + }, + { + "sites": [ + "map" + ], + "task_id": 252, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "Tell me the coordinates of {{location}} in DD format", + "instantiation_dict": { + "location": "Tokyo Japanese Food Store in Pittsburgh" + }, + "intent": "Tell me the coordinates of Tokyo Japanese Food Store in Pittsburgh in DD format", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "verify_retrieved_value_normalized", + "eval_params": { + "ordered": false, + "type": "text" + }, + "expected_data": [ + "40.457", + "-79.929" + ] + } + } + ], + "site": "map" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "40.457", + "-79.929" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "40.45761, -79.92934" + }, + "intent_template_id": 46, + "changelogs": [ + { + "key": "eval", + "category": "permissive_string_match", + "note": "Switched to exact match" + } + ] + }, + { + "sites": [ + "map" + ], + "task_id": 253, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "What is the {{information}} of {{location}}", + "instantiation_dict": { + "location": "Carnegie Mellon Caf\u00e9", + "information": "phone number" + }, + "intent": "What is the phone number of Carnegie Mellon Caf\u00e9", + "require_reset": false, + "eval": { + "site": "map" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": "N/A" + }, + "reference_url": "", + "program_html": [], + "string_note": "There is no such information in the map", + "reference_answer_raw_annotation": "N/A" + }, + "intent_template_id": 501, + "changelogs": [ + { + "key": "expected_retrieve_value", + "category": "unachievable_tasks", + "note": "Status check used instead of fuzzy match" + } + ] + }, + { + "sites": [ + "map" + ], + "task_id": 254, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "What is the {{information}} of {{location}}", + "instantiation_dict": { + "location": "Western Pennsylvania Hospital", + "information": "phone number" + }, + "intent": "What is the phone number of Western Pennsylvania Hospital", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "verify_retrieved_value_normalized", + "eval_params": { + "ordered": false, + "type": "text" + }, + "expected_data": [ + "4125785000" + ] + } + } + ], + "site": "map" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "exact_match": "4125785000" + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "4125785000" + }, + "intent_template_id": 501 + }, + { + "sites": [ + "map" + ], + "task_id": 255, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "Who is the {{information}} of {{location}}", + "instantiation_dict": { + "location": "PIT airport", + "information": "operator" + }, + "intent": "Who is the operator of PIT airport", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "verify_retrieved_value_normalized", + "eval_params": { + "ordered": false, + "type": "text" + }, + "expected_data": [ + "Allegheny County Airport Authority" + ] + } + } + ], + "site": "map" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "exact_match": "Allegheny County Airport Authority" + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "Allegheny County Airport Authority" + }, + "intent_template_id": 501 + }, + { + "sites": [ + "map" + ], + "task_id": 256, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "What is the {{information}} of {{location}}", + "instantiation_dict": { + "location": "Carnegie art museum in pittsburgh", + "information": "website" + }, + "intent": "What is the website of Carnegie art museum in pittsburgh", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "verify_retrieved_value_normalized", + "eval_params": { + "ordered": false, + "type": "text" + }, + "expected_data": [ + "http://web.cmoa.org/" + ] + } + } + ], + "site": "map" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "exact_match": "http://web.cmoa.org/" + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "http://web.cmoa.org/" + }, + "intent_template_id": 501 + }, + { + "sites": [ + "map" + ], + "task_id": 257, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "What is the {{information}} of {{location}}", + "instantiation_dict": { + "location": "Tokyo Japanese Food Store in Pittsburgh", + "information": "hours of operation" + }, + "intent": "What is the hours of operation of Tokyo Japanese Food Store in Pittsburgh", + "require_reset": false, + "eval": { + "site": "map" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": [ + "We-Su 10:00-17:00" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "We-Su 10:00-17:00" + }, + "intent_template_id": 501, + "changelogs": [ + { + "key": "eval", + "category": "permissive_string_match", + "note": "Switched from fuzzy eval" + } + ] + }, + { + "sites": [ + "gitlab" + ], + "task_id": 258, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Navigate to all public projects", + "original.intent_template": "See all public projects", + "instantiation_dict": {}, + "intent": "Navigate to all public projects", + "original.intent": "See all public projects", + "require_reset": false, + "eval": { + "expected_ui_state": [ + { + "url": "__GITLAB__/explore" + } + ], + "site": "gitlab" + }, + "original.eval": { + "eval_types": [ + "url_match" + ], + "reference_answers": null, + "reference_url": "__GITLAB__/explore", + "program_html": [], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 325, + "changelogs": [ + { + "key": "intent_template", + "category": "clarify_instructions", + "note": "Clarify the intent as a navigation task" + } + ] + }, + { + "sites": [ + "gitlab" + ], + "task_id": 259, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Get me my RSS feed token", + "instantiation_dict": {}, + "intent": "Get me my RSS feed token", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "verify_retrieved_value_normalized", + "eval_params": { + "ordered": "false", + "type": "text" + }, + "expected_data": [ + "TMN_bBn9Z48qVbUFZV45" + ] + } + } + ], + "site": "gitlab" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "exact_match": "TMN_bBn9Z48qVbUFZV45" + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "TMN_bBn9Z48qVbUFZV45" + }, + "intent_template_id": 312, + "changelogs": [ + { + "key": "expected_retrieve_value", + "category": "avoid_permissive_text", + "note": "Use structured response to judge correctness" + } + ] + }, + { + "sites": [ + "shopping" + ], + "task_id": 260, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "I want to browse the products in the {{category}} category", + "instantiation_dict": { + "category": "Video Game" + }, + "intent": "I want to browse the products in the Video Game category", + "require_reset": false, + "eval": { + "expected_ui_state": [ + { + "url": "__SHOPPING__/video-games.html" + } + ], + "site": "shopping" + }, + "original.eval": { + "eval_types": [ + "url_match" + ], + "reference_answers": null, + "reference_url": "__SHOPPING__/video-games.html", + "program_html": [], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 211 + }, + { + "sites": [ + "shopping" + ], + "task_id": 261, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "I want to browse the products in the {{category}} category", + "instantiation_dict": { + "category": "Headphones" + }, + "intent": "I want to browse the products in the Headphones category", + "require_reset": false, + "eval": { + "expected_ui_state": [ + { + "url": "__SHOPPING__/electronics/headphones.html" + } + ], + "site": "shopping" + }, + "original.eval": { + "eval_types": [ + "url_match" + ], + "reference_answers": null, + "reference_url": "__SHOPPING__/electronics/headphones.html", + "program_html": [], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 211 + }, + { + "sites": [ + "shopping" + ], + "task_id": 262, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "I want to browse the products in the {{category}} category", + "instantiation_dict": { + "category": "Men shoes" + }, + "intent": "I want to browse the products in the Men shoes category", + "require_reset": false, + "eval": { + "expected_ui_state": [ + { + "url": "__SHOPPING__/clothing-shoes-jewelry/men/shoes.html" + } + ], + "site": "shopping" + }, + "original.eval": { + "eval_types": [ + "url_match" + ], + "reference_answers": null, + "reference_url": "__SHOPPING__/clothing-shoes-jewelry/men/shoes.html", + "program_html": [], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 211 + }, + { + "sites": [ + "shopping" + ], + "task_id": 263, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "I want to browse the products in the {{category}} category", + "instantiation_dict": { + "category": "Woman clothing" + }, + "intent": "I want to browse the products in the Woman clothing category", + "require_reset": false, + "eval": { + "expected_ui_state": [ + { + "url": "__SHOPPING__/clothing-shoes-jewelry/women/clothing.html" + } + ], + "site": "shopping" + }, + "original.eval": { + "eval_types": [ + "url_match" + ], + "reference_answers": null, + "reference_url": "__SHOPPING__/clothing-shoes-jewelry/women/clothing.html", + "program_html": [], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 211 + }, + { + "sites": [ + "shopping" + ], + "task_id": 264, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "I want to browse the products in the {{category}} category", + "instantiation_dict": { + "category": "Cabinets, Racks & Shelves" + }, + "intent": "I want to browse the products in the Cabinets, Racks & Shelves category", + "require_reset": false, + "eval": { + "expected_ui_state": [ + { + "url": "__SHOPPING__/office-products/office-furniture-lighting/cabinets-racks-shelves.html" + } + ], + "site": "shopping" + }, + "original.eval": { + "eval_types": [ + "url_match" + ], + "reference_answers": null, + "reference_url": "__SHOPPING__/office-products/office-furniture-lighting/cabinets-racks-shelves.html", + "program_html": [], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 211 + }, + { + "sites": [ + "wikipedia", + "map" + ], + "task_id": 265, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "What's the closest national park to {{city}}? How far is it to drive there?", + "instantiation_dict": { + "city": "Boston" + }, + "intent": "What's the closest national park to Boston? How far is it to drive there?", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "verify_retrieved_value_normalized", + "eval_params": { + "ordered": false, + "type": "text" + }, + "expected_data": [ + "Acadia National Park", + "457km" + ] + } + } + ], + "site": "map" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "Acadia National Park", + "457km" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "Acadia National Park\n457km" + }, + "intent_template_id": 85, + "changelogs": [ + { + "key": "eval", + "category": "permissive_string_match", + "note": "Switched to exact match" + } + ] + }, + { + "sites": [ + "wikipedia", + "map" + ], + "task_id": 266, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "What's the closest national park to {{city}}?", + "instantiation_dict": { + "city": "the largest city in Maine" + }, + "intent": "What's the closest national park to the largest city in Maine?", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "verify_retrieved_value_normalized", + "eval_params": { + "ordered": false, + "type": "text" + }, + "expected_data": [ + "Acadia National Park" + ] + } + } + ], + "site": "map" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "exact_match": "Acadia National Park" + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "Acadia National Park" + }, + "intent_template_id": 85 + }, + { + "sites": [ + "wikipedia", + "map" + ], + "task_id": 267, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "What's the closest national park to {{city}}? How long it takes to drive there?", + "instantiation_dict": { + "city": "the hometown of Stephen King" + }, + "intent": "What's the closest national park to the hometown of Stephen King? How long it takes to drive there?", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "verify_retrieved_value_normalized", + "eval_params": { + "ordered": false, + "type": "text" + }, + "expected_data": [ + "Acadia National Park" + ] + } + } + ], + "site": "map" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "Acadia National Park" + ], + "fuzzy_match": [ + "1h 23min" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "Acadia National Park\n1h 23min" + }, + "intent_template_id": 85, + "changelogs": [ + { + "key": "eval", + "category": "permissive_string_match", + "note": "Switched from fuzzy eval" + } + ] + }, + { + "sites": [ + "wikipedia", + "map" + ], + "task_id": 268, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "What's the closest national park to {{city}}? How long does it take to bike there?", + "instantiation_dict": { + "city": "Vinalhaven, ME" + }, + "intent": "What's the closest national park to Vinalhaven, ME? How long does it take to bike there?", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "verify_retrieved_value_normalized", + "eval_params": { + "ordered": false, + "type": "text" + }, + "expected_data": [ + "Acadia National Park" + ] + } + } + ], + "site": "map" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "Acadia National Park" + ], + "fuzzy_match": [ + "10h 33min" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "Acadia National Park\n10h 33min" + }, + "intent_template_id": 85, + "changelogs": [ + { + "key": "eval", + "category": "permissive_string_match", + "note": "Switched from fuzzy eval" + } + ] + }, + { + "sites": [ + "shopping" + ], + "task_id": 269, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "Show me products under ${{price}} in \"{{product_category}}\" category", + "instantiation_dict": { + "price": "25", + "product_category": "women shoes" + }, + "intent": "Show me products under $25 in \"women shoes\" category", + "require_reset": false, + "eval": { + "expected_ui_state": [ + { + "url": "__SHOPPING__/clothing-shoes-jewelry/women/shoes.html?price=0-25" + } + ], + "site": "shopping" + }, + "original.eval": { + "eval_types": [ + "url_match" + ], + "reference_answers": null, + "reference_url": "__SHOPPING__/clothing-shoes-jewelry/women/shoes.html?price=0-25", + "program_html": [], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 139 + }, + { + "sites": [ + "shopping" + ], + "task_id": 270, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "Show me products under ${{price}} in \"{{product_category}}\" category", + "instantiation_dict": { + "price": "30", + "product_category": "men shoes" + }, + "intent": "Show me products under $30 in \"men shoes\" category", + "require_reset": false, + "eval": { + "expected_ui_state": [ + { + "url": "__SHOPPING__/clothing-shoes-jewelry/men/shoes.html?price=0-30" + } + ], + "site": "shopping" + }, + "original.eval": { + "eval_types": [ + "url_match" + ], + "reference_answers": null, + "reference_url": "__SHOPPING__/clothing-shoes-jewelry/men/shoes.html?price=0-30", + "program_html": [], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 139 + }, + { + "sites": [ + "shopping" + ], + "task_id": 271, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "Show me products under ${{price}} in \"{{product_category}}\" category", + "instantiation_dict": { + "price": "46.99", + "product_category": "makeup remover" + }, + "intent": "Show me products under $46.99 in \"makeup remover\" category", + "require_reset": false, + "eval": { + "expected_ui_state": [ + { + "url": "__SHOPPING__/beauty-personal-care/makeup/makeup-remover.html?price=0-46.99" + } + ], + "site": "shopping" + }, + "original.eval": { + "eval_types": [ + "url_match" + ], + "reference_answers": null, + "reference_url": "__SHOPPING__/beauty-personal-care/makeup/makeup-remover.html?price=0-46.99", + "program_html": [], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 139 + }, + { + "sites": [ + "shopping" + ], + "task_id": 272, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "Show me products under ${{price}} in \"{{product_category}}\" category", + "instantiation_dict": { + "price": "78", + "product_category": "children dental care" + }, + "intent": "Show me products under $78 in \"children dental care\" category", + "require_reset": false, + "eval": { + "expected_ui_state": [ + { + "url": "__SHOPPING__/beauty-personal-care/oral-care/children-s-dental-care.html?price=0-78" + } + ], + "site": "shopping" + }, + "original.eval": { + "eval_types": [ + "url_match" + ], + "reference_answers": null, + "reference_url": "__SHOPPING__/beauty-personal-care/oral-care/children-s-dental-care.html?price=0-78", + "program_html": [], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 139 + }, + { + "sites": [ + "shopping" + ], + "task_id": 273, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "Show me products under ${{price}} in \"{{product_category}}\" category", + "instantiation_dict": { + "price": "199", + "product_category": "furtiture with accent" + }, + "intent": "Show me products under $199 in \"furtiture with accent\" category", + "require_reset": false, + "eval": { + "expected_ui_state": [ + { + "url": "__SHOPPING__/home-kitchen/furniture/accent-furniture.html?price=0-199" + } + ], + "site": "shopping" + }, + "original.eval": { + "eval_types": [ + "url_match" + ], + "reference_answers": null, + "reference_url": "__SHOPPING__/home-kitchen/furniture/accent-furniture.html?price=0-199", + "program_html": [], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 139 + }, + { + "sites": [ + "shopping" + ], + "task_id": 274, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "Search for \"{{keyword}}\"", + "instantiation_dict": { + "keyword": "usb wifi" + }, + "intent": "Search for \"usb wifi\"", + "require_reset": false, + "eval": { + "expected_ui_state": [ + { + "url": "__SHOPPING__/catalogsearch/result/?q=usb+wifi" + } + ], + "site": "shopping" + }, + "original.eval": { + "eval_types": [ + "url_match" + ], + "reference_answers": null, + "reference_url": "__SHOPPING__/catalogsearch/result/?q=usb+wifi", + "program_html": [], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 212 + }, + { + "sites": [ + "shopping" + ], + "task_id": 275, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "Search for \"{{keyword}}\"", + "instantiation_dict": { + "keyword": "xbox" + }, + "intent": "Search for \"xbox\"", + "require_reset": false, + "eval": { + "expected_ui_state": [ + { + "url": "__SHOPPING__/catalogsearch/result/?q=xbox" + } + ], + "site": "shopping" + }, + "original.eval": { + "eval_types": [ + "url_match" + ], + "reference_answers": null, + "reference_url": "__SHOPPING__/catalogsearch/result/?q=xbox", + "program_html": [], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 212 + }, + { + "sites": [ + "shopping" + ], + "task_id": 276, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "Search for \"{{keyword}}\"", + "instantiation_dict": { + "keyword": "switch accessories" + }, + "intent": "Search for \"switch accessories\"", + "require_reset": false, + "eval": { + "expected_ui_state": [ + { + "url": "__SHOPPING__/catalogsearch/result/?q=switch+accessories" + } + ], + "site": "shopping" + }, + "original.eval": { + "eval_types": [ + "url_match" + ], + "reference_answers": null, + "reference_url": "__SHOPPING__/catalogsearch/result/?q=switch+accessories", + "program_html": [], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 212 + }, + { + "sites": [ + "shopping" + ], + "task_id": 277, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "Search for \"{{keyword}}\"", + "instantiation_dict": { + "keyword": "batteries for iphone 13" + }, + "intent": "Search for \"batteries for iphone 13\"", + "require_reset": false, + "eval": { + "expected_ui_state": [ + { + "url": "__SHOPPING__/catalogsearch/result/?q=iphone+13" + } + ], + "site": "shopping" + }, + "original.eval": { + "eval_types": [ + "url_match" + ], + "reference_answers": null, + "reference_url": "__SHOPPING__/catalogsearch/result/?q=iphone+13", + "program_html": [], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 212 + }, + { + "sites": [ + "shopping" + ], + "task_id": 278, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "Search for \"{{keyword}}\"", + "instantiation_dict": { + "keyword": "green tea bag for weight loss" + }, + "intent": "Search for \"green tea bag for weight loss\"", + "require_reset": false, + "eval": { + "expected_ui_state": [ + { + "url": "__SHOPPING__/catalogsearch/result/?q=green+tea+bag+for+weight+loss" + } + ], + "site": "shopping" + }, + "original.eval": { + "eval_types": [ + "url_match" + ], + "reference_answers": null, + "reference_url": "__SHOPPING__/catalogsearch/result/?q=green+tea+bag+for+weight+loss", + "program_html": [], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 212 + }, + { + "sites": [ + "shopping" + ], + "task_id": 279, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "Provide me with the full names of {{product}}, and also share the price range for the available models", + "original.intent_template": "Provide me with the complete names of Bluetooth headphones from Sony, and also share the price range for the available models", + "instantiation_dict": { + "product": "Bluetooth headphones from Sony" + }, + "original.instantiation_dict": {}, + "intent": "Provide me with the full names of Bluetooth headphones from Sony, and also share the price range for the available models", + "original.intent": "Provide me with the complete names of Bluetooth headphones from Sony, and also share the price range for the available models", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "verify_retrieved_value_normalized", + "eval_params": { + "ordered": false, + "type": "object" + }, + "expected_data": [ + { + "names": { + "value": [ + "SONY WH1000XM3 Bluetooth Wireless Noise Canceling Headphones Silver WH-1000XM3/S (Renewed)", + "Sony WH-CH710N/H Wireless Bluetooth Noise Cancelling Headphones", + "Sony WH-1000XM3B Wireless Bluetooth Noise-Canceling Over-Ear Headphones (Black) Basic Headphone Bundle Kit with Stylus", + "Sony Wireless Headphones WH-CH510: Wireless Bluetooth On-Ear Headset with Mic for Phone-Call, Black", + "Sony WHCH710N Wireless Bluetooth Noise Canceling Over-The-Ear Headphones (Black) with Kratos 18W PD Two-Port Power Adapter and Kratos 6-Feet Nylon Braided USB-C Cable Bundle (3 Items)", + "Sony WI-SP500 Wireless in-Ear Sports Headphones, White (WISP500/W)", + "Sony WI-SP510 Extra BASS Wireless in-Ear Headset/Headphones with mic for Phone Call Sports IPX5 Bluetooth, Black (WISP510/B)", + "Sony MDRAS600BT Active Sports Bluetooth Headset (Black)", + "Sony WH-1000XM4 Wireless Noise Canceling Over-Ear Headphones (Black) with Sony WLA-NS7 Wireless TV Adapter Bundle (2 Items)", + "Sony WI-C300 Wireless In-Ear Headphones, Red (WIC300/R)", + "Sony XB950N1 Extra Bass Wireless Noise Canceling Headphones, Black", + "SONY - H900N Hi-Res Noise Cancelling Wireless Headphone Grayish Black Renewed" + ], + "type": "text" + }, + "min": { + "value": "18.99", + "type": "currency" + }, + "max": { + "value": "406", + "type": "currency" + } + } + ] + } + } + ], + "site": "shopping" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "SONY WH1000XM3 Bluetooth Wireless Noise Canceling Headphones Silver WH-1000XM3/S (Renewed)", + "Sony WH-CH710N/H Wireless Bluetooth Noise Cancelling Headphones", + "Sony WH-1000XM3B Wireless Bluetooth Noise-Canceling Over-Ear Headphones (Black) Basic Headphone Bundle Kit with Stylus", + "Sony Wireless Headphones WH-CH510: Wireless Bluetooth On-Ear Headset with Mic for Phone-Call, Black", + "Sony WHCH710N Wireless Bluetooth Noise Canceling Over-The-Ear Headphones (Black) with Kratos 18W PD Two-Port Power Adapter and Kratos 6-Feet Nylon Braided USB-C Cable Bundle (3 Items)", + "Sony WI-SP500 Wireless in-Ear Sports Headphones, White (WISP500/W)", + "Sony WI-SP510 Extra BASS Wireless in-Ear Headset/Headphones with mic for Phone Call Sports IPX5 Bluetooth, Black (WISP510/B)", + "Sony MDRAS600BT Active Sports Bluetooth Headset (Black)", + "Sony WH-1000XM4 Wireless Noise Canceling Over-Ear Headphones (Black) with Sony WLA-NS7 Wireless TV Adapter Bundle (2 Items)", + "Sony WI-C300 Wireless In-Ear Headphones, Red (WIC300/R)", + "Sony XB950N1 Extra Bass Wireless Noise Canceling Headphones, Black", + "SONY - H900N Hi-Res Noise Cancelling Wireless Headphone Grayish Black Renewed", + "18.99", + "406" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "These models are avaiable: SONY WH1000XM3 Bluetooth Wireless Noise Canceling Headphones Silver WH-1000XM3/S (Renewed) Sony WH-CH710N/H Wireless Bluetooth Noise Cancelling Headphones Sony WH-1000XM3B Wireless Bluetooth Noise-Canceling Over-Ear Headphones (Black) Basic Headphone Bundle Kit with Stylus Sony Wireless Headphones WH-CH510: Wireless Bluetooth On-Ear Headset with Mic for Phone-Call, Black Sony WHCH710N Wireless Bluetooth Noise Canceling Over-The-Ear Headphones (Black) with Kratos 18W PD Two-Port Power Adapter and Kratos 6-Feet Nylon Braided USB-C Cable Bundle (3 Items) Sony WI-SP500 Wireless in-Ear Sports Headphones, White (WISP500/W) Sony WI-SP510 Extra BASS Wireless in-Ear Headset/Headphones with mic for Phone Call Sports IPX5 Bluetooth, Black (WISP510/B) Sony MDRAS600BT Active Sports Bluetooth Headset (Black) Sony WH-1000XM4 Wireless Noise Canceling Over-Ear Headphones (Black) with Sony WLA-NS7 Wireless TV Adapter Bundle (2 Items) Sony WI-C300 Wireless In-Ear Headphones, Red (WIC300/R) Sony XB950N1 Extra Bass Wireless Noise Canceling Headphones, Black SONY - H900N Hi-Res Noise Cancelling Wireless Headphone Grayish Black Renewed The price ranges from $18.99 to $406 " + }, + "intent_template_id": 204, + "format_specification": "Use \"names\" for the list of product names and \"min\" for the minimum price and \"max\" for the maximum price.", + "changelogs": [ + { + "key": "eval", + "category": "permissive_string_match", + "note": "Switched to exact match" + } + ] + }, + { + "sites": [ + "shopping" + ], + "task_id": 280, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "Provide me with the full names of {{product}}, and also share the price range for the available models", + "original.intent_template": "Provide me with the full names of chargers from Anker, and also share the price range for the available models", + "instantiation_dict": { + "product": "chargers from Anker" + }, + "original.instantiation_dict": {}, + "intent": "Provide me with the full names of chargers from Anker, and also share the price range for the available models", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "verify_retrieved_value_normalized", + "eval_params": { + "ordered": false, + "type": "object" + }, + "expected_data": [ + { + "names": { + "value": [ + "Anker USB C Charger 30W, 711 Charger, Compact Fast Charger (Not Foldable) for MacBook Air/iPhone 13/13 Mini/13 Pro/13 Pro Max/12, Galaxy S21, Note 20, iPad Pro, Pixel, and More", + "Anker USB C Charger 40W, 521 Charger (Nano Pro), PIQ 3.0 Durable Compact Fast Charger (Not Foldable) for iPhone 13/13 Mini/13 Pro/13 Pro Max/12, Galaxy, Pixel 4/3, iPad/iPad Mini (Cable Not Included)", + "Anker PowerCore Speed 20000, 20000mAh Qualcomm Quick Charge 3.0 & PowerIQ Portable Charger, with Quick Charge Recharging, Power Bank for Samsung, iPhone, iPad and More, Black (A1278)", + "5Ft Micro-USB Charger Cord Cable Fit for Anker-PowerCore 5000 10000 20100 13000 26800 Mini 3350 Fusion II 15000 Redux 20000 Slim 10000 Astro E1 AC Replacement Power Adapter Supply", + "Anker 10W Max Wireless Charger, 313 Wireless Charger (Pad), Qi-Certified Wireless Charging 7.5W for iPhone 12/12 Pro/12 mini/12 Pro Max, 10W for Galaxy S10 S9 S8, S9 Plus, Note 9 (No AC Adapter)", + "Anker Wireless Charger, 313 Wireless Charger (Stand), Qi-Certified for iPhone 12, 12 Pro Max, SE, 11, 11 Pro, 11 Pro Max, XR, XS Max, 10W Fast-Charging Galaxy S20, S10 (No AC Adapter)", + "USB Charger, Anker Elite Dual Port 24W Wall Charger, PowerPort 2 with PowerIQ and Foldable Plug, for iPhone 11/Xs/XS Max/XR/X/8/7/6/Plus, iPad Pro/Air 2/Mini 3/Mini 4, Samsung S4/S5, and More", + "iPhone 12 Charger [GaN Tech], Anker 30W Compact USB-C Wall Charger with Power Delivery, PowerPort Atom for iPhone 12 / Mini/Pro/Pro Max / 11 / X/XS/XR, iPad Pro, MacBook 12'', Pixel, Galaxy", + "USB C Charger, Anker 30W 2 Port Fast Charger with 18W USB C Power Adapter, Foldable PowerPort PD 2 Charger for iPad Pro, iPhone 11/11 Pro / 11 Pro Max/XS/Max/XR/X, Pixel, Galaxy, and More", + "Anker 40W 5-Port USB Wall Charger, PowerPort 5 for iPhone XS / XS Max / XR / X / 8 / 7 / 6 / Plus, iPad Pro / Air 2 / mini, Galaxy S9 / S8 / Edge / Plus, Note 8 / 7, LG, Nexus, HTC and More, Black (AK-A2124111)", + "Anker Quick Charge 3.0 39W Dual USB Wall Charger, PowerPort Speed 2 for Galaxy S10/S9/S8/Edge/Plus, Note 8/7 and PowerIQ for iPhone Xs/XS Max/XR/X/8/Plus, iPad Pro/Air 2/Mini, LG, Nexus, HTC and More", + "USB C Charger, Anker 20W PIQ 3.0 Fast Charger with Foldable Plug, PowerPort III Charger for iPhone 13/13 Mini/13 Pro/13 Pro Max/12/11, iPad/iPad Mini, MagSafe, and More (Cable Not Included)" + ], + "type": "text" + }, + "min": { + "value": "8.99", + "type": "currency" + }, + "max": { + "value": "59.99", + "type": "currency" + } + } + ] + } + } + ], + "site": "shopping" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "Anker USB C Charger 30W, 711 Charger, Compact Fast Charger (Not Foldable) for MacBook Air/iPhone 13/13 Mini/13 Pro/13 Pro Max/12, Galaxy S21, Note 20, iPad Pro, Pixel, and More", + "Anker USB C Charger 40W, 521 Charger (Nano Pro), PIQ 3.0 Durable Compact Fast Charger (Not Foldable) for iPhone 13/13 Mini/13 Pro/13 Pro Max/12, Galaxy, Pixel 4/3, iPad/iPad Mini (Cable Not Included)", + "Anker PowerCore Speed 20000, 20000mAh Qualcomm Quick Charge 3.0 & PowerIQ Portable Charger, with Quick Charge Recharging, Power Bank for Samsung, iPhone, iPad and More, Black (A1278)", + "5Ft Micro-USB Charger Cord Cable Fit for Anker-PowerCore 5000 10000 20100 13000 26800 Mini 3350 Fusion II 15000 Redux 20000 Slim 10000 Astro E1 AC Replacement Power Adapter Supply", + "Anker 10W Max Wireless Charger, 313 Wireless Charger (Pad), Qi-Certified Wireless Charging 7.5W for iPhone 12/12 Pro/12 mini/12 Pro Max, 10W for Galaxy S10 S9 S8, S9 Plus, Note 9 (No AC Adapter)", + "Anker Wireless Charger, 313 Wireless Charger (Stand), Qi-Certified for iPhone 12, 12 Pro Max, SE, 11, 11 Pro, 11 Pro Max, XR, XS Max, 10W Fast-Charging Galaxy S20, S10 (No AC Adapter)", + "USB Charger, Anker Elite Dual Port 24W Wall Charger, PowerPort 2 with PowerIQ and Foldable Plug, for iPhone 11/Xs/XS Max/XR/X/8/7/6/Plus, iPad Pro/Air 2/Mini 3/Mini 4, Samsung S4/S5, and More", + "iPhone 12 Charger [GaN Tech], Anker 30W Compact USB-C Wall Charger with Power Delivery, PowerPort Atom for iPhone 12 / Mini/Pro/Pro Max / 11 / X/XS/XR, iPad Pro, MacBook 12'', Pixel, Galaxy", + "USB C Charger, Anker 30W 2 Port Fast Charger with 18W USB C Power Adapter, Foldable PowerPort PD 2 Charger for iPad Pro, iPhone 11/11 Pro / 11 Pro Max/XS/Max/XR/X, Pixel, Galaxy, and More", + "Anker 40W 5-Port USB Wall Charger, PowerPort 5 for iPhone XS / XS Max / XR / X / 8 / 7 / 6 / Plus, iPad Pro / Air 2 / mini, Galaxy S9 / S8 / Edge / Plus, Note 8 / 7, LG, Nexus, HTC and More, Black (AK-A2124111)", + "Anker Quick Charge 3.0 39W Dual USB Wall Charger, PowerPort Speed 2 for Galaxy S10/S9/S8/Edge/Plus, Note 8/7 and PowerIQ for iPhone Xs/XS Max/XR/X/8/Plus, iPad Pro/Air 2/Mini, LG, Nexus, HTC and More", + "USB C Charger, Anker 20W PIQ 3.0 Fast Charger with Foldable Plug, PowerPort III Charger for iPhone 13/13 Mini/13 Pro/13 Pro Max/12/11, iPad/iPad Mini, MagSafe, and More (Cable Not Included)", + "8.99", + "59.99" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "These models are availiable: Anker USB C Charger 30W, 711 Charger, Compact Fast Charger (Not Foldable) for MacBook Air/iPhone 13/13 Mini/13 Pro/13 Pro Max/12, Galaxy S21, Note 20, iPad Pro, Pixel, and More Anker USB C Charger 40W, 521 Charger (Nano Pro), PIQ 3.0 Durable Compact Fast Charger (Not Foldable) for iPhone 13/13 Mini/13 Pro/13 Pro Max/12, Galaxy, Pixel 4/3, iPad/iPad Mini (Cable Not Included) Anker PowerCore Speed 20000, 20000mAh Qualcomm Quick Charge 3.0 & PowerIQ Portable Charger, with Quick Charge Recharging, Power Bank for Samsung, iPhone, iPad and More, Black (A1278) 5Ft Micro-USB Charger Cord Cable Fit for Anker-PowerCore 5000 10000 20100 13000 26800 Mini 3350 Fusion II 15000 Redux 20000 Slim 10000 Astro E1 AC Replacement Power Adapter Supply Anker 10W Max Wireless Charger, 313 Wireless Charger (Pad), Qi-Certified Wireless Charging 7.5W for iPhone 12/12 Pro/12 mini/12 Pro Max, 10W for Galaxy S10 S9 S8, S9 Plus, Note 9 (No AC Adapter) Anker Wireless Charger, 313 Wireless Charger (Stand), Qi-Certified for iPhone 12, 12 Pro Max, SE, 11, 11 Pro, 11 Pro Max, XR, XS Max, 10W Fast-Charging Galaxy S20, S10 (No AC Adapter) USB Charger, Anker Elite Dual Port 24W Wall Charger, PowerPort 2 with PowerIQ and Foldable Plug, for iPhone 11/Xs/XS Max/XR/X/8/7/6/Plus, iPad Pro/Air 2/Mini 3/Mini 4, Samsung S4/S5, and More iPhone 12 Charger [GaN Tech], Anker 30W Compact USB-C Wall Charger with Power Delivery, PowerPort Atom for iPhone 12 / Mini/Pro/Pro Max / 11 / X/XS/XR, iPad Pro, MacBook 12'', Pixel, Galaxy USB C Charger, Anker 30W 2 Port Fast Charger with 18W USB C Power Adapter, Foldable PowerPort PD 2 Charger for iPad Pro, iPhone 11/11 Pro / 11 Pro Max/XS/Max/XR/X, Pixel, Galaxy, and More Anker 40W 5-Port USB Wall Charger, PowerPort 5 for iPhone XS / XS Max / XR / X / 8 / 7 / 6 / Plus, iPad Pro / Air 2 / mini, Galaxy S9 / S8 / Edge / Plus, Note 8 / 7, LG, Nexus, HTC and More, Black (AK-A2124111) Anker Quick Charge 3.0 39W Dual USB Wall Charger, PowerPort Speed 2 for Galaxy S10/S9/S8/Edge/Plus, Note 8/7 and PowerIQ for iPhone Xs/XS Max/XR/X/8/Plus, iPad Pro/Air 2/Mini, LG, Nexus, HTC and More USB C Charger, Anker 20W PIQ 3.0 Fast Charger with Foldable Plug, PowerPort III Charger for iPhone 13/13 Mini/13 Pro/13 Pro Max/12/11, iPad/iPad Mini, MagSafe, and More (Cable Not Included) Magnetic Wireless Charger, Anker Wireless Charger with 5ft Built-in USB-C Cable, PowerWave Magnetic Pad, 7.5W Charging for iPhone 13 / 13 Pro / 13 Pro Max / 13 mini / 12 / 12 Pro (No AC Adapter) USB C Super Fast Charger, Anker 25W PD Wall Charger Fast Charging for Samsung Galaxy S21/S21+/S21 Ultra/S20/Z Flip/Note20/20 Ultra/Note10/10+/S9/S8/S10e, iPad Pro 12.9, and More (Cable not Included) The price ranges from $8.99 to $59.99" + }, + "intent_template_id": 204, + "format_specification": "Use \"names\" for the list of product names and \"min\" for the minimum price and \"max\" for the maximum price.", + "changelogs": [ + { + "key": "eval", + "category": "permissive_string_match", + "note": "Switched to exact match" + } + ] + }, + { + "sites": [ + "shopping" + ], + "task_id": 281, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "Provide me with the full names of {{product}}, and also share the price range for the available models", + "original.intent_template": "Please provide me with the complete product names of Oral B brush heads designed for children, along with their corresponding price range per brush", + "instantiation_dict": { + "product": "Oral B brush heads designed for children" + }, + "original.instantiation_dict": {}, + "intent": "Provide me with the full names of Oral B brush heads designed for children, and also share the price range for the available models", + "original.intent": "Please provide me with the complete product names of Oral B brush heads designed for children, along with their corresponding price range per brush", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "verify_retrieved_value_normalized", + "eval_params": { + "ordered": false, + "type": "object" + }, + "expected_data": [ + { + "names": { + "value": [ + "Oral-B Kids Extra Soft Replacement Brush Heads featuring STAR WARS, 2 count", + "Kids By Oral-b Stages Power Star Wars Replacement Heads 4 Pack" + ], + "type": "text" + }, + "min": { + "value": "12.99", + "type": "currency" + }, + "max": { + "value": "14.98", + "type": "currency" + } + } + ] + } + } + ], + "site": "shopping" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "Oral-B Kids Extra Soft Replacement Brush Heads featuring STAR WARS, 2 count", + "Kids By Oral-b Stages Power Star Wars Replacement Heads 4 Pack", + "3.745", + "6.495" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "These models are availiable: Oral-B Kids Extra Soft Replacement Brush Heads featuring STAR WARS, 2 count Kids By Oral-b Stages Power Star Wars Replacement Heads 4 Pack The price ranges from $3.745 to $6.495 " + }, + "intent_template_id": 204, + "format_specification": "Use \"names\" for the list of product names and \"min\" for the minimum price and \"max\" for the maximum price.", + "changelogs": [ + { + "key": "eval", + "category": "permissive_string_match", + "note": "Switched to exact match" + } + ] + }, + { + "sites": [ + "shopping" + ], + "task_id": 282, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "Provide me with the full names of {{product}}, and also share the price range for the available models", + "original.intent_template": "List the full product names of slide slippers from Nike and tell me the price range of the available products", + "instantiation_dict": { + "product": "slide slippers from Nike" + }, + "original.instantiation_dict": {}, + "intent": "Provide me with the full names of slide slippers from Nike, and also share the price range for the available models", + "original.intent": "List the full product names of slide slippers from Nike and tell me the price range of the available products", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "verify_retrieved_value_normalized", + "eval_params": { + "ordered": false, + "type": "object" + }, + "expected_data": [ + { + "names": { + "value": [ + "Nike Men's Air Max Camden Slide Sandal", + "Nike Men's Benassi JDI Fanny Pack Slides", + "Nike Victori One Mens Comfort Slide Cn9675-003 (Midnight Navy/Midnight Navy/White, Numeric_10)", + "Nike Offcourt Slide Mens Bq4639-002 Size 12", + "Nike Jordan Men's Break Slide Red AR6374-602", + "Nike Victori One Slide Mens Style : Dd9559-300", + "Nike Men's Benassi Solarsoft Slide Athletic Sandal (Black/White, numeric_14)", + "Nike Men's Benassi Solarsoft Slide Athletic Sandal (Midnight Navy/Blue, numeric_8)", + "Nike womens Benassi Just Do It" + ], + "type": "text" + }, + "min": { + "value": "27.6", + "type": "currency" + }, + "max": { + "value": "90.65", + "type": "currency" + } + } + ] + } + } + ], + "site": "shopping" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "Nike Men's Air Max Camden Slide Sandal", + "Nike Men's Benassi JDI Fanny Pack Slides", + "Nike Victori One Mens Comfort Slide Cn9675-003 (Midnight Navy/Midnight Navy/White, Numeric_10)", + "Nike Offcourt Slide Mens Bq4639-002 Size 12", + "Nike Jordan Men's Break Slide Red AR6374-602", + "Nike Victori One Slide Mens Style : Dd9559-300", + "Nike Men's Benassi Solarsoft Slide Athletic Sandal (Black/White, numeric_14)", + "Nike Men's Benassi Solarsoft Slide Athletic Sandal (Midnight Navy/Blue, numeric_8)", + "Nike womens Benassi Just Do It", + "27.6", + "90.65" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "These models are availiable: Nike Men's Air Max Camden Slide Sandal Nike Men's Benassi JDI Fanny Pack Slides Nike Victori One Mens Comfort Slide Cn9675-003 (Midnight Navy/Midnight Navy/White, Numeric_10) Nike Offcourt Slide Mens Bq4639-002 Size 12 Nike Jordan Men's Break Slide Red AR6374-602 Nike Victori One Slide Mens Style : Dd9559-300 Nike Men's Benassi Solarsoft Slide Athletic Sandal (Black/White, numeric_14) Nike Men's Benassi Solarsoft Slide Athletic Sandal (Midnight Navy/Blue, numeric_8) Nike womens Benassi Just Do It The price ranges from $27.6 to $90.65" + }, + "intent_template_id": 204, + "format_specification": "Use \"names\" for the list of product names and \"min\" for the minimum price and \"max\" for the maximum price.", + "changelogs": [ + { + "key": "eval", + "category": "permissive_string_match", + "note": "Switched to exact match" + } + ] + }, + { + "sites": [ + "shopping" + ], + "task_id": 283, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "Navigate to the most recent models of XBox controllers released between 2020-2021.", + "original.intent_template": "Look up the most recent models of XBox controllers released between 2020-2021?", + "instantiation_dict": {}, + "intent": "Navigate to the most recent models of XBox controllers released between 2020-2021.", + "original.intent": "Look up the most recent models of XBox controllers released between 2020-2021?", + "require_reset": false, + "eval": { + "expected_ui_state": [ + { + "url": "__SHOPPING__/microsoft-xbox-controller-carbon-black-for-series-x-series-s-xbox-one-windows-10-android-ios-bundled-with-dual-port-charging-dock-xbox-controller-skin-voucher-premgear-cloth.html" + } + ], + "site": "shopping" + }, + "original.eval": { + "eval_types": [ + "url_match" + ], + "reference_answers": null, + "reference_url": "__SHOPPING__/microsoft-xbox-controller-carbon-black-for-series-x-series-s-xbox-one-windows-10-android-ios-bundled-with-dual-port-charging-dock-xbox-controller-skin-voucher-premgear-cloth.html", + "program_html": [], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 210, + "changelogs": [ + { + "key": "intent_template", + "category": "task_ambiguity", + "note": "Clearly specify expected navigate vs return value. and remove ambiguous '?'." + } + ] + }, + { + "sites": [ + "shopping" + ], + "task_id": 284, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "Show the least expensive {{product}} with a minimum storage capacity of {{min_storage}}.", + "instantiation_dict": { + "product": "shoe storage", + "min_storage": "12 pairs" + }, + "intent": "Show the least expensive shoe storage with a minimum storage capacity of 12 pairs.", + "require_reset": false, + "eval": { + "expected_ui_state": [ + { + "url": "__SHOPPING__/onlyeasy-over-the-door-shoe-storage-organizer-hanging-shoe-rack-holder-with-24-large-fabric-pockets-22-1-x-61-4-herringbone-grey-mxrodsb1p.html" + } + ], + "site": "shopping" + }, + "original.eval": { + "eval_types": [ + "url_match" + ], + "reference_answers": null, + "reference_url": "__SHOPPING__/onlyeasy-over-the-door-shoe-storage-organizer-hanging-shoe-rack-holder-with-24-large-fabric-pockets-22-1-x-61-4-herringbone-grey-mxrodsb1p.html", + "program_html": [], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 207 + }, + { + "sites": [ + "shopping" + ], + "task_id": 285, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "Show the least expensive {{product}} with a minimum storage capacity of {{min_storage}}.", + "instantiation_dict": { + "product": "switch card holder", + "min_storage": "15 cards" + }, + "intent": "Show the least expensive switch card holder with a minimum storage capacity of 15 cards.", + "require_reset": false, + "eval": { + "expected_ui_state": [ + { + "url": "__SHOPPING__/game-card-holder-storage-case-for-nintendo-switch-games-or-ps-vita-game-case-or-sd-memory-cards-black.html" + } + ], + "site": "shopping" + }, + "original.eval": { + "eval_types": [ + "url_match" + ], + "reference_answers": null, + "reference_url": "__SHOPPING__/game-card-holder-storage-case-for-nintendo-switch-games-or-ps-vita-game-case-or-sd-memory-cards-black.html", + "program_html": [], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 207 + }, + { + "sites": [ + "shopping" + ], + "task_id": 286, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "Show the least expensive {{product}} with a minimum storage capacity of {{min_storage}}.", + "instantiation_dict": { + "product": "ssd hard drive", + "min_storage": "1TB" + }, + "intent": "Show the least expensive ssd hard drive with a minimum storage capacity of 1TB.", + "require_reset": false, + "eval": { + "expected_ui_state": [ + { + "url": "__SHOPPING__/external-hard-drive-2tb-ultra-thin-external-hard-drive-2000gb-ultra-high-speed-portable-3-1-type-c-storage-drive-compatible-with-pc-laptop-and-mac-2tb-a1.html" + } + ], + "site": "shopping" + }, + "original.eval": { + "eval_types": [ + "url_match" + ], + "reference_answers": null, + "reference_url": "__SHOPPING__/external-hard-drive-2tb-ultra-thin-external-hard-drive-2000gb-ultra-high-speed-portable-3-1-type-c-storage-drive-compatible-with-pc-laptop-and-mac-2tb-a1.html", + "program_html": [], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 207 + }, + { + "sites": [ + "map" + ], + "task_id": 287, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "How much time does it take from Pittsburgh to Philadelphia by car?", + "instantiation_dict": {}, + "intent": "How much time does it take from Pittsburgh to Philadelphia by car?", + "require_reset": false, + "eval": { + "site": "map" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": [ + "5h 47min" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "5h 47min" + }, + "intent_template_id": 47, + "changelogs": [ + { + "key": "eval", + "category": "permissive_string_match", + "note": "Switched from fuzzy eval" + } + ] + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 288, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Tell me the {{attribute}} of the customer who has the most cancellations in the history", + "instantiation_dict": { + "attribute": "name" + }, + "intent": "Tell me the name of the customer who has the most cancellations in the history", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "verify_retrieved_value_normalized", + "eval_params": { + "ordered": false, + "type": "text" + }, + "expected_data": [ + "Samantha Jones" + ] + } + } + ], + "site": "shopping_admin" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "exact_match": "Samantha Jones" + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "Samantha Jones" + }, + "intent_template_id": 234 + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 289, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Tell me the {{attribute}} of the customer who has the most cancellations in the history", + "instantiation_dict": { + "attribute": "email address, name, phone number" + }, + "intent": "Tell me the email address, name, phone number of the customer who has the most cancellations in the history", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "verify_retrieved_value_normalized", + "eval_params": { + "ordered": false, + "type": "object" + }, + "expected_data": [ + { + "name": { + "value": "Samantha Jones", + "type": "text" + }, + "email": { + "value": "coolcat321@hotmail.com", + "type": "text" + }, + "phone_number": { + "value": "3055551212", + "type": "text" + } + } + ] + } + } + ], + "site": "shopping_admin" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": [ + "email: coolcat321@hotmail.com", + "name: Samantha Jones", + "phone number: 3055551212" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "email: coolcat321@hotmail.com name: Samantha Jones phone number: 3055551212" + }, + "intent_template_id": 234, + "format_specification": "Use \"name\" for the customer name and \"email\" for the email and \"phone_number\" for the phone number.", + "changelogs": [ + { + "key": "eval", + "category": "permissive_string_match", + "note": "Switched from fuzzy eval" + } + ] + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 290, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Tell me the {{attribute}} of the customer who has the most cancellations in the history", + "instantiation_dict": { + "attribute": "product SKUs in the most recent cancelled orders" + }, + "intent": "Tell me the product SKUs in the most recent cancelled orders of the customer who has the most cancellations in the history", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "verify_retrieved_value_normalized", + "eval_params": { + "ordered": false, + "type": "text" + }, + "expected_data": [ + "WSH09-29-White", + "WSH09-28-Green", + "MSH11-34-Blue", + "WP09-29-Purple" + ] + } + } + ], + "site": "shopping_admin" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "WSH09-29-White", + "WSH09-28-Green", + "MSH11-34-Blue", + "WP09-29-Purple" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "WSH09-29-White,WSH09-28-Green,MSH11-34-Blue,WP09-29-Purple" + }, + "intent_template_id": 234, + "changelogs": [ + { + "key": "eval", + "category": "permissive_string_match", + "note": "Switched to exact match" + } + ] + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 291, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Tell me the {{attribute}} of the customer who has the most cancellations in the history", + "instantiation_dict": { + "attribute": "total spend on products in the most recent cancelled orders" + }, + "intent": "Tell me the total spend on products in the most recent cancelled orders of the customer who has the most cancellations in the history", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "verify_retrieved_value_normalized", + "eval_params": { + "ordered": false, + "type": "numeric" + }, + "expected_data": [ + 148 + ] + } + } + ], + "site": "shopping_admin" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "148" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "$148" + }, + "intent_template_id": 234, + "changelogs": [ + { + "key": "eval", + "category": "permissive_string_match", + "note": "Switched to exact match" + } + ] + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 292, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Tell me the {{attribute}} of the customer who has the most cancellations in the history", + "instantiation_dict": { + "attribute": "total number of cancellations" + }, + "intent": "Tell me the total number of cancellations of the customer who has the most cancellations in the history", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "verify_retrieved_value_normalized", + "eval_params": { + "ordered": false, + "type": "numeric" + }, + "expected_data": [ + 9 + ] + } + } + ], + "site": "shopping_admin" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "9" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "9" + }, + "intent_template_id": 234, + "changelogs": [ + { + "key": "eval", + "category": "permissive_string_match", + "note": "Switched to exact match" + } + ] + }, + { + "sites": [ + "gitlab" + ], + "task_id": 293, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Show me the command to clone {{repo}} with SSH. Use the globally provided host and port", + "original.intent_template": "Show me the command to clone {{repo}} with SSH.", + "instantiation_dict": { + "repo": "Super_Awesome_Robot" + }, + "intent": "Show me the command to clone Super_Awesome_Robot with SSH. Use the globally provided host and port", + "original.intent": "Show me the command to clone Super_Awesome_Robot with SSH.", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "verify_retrieved_value_normalized", + "eval_params": { + "type": "text" + }, + "expected_data": [ + "git clone ssh://git@{{ resource.base_host }}:2222/convexegg/super_awesome_robot.git" + ] + } + } + ], + "site": "gitlab" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "exact_match": "git clone ssh://git@metis.lti.cs.cmu.edu:2222/convexegg/super_awesome_robot.git" + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "git clone ssh://git@metis.lti.cs.cmu.edu:2222/convexegg/super_awesome_robot.git" + }, + "intent_template_id": 329, + "changelogs": [ + { + "key": "intent_template", + "category": "reference_alignment", + "note": "Specify which host and port to use when giving the final output" + }, + { + "key": "expected_retrieve_value", + "category": "avoid_permissive_text", + "note": "Use structured response to judge correctness" + } + ] + }, + { + "sites": [ + "gitlab" + ], + "task_id": 294, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Show me the command to clone {{repo}} with SSH. Use the globally provided host and port", + "original.intent_template": "Show me the command to clone {{repo}} with SSH.", + "instantiation_dict": { + "repo": "ChatGPT" + }, + "intent": "Show me the command to clone ChatGPT with SSH. Use the globally provided host and port", + "original.intent": "Show me the command to clone ChatGPT with SSH.", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "verify_retrieved_value_normalized", + "eval_params": { + "type": "text" + }, + "expected_data": [ + "git clone ssh://git@{{ resource.base_host }}:2222/convexegg/chatgpt.git" + ] + } + } + ], + "site": "gitlab" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "exact_match": "git clone ssh://git@metis.lti.cs.cmu.edu:2222/convexegg/chatgpt.git" + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "git clone ssh://git@metis.lti.cs.cmu.edu:2222/convexegg/chatgpt.git" + }, + "intent_template_id": 329, + "changelogs": [ + { + "key": "intent_template", + "category": "reference_alignment", + "note": "Specify which host and port to use when giving the final output" + }, + { + "key": "expected_retrieve_value", + "category": "avoid_permissive_text", + "note": "Use structured response to judge correctness" + } + ] + }, + { + "sites": [ + "gitlab" + ], + "task_id": 295, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Show me the command to clone {{repo}} with SSH. Use the globally provided host and port", + "original.intent_template": "Show me the command to clone {{repo}} with SSH.", + "instantiation_dict": { + "repo": "metaseq" + }, + "intent": "Show me the command to clone metaseq with SSH. Use the globally provided host and port", + "original.intent": "Show me the command to clone metaseq with SSH.", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "verify_retrieved_value_normalized", + "eval_params": { + "type": "text" + }, + "expected_data": [ + "git clone ssh://git@{{ resource.base_host }}:2222/root/metaseq.git" + ] + } + } + ], + "site": "gitlab" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "exact_match": "git clone ssh://git@metis.lti.cs.cmu.edu:2222/root/metaseq.git" + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "git clone ssh://git@metis.lti.cs.cmu.edu:2222/root/metaseq.git" + }, + "intent_template_id": 329, + "changelogs": [ + { + "key": "intent_template", + "category": "reference_alignment", + "note": "Specify which host and port to use when giving the final output" + }, + { + "key": "expected_retrieve_value", + "category": "avoid_permissive_text", + "note": "Use structured response to judge correctness" + } + ] + }, + { + "sites": [ + "gitlab" + ], + "task_id": 296, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Show me the command to clone {{repo}} with SSH. Use the globally provided host and port", + "original.intent_template": "Show me the command to clone {{repo}} with SSH.", + "instantiation_dict": { + "repo": "the best GAN python implementation" + }, + "intent": "Show me the command to clone the best GAN python implementation with SSH. Use the globally provided host and port", + "original.intent": "Show me the command to clone the best GAN python implementation with SSH.", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "verify_retrieved_value_normalized", + "eval_params": { + "type": "text" + }, + "expected_data": [ + "git clone ssh://git@{{ resource.base_host }}:2222/eriklindernoren/PyTorch-GAN.git" + ] + } + } + ], + "site": "gitlab" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "exact_match": "ssh://git@metis.lti.cs.cmu.edu:2222/eriklindernoren/PyTorch-GAN.git" + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "ssh://git@metis.lti.cs.cmu.edu:2222/eriklindernoren/PyTorch-GAN.git" + }, + "intent_template_id": 329, + "changelogs": [ + { + "key": "intent_template", + "category": "reference_alignment", + "note": "Specify which host and port to use when giving the final output" + }, + { + "key": "expected_retrieve_value", + "category": "avoid_permissive_text", + "note": "Use structured response to judge correctness" + } + ] + }, + { + "sites": [ + "gitlab" + ], + "task_id": 297, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Show me the command to clone {{repo}} with SSH. Use the globally provided host and port", + "original.intent_template": "Show me the command to clone {{repo}} with SSH.", + "instantiation_dict": { + "repo": "the most stared Covid location tracker" + }, + "intent": "Show me the command to clone the most stared Covid location tracker with SSH. Use the globally provided host and port", + "original.intent": "Show me the command to clone the most stared Covid location tracker with SSH.", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "verify_retrieved_value_normalized", + "eval_params": { + "type": "text" + }, + "expected_data": [ + "git clone ssh://git@{{ resource.base_host }}:2222/yjlou/2019-nCov.git" + ] + } + } + ], + "site": "gitlab" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "exact_match": "ssh://git@metis.lti.cs.cmu.edu:2222/yjlou/2019-nCov.git" + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "ssh://git@metis.lti.cs.cmu.edu:2222/yjlou/2019-nCov.git" + }, + "intent_template_id": 329, + "changelogs": [ + { + "key": "intent_template", + "category": "reference_alignment", + "note": "Specify which host and port to use when giving the final output" + }, + { + "key": "expected_retrieve_value", + "category": "avoid_permissive_text", + "note": "Use structured response to judge correctness" + } + ] + }, + { + "sites": [ + "shopping" + ], + "task_id": 298, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "Show the most recent {{status}} order", + "instantiation_dict": { + "status": "completed" + }, + "intent": "Show the most recent completed order", + "require_reset": false, + "eval": { + "expected_ui_state": [ + { + "url": "__SHOPPING__/sales/order/view/order_id/180/" + } + ], + "site": "shopping" + }, + "original.eval": { + "eval_types": [ + "url_match" + ], + "reference_answers": null, + "reference_url": "__SHOPPING__/sales/order/view/order_id/180/", + "program_html": [], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 180 + }, + { + "sites": [ + "shopping" + ], + "task_id": 299, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "Show the most recent {{status}} order", + "instantiation_dict": { + "status": "cancelled" + }, + "intent": "Show the most recent cancelled order", + "require_reset": false, + "eval": { + "expected_ui_state": [ + { + "url": "__SHOPPING__/sales/order/view/order_id/170/" + } + ], + "site": "shopping" + }, + "original.eval": { + "eval_types": [ + "url_match" + ], + "reference_answers": null, + "reference_url": "__SHOPPING__/sales/order/view/order_id/170/", + "program_html": [], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 180 + }, + { + "sites": [ + "shopping" + ], + "task_id": 300, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "Show the most recent {{status}} order", + "instantiation_dict": { + "status": "pending" + }, + "intent": "Show the most recent pending order", + "require_reset": false, + "eval": { + "expected_ui_state": [ + { + "url": "__SHOPPING__/sales/order/view/order_id/189/" + } + ], + "site": "shopping" + }, + "original.eval": { + "eval_types": [ + "url_match" + ], + "reference_answers": null, + "reference_url": "__SHOPPING__/sales/order/view/order_id/189/", + "program_html": [], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 180 + }, + { + "sites": [ + "shopping" + ], + "task_id": 301, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "Show the most recent {{status}} order", + "instantiation_dict": { + "status": "processing" + }, + "intent": "Show the most recent processing order", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "validate_status", + "expected_data": { + "expected_status": "NOT_FOUND_ERROR" + } + } + } + ], + "site": "shopping" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": "N/A" + }, + "reference_url": "", + "program_html": [], + "string_note": "there is no order in processing" + }, + "intent_template_id": 180, + "changelogs": [ + { + "key": "expected_retrieve_value", + "category": "unachievable_tasks", + "note": "Status check used instead of fuzzy match" + } + ] + }, + { + "sites": [ + "shopping" + ], + "task_id": 302, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "Show the most recent {{status}} order", + "instantiation_dict": { + "status": "out of delivery" + }, + "intent": "Show the most recent out of delivery order", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "validate_status", + "expected_data": { + "expected_status": "NOT_FOUND_ERROR" + } + } + } + ], + "site": "shopping" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": "N/A" + }, + "reference_url": "", + "program_html": [], + "string_note": "there is no order in processing" + }, + "intent_template_id": 180, + "changelogs": [ + { + "key": "expected_retrieve_value", + "category": "unachievable_tasks", + "note": "Status check used instead of fuzzy match" + } + ] + }, + { + "sites": [ + "gitlab" + ], + "task_id": 303, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__/a11yproject/a11yproject.com", + "geolocation": null, + "intent_template": "How many commits did {{user}} make {{period}}?", + "instantiation_dict": { + "user": "Kilian", + "period": "during 2023" + }, + "original.instantiation_dict": { + "user": "Kilian", + "period": "durning 2023" + }, + "intent": "How many commits did Kilian make during 2023?", + "original.intent": "How many commits did Kilian make durning 2023?", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "verify_retrieved_value_normalized", + "eval_params": { + "ordered": "false", + "type": "numeric" + }, + "expected_data": [ + 1 + ] + } + } + ], + "site": "gitlab" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "1" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "1" + }, + "intent_template_id": 321, + "changelogs": [ + { + "key": "instantiation_dict", + "category": "spelling_or_grammar", + "note": "Specify response format and correct spelling" + }, + { + "key": "expected_retrieve_value", + "category": "avoid_permissive_text", + "note": "Use structured response to judge correctness" + }, + { + "key": "eval", + "category": "permissive_string_match", + "note": "Switched to exact match" + } + ] + }, + { + "sites": [ + "gitlab" + ], + "task_id": 304, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__/a11yproject/a11yproject.com", + "geolocation": null, + "intent_template": "How many commits did {{user}} make {{period}}?", + "instantiation_dict": { + "user": "Eric", + "period": "between Feb 2023 and May 2023" + }, + "intent": "How many commits did Eric make between Feb 2023 and May 2023?", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "verify_retrieved_value_normalized", + "eval_params": { + "ordered": "false", + "type": "numeric" + }, + "expected_data": [ + 14 + ] + } + } + ], + "site": "gitlab" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "14" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "14" + }, + "intent_template_id": 321, + "changelogs": [ + { + "key": "expected_retrieve_value", + "category": "avoid_permissive_text", + "note": "Use structured response to judge correctness" + }, + { + "key": "eval", + "category": "permissive_string_match", + "note": "Switched to exact match" + } + ] + }, + { + "sites": [ + "gitlab" + ], + "task_id": 305, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__/a11yproject/a11yproject.com", + "geolocation": null, + "intent_template": "How many commits did {{user}} make {{period}}?", + "instantiation_dict": { + "user": "Philip", + "period": "in 2023/1" + }, + "intent": "How many commits did Philip make in 2023/1?", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "verify_retrieved_value_normalized", + "eval_params": { + "ordered": "false", + "type": "numeric" + }, + "expected_data": [ + 0 + ] + } + } + ], + "site": "gitlab" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "0" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "0" + }, + "intent_template_id": 321, + "changelogs": [ + { + "key": "expected_retrieve_value", + "category": "avoid_permissive_text", + "note": "Use structured response to judge correctness" + }, + { + "key": "eval", + "category": "permissive_string_match", + "note": "Switched to exact match" + } + ] + }, + { + "sites": [ + "gitlab" + ], + "task_id": 306, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__/a11yproject/a11yproject.com", + "geolocation": null, + "intent_template": "How many commits did {{user}} make {{period}}?", + "instantiation_dict": { + "user": "Anthony", + "period": "between 08/2022-09/2022" + }, + "intent": "How many commits did Anthony make between 08/2022-09/2022?", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "verify_retrieved_value_normalized", + "eval_params": { + "ordered": "false", + "type": "numeric" + }, + "expected_data": [ + 0 + ] + } + } + ], + "site": "gitlab" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "0" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "0" + }, + "intent_template_id": 321, + "changelogs": [ + { + "key": "expected_retrieve_value", + "category": "avoid_permissive_text", + "note": "Use structured response to judge correctness" + }, + { + "key": "eval", + "category": "permissive_string_match", + "note": "Switched to exact match" + } + ] + }, + { + "sites": [ + "gitlab" + ], + "task_id": 307, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__/a11yproject/a11yproject.com", + "geolocation": null, + "intent_template": "How many commits did {{user}} make {{period}}?", + "instantiation_dict": { + "user": "Nic", + "period": "in April 2021" + }, + "intent": "How many commits did Nic make in April 2021?", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "validate_status", + "expected_data": { + "expected_status": "RESOURCE_NOT_FOUND_ERROR" + } + } + } + ], + "site": "gitlab" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "16" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "16" + }, + "intent_template_id": 321, + "changelogs": [ + { + "key": "expected_retrieve_value", + "category": "unachievable_tasks", + "note": "Expect the right error code for unachievable tasks" + }, + { + "key": "eval", + "category": "permissive_string_match", + "note": "Switched to exact match" + } + ] + }, + { + "sites": [ + "gitlab" + ], + "task_id": 308, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Tell me who has made the most contributions, in terms of number of commits, to the {{repo}} project", + "instantiation_dict": { + "repo": "primer/design" + }, + "intent": "Tell me who has made the most contributions, in terms of number of commits, to the primer/design project", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "verify_retrieved_value_normalized", + "eval_params": { + "type": "text" + }, + "expected_data": [ + "Shawn Allen" + ] + } + } + ], + "site": "gitlab" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "exact_match": "Shawn Allen" + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "Shawn Allen" + }, + "intent_template_id": 323, + "format_specification": "Return the first and last name as a string. If first and last name don't exist then return their username", + "changelogs": [ + { + "key": "expected_retrieve_value", + "category": "avoid_permissive_text", + "note": "Use structured response to judge correctness" + } + ] + }, + { + "sites": [ + "gitlab" + ], + "task_id": 309, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Tell me who has made the most contributions, in terms of number of commits, to the {{repo}} project", + "instantiation_dict": { + "repo": "thoughtbot/administrate" + }, + "intent": "Tell me who has made the most contributions, in terms of number of commits, to the thoughtbot/administrate project", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "verify_retrieved_value_normalized", + "eval_params": { + "type": "text" + }, + "expected_data": [ + "Grayson Wright" + ] + } + } + ], + "site": "gitlab" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "exact_match": "Grayson Wright" + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "Grayson Wright" + }, + "intent_template_id": 323, + "format_specification": "Return the first and last name as a string. If first and last name don't exist then return their username", + "changelogs": [ + { + "key": "expected_retrieve_value", + "category": "avoid_permissive_text", + "note": "Use structured response to judge correctness" + } + ] + }, + { + "sites": [ + "gitlab" + ], + "task_id": 310, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Tell me who has made the most contributions, in terms of number of commits, to the {{repo}} project", + "instantiation_dict": { + "repo": "AndroidSlidingUpPanel" + }, + "intent": "Tell me who has made the most contributions, in terms of number of commits, to the AndroidSlidingUpPanel project", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "verify_retrieved_value_normalized", + "eval_params": { + "type": "text" + }, + "expected_data": [ + "tokudu" + ] + } + } + ], + "site": "gitlab" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "exact_match": "tokudu" + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "tokudu" + }, + "intent_template_id": 323, + "format_specification": "Return the first and last name as a string. If first and last name don't exist then return their username", + "changelogs": [ + { + "key": "expected_retrieve_value", + "category": "avoid_permissive_text", + "note": "Use structured response to judge correctness" + } + ] + }, + { + "sites": [ + "gitlab" + ], + "task_id": 311, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Tell me who has made the most contributions, in terms of number of commits, to the {{repo}} project", + "instantiation_dict": { + "repo": "Pytorch GAN" + }, + "intent": "Tell me who has made the most contributions, in terms of number of commits, to the Pytorch GAN project", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "verify_retrieved_value_normalized", + "eval_params": { + "type": "text" + }, + "expected_data": [ + "Erik Linder-Nor\u00e9n" + ] + } + } + ], + "site": "gitlab" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "exact_match": "Erik Linder-Nor\u00e9n" + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "Erik Linder-Nor\u00e9n" + }, + "intent_template_id": 323, + "format_specification": "Return the first and last name as a string. If first and last name don't exist then return their username", + "changelogs": [ + { + "key": "expected_retrieve_value", + "category": "avoid_permissive_text", + "note": "Use structured response to judge correctness" + } + ] + }, + { + "sites": [ + "gitlab" + ], + "task_id": 312, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Tell me who has made the most contributions, in terms of number of commits, to the {{repo}} project", + "instantiation_dict": { + "repo": "csvkit" + }, + "intent": "Tell me who has made the most contributions, in terms of number of commits, to the csvkit project", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "verify_retrieved_value_normalized", + "eval_params": { + "type": "text" + }, + "expected_data": [ + "Christopher Groskopf" + ] + } + } + ], + "site": "gitlab" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "exact_match": "Christopher Groskopf" + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "Christopher Groskopf" + }, + "intent_template_id": 323, + "format_specification": "Return the first and last name as a string. If first and last name don't exist then return their username", + "changelogs": [ + { + "key": "expected_retrieve_value", + "category": "avoid_permissive_text", + "note": "Use structured response to judge correctness" + } + ] + }, + { + "sites": [ + "shopping" + ], + "task_id": 313, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "Return the phone number to call for the customer service?", + "original.intent_template": "Which number to call for the customer service?", + "instantiation_dict": {}, + "intent": "Return the phone number to call for the customer service?", + "original.intent": "Which number to call for the customer service?", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "validate_status", + "expected_data": { + "expected_status": "NOT_FOUND_ERROR" + } + } + } + ], + "site": "shopping" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": "N/A" + }, + "reference_url": "", + "program_html": [], + "string_note": "There is no phone number in the website", + "reference_answer_raw_annotation": "N/A" + }, + "intent_template_id": 134, + "changelogs": [ + { + "key": "intent_template", + "category": "task_ambiguity", + "note": "Clarify the intent to specify return value" + }, + { + "key": "expected_retrieve_value", + "category": "unachievable_tasks", + "note": "Status check used instead of fuzzy match" + } + ] + }, + { + "sites": [ + "gitlab" + ], + "task_id": 314, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "List the {{attribute}} of the top 3 contributors to {{repo}} repo, ranked by the number of commits?", + "instantiation_dict": { + "repo": "primer/design", + "attribute": "name" + }, + "original.instantiation_dict": { + "repo": "prime/design", + "attribute": "name" + }, + "intent": "List the name of the top 3 contributors to primer/design repo, ranked by the number of commits?", + "original.intent": "List the name of the top 3 contributors to prime/design repo, ranked by the number of commits?", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "verify_retrieved_value_normalized", + "eval_params": { + "type": "text" + }, + "expected_data": [ + "Shawn Allen", + "Inayaili Le\u00f3n", + "Aurora Pleguezuelo" + ] + } + } + ], + "site": "gitlab" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "Shawn Allen", + "Inayaili Le\u00f3n", + "Aurora Pleguezuelo" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "Shawn Allen, Inayaili Le\u00f3n, Aurora Pleguezuelo" + }, + "intent_template_id": 324, + "format_specification": "Return the first and last names as a list", + "changelogs": [ + { + "key": "instantiation_dict", + "category": "spelling_or_grammar", + "note": "Corrected misspelling" + }, + { + "key": "expected_retrieve_value", + "category": "avoid_permissive_text", + "note": "Use structured response to judge correctness" + }, + { + "key": "eval", + "category": "permissive_string_match", + "note": "Switched to exact match" + } + ] + }, + { + "sites": [ + "gitlab" + ], + "task_id": 315, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "List the {{attribute}} of the top 3 contributors to {{repo}} repo, ranked by the number of commits?", + "instantiation_dict": { + "repo": "Pytorch GAN", + "attribute": "email address" + }, + "intent": "List the email address of the top 3 contributors to Pytorch GAN repo, ranked by the number of commits?", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "verify_retrieved_value_normalized", + "eval_params": { + "type": "text" + }, + "expected_data": [ + "eriklindernoren@live.se", + "eriklindernoren@gmail.com", + "pinnacle.chen@qq.com" + ] + } + } + ], + "site": "gitlab" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "eriklindernoren@live.se", + "eriklindernoren@gmail.com", + "pinnacle.chen@qq.com" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "eriklindernoren@live.se, eriklindernoren@gmail.com, pinnacle.chen@qq.com" + }, + "intent_template_id": 324, + "changelogs": [ + { + "key": "expected_retrieve_value", + "category": "avoid_permissive_text", + "note": "Use structured response to judge correctness" + }, + { + "key": "eval", + "category": "permissive_string_match", + "note": "Switched to exact match" + } + ] + }, + { + "sites": [ + "gitlab" + ], + "task_id": 316, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "List the {{attribute}} of the top 3 contributors to {{repo}} repo, ranked by the number of commits?", + "instantiation_dict": { + "repo": "facebook's guide on building react apps", + "attribute": "name" + }, + "intent": "List the name of the top 3 contributors to facebook's guide on building react apps repo, ranked by the number of commits?", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "validate_status", + "expected_data": { + "expected_status": "RESOURCE_NOT_FOUND_ERROR" + } + } + } + ], + "site": "gitlab" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "Ian Sutherland", + "Joe Hadda", + "Dan Abramov" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "Ian Sutherland, Joe Hadda, Dan Abramov" + }, + "intent_template_id": 324, + "changelogs": [ + { + "key": "expected_data", + "category": "unachievable_tasks", + "note": "Repository not found" + }, + { + "key": "expected_retrieve_value", + "category": "avoid_permissive_text", + "note": "Use structured response to judge correctness" + }, + { + "key": "eval", + "category": "permissive_string_match", + "note": "Switched to exact match" + } + ] + }, + { + "sites": [ + "gitlab" + ], + "task_id": 317, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "List the {{attribute}} of the top 3 contributors to {{repo}} repo, ranked by the number of commits?", + "instantiation_dict": { + "repo": "metaseq", + "attribute": "name and number of commits" + }, + "intent": "List the name and number of commits of the top 3 contributors to metaseq repo, ranked by the number of commits?", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "verify_retrieved_value_normalized", + "eval_params": { + "ordered": "false", + "type": "text" + }, + "expected_data": [ + "Susan Zhang: 70", + "Stephen Roller: 51", + "Peter Albert: 12" + ] + } + } + ], + "site": "gitlab" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "Susan Zhang: 70", + "Stephen Roller: 51", + "Peter Albert: 12" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "Susan Zhang: 70, Stephen Roller: 51, Peter Albert: 12" + }, + "intent_template_id": 324, + "format_specification": "Return a list where each element is a string with format, : ", + "changelogs": [ + { + "key": "expected_retrieve_value", + "category": "avoid_permissive_text", + "note": "Use structured response to judge correctness" + }, + { + "key": "eval", + "category": "permissive_string_match", + "note": "Switched to exact match" + } + ] + }, + { + "sites": [ + "gitlab" + ], + "task_id": 318, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "List the {{attribute}} of the top 3 contributors to {{repo}} repo, ranked by the number of commits?", + "instantiation_dict": { + "repo": "2019-nCov", + "attribute": "last names" + }, + "intent": "List the last names of the top 3 contributors to 2019-nCov repo, ranked by the number of commits?", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "verify_retrieved_value_normalized", + "eval_params": { + "ordered": "false", + "type": "text" + }, + "expected_data": [ + "Lo", + "Chen", + "Chu" + ] + } + } + ], + "site": "gitlab" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "Lo", + "Chen", + "Chu" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "Lo, Chen, Chu" + }, + "intent_template_id": 324, + "format_specification": "Return a list their first names", + "changelogs": [ + { + "key": "expected_retrieve_value", + "category": "avoid_permissive_text", + "note": "Use structured response to judge correctness" + }, + { + "key": "eval", + "category": "permissive_string_match", + "note": "Switched to exact match" + } + ] + }, + { + "sites": [ + "shopping" + ], + "task_id": 319, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "How much refund I should expect from my order canceled in {{time}}, including shipping fee", + "original.intent_template": "How much refund I should expect from my order canlled in {{time}}, including shipping fee", + "instantiation_dict": { + "time": "April 2022" + }, + "intent": "How much refund I should expect from my order canceled in April 2022, including shipping fee", + "original.intent": "How much refund I should expect from my order canlled in April 2022, including shipping fee", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "verify_retrieved_value_normalized", + "eval_params": { + "ordered": false, + "type": "currency" + }, + "expected_data": [ + 0 + ] + } + } + ], + "site": "shopping" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "0" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "0" + }, + "intent_template_id": 160, + "changelogs": [ + { + "key": "intent_template", + "category": "spelling_or_grammar", + "note": "Fix spelling of canceled" + }, + { + "key": "eval", + "category": "permissive_string_match", + "note": "Switched to exact match" + } + ] + }, + { + "sites": [ + "shopping" + ], + "task_id": 320, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "How much refund I should expect from my order canceled in {{time}}, including shipping fee", + "original.intent_template": "How much refund I should expect from my order canlled in {{time}}, including shipping fee", + "instantiation_dict": { + "time": "Feb 2023" + }, + "intent": "How much refund I should expect from my order canceled in Feb 2023, including shipping fee", + "original.intent": "How much refund I should expect from my order canlled in Feb 2023, including shipping fee", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "verify_retrieved_value_normalized", + "eval_params": { + "ordered": false, + "type": "currency" + }, + "expected_data": [ + "406.53" + ] + } + } + ], + "site": "shopping" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "406.53" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "406.53" + }, + "intent_template_id": 160, + "changelogs": [ + { + "key": "intent_template", + "category": "spelling_or_grammar", + "note": "Fix spelling of canceled" + }, + { + "key": "eval", + "category": "permissive_string_match", + "note": "Switched to exact match" + } + ] + }, + { + "sites": [ + "shopping" + ], + "task_id": 321, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "How much refund I should expect from my order canceled in {{time}}, including shipping fee", + "original.intent_template": "How much refund I should expect from my order canlled in {{time}}, including shipping fee", + "instantiation_dict": { + "time": "2022" + }, + "intent": "How much refund I should expect from my order canceled in 2022, including shipping fee", + "original.intent": "How much refund I should expect from my order canlled in 2022, including shipping fee", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "verify_retrieved_value_normalized", + "eval_params": { + "ordered": false, + "type": "currency" + }, + "expected_data": [ + "3053.97" + ] + } + } + ], + "site": "shopping" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "3053.97" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "3053.97" + }, + "intent_template_id": 160, + "changelogs": [ + { + "key": "intent_template", + "category": "spelling_or_grammar", + "note": "Fix spelling of canceled" + }, + { + "key": "eval", + "category": "permissive_string_match", + "note": "Switched to exact match" + } + ] + }, + { + "sites": [ + "shopping" + ], + "task_id": 322, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "How much refund I should expect from my order canceled in {{time}} if I cannot get the shipping fee refunded?", + "original.intent_template": "How much refund I should expect from my order canlled in {{time}} if I cannot get the shipping fee refunded?", + "instantiation_dict": { + "time": "May 2023" + }, + "intent": "How much refund I should expect from my order canceled in May 2023 if I cannot get the shipping fee refunded?", + "original.intent": "How much refund I should expect from my order canlled in May 2023 if I cannot get the shipping fee refunded?", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "verify_retrieved_value_normalized", + "eval_params": { + "ordered": false, + "type": "currency" + }, + "expected_data": [ + "350.42" + ] + } + } + ], + "site": "shopping" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "350.42" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "350.42" + }, + "intent_template_id": 160, + "changelogs": [ + { + "key": "intent_template", + "category": "spelling_or_grammar", + "note": "Fix spelling of canceled" + }, + { + "key": "intent_template", + "category": "spelling_or_grammar", + "note": "Fix spelling of canceled" + }, + { + "key": "eval", + "category": "permissive_string_match", + "note": "Switched to exact match" + } + ] + }, + { + "sites": [ + "shopping" + ], + "task_id": 323, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "How much refund I should expect from my order canceled in {{time}}? I only kept the AC-DC Adapter and the shop told me that I cannot get the shipping fee back", + "original.intent_template": "How much refund I should expect from my order canlled in {{time}}? I only kept the AC-DC Adapter and the shop told me that I cannot get the shipping fee back", + "instantiation_dict": { + "time": "2022/03" + }, + "intent": "How much refund I should expect from my order canceled in 2022/03? I only kept the AC-DC Adapter and the shop told me that I cannot get the shipping fee back", + "original.intent": "How much refund I should expect from my order canlled in 2022/03? I only kept the AC-DC Adapter and the shop told me that I cannot get the shipping fee back", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "verify_retrieved_value_normalized", + "eval_params": { + "ordered": false, + "type": "currency" + }, + "expected_data": [ + "264.49" + ] + } + } + ], + "site": "shopping" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "264.49" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "264.49" + }, + "intent_template_id": 160, + "changelogs": [ + { + "key": "intent_template", + "category": "spelling_or_grammar", + "note": "Fix spelling of canceled" + }, + { + "key": "intent_template", + "category": "spelling_or_grammar", + "note": "Fix spelling of canceled" + }, + { + "key": "eval", + "category": "permissive_string_match", + "note": "Switched to exact match" + } + ] + }, + { + "sites": [ + "shopping" + ], + "task_id": 324, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "Show me the \"{{product}}\" listings by {{sorting_order}}.", + "instantiation_dict": { + "product": "chairs", + "sorting_order": "ascending price" + }, + "intent": "Show me the \"chairs\" listings by ascending price.", + "require_reset": false, + "eval": { + "expected_ui_state": [ + { + "url": "__SHOPPING__/catalogsearch/result/index/?product_list_order=price&q=chairs&product_list_dir=asc" + } + ], + "site": "shopping" + }, + "original.eval": { + "eval_types": [ + "url_match" + ], + "reference_answers": null, + "reference_url": "__SHOPPING__/catalogsearch/result/index/?product_list_order=price&q=chairs&product_list_dir=asc", + "program_html": [], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 208 + }, + { + "sites": [ + "shopping" + ], + "task_id": 325, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "Show me the \"{{product}}\" listings by {{sorting_order}}.", + "instantiation_dict": { + "product": "mouth night guard", + "sorting_order": "descending price" + }, + "intent": "Show me the \"mouth night guard\" listings by descending price.", + "require_reset": false, + "eval": { + "expected_ui_state": [ + { + "url": "__SHOPPING__/catalogsearch/result/index/?q=mouth%20night%20guard%20&product_list_order=price" + } + ], + "site": "shopping" + }, + "original.eval": { + "eval_types": [ + "url_match" + ], + "reference_answers": null, + "reference_url": "__SHOPPING__/catalogsearch/result/index/?q=mouth%20night%20guard%20&product_list_order=price", + "program_html": [], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 208 + }, + { + "sites": [ + "shopping" + ], + "task_id": 326, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "Show me the \"{{product}}\" listings by {{sorting_order}}.", + "instantiation_dict": { + "product": "Canon photo printer", + "sorting_order": "search relevance, from most to least" + }, + "intent": "Show me the \"Canon photo printer\" listings by search relevance, from most to least.", + "require_reset": false, + "eval": { + "expected_ui_state": [ + { + "url": "__SHOPPING__/catalogsearch/result/?q=Canon+photo+printer" + } + ], + "site": "shopping" + }, + "original.eval": { + "eval_types": [ + "url_match" + ], + "reference_answers": null, + "reference_url": "__SHOPPING__/catalogsearch/result/?q=Canon+photo+printer", + "program_html": [], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 208 + }, + { + "sites": [ + "shopping" + ], + "task_id": 327, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "Show me the \"{{product}}\" listings by {{sorting_order}}.", + "instantiation_dict": { + "product": "iphone 12 phone case", + "sorting_order": "name alphabetically" + }, + "intent": "Show me the \"iphone 12 phone case\" listings by name alphabetically.", + "require_reset": false, + "eval": { + "expected_ui_state": [ + { + "url": "__SHOPPING__/catalogsearch/result/index/?q=%20iphone%2012%20phone%20case&product_list_order=name" + } + ], + "site": "shopping" + }, + "original.eval": { + "eval_types": [ + "url_match" + ], + "reference_answers": null, + "reference_url": "__SHOPPING__/catalogsearch/result/index/?q=%20iphone%2012%20phone%20case&product_list_order=name", + "program_html": [], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 208 + }, + { + "sites": [ + "shopping" + ], + "task_id": 328, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "Show me the \"{{product}}\" listings by {{sorting_order}}.", + "instantiation_dict": { + "product": "iphone 12 phone case", + "sorting_order": "price" + }, + "intent": "Show me the \"iphone 12 phone case\" listings by price.", + "require_reset": false, + "eval": { + "expected_ui_state": [ + { + "url": "__SHOPPING__/catalogsearch/result/index/?product_list_order=price&q=%20iphone%2012%20phone%20case" + } + ], + "site": "shopping" + }, + "original.eval": { + "eval_types": [ + "url_match" + ], + "reference_answers": null, + "reference_url": "__SHOPPING__/catalogsearch/result/index/?product_list_order=price&q=%20iphone%2012%20phone%20case", + "program_html": [], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 208 + }, + { + "sites": [ + "shopping" + ], + "task_id": 329, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "Return how much I spent {{time}} on shopping at One Stop Market?", + "original.intent_template": "How much I spend {{time}} on shopping at One Stop Market?", + "instantiation_dict": { + "time": "on 4/19/2023" + }, + "intent": "Return how much I spent on 4/19/2023 on shopping at One Stop Market?", + "original.intent": "How much I spend on 4/19/2023 on shopping at One Stop Market?", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "verify_retrieved_value_normalized", + "eval_params": { + "ordered": false, + "type": "numeric" + }, + "expected_data": [ + 0 + ] + } + } + ], + "site": "shopping" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "0" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "0" + }, + "intent_template_id": 147, + "changelogs": [ + { + "key": "intent_template", + "category": "spelling_or_grammar", + "note": "Fix task wording" + }, + { + "key": "eval", + "category": "permissive_string_match", + "note": "Switched to exact match" + } + ] + }, + { + "sites": [ + "shopping" + ], + "task_id": 330, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "Return how much I spent {{time}} on shopping at One Stop Market?", + "original.intent_template": "How much I spend {{time}} on shopping at One Stop Market?", + "instantiation_dict": { + "time": "in March 2023" + }, + "intent": "Return how much I spent in March 2023 on shopping at One Stop Market?", + "original.intent": "How much I spend in March 2023 on shopping at One Stop Market?", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "verify_retrieved_value_normalized", + "eval_params": { + "ordered": false, + "type": "currency" + }, + "expected_data": [ + "83.31" + ] + } + } + ], + "site": "shopping" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "81.31" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "81.31" + }, + "intent_template_id": 147, + "changelogs": [ + { + "key": "intent_template", + "category": "spelling_or_grammar", + "note": "Fix task wording" + }, + { + "key": "expected_retrieve_value", + "category": "reference_alignment", + "note": "The two orders add up to 83.31" + }, + { + "key": "eval", + "category": "permissive_string_match", + "note": "Switched to exact match" + } + ] + }, + { + "sites": [ + "shopping" + ], + "task_id": 331, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "Return how much I spent {{time}} on shopping at One Stop Market?", + "original.intent_template": "How much I spend {{time}} on shopping at One Stop Market?", + "instantiation_dict": { + "time": "in July 2022" + }, + "intent": "Return how much I spent in July 2022 on shopping at One Stop Market?", + "original.intent": "How much I spend in July 2022 on shopping at One Stop Market?", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "verify_retrieved_value_normalized", + "eval_params": { + "ordered": false, + "type": "currency" + }, + "expected_data": [ + "40.16" + ] + } + } + ], + "site": "shopping" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "40.16" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "40.16" + }, + "intent_template_id": 147, + "changelogs": [ + { + "key": "intent_template", + "category": "spelling_or_grammar", + "note": "Fix task wording" + }, + { + "key": "eval", + "category": "permissive_string_match", + "note": "Switched to exact match" + } + ] + }, + { + "sites": [ + "shopping" + ], + "task_id": 332, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "Return how much I spent {{time}} on shopping at One Stop Market?", + "original.intent_template": "How much I spend {{time}} on shopping at One Stop Market?", + "instantiation_dict": { + "time": "each month from Jan to the end of March 2023" + }, + "intent": "Return how much I spent each month from Jan to the end of March 2023 on shopping at One Stop Market?", + "original.intent": "How much I spend each month from Jan to the end of March 2023 on shopping at One Stop Market?", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "verify_retrieved_value_normalized", + "eval_params": { + "ordered": false, + "type": "object" + }, + "expected_data": [ + { + "month": { + "value": "Jan", + "type": "month" + }, + "total": { + "value": "572.88", + "type": "currency" + } + }, + { + "month": { + "value": "Feb", + "type": "month" + }, + "total": { + "value": "947.5", + "type": "currency" + } + }, + { + "month": { + "value": "Mar", + "type": "month" + }, + "total": { + "value": "83.31", + "type": "currency" + } + } + ] + } + } + ], + "site": "shopping" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": [ + "Jan: 572.8", + "Feb: 762.18", + "Mar: 83.31" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "Jan: 572.8\nFeb: 762.18\nMar: 83.31" + }, + "intent_template_id": 147, + "changelogs": [ + { + "key": "intent_template", + "category": "spelling_or_grammar", + "note": "Fix task wording" + }, + { + "key": "eval", + "category": "permissive_string_match", + "note": "Switched from fuzzy eval" + } + ], + "format_specification": "Use \"month\" for month and \"total\" for spent amount." + }, + { + "sites": [ + "shopping" + ], + "task_id": 333, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "Return how much I spent {{time}} on shopping at One Stop Market?", + "original.intent_template": "How much did I spend on shopping at One Stop Market {{time}}? They gave me a 20% discount on the total amount for orders exceeding $200 in cash", + "instantiation_dict": { + "time": "on November 2022" + }, + "intent": "Return how much I spent on November 2022 on shopping at One Stop Market?", + "original.intent": "How much did I spend on shopping at One Stop Market on November 2022? They gave me a 20% discount on the total amount for orders exceeding $200 in cash", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "verify_retrieved_value_normalized", + "eval_params": { + "ordered": false, + "type": "currency" + }, + "expected_data": [ + "403.18" + ] + } + } + ], + "site": "shopping" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "359.546" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "359.546" + }, + "intent_template_id": 147, + "changelogs": [ + { + "key": "intent_template", + "category": "spelling_or_grammar", + "note": "Fix task wording" + }, + { + "key": "expected_retrieve_value", + "category": "reference_alignment", + "note": "The three orders add up to 403.18" + }, + { + "key": "eval", + "category": "permissive_string_match", + "note": "Switched to exact match" + } + ] + }, + { + "sites": [ + "shopping" + ], + "task_id": 334, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "Tell me when I last ordered my {{description}}?", + "instantiation_dict": { + "description": "muffin cornbread mix" + }, + "intent": "Tell me when I last ordered my muffin cornbread mix?", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "verify_retrieved_value_normalized", + "eval_params": { + "ordered": false, + "type": "date" + }, + "expected_data": [ + "March 11th 2023" + ] + } + } + ], + "site": "shopping" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": [ + "March 11th 2023" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "March 11th 2023" + }, + "intent_template_id": 169, + "changelogs": [ + { + "key": "eval", + "category": "permissive_string_match", + "note": "Switched from fuzzy eval" + } + ] + }, + { + "sites": [ + "shopping" + ], + "task_id": 335, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "Tell me when I last ordered my {{description}}?", + "instantiation_dict": { + "description": "body butter" + }, + "intent": "Tell me when I last ordered my body butter?", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "verify_retrieved_value_normalized", + "eval_params": { + "ordered": false, + "type": "date" + }, + "expected_data": [ + "January 16th 2023" + ] + } + } + ], + "site": "shopping" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": [ + "January 16th 2023" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "January 16th 2023" + }, + "intent_template_id": 169, + "changelogs": [ + { + "key": "eval", + "category": "permissive_string_match", + "note": "Switched from fuzzy eval" + } + ] + }, + { + "sites": [ + "shopping" + ], + "task_id": 336, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "Tell me when I last ordered my {{description}}?", + "instantiation_dict": { + "description": "conditioner" + }, + "intent": "Tell me when I last ordered my conditioner?", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "verify_retrieved_value_normalized", + "eval_params": { + "ordered": false, + "type": "date" + }, + "expected_data": [ + "January 16th 2023" + ] + } + } + ], + "site": "shopping" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": [ + "January 16th 2023" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "January 16th 2023" + }, + "intent_template_id": 169, + "changelogs": [ + { + "key": "eval", + "category": "permissive_string_match", + "note": "Switched from fuzzy eval" + } + ] + }, + { + "sites": [ + "shopping" + ], + "task_id": 337, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "Tell me when I last ordered my {{description}}?", + "instantiation_dict": { + "description": "bread olive" + }, + "intent": "Tell me when I last ordered my bread olive?", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "verify_retrieved_value_normalized", + "eval_params": { + "ordered": false, + "type": "date" + }, + "expected_data": [ + "December 12th 2022" + ] + } + } + ], + "site": "shopping" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": [ + "December 12th 2022" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "December 12th 2022" + }, + "intent_template_id": 169, + "changelogs": [ + { + "key": "eval", + "category": "permissive_string_match", + "note": "Switched from fuzzy eval" + } + ] + }, + { + "sites": [ + "shopping" + ], + "task_id": 338, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "Tell me when I last ordered my {{description}}?", + "instantiation_dict": { + "description": "toothpaste" + }, + "intent": "Tell me when I last ordered my toothpaste?", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "verify_retrieved_value_normalized", + "eval_params": { + "ordered": false, + "type": "date" + }, + "expected_data": [ + "December 4th 2022" + ] + } + } + ], + "site": "shopping" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": [ + "December 4th 2022" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "December 4th 2022" + }, + "intent_template_id": 169, + "changelogs": [ + { + "key": "eval", + "category": "permissive_string_match", + "note": "Switched from fuzzy eval" + } + ] + }, + { + "sites": [ + "gitlab" + ], + "task_id": 339, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__/a11yproject/a11yproject.com", + "geolocation": null, + "intent_template": "Navigate to and display a list of all opened issues {{description}}", + "original.intent_template": "List all opened issues {{description}}", + "instantiation_dict": { + "description": "that report bugs" + }, + "intent": "Navigate to and display a list of all opened issues that report bugs", + "original.intent": "List all opened issues that report bugs", + "require_reset": false, + "eval": { + "expected_ui_state": [ + { + "url": "__GITLAB__/a11yproject/a11yproject.com/-/issues/?label_name%5B%5D=bug" + } + ], + "site": "gitlab" + }, + "original.eval": { + "eval_types": [ + "url_match" + ], + "reference_answers": null, + "reference_url": "__GITLAB__/a11yproject/a11yproject.com/-/issues/?label_name%5B%5D=bug", + "program_html": [], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 299, + "changelogs": [ + { + "key": "intent_template", + "category": "clarify_instructions", + "note": "Clarify the intent as a navigation task" + } + ] + }, + { + "sites": [ + "gitlab" + ], + "task_id": 340, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__/primer/design", + "geolocation": null, + "intent_template": "Navigate to and display a list of all opened issues {{description}}", + "original.intent_template": "List all opened issues {{description}}", + "instantiation_dict": { + "description": "that report bugs" + }, + "intent": "Navigate to and display a list of all opened issues that report bugs", + "original.intent": "List all opened issues that report bugs", + "require_reset": false, + "eval": { + "expected_ui_state": [ + { + "url": "__GITLAB__/primer/design/-/issues/?label_name%5B%5D=type%3A%20bug%20%F0%9F%90%9E" + } + ], + "site": "gitlab" + }, + "original.eval": { + "eval_types": [ + "url_match" + ], + "reference_answers": null, + "reference_url": "__GITLAB__/primer/design/-/issues/?label_name%5B%5D=type%3A%20bug%20%F0%9F%90%9E", + "program_html": [], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 299, + "changelogs": [ + { + "key": "intent_template", + "category": "clarify_instructions", + "note": "Clarify the intent as a navigation task" + } + ] + }, + { + "sites": [ + "gitlab" + ], + "task_id": 341, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__/root/metaseq", + "geolocation": null, + "intent_template": "Navigate to and display a list of all opened issues {{description}}", + "original.intent_template": "List all opened issues {{description}}", + "instantiation_dict": { + "description": "requesting new features" + }, + "intent": "Navigate to and display a list of all opened issues requesting new features", + "original.intent": "List all opened issues requesting new features", + "require_reset": false, + "eval": { + "expected_ui_state": [ + { + "url": "__GITLAB__/root/metaseq/-/issues/?label_name%5B%5D=enhancement" + } + ], + "site": "gitlab" + }, + "original.eval": { + "eval_types": [ + "url_match" + ], + "reference_answers": null, + "reference_url": "__GITLAB__/root/metaseq/-/issues/?label_name%5B%5D=enhancement", + "program_html": [], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 299, + "changelogs": [ + { + "key": "intent_template", + "category": "clarify_instructions", + "note": "Clarify the intent as a navigation task" + } + ] + }, + { + "sites": [ + "gitlab" + ], + "task_id": 342, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__/root/metaseq", + "geolocation": null, + "intent_template": "Navigate to and display a list of all opened issues {{description}}", + "original.intent_template": "List all opened issues {{description}}", + "instantiation_dict": { + "description": "that ask about OPT model related questions" + }, + "intent": "Navigate to and display a list of all opened issues that ask about OPT model related questions", + "original.intent": "List all opened issues that ask about OPT model related questions", + "require_reset": false, + "eval": { + "expected_ui_state": [ + { + "url": "__GITLAB__/root/metaseq/-/issues/?search=OPT&label_name%5B%5D=question" + } + ], + "site": "gitlab" + }, + "original.eval": { + "eval_types": [ + "url_match" + ], + "reference_answers": null, + "reference_url": "__GITLAB__/root/metaseq/-/issues/?search=OPT&label_name%5B%5D=question", + "program_html": [], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 299, + "changelogs": [ + { + "key": "intent_template", + "category": "clarify_instructions", + "note": "Clarify the intent as a navigation task" + } + ] + }, + { + "sites": [ + "gitlab" + ], + "task_id": 343, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__/root/metaseq", + "geolocation": null, + "intent_template": "Navigate to and display a list of all opened issues {{description}}", + "original.intent_template": "List all opened issues {{description}}", + "instantiation_dict": { + "description": "that don't have any labels" + }, + "intent": "Navigate to and display a list of all opened issues that don't have any labels", + "original.intent": "List all opened issues that don't have any labels", + "require_reset": false, + "eval": { + "expected_ui_state": [ + { + "url": "__GITLAB__/root/metaseq/-/issues/?label_name%5B%5D=None" + } + ], + "site": "gitlab" + }, + "original.eval": { + "eval_types": [ + "url_match" + ], + "reference_answers": null, + "reference_url": "__GITLAB__/root/metaseq/-/issues/?label_name%5B%5D=None", + "program_html": [], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 299, + "changelogs": [ + { + "key": "intent_template", + "category": "clarify_instructions", + "note": "Clarify the intent as a navigation task" + } + ] + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 344, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "How many reviews did our shop receive {{time}}?", + "original.intent_template": "How many reviews our shop received {{time}}?", + "instantiation_dict": { + "time": "so far" + }, + "original.instantiation_dict": { + "time": "by far" + }, + "intent": "How many reviews did our shop receive so far?", + "original.intent": "How many reviews our shop received by far?", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "verify_retrieved_value_normalized", + "eval_params": { + "ordered": false, + "type": "numeric" + }, + "expected_data": [ + 351 + ] + } + } + ], + "site": "shopping_admin" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "351" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "351" + }, + "intent_template_id": 248, + "changelogs": [ + { + "key": "intent_template", + "category": "spelling_or_grammar", + "note": "Fix task wording." + }, + { + "key": "eval", + "category": "permissive_string_match", + "note": "Switched to exact match" + } + ] + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 345, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "How many reviews did our shop receive {{time}}?", + "original.intent_template": "How many reviews our shop received {{time}}?", + "instantiation_dict": { + "time": "in Apr 2023" + }, + "intent": "How many reviews did our shop receive in Apr 2023?", + "original.intent": "How many reviews our shop received in Apr 2023?", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "verify_retrieved_value_normalized", + "eval_params": { + "ordered": false, + "type": "numeric" + }, + "expected_data": [ + 351 + ] + } + } + ], + "site": "shopping_admin" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "351" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "351" + }, + "intent_template_id": 248, + "changelogs": [ + { + "key": "intent_template", + "category": "spelling_or_grammar", + "note": "Fix task wording." + }, + { + "key": "eval", + "category": "permissive_string_match", + "note": "Switched to exact match" + } + ] + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 346, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "How many reviews did our shop receive {{time}}?", + "original.intent_template": "How many reviews our shop received {{time}}?", + "instantiation_dict": { + "time": "during 2022" + }, + "intent": "How many reviews did our shop receive during 2022?", + "original.intent": "How many reviews our shop received during 2022?", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "verify_retrieved_value_normalized", + "eval_params": { + "ordered": false, + "type": "numeric" + }, + "expected_data": [ + 0 + ] + } + } + ], + "site": "shopping_admin" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "0" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "0" + }, + "intent_template_id": 248, + "changelogs": [ + { + "key": "intent_template", + "category": "spelling_or_grammar", + "note": "Fix task wording." + }, + { + "key": "eval", + "category": "permissive_string_match", + "note": "Switched to exact match" + } + ] + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 347, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "How many reviews did our shop receive {{time}}?", + "original.intent_template": "How many reviews our shop received {{time}}?", + "instantiation_dict": { + "time": "from the beginning of the shop" + }, + "intent": "How many reviews did our shop receive from the beginning of the shop?", + "original.intent": "How many reviews our shop received from the beginning of the shop?", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "verify_retrieved_value_normalized", + "eval_params": { + "ordered": false, + "type": "numeric" + }, + "expected_data": [ + 351 + ] + } + } + ], + "site": "shopping_admin" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "351" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "351" + }, + "intent_template_id": 248, + "changelogs": [ + { + "key": "intent_template", + "category": "spelling_or_grammar", + "note": "Fix task wording." + }, + { + "key": "eval", + "category": "permissive_string_match", + "note": "Switched to exact match" + } + ] + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 348, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "How many reviews did our shop receive {{time}}?", + "original.intent_template": "How many reviews our shop received {{time}}?", + "instantiation_dict": { + "time": "in May 2023" + }, + "intent": "How many reviews did our shop receive in May 2023?", + "original.intent": "How many reviews our shop received in May 2023?", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "verify_retrieved_value_normalized", + "eval_params": { + "ordered": false, + "type": "numeric" + }, + "expected_data": [ + 0 + ] + } + } + ], + "site": "shopping_admin" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "0" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "0" + }, + "intent_template_id": 248, + "changelogs": [ + { + "key": "intent_template", + "category": "spelling_or_grammar", + "note": "Fix task wording." + }, + { + "key": "eval", + "category": "permissive_string_match", + "note": "Switched to exact match" + } + ] + }, + { + "sites": [ + "gitlab" + ], + "task_id": 349, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Who else has access to my repo {{repo}}, return me their usernames", + "original.intent_template": "Who else have access to my repo {{repo}}, show me their usernames", + "instantiation_dict": { + "repo": "gimmiethat.space" + }, + "intent": "Who else has access to my repo gimmiethat.space, return me their usernames", + "original.intent": "Who else have access to my repo gimmiethat.space, show me their usernames", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "verify_retrieved_value_normalized", + "eval_params": { + "ordered": "false", + "type": "text" + }, + "expected_data": [ + "yjlou" + ] + } + } + ], + "site": "gitlab" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "exact_match": "yjlou" + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "yjlou" + }, + "intent_template_id": 298, + "changelogs": [ + { + "key": "intent_template", + "category": "clarify_instructions", + "note": "Clarify the intent as a retrieve task" + }, + { + "key": "expected_retrieve_value", + "category": "avoid_permissive_text", + "note": "Use structured response to judge correctness" + } + ] + }, + { + "sites": [ + "gitlab" + ], + "task_id": 350, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Who else has access to my repo {{repo}}, return me their usernames", + "original.intent_template": "Who else have access to my repo {{repo}}, show me their usernames", + "instantiation_dict": { + "repo": "prism-theme" + }, + "intent": "Who else has access to my repo prism-theme, return me their usernames", + "original.intent": "Who else have access to my repo prism-theme, show me their usernames", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "verify_retrieved_value_normalized", + "eval_params": { + "ordered": "false", + "type": "text" + }, + "expected_data": [ + "abisubramanya27" + ] + } + } + ], + "site": "gitlab" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "abisubramanya27" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "Abishek S, abisubramanya27" + }, + "intent_template_id": 298, + "changelogs": [ + { + "key": "intent_template", + "category": "clarify_instructions", + "note": "Clarify the intent as a retrieve task" + }, + { + "key": "expected_retrieve_value", + "category": "avoid_permissive_text", + "note": "Use structured response to judge correctness" + }, + { + "key": "eval", + "category": "permissive_string_match", + "note": "Switched to exact match" + } + ] + }, + { + "sites": [ + "shopping" + ], + "task_id": 351, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "List products from {{product_category}} category by {{order}} price", + "instantiation_dict": { + "product_category": "PS4 accessories", + "order": "ascending" + }, + "intent": "List products from PS4 accessories category by ascending price", + "require_reset": false, + "eval": { + "expected_ui_state": [ + { + "url": "__SHOPPING__/video-games/playstation-4/accessories.html?product_list_order=price" + } + ], + "site": "shopping" + }, + "original.eval": { + "eval_types": [ + "url_match" + ], + "reference_answers": null, + "reference_url": "__SHOPPING__/video-games/playstation-4/accessories.html?product_list_order=price", + "program_html": [], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 137 + }, + { + "sites": [ + "shopping" + ], + "task_id": 352, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "List products from {{product_category}} category by {{order}} price", + "instantiation_dict": { + "product_category": "nutrition bars and drinks", + "order": "ascending" + }, + "intent": "List products from nutrition bars and drinks category by ascending price", + "require_reset": false, + "eval": { + "expected_ui_state": [ + { + "url": "__SHOPPING__/health-household/diet-sports-nutrition/nutrition-bars-drinks.html?product_list_order=price" + } + ], + "site": "shopping" + }, + "original.eval": { + "eval_types": [ + "url_match" + ], + "reference_answers": null, + "reference_url": "__SHOPPING__/health-household/diet-sports-nutrition/nutrition-bars-drinks.html?product_list_order=price", + "program_html": [], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 137 + }, + { + "sites": [ + "shopping" + ], + "task_id": 353, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "List products from {{product_category}} category by {{order}} price", + "instantiation_dict": { + "product_category": "competitive swimwear", + "order": "ascending" + }, + "intent": "List products from competitive swimwear category by ascending price", + "require_reset": false, + "eval": { + "expected_ui_state": [ + { + "url": "__SHOPPING__/clothing-shoes-jewelry/sport-specific-clothing/competitive-swimwear.html?product_list_order=price" + } + ], + "site": "shopping" + }, + "original.eval": { + "eval_types": [ + "url_match" + ], + "reference_answers": null, + "reference_url": "__SHOPPING__/clothing-shoes-jewelry/sport-specific-clothing/competitive-swimwear.html?product_list_order=price", + "program_html": [], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 137 + }, + { + "sites": [ + "shopping" + ], + "task_id": 354, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "List products from {{product_category}} category by {{order}} price", + "instantiation_dict": { + "product_category": "living room furtniture", + "order": "descending" + }, + "intent": "List products from living room furtniture category by descending price", + "require_reset": false, + "eval": { + "expected_ui_state": [ + { + "url": "__SHOPPING__/home-kitchen/furniture/living-room-furniture.html?product_list_order=price&product_list_dir=desc" + } + ], + "site": "shopping" + }, + "original.eval": { + "eval_types": [ + "url_match" + ], + "reference_answers": null, + "reference_url": "__SHOPPING__/home-kitchen/furniture/living-room-furniture.html?product_list_order=price&product_list_dir=desc", + "program_html": [], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 137 + }, + { + "sites": [ + "shopping" + ], + "task_id": 355, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "List products from {{product_category}} category by {{order}} price", + "instantiation_dict": { + "product_category": "kids' bedding", + "order": "descending" + }, + "intent": "List products from kids' bedding category by descending price", + "require_reset": false, + "eval": { + "expected_ui_state": [ + { + "url": "__SHOPPING__/home-kitchen/bedding/kids-bedding.html?product_list_dir=desc" + } + ], + "site": "shopping" + }, + "original.eval": { + "eval_types": [ + "url_match" + ], + "reference_answers": null, + "reference_url": "__SHOPPING__/home-kitchen/bedding/kids-bedding.html?product_list_dir=desc", + "program_html": [], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 137 + }, + { + "sites": [ + "map" + ], + "task_id": 356, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "Show the route from SCS CMU in Pittsburgh to the location where the Declaration of Independence and Constitution were signed", + "instantiation_dict": {}, + "intent": "Show the route from SCS CMU in Pittsburgh to the location where the Declaration of Independence and Constitution were signed", + "require_reset": false, + "eval": { + "site": "map" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": null, + "program_html": [ + { + "url": "last", + "locator": "document.querySelector(\"div#content select.routing_engines\").selectedIndex", + "required_contents": { + "exact_match": "1" + } + }, + { + "url": "last", + "locator": "document.querySelector('[name=\"route_from\"').value", + "required_contents": { + "must_include": [ + "Gates and Hillman Centers", + "Pittsburgh" + ] + } + }, + { + "url": "last", + "locator": "document.querySelector('[name=\"route_to\"').value", + "required_contents": { + "must_include": [ + "Independence Hall", + "Philadelphia" + ] + } + } + ] + }, + "intent_template_id": 49 + }, + { + "sites": [ + "gitlab" + ], + "task_id": 357, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Navigate to the merge requests requiring my review", + "original.intent_template": "Checkout merge requests requiring my review", + "instantiation_dict": {}, + "intent": "Navigate to the merge requests requiring my review", + "original.intent": "Checkout merge requests requiring my review", + "require_reset": false, + "eval": { + "expected_ui_state": [ + { + "url": "__GITLAB__/dashboard/merge_requests?reviewer_username=byteblaze" + } + ], + "site": "gitlab" + }, + "original.eval": { + "eval_types": [ + "url_match" + ], + "reference_answers": null, + "reference_url": "__GITLAB__/dashboard/merge_requests?reviewer_username=byteblaze", + "program_html": [], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 291, + "changelogs": [ + { + "key": "intent_template", + "category": "clarify_instructions", + "note": "Clarify the intent as a navigation task" + } + ] + }, + { + "sites": [ + "shopping" + ], + "task_id": 358, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "Get the {{info}} for order number {{order_number}}.", + "original.intent_template": "Show me the {{info}} for order number {{order_number}}.", + "instantiation_dict": { + "info": "shipping method", + "order_number": 187 + }, + "intent": "Get the shipping method for order number 187.", + "original.intent": "Show me the shipping method for order number 187.", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "verify_retrieved_value_normalized", + "eval_params": { + "ordered": false, + "type": "text" + }, + "expected_data": [ + "Flat Rate - Fixed" + ] + } + } + ], + "site": "shopping" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "Flat Rate - Fixed" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "Flat Rate - Fixed" + }, + "intent_template_id": 206, + "changelogs": [ + { + "key": "intent_template", + "category": "task_ambiguity", + "note": "Clearly specify return value expected" + }, + { + "key": "eval", + "category": "permissive_string_match", + "note": "Switched to exact match" + } + ] + }, + { + "sites": [ + "shopping" + ], + "task_id": 359, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "Get the {{info}} for order number {{order_number}}.", + "original.intent_template": "Show me the {{info}} for order number {{order_number}}.", + "instantiation_dict": { + "info": "order date", + "order_number": "148" + }, + "intent": "Get the order date for order number 148.", + "original.intent": "Show me the order date for order number 148.", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "verify_retrieved_value_normalized", + "eval_params": { + "ordered": false, + "type": "date" + }, + "expected_data": [ + "1/29/2023" + ] + } + } + ], + "site": "shopping" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": [ + "1/29/2023" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "1/29/2023" + }, + "intent_template_id": 206, + "changelogs": [ + { + "key": "intent_template", + "category": "task_ambiguity", + "note": "Clearly specify return value expected" + }, + { + "key": "eval", + "category": "permissive_string_match", + "note": "Switched from fuzzy eval" + } + ] + }, + { + "sites": [ + "shopping" + ], + "task_id": 360, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "Get the {{info}} for order number {{order_number}}.", + "original.intent_template": "Show me the {{info}} for order number {{order_number}}.", + "instantiation_dict": { + "info": "product names", + "order_number": "148" + }, + "intent": "Get the product names for order number 148.", + "original.intent": "Show me the product names for order number 148.", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "verify_retrieved_value_normalized", + "eval_params": { + "ordered": false, + "type": "text" + }, + "expected_data": [ + "Bornbridge Artificial Spiral Topiary Tree - Indoor / Outdoor Topiary Trees - Artificial Outdoor Plants (2 Pack, 4' Cypress)", + "Russound 5B45W 4\" Indoor Outdoor Speakers White" + ] + } + } + ], + "site": "shopping" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "Bornbridge Artificial Spiral Topiary Tree - Indoor / Outdoor Topiary Trees - Artificial Outdoor Plants (2 Pack, 4' Cypress)", + "Russound 5B45W 4\" Indoor Outdoor Speakers White" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "Bornbridge Artificial Spiral Topiary Tree - Indoor / Outdoor Topiary Trees - Artificial Outdoor Plants (2 Pack, 4' Cypress), Russound 5B45W 4\" Indoor Outdoor Speakers White" + }, + "intent_template_id": 206, + "changelogs": [ + { + "key": "intent_template", + "category": "task_ambiguity", + "note": "Clearly specify return value expected" + }, + { + "key": "eval", + "category": "permissive_string_match", + "note": "Switched to exact match" + } + ] + }, + { + "sites": [ + "shopping" + ], + "task_id": 361, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "Get the {{info}} for order number {{order_number}}.", + "original.intent_template": "Show me the {{info}} for order number {{order_number}}.", + "instantiation_dict": { + "info": "order statuses", + "order_number": "170 and 189" + }, + "intent": "Get the order statuses for order number 170 and 189.", + "original.intent": "Show me the order statuses for order number 170 and 189.", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "verify_retrieved_value_normalized", + "eval_params": { + "ordered": false, + "type": "object" + }, + "expected_data": [ + { + "order_number": { + "value": "170", + "type": "numeric" + }, + "status": { + "value": "canceled", + "type": "text" + } + }, + { + "order_number": { + "value": "189", + "type": "numeric" + }, + "status": { + "value": "pending", + "type": "text" + } + } + ] + } + } + ], + "site": "shopping" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": [ + "170: cancelled", + "189: pending" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "170: cancelled, 189: pending" + }, + "intent_template_id": 206, + "changelogs": [ + { + "key": "intent_template", + "category": "task_ambiguity", + "note": "Clearly specify return value expected" + }, + { + "key": "eval", + "category": "permissive_string_match", + "note": "Switched from fuzzy eval" + } + ], + "format_specification": "Use \"order_number\" for order number and \"status\" for status." + }, + { + "sites": [ + "shopping" + ], + "task_id": 362, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "Get the {{info}} for order number {{order_number}}.", + "original.intent_template": "Show me the {{info}} for order number {{order_number}}.", + "instantiation_dict": { + "info": "billing address", + "order_number": "00178" + }, + "intent": "Get the billing address for order number 00178.", + "original.intent": "Show me the billing address for order number 00178.", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "verify_retrieved_value_normalized", + "eval_params": { + "ordered": false, + "type": "object" + }, + "expected_data": [ + { + "address": { + "value": "101 S San Mateo Dr", + "type": "text" + }, + "city": { + "value": "San Mateo", + "type": "text" + }, + "state": { + "value": "California", + "type": "text" + }, + "zip_code": { + "value": "94010", + "type": "text" + }, + "country": { + "value": "United States", + "type": "text" + } + } + ] + } + } + ], + "site": "shopping" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "101 S San Mateo Dr", + "San Mateo", + "California", + "94010", + "United States" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "Emma Lopez, 101 S San Mateo Dr, San Mateo, California, 94010, United States" + }, + "intent_template_id": 206, + "changelogs": [ + { + "key": "intent_template", + "category": "task_ambiguity", + "note": "Clearly specify return value expected" + }, + { + "key": "eval", + "category": "permissive_string_match", + "note": "Switched to exact match" + } + ], + "format_specification": "Use keys \"address\", \"city\", \"state\", \"zip_code\", and \"country\". Set any key to null if not available." + }, + { + "sites": [ + "map" + ], + "task_id": 363, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "Measure distance between {{location/address_1}} and {{location/address_2}} by walking", + "instantiation_dict": { + "location/address_1": "Carnegie Mellon University", + "location/address_2": "Carnegie Music Hall" + }, + "intent": "Measure distance between Carnegie Mellon University and Carnegie Music Hall by walking", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "verify_retrieved_value_normalized", + "eval_params": { + "ordered": false, + "type": "text" + }, + "expected_data": [ + "748m" + ] + } + } + ], + "site": "map" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "exact_match": "748m" + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "748m" + }, + "intent_template_id": 58 + }, + { + "sites": [ + "map" + ], + "task_id": 364, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "Measure distance between {{location/address_1}} and {{location/address_2}} by walking", + "instantiation_dict": { + "location/address_1": "Carnegie Mellon University", + "location/address_2": "UPMC Shadyside" + }, + "intent": "Measure distance between Carnegie Mellon University and UPMC Shadyside by walking", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "verify_retrieved_value_normalized", + "eval_params": { + "ordered": false, + "type": "text" + }, + "expected_data": [ + "1.7km" + ] + } + } + ], + "site": "map" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "exact_match": "1.7km" + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "1.7km" + }, + "intent_template_id": 58 + }, + { + "sites": [ + "map" + ], + "task_id": 365, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "Measure distance between {{location/address_1}} and {{location/address_2}} by walking", + "instantiation_dict": { + "location/address_1": "Carnegie Music Hall", + "location/address_2": "UPMC Shadyside" + }, + "intent": "Measure distance between Carnegie Music Hall and UPMC Shadyside by walking", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "verify_retrieved_value_normalized", + "eval_params": { + "ordered": false, + "type": "text" + }, + "expected_data": [ + "2.2km" + ] + } + } + ], + "site": "map" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "exact_match": "2.2km" + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "2.2km" + }, + "intent_template_id": 58 + }, + { + "sites": [ + "map" + ], + "task_id": 366, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "Measure distance between {{location/address_1}} and {{location/address_2}} by walking", + "instantiation_dict": { + "location/address_1": "CVS (closet one)", + "location/address_2": "UPMC Shadyside" + }, + "intent": "Measure distance between CVS (closet one) and UPMC Shadyside by walking", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "verify_retrieved_value_normalized", + "eval_params": { + "ordered": false, + "type": "text" + }, + "expected_data": [ + "1.2km" + ] + } + } + ], + "site": "map" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "exact_match": "1.2km" + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "1.2km" + }, + "intent_template_id": 58 + }, + { + "sites": [ + "map" + ], + "task_id": 367, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "Measure distance between {{location/address_1}} and {{location/address_2}} by walking", + "instantiation_dict": { + "location/address_1": "Carnegie Mellon University", + "location/address_2": "CVS (closet one)" + }, + "intent": "Measure distance between Carnegie Mellon University and CVS (closet one) by walking", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "verify_retrieved_value_normalized", + "eval_params": { + "ordered": false, + "type": "text" + }, + "expected_data": [ + "1.4km" + ] + } + } + ], + "site": "map" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "exact_match": "1.4km" + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "1.4km" + }, + "intent_template_id": 58 + }, + { + "sites": [ + "shopping" + ], + "task_id": 368, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "find discounted items.", + "instantiation_dict": {}, + "intent": "find discounted items.", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "validate_status", + "expected_data": { + "expected_status": "NOT_FOUND_ERROR" + } + } + } + ], + "site": "shopping" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": "N/A" + }, + "reference_url": "", + "program_html": [], + "string_note": "There is no function to show only discount items", + "reference_answer_raw_annotation": "There is no function to show only discount items." + }, + "intent_template_id": 188, + "changelogs": [ + { + "key": "expected_retrieve_value", + "category": "unachievable_tasks", + "note": "Status check used instead of fuzzy match" + } + ] + }, + { + "sites": [ + "map" + ], + "task_id": 369, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "Pull up the description page of {{location}} on Map", + "instantiation_dict": { + "location": "Carnegie Music Hall" + }, + "intent": "Pull up the description page of Carnegie Music Hall on Map", + "require_reset": false, + "eval": { + "site": "map" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": null, + "program_html": [ + { + "url": "last", + "locator": "document.querySelector('[id=\"sidebar_content\"').outerText", + "required_contents": { + "must_include": [ + "Carnegie Music Hall" + ] + } + } + ] + }, + "intent_template_id": 52 + }, + { + "sites": [ + "map" + ], + "task_id": 370, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "Pull up the description page of {{location}} on Map", + "instantiation_dict": { + "location": "Carnegie Mellon University" + }, + "intent": "Pull up the description page of Carnegie Mellon University on Map", + "require_reset": false, + "eval": { + "site": "map" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": null, + "program_html": [ + { + "url": "last", + "locator": "document.querySelector('[id=\"sidebar_content\"').outerText", + "required_contents": { + "must_include": [ + "Carnegie Mellon University" + ] + } + } + ] + }, + "intent_template_id": 52 + }, + { + "sites": [ + "map" + ], + "task_id": 371, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "Pull up the description page of {{location}} on Map", + "instantiation_dict": { + "location": "Piada restaurant near Pitt" + }, + "intent": "Pull up the description page of Piada restaurant near Pitt on Map", + "require_reset": false, + "eval": { + "site": "map" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": null, + "program_html": [ + { + "url": "last", + "locator": "document.querySelector('[id=\"sidebar_content\"').outerText", + "required_contents": { + "must_include": [ + "Piada Italian Street Food", + "Forbes Avenue" + ] + } + } + ] + }, + "intent_template_id": 52 + }, + { + "sites": [ + "map" + ], + "task_id": 372, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "Pull up the description page of {{location}} on Map", + "instantiation_dict": { + "location": "the Costco in Pittsburhg near a river" + }, + "intent": "Pull up the description page of the Costco in Pittsburhg near a river on Map", + "require_reset": false, + "eval": { + "site": "map" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": null, + "program_html": [ + { + "url": "last", + "locator": "document.querySelector('[id=\"sidebar_content\"').outerText", + "required_contents": { + "must_include": [ + "Costco", + "Waterfront Drive West" + ] + } + } + ] + }, + "intent_template_id": 52 + }, + { + "sites": [ + "map" + ], + "task_id": 373, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "Pull up the description page of {{location}} on Map", + "instantiation_dict": { + "location": "Whole Foods near Carnegie Mellon" + }, + "intent": "Pull up the description page of Whole Foods near Carnegie Mellon on Map", + "require_reset": false, + "eval": { + "site": "map" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": null, + "program_html": [ + { + "url": "last", + "locator": "document.querySelector('[id=\"sidebar_content\"').outerText", + "required_contents": { + "must_include": [ + "Whole Foods", + "East Liberty" + ] + } + } + ] + }, + "intent_template_id": 52 + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 374, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Show the {{name}} theme settings", + "original.intent_template": "Preview the {{name}} theme for my shop", + "instantiation_dict": { + "name": "Magento Blank" + }, + "intent": "Show the Magento Blank theme settings", + "original.intent": "Preview the Magento Blank theme for my shop", + "require_reset": false, + "eval": { + "expected_ui_state": [ + { + "url": "__SHOPPING_ADMIN__/admin/system_design_theme/edit/id/1" + } + ], + "site": "shopping_admin" + }, + "original.eval": { + "eval_types": [ + "url_match" + ], + "reference_answers": null, + "reference_url": "__SHOPPING_ADMIN__/admin/system_design_theme/edit/id/1", + "program_html": [], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 266, + "changelogs": [ + { + "key": "intent_template", + "category": "task_ambiguity", + "note": "Preview a theme can mean applying the theme. We match the intent to the original evaluation target which is navigating to the theme settings." + } + ] + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 375, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Show the {{name}} theme settings", + "original.intent_template": "Preview the {{name}} theme for my shop", + "instantiation_dict": { + "name": "Magento Luma" + }, + "intent": "Show the Magento Luma theme settings", + "original.intent": "Preview the Magento Luma theme for my shop", + "require_reset": false, + "eval": { + "expected_ui_state": [ + { + "url": "__SHOPPING_ADMIN__/admin/system_design_theme/edit/id/3/key/" + } + ], + "site": "shopping_admin" + }, + "original.eval": { + "eval_types": [ + "url_match" + ], + "reference_answers": null, + "reference_url": "__SHOPPING_ADMIN__/admin/system_design_theme/edit/id/3/key/", + "program_html": [], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 266, + "changelogs": [ + { + "key": "intent_template", + "category": "task_ambiguity", + "note": "Preview a theme can mean applying the theme. We match the intent to the original evaluation target which is navigating to the theme settings." + } + ] + }, + { + "sites": [ + "shopping" + ], + "task_id": 376, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "Summarize customer reviews for {{product}}.", + "instantiation_dict": { + "product": "Amazon Echo Dot 3rd generation" + }, + "intent": "Summarize customer reviews for Amazon Echo Dot 3rd generation.", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "validate_status", + "expected_data": { + "expected_status": "NOT_FOUND_ERROR" + } + } + } + ], + "site": "shopping" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": "N/A" + }, + "reference_url": "", + "program_html": [], + "string_note": "There is no review for this product", + "reference_answer_raw_annotation": "N/A" + }, + "intent_template_id": 182, + "changelogs": [ + { + "key": "expected_retrieve_value", + "category": "unachievable_tasks", + "note": "Status check used instead of fuzzy match" + } + ] + }, + { + "sites": [ + "map" + ], + "task_id": 377, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "Find the {{space}} around {{location}}", + "instantiation_dict": { + "location": "CMU ArtPark Lab", + "space": "resturants" + }, + "intent": "Find the resturants around CMU ArtPark Lab", + "require_reset": false, + "eval": { + "expected_ui_state": [ + { + "url": "__MAP__/search?query=restaurants%20near%20CMU%20ArtPark%20Lab" + } + ], + "site": "map" + }, + "original.eval": { + "eval_types": [ + "url_match" + ], + "reference_answers": null, + "reference_url": "__MAP__/search?query=restaurants%20near%20CMU%20ArtPark%20Lab", + "program_html": [], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 59 + }, + { + "sites": [ + "map" + ], + "task_id": 378, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "Find the {{space}} around {{location}}", + "instantiation_dict": { + "location": "CMU main campus", + "space": "parking" + }, + "intent": "Find the parking around CMU main campus", + "require_reset": false, + "eval": { + "expected_ui_state": [ + { + "url": "__MAP__/search?query=parking%20near%20carnegie%20mellon%20university" + } + ], + "site": "map" + }, + "original.eval": { + "eval_types": [ + "url_match" + ], + "reference_answers": null, + "reference_url": "__MAP__/search?query=parking%20near%20carnegie%20mellon%20university", + "program_html": [], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 59 + }, + { + "sites": [ + "map" + ], + "task_id": 379, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "Find the {{space}} around {{location}}", + "instantiation_dict": { + "location": "CMU main campus", + "space": "hotel" + }, + "intent": "Find the hotel around CMU main campus", + "require_reset": false, + "eval": { + "expected_ui_state": [ + { + "url": "__MAP__/search?query=hotels%20near%20carnegie%20mellon%20university" + } + ], + "site": "map" + }, + "original.eval": { + "eval_types": [ + "url_match" + ], + "reference_answers": null, + "reference_url": "__MAP__/search?query=hotels%20near%20carnegie%20mellon%20university", + "program_html": [], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 59 + }, + { + "sites": [ + "map" + ], + "task_id": 380, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "Find the {{space}} around {{location}}", + "instantiation_dict": { + "location": "Carnegie Music Hall", + "space": "bar" + }, + "intent": "Find the bar around Carnegie Music Hall", + "require_reset": false, + "eval": { + "expected_ui_state": [ + { + "url": "__MAP__/search?query=bars%20near%20Carnegie%20Music%20Hall" + } + ], + "site": "map" + }, + "original.eval": { + "eval_types": [ + "url_match" + ], + "reference_answers": null, + "reference_url": "__MAP__/search?query=bars%20near%20Carnegie%20Music%20Hall", + "program_html": [], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 59 + }, + { + "sites": [ + "map" + ], + "task_id": 381, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "Find the {{space}} around {{location}}", + "instantiation_dict": { + "location": "Carnegie Music Hall", + "space": "hotel" + }, + "intent": "Find the hotel around Carnegie Music Hall", + "require_reset": false, + "eval": { + "expected_ui_state": [ + { + "url": "__MAP__/search?query=hotels%20near%20Carnegie%20Music%20Hall" + } + ], + "site": "map" + }, + "original.eval": { + "eval_types": [ + "url_match" + ], + "reference_answers": null, + "reference_url": "__MAP__/search?query=hotels%20near%20Carnegie%20Music%20Hall", + "program_html": [], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 59 + }, + { + "sites": [ + "map" + ], + "task_id": 382, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "I am arriving at Carnegie Mellon University. Find the nearby US Citizenship and Immigration Services and the walking distance to the nearest Social Security Administration from US Citizenship and Immigration Services", + "instantiation_dict": {}, + "intent": "I am arriving at Carnegie Mellon University. Find the nearby US Citizenship and Immigration Services and the walking distance to the nearest Social Security Administration from US Citizenship and Immigration Services", + "require_reset": false, + "eval": { + "site": "map" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": "N/A" + }, + "reference_url": "", + "program_html": [], + "string_note": "There is no USCIS nearby", + "reference_answer_raw_annotation": "N/A" + }, + "intent_template_id": 781, + "changelogs": [ + { + "key": "expected_retrieve_value", + "category": "unachievable_tasks", + "note": "Status check used instead of fuzzy match" + } + ] + }, + { + "sites": [ + "map" + ], + "task_id": 383, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "I am arriving at Pittsburgh Airport. Show me the name of a Hyatt hotel if there is any nearby. Tell me the names of supermarkets that are within 15mins driving from the hotel", + "instantiation_dict": {}, + "intent": "I am arriving at Pittsburgh Airport. Show me the name of a Hyatt hotel if there is any nearby. Tell me the names of supermarkets that are within 15mins driving from the hotel", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "verify_retrieved_value_normalized", + "eval_params": { + "ordered": false, + "type": "text" + }, + "expected_data": [ + "Hyatt Regency Pittsburgh International Airport", + "Giant Eagle", + "ALDI" + ] + } + } + ], + "site": "map" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "Hyatt Regency Pittsburgh International Airport", + "Giant Eagle", + "ALDI" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "Hyatt Regency Pittsburgh International Airport Giant Eagle, ALDI" + }, + "intent_template_id": 782, + "changelogs": [ + { + "key": "eval", + "category": "permissive_string_match", + "note": "Switched to exact match" + } + ] + }, + { + "sites": [ + "shopping" + ], + "task_id": 384, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "List the customer names who complain about the quality of EYZUTAK phone cases", + "instantiation_dict": {}, + "intent": "List the customer names who complain about the quality of EYZUTAK phone cases", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "verify_retrieved_value_normalized", + "eval_params": { + "ordered": false, + "type": "text" + }, + "expected_data": [ + "Lisa Lee", + "Evelyn Kurver", + "Amanda", + "N Randall" + ] + } + } + ], + "site": "shopping" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "Lisa Lee", + "Evelyn Kurver", + "Amanda", + "N Randall" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "Lisa Lee, Evelyn Kurver, Amanda, N Randall" + }, + "intent_template_id": 666, + "changelogs": [ + { + "key": "eval", + "category": "permissive_string_match", + "note": "Switched to exact match" + } + ] + }, + { + "sites": [ + "shopping" + ], + "task_id": 385, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "List the customer names who thinks EYZUTAK phone cases are of good looking", + "instantiation_dict": {}, + "intent": "List the customer names who thinks EYZUTAK phone cases are of good looking", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "verify_retrieved_value_normalized", + "eval_params": { + "ordered": false, + "type": "text" + }, + "expected_data": [ + "Lisa Lee", + "MH", + "Misba009", + "Amanda", + "N Randall", + "Amazon Customer", + "Cally", + "Bethany Robertson" + ] + } + } + ], + "site": "shopping" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "Lisa Lee", + "MH", + "Misba009", + "Amanda", + "N Randall", + "Amazon Customer", + "Cally", + "Bethany Robertson" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "Lisa Lee, MH, Misba009, Amanda, N Randall, Amazon Customer, Cally, Bethany Robertson" + }, + "intent_template_id": 666, + "changelogs": [ + { + "key": "eval", + "category": "permissive_string_match", + "note": "Switched to exact match" + } + ] + }, + { + "sites": [ + "shopping" + ], + "task_id": 386, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "What is the rating of {{product}}", + "instantiation_dict": { + "product": "Ugreen lightning to 3.5mm cable" + }, + "intent": "What is the rating of Ugreen lightning to 3.5mm cable", + "original.intent": "What is the rating of Ugreen lightning to 3.5mm cable. Please round to the nearest whole number", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "verify_retrieved_value_normalized", + "eval_params": { + "ordered": false, + "type": "numeric" + }, + "expected_data": [ + 3 + ] + } + } + ], + "site": "shopping" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "65 |OR| 3" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "Around 3.1 stars" + }, + "intent_template_id": 1355, + "changelogs": [ + { + "key": "eval", + "category": "permissive_string_match", + "note": "Switched to exact match" + } + ] + }, + { + "sites": [ + "shopping" + ], + "task_id": 387, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "Who gave {{stars}} for phone cases from EYZUTAK", + "instantiation_dict": { + "stars": "4 or 5 stars" + }, + "intent": "Who gave 4 or 5 stars for phone cases from EYZUTAK", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "verify_retrieved_value_normalized", + "eval_params": { + "ordered": false, + "type": "text" + }, + "expected_data": [ + "MH", + "Misba009", + "Amanda", + "Amazon Customer", + "Cally", + "Bethany Robertson" + ] + } + } + ], + "site": "shopping" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "MH", + "Misba009", + "Amanda", + "Amazon Customer", + "Cally", + "Bethany Robertson" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "MH, Misba009, Amanda, Amazon Customer, Cally, Bethany Robertson, Bethany Robertson" + }, + "intent_template_id": 1356, + "changelogs": [ + { + "key": "eval", + "category": "permissive_string_match", + "note": "Switched to exact match" + } + ] + }, + { + "sites": [ + "shopping" + ], + "task_id": 388, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "Who gave {{stars}} for phone cases from EYZUTAK", + "instantiation_dict": { + "stars": "1 or 2 stars" + }, + "intent": "Who gave 1 or 2 stars for phone cases from EYZUTAK", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "verify_retrieved_value_normalized", + "eval_params": { + "ordered": false, + "type": "text" + }, + "expected_data": [ + "Evelyn Kurver", + "N Randall" + ] + } + } + ], + "site": "shopping" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "Evelyn Kurver", + "N Randall" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "Evelyn Kurver, N Randall" + }, + "intent_template_id": 1356, + "changelogs": [ + { + "key": "eval", + "category": "permissive_string_match", + "note": "Switched to exact match" + } + ] + }, + { + "sites": [ + "gitlab" + ], + "task_id": 389, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Post \"{{content}}\" for the merge request related to {{mr}} in {{repo}} project", + "instantiation_dict": { + "mr": "octovisuals page", + "repo": "primer/design", + "content": "Thanks, working on reviews" + }, + "intent": "Post \"Thanks, working on reviews\" for the merge request related to octovisuals page in primer/design project", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "does_merge_request_have_comment", + "eval_params": { + "group": "primer", + "project": "design", + "merge_request_iid": 450, + "comment": "Thanks, working on reviews" + } + } + } + ], + "site": "gitlab" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__GITLAB__/primer/design/-/merge_requests/450", + "locator": "", + "required_contents": { + "must_include": [ + "Thanks, working on reviews" + ] + } + } + ] + }, + "intent_template_id": 348, + "changelogs": [ + { + "key": "expected_backend_state", + "category": "switch_to_api_eval", + "note": "API-based validation is more accurate" + }, + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "gitlab" + ], + "task_id": 390, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Post \"{{content}}\" for the merge request related to {{mr}} in {{repo}} project", + "instantiation_dict": { + "mr": "semantic HTML post", + "repo": "a11yproject/a11yproject.com", + "content": "lgtm" + }, + "intent": "Post \"lgtm\" for the merge request related to semantic HTML post in a11yproject/a11yproject.com project", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "does_merge_request_have_comment", + "eval_params": { + "group": "a11yproject", + "project": "a11yproject.com", + "merge_request_iid": 1531, + "comment": "lgtm" + } + } + } + ], + "site": "gitlab" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__GITLAB__/a11yproject/a11yproject.com/-/merge_requests/1531", + "locator": "document.querySelector('[id=\"notes-list\"').lastElementChild.querySelector('.timeline-discussion-body').outerText", + "required_contents": { + "exact_match": "lgtm" + } + } + ] + }, + "intent_template_id": 348, + "changelogs": [ + { + "key": "expected_backend_state", + "category": "switch_to_api_eval", + "note": "Use API call for MR comment verification; more accurate" + }, + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "gitlab" + ], + "task_id": 391, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Post \"{{content}}\" for the merge request related to {{mr}} in {{repo}} project", + "instantiation_dict": { + "mr": "focus edge cases", + "repo": "a11yproject/a11yproject.com", + "content": "close because non reproducible" + }, + "intent": "Post \"close because non reproducible\" for the merge request related to focus edge cases in a11yproject/a11yproject.com project", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "does_merge_request_have_comment", + "eval_params": { + "group": "a11yproject", + "project": "a11yproject.com", + "merge_request_iid": 1265, + "comment": "close because non reproducible" + } + } + } + ], + "site": "gitlab" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__GITLAB__/a11yproject/a11yproject.com/-/merge_requests/1265", + "locator": "document.querySelector('[id=\"notes-list\"').lastElementChild.querySelector('.timeline-discussion-body').outerText", + "required_contents": { + "exact_match": "close because non reproducible" + } + } + ] + }, + "intent_template_id": 348, + "changelogs": [ + { + "key": "expected_backend_state", + "category": "switch_to_api_eval", + "note": "Use API call for MR comment verification; more accurate" + }, + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "gitlab" + ], + "task_id": 392, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Post \"{{content}}\" for the merge request related to {{mr}} in {{repo}} project", + "instantiation_dict": { + "mr": "color ulitity", + "repo": "a11yproject.com", + "content": "Good idea" + }, + "intent": "Post \"Good idea\" for the merge request related to color ulitity in a11yproject.com project", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "does_merge_request_have_comment", + "eval_params": { + "group": "a11yproject", + "project": "a11yproject.com", + "merge_request_iid": 1071, + "comment": "Good idea" + } + } + } + ], + "site": "gitlab" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__GITLAB__/a11yproject/a11yproject.com/-/merge_requests/1071", + "locator": "document.querySelector('[id=\"notes-list\"').lastElementChild.querySelector('.timeline-discussion-body').outerText", + "required_contents": { + "exact_match": "Good idea" + } + } + ] + }, + "intent_template_id": 348, + "changelogs": [ + { + "key": "expected_backend_state", + "category": "switch_to_api_eval", + "note": "Use API call for MR comment verification; more accurate" + }, + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "gitlab" + ], + "task_id": 393, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Post \"{{content}}\" for the merge request related to {{mr}} in {{repo}} project", + "instantiation_dict": { + "mr": "fixing the broken links", + "repo": "byteblaze/empathy-prompts", + "content": "lgtm" + }, + "intent": "Post \"lgtm\" for the merge request related to fixing the broken links in byteblaze/empathy-prompts project", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "does_merge_request_have_comment", + "eval_params": { + "group": "byteblaze", + "project": "empathy-prompts", + "merge_request_iid": 19, + "comment": "lgtm" + } + } + } + ], + "site": "gitlab" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__GITLAB__/byteblaze/empathy-prompts/-/merge_requests/19", + "locator": "document.querySelector('[id=\"notes-list\"').lastElementChild.querySelector('.timeline-discussion-body').outerText", + "required_contents": { + "exact_match": "lgtm" + } + } + ] + }, + "intent_template_id": 348, + "changelogs": [ + { + "key": "expected_backend_state", + "category": "switch_to_api_eval", + "note": "Use API call for MR comment verification; more accurate" + }, + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "gitlab" + ], + "task_id": 394, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Fork {{repo}}.", + "instantiation_dict": { + "repo": "2019-nCov" + }, + "intent": "Fork 2019-nCov.", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "user_has_project", + "eval_params": { + "project": "2019-nCov" + } + } + } + ], + "site": "gitlab" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__GITLAB__/byteblaze/2019-nCov", + "locator": "", + "required_contents": { + "must_include": [ + "2019-nCov" + ] + } + } + ] + }, + "intent_template_id": 352, + "changelogs": [ + { + "key": "expected_backend_state", + "category": "switch_to_api_eval", + "note": "Use API to verify; more accurate" + }, + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "gitlab" + ], + "task_id": 395, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Fork {{repo}}.", + "instantiation_dict": { + "repo": "the Pytorch GAN repo with most stars" + }, + "intent": "Fork the Pytorch GAN repo with most stars.", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "user_has_project", + "eval_params": { + "project": "Pytorch-GAN" + } + } + } + ], + "site": "gitlab" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__GITLAB__/byteblaze/PyTorch-GAN", + "locator": "", + "required_contents": { + "must_include": [ + "Pytorch-GAN" + ] + } + } + ] + }, + "intent_template_id": 352, + "changelogs": [ + { + "key": "expected_backend_state", + "category": "switch_to_api_eval", + "note": "Use API to verify; more accurate" + }, + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "gitlab" + ], + "task_id": 396, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Fork {{repo}}.", + "instantiation_dict": { + "repo": "ChatGPT" + }, + "intent": "Fork ChatGPT.", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "user_has_project", + "eval_params": { + "project": "ChatGPT" + } + } + } + ], + "site": "gitlab" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__GITLAB__/byteblaze/ChatGPT", + "locator": "", + "required_contents": { + "must_include": [ + "ChatGPT" + ] + } + } + ] + }, + "intent_template_id": 352, + "changelogs": [ + { + "key": "expected_backend_state", + "category": "switch_to_api_eval", + "note": "Use API to verify; more accurate" + }, + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "gitlab" + ], + "task_id": 397, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Fork {{repo}}.", + "instantiation_dict": { + "repo": "MetaSeq" + }, + "intent": "Fork MetaSeq.", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "user_has_project", + "eval_params": { + "project": "metaseq" + } + } + } + ], + "site": "gitlab" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__GITLAB__/byteblaze/metaseq", + "locator": "", + "required_contents": { + "must_include": [ + "metaseq" + ] + } + } + ] + }, + "intent_template_id": 352, + "changelogs": [ + { + "key": "expected_backend_state", + "category": "switch_to_api_eval", + "note": "Use API to verify; more accurate" + }, + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "gitlab" + ], + "task_id": 398, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Fork {{repo}}.", + "instantiation_dict": { + "repo": "all source repos from Akilesh Kannan" + }, + "intent": "Fork all source repos from Akilesh Kannan.", + "original.intent": "Fork all source repos from Akilesh Kannan", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "user_has_project", + "eval_params": { + "project": [ + "SimCache", + "dots", + "CacheEval", + "nvidia-patch", + "viewgrades-scraper" + ] + } + } + } + ], + "site": "gitlab" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__GITLAB__/byteblaze/SimCache", + "locator": "", + "required_contents": { + "must_include": [ + "SimCache" + ] + } + }, + { + "url": "__GITLAB__/byteblaze/dots", + "locator": "", + "required_contents": { + "must_include": [ + "dots" + ] + } + }, + { + "url": "__GITLAB__/byteblaze/CacheEval", + "locator": "", + "required_contents": { + "must_include": [ + "CacheEval" + ] + } + }, + { + "url": "__GITLAB__/byteblaze/nvidia-patch", + "locator": "", + "required_contents": { + "must_include": [ + "404" + ] + } + }, + { + "url": "__GITLAB__/byteblaze/viewgrades-scraper", + "locator": "", + "required_contents": { + "must_include": [ + "404" + ] + } + } + ] + }, + "intent_template_id": 352, + "changelogs": [ + { + "key": "expected_backend_state", + "category": "switch_to_api_eval", + "note": "Use API to verify; more accurate" + }, + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "reddit" + ], + "task_id": 399, + "require_login": true, + "storage_state": "./.auth/reddit_state.json", + "start_url": "__REDDIT__", + "geolocation": null, + "intent_template": "Change my reddit bio to \"{{content}}\"", + "instantiation_dict": { + "content": "I am a robot" + }, + "intent": "Change my reddit bio to \"I am a robot\"", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "validate_user_bio_content", + "expected_data": { + "bio": "I am a robot" + } + } + } + ], + "site": "reddit" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__REDDIT__/user/MarvelsGrantMan136", + "locator": "document.querySelector(\".user-bio__biography\").outerText", + "required_contents": { + "exact_match": "I am a robot" + } + } + ] + }, + "intent_template_id": 6, + "changelogs": [ + { + "key": "expected_backend_state", + "category": "switch_to_api_eval", + "note": "Update to use system check instead of dom check" + }, + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "reddit" + ], + "task_id": 400, + "require_login": true, + "storage_state": "./.auth/reddit_state.json", + "start_url": "__REDDIT__", + "geolocation": null, + "intent_template": "Change my reddit bio to \"{{content}}\"", + "instantiation_dict": { + "content": "Pro Python Developer with 20 years of Experience" + }, + "intent": "Change my reddit bio to \"Pro Python Developer with 20 years of Experience\"", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "validate_user_bio_content", + "expected_data": { + "bio": "Pro Python Developer with 20 years of Experience" + } + } + } + ], + "site": "reddit" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__REDDIT__/user/MarvelsGrantMan136", + "locator": "document.querySelector(\".user-bio__biography\").outerText", + "required_contents": { + "exact_match": "Pro Python Developer with 20 years of Experience" + } + } + ] + }, + "intent_template_id": 6, + "changelogs": [ + { + "key": "expected_backend_state", + "category": "switch_to_api_eval", + "note": "Update to use system check instead of dom check" + }, + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "reddit" + ], + "task_id": 401, + "require_login": true, + "storage_state": "./.auth/reddit_state.json", + "start_url": "__REDDIT__", + "geolocation": null, + "intent_template": "Change my reddit bio to \"{{content}}\"", + "instantiation_dict": { + "content": "Seeking SDE positions" + }, + "intent": "Change my reddit bio to \"Seeking SDE positions\"", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "validate_user_bio_content", + "expected_data": { + "bio": "Seeking SDE positions" + } + } + } + ], + "site": "reddit" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__REDDIT__/user/MarvelsGrantMan136", + "locator": "document.querySelector(\".user-bio__biography\").outerText", + "required_contents": { + "exact_match": "Seeking SDE positions" + } + } + ] + }, + "intent_template_id": 6, + "changelogs": [ + { + "key": "expected_backend_state", + "category": "switch_to_api_eval", + "note": "Update to use system check instead of dom check" + }, + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "reddit" + ], + "task_id": 402, + "require_login": true, + "storage_state": "./.auth/reddit_state.json", + "start_url": "__REDDIT__", + "geolocation": null, + "intent_template": "Change my reddit bio to \"{{content}}\"", + "instantiation_dict": { + "content": "Freelance Web Developer" + }, + "intent": "Change my reddit bio to \"Freelance Web Developer\"", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "validate_user_bio_content", + "expected_data": { + "bio": "Freelance Web Developer" + } + } + } + ], + "site": "reddit" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__REDDIT__/user/MarvelsGrantMan136", + "locator": "document.querySelector(\".user-bio__biography\").outerText", + "required_contents": { + "exact_match": "Freelance Web Developer" + } + } + ] + }, + "intent_template_id": 6, + "changelogs": [ + { + "key": "expected_backend_state", + "category": "switch_to_api_eval", + "note": "Update to use system check instead of dom check" + }, + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "reddit" + ], + "task_id": 403, + "require_login": true, + "storage_state": "./.auth/reddit_state.json", + "start_url": "__REDDIT__", + "geolocation": null, + "intent_template": "Change my reddit bio to \"{{content}}\"", + "instantiation_dict": { + "content": "Awesome Prompt Artist" + }, + "intent": "Change my reddit bio to \"Awesome Prompt Artist\"", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "validate_user_bio_content", + "expected_data": { + "bio": "Awesome Prompt Artist" + } + } + } + ], + "site": "reddit" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__REDDIT__/user/MarvelsGrantMan136", + "locator": "document.querySelector(\".user-bio__biography\").outerText", + "required_contents": { + "exact_match": "Awesome Prompt Artist" + } + } + ] + }, + "intent_template_id": 6, + "changelogs": [ + { + "key": "expected_backend_state", + "category": "switch_to_api_eval", + "note": "Update to use system check instead of dom check" + }, + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "reddit" + ], + "task_id": 404, + "require_login": true, + "storage_state": "./.auth/reddit_state.json", + "start_url": "__REDDIT__", + "geolocation": null, + "intent_template": "Upvote the newest post in {{subreddit}} subreddit", + "instantiation_dict": { + "subreddit": "books" + }, + "intent": "Upvote the newest post in books subreddit", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "validate_user_upvoted_newest_post", + "expected_data": { + "subreddit": "books" + } + } + } + ], + "site": "reddit" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__REDDIT__/f/books/124260/adults-reading-to-each-other-out-loud", + "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')", + "required_contents": { + "must_include": [ + "vote vote--user-upvoted" + ] + } + } + ] + }, + "intent_template_id": 22, + "changelogs": [ + { + "key": "expected_backend_state", + "category": "switch_to_api_eval", + "note": "Update to use system check instead of dom check" + }, + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "reddit" + ], + "task_id": 405, + "require_login": true, + "storage_state": "./.auth/reddit_state.json", + "start_url": "__REDDIT__", + "geolocation": null, + "intent_template": "Upvote the newest post in {{subreddit}} subreddit", + "instantiation_dict": { + "subreddit": "DIY" + }, + "intent": "Upvote the newest post in DIY subreddit", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "validate_user_upvoted_newest_post", + "expected_data": { + "subreddit": "diy" + } + } + } + ], + "site": "reddit" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__REDDIT__/f/DIY/119019/how-can-i-bring-an-hdmi-cable-from-my-pc-downstairs-to-my-tv", + "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')", + "required_contents": { + "must_include": [ + "vote vote--user-upvoted" + ] + } + } + ] + }, + "intent_template_id": 22, + "changelogs": [ + { + "key": "expected_backend_state", + "category": "switch_to_api_eval", + "note": "Update to use system check instead of dom check" + }, + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "reddit" + ], + "task_id": 406, + "require_login": true, + "storage_state": "./.auth/reddit_state.json", + "start_url": "__REDDIT__", + "geolocation": null, + "intent_template": "Upvote the newest post in {{subreddit}} subreddit", + "instantiation_dict": { + "subreddit": "future technology" + }, + "intent": "Upvote the newest post in future technology subreddit", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "validate_user_upvoted_newest_post", + "expected_data": { + "subreddit": "future technology" + } + } + } + ], + "site": "reddit" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__REDDIT__/f/Futurology/119517/openai-ceo-it-s-not-funny-that-i-m-afraid-of-the-ai-we-re", + "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')", + "required_contents": { + "must_include": [ + "vote vote--user-upvoted" + ] + } + } + ] + }, + "intent_template_id": 22, + "changelogs": [ + { + "key": "expected_backend_state", + "category": "switch_to_api_eval", + "note": "Update to use system check instead of dom check" + }, + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "reddit" + ], + "task_id": 407, + "require_login": true, + "storage_state": "./.auth/reddit_state.json", + "start_url": "__REDDIT__", + "geolocation": null, + "intent_template": "Upvote the newest post in {{subreddit}} subreddit", + "instantiation_dict": { + "subreddit": "deeplearning" + }, + "intent": "Upvote the newest post in deeplearning subreddit", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "validate_user_upvoted_newest_post", + "expected_data": { + "subreddit": "deeplearning" + } + } + } + ], + "site": "reddit" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__REDDIT__/f/deeplearning/124993/meta-s-llama-weights-leaked-on-torrent-and-the-best-thing", + "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')", + "required_contents": { + "must_include": [ + "vote vote--user-upvoted" + ] + } + } + ] + }, + "intent_template_id": 22, + "changelogs": [ + { + "key": "expected_backend_state", + "category": "switch_to_api_eval", + "note": "Update to use system check instead of dom check" + }, + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "reddit" + ], + "task_id": 408, + "require_login": true, + "storage_state": "./.auth/reddit_state.json", + "start_url": "__REDDIT__", + "geolocation": null, + "intent_template": "Upvote the newest post in {{subreddit}} subreddit", + "instantiation_dict": { + "subreddit": "explain like im 5" + }, + "intent": "Upvote the newest post in explain like im 5 subreddit", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "validate_user_upvoted_newest_post", + "expected_data": { + "subreddit": "explain like im 5" + } + } + } + ], + "site": "reddit" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__REDDIT__/f/explainlikeimfive/39244/eli5-how-does-pinching-a-ribbon-and-sliding-your-finger", + "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')", + "required_contents": { + "must_include": [ + "vote vote--user-upvoted" + ] + } + } + ] + }, + "intent_template_id": 22, + "changelogs": [ + { + "key": "expected_backend_state", + "category": "switch_to_api_eval", + "note": "Update to use system check instead of dom check" + }, + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "reddit" + ], + "task_id": 409, + "require_login": true, + "storage_state": "./.auth/reddit_state.json", + "start_url": "__REDDIT__/f/books/59421/friendly-reminder-bookshop-org-exists", + "geolocation": null, + "intent_template": "Reply to {{position_description}} in this post with \"{{content_description}}\"", + "instantiation_dict": { + "position_description": "the manager of the website", + "content_description": "thanks! I am a big fan of your website." + }, + "intent": "Reply to the manager of the website in this post with \"thanks! I am a big fan of your website.\"", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "validate_user_upvoted_newest_post", + "expected_data": { + "post_id": "1235250", + "comment_content": "thanks! I am a big fan of your website." + } + } + } + ], + "site": "reddit" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__REDDIT__/f/books/59421/-/comment/1235250", + "locator": "", + "required_contents": { + "must_include": [ + "thanks! I am a big fan of your website." + ] + } + } + ] + }, + "intent_template_id": 23, + "changelogs": [ + { + "key": "expected_backend_state", + "category": "switch_to_api_eval", + "note": "Update to use system check instead of dom check" + }, + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "reddit" + ], + "task_id": 410, + "require_login": true, + "storage_state": "./.auth/reddit_state.json", + "start_url": "__REDDIT__/f/singularity/69404/this-is-how-chatgpt-sees-itself", + "geolocation": null, + "intent_template": "Reply to {{position_description}} in this post with \"{{content_description}}\"", + "instantiation_dict": { + "position_description": "the first reply", + "content_description": "don't panic" + }, + "intent": "Reply to the first reply in this post with \"don't panic\"", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "validate_user_upvoted_newest_post", + "expected_data": { + "post_id": "1042264", + "comment_content": "don't panic" + } + } + } + ], + "site": "reddit" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__REDDIT__/f/singularity/69404/-/comment/1042264", + "locator": "", + "required_contents": { + "must_include": [ + "don't panic" + ] + } + } + ] + }, + "intent_template_id": 23, + "changelogs": [ + { + "key": "expected_backend_state", + "category": "switch_to_api_eval", + "note": "Update to use system check instead of dom check" + }, + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "gitlab" + ], + "task_id": 411, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Make the LICENSE of {{repo}} to MIT license.", + "instantiation_dict": { + "repo": "byteblaze/cloud-to-butt" + }, + "intent": "Make the LICENSE of byteblaze/cloud-to-butt to MIT license.", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "file_has_substring", + "eval_params": { + "group": "byteblaze", + "project": "cloud-to-butt", + "file_path": "LICENSE.txt", + "substring": "MIT License", + "branch": "master" + } + } + } + ], + "site": "gitlab" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__GITLAB__/byteblaze/cloud-to-butt/-/blob/master/LICENSE.txt", + "locator": "", + "required_contents": { + "must_include": [ + "MIT license" + ] + } + } + ] + }, + "intent_template_id": 355, + "changelogs": [ + { + "key": "expected_backend_state", + "category": "switch_to_api_eval", + "note": "Use GitLab API to fetch file content; more accurate" + }, + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "gitlab" + ], + "task_id": 412, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Make the LICENSE of {{repo}} to Apache License", + "instantiation_dict": { + "repo": "byteblaze/accessible-html-content-patterns" + }, + "intent": "Make the LICENSE of byteblaze/accessible-html-content-patterns to Apache License", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "file_has_substring", + "eval_params": { + "group": "byteblaze", + "project": "accessible-html-content-patterns", + "file_path": "LICENSE", + "substring": "Apache License", + "branch": "main" + } + } + } + ], + "site": "gitlab" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__GITLAB__/byteblaze/accessible-html-content-patterns/-/blob/main/LICENSE", + "locator": "", + "required_contents": { + "must_include": [ + "Apache License", + "http://www.apache.org/licenses/LICENSE-2.0" + ] + } + } + ] + }, + "intent_template_id": 355, + "changelogs": [ + { + "key": "expected_backend_state", + "category": "switch_to_api_eval", + "note": "Use GitLab API to fetch file content; more accurate" + }, + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "gitlab" + ], + "task_id": 413, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Make the LICENSE of {{repo}} to one that mandates all copies and derivative works to be under the same license", + "instantiation_dict": { + "repo": "byteblaze/a11y-syntax-highlighting" + }, + "intent": "Make the LICENSE of byteblaze/a11y-syntax-highlighting to one that mandates all copies and derivative works to be under the same license", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "file_has_substring", + "eval_params": { + "group": "byteblaze", + "project": "a11y-syntax-highlighting", + "file_path": "LICENSE", + "substring": "GNU GENERAL PUBLIC LICENSE", + "not_substring": "MIT License", + "branch": "main" + } + } + } + ], + "site": "gitlab" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__GITLAB__/byteblaze/a11y-syntax-highlighting/-/blob/main/LICENSE", + "locator": "", + "required_contents": { + "must_include": [ + "GENERAL PUBLIC LICENSE" + ] + } + } + ] + }, + "intent_template_id": 355, + "changelogs": [ + { + "key": "expected_backend_state", + "category": "switch_to_api_eval", + "note": "Use GitLab API to fetch file content; more accurate" + }, + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "gitlab" + ], + "task_id": 414, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Make the LICENSE of {{repo}} to MIT license.", + "instantiation_dict": { + "repo": "byteblaze/dotfiles" + }, + "intent": "Make the LICENSE of byteblaze/dotfiles to MIT license.", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "file_has_substring", + "eval_params": { + "group": "byteblaze", + "project": "dotfiles", + "file_path": "LICENSE", + "substring": "MIT License", + "branch": "main" + } + } + } + ], + "site": "gitlab" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__GITLAB__/byteblaze/dotfiles/-/blob/main/LICENSE", + "locator": "", + "required_contents": { + "must_include": [ + "MIT license", + "The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software." + ] + } + } + ] + }, + "intent_template_id": 355, + "changelogs": [ + { + "key": "expected_backend_state", + "category": "switch_to_api_eval", + "note": "Use GitLab API to fetch file content; more accurate" + }, + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "gitlab" + ], + "task_id": 415, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Go to the merge request on {{topic}} I have to review, find if the author of the merge request responded at the end, and reply \"Thank you\" if he did. Otherwise remind him with a simple @.", + "instantiation_dict": { + "topic": "verification functions" + }, + "intent": "Go to the merge request on verification functions I have to review, find if the author of the merge request responded at the end, and reply \"Thank you\" if he did. Otherwise remind him with a simple @.", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "does_merge_request_have_comment", + "eval_params": { + "group": "byteblaze", + "project": "a11y-webring.club", + "merge_request_iid": 40, + "comment": "@davepgreene" + } + } + } + ], + "site": "gitlab" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__GITLAB__/byteblaze/a11y-webring.club/-/merge_requests/40", + "locator": "document.querySelector('[id=\"notes-list\"').lastElementChild.querySelector('.timeline-discussion-body').outerText", + "required_contents": { + "exact_match": "@davepgreene" + } + } + ] + }, + "intent_template_id": 360, + "changelogs": [ + { + "key": "expected_backend_state", + "category": "switch_to_api_eval", + "note": "Use API call for MR comment verification; more accurate" + }, + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "gitlab" + ], + "task_id": 416, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Go to the merge request on {{topic}} I have to review, find if the author of the merge request responded at the end, and reply \"Thank you\" if he did. Otherwise remind him with a simple @.", + "instantiation_dict": { + "topic": "wcag" + }, + "intent": "Go to the merge request on wcag I have to review, find if the author of the merge request responded at the end, and reply \"Thank you\" if he did. Otherwise remind him with a simple @.", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "does_merge_request_have_comment", + "eval_params": { + "group": "a11yproject", + "project": "a11yproject.com", + "merge_request_iid": 1270, + "comment": "Thank you" + } + } + } + ], + "site": "gitlab" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__GITLAB__/a11yproject/a11yproject.com/-/merge_requests/1270", + "locator": "document.querySelector('[id=\"notes-list\"').lastElementChild.querySelector('.timeline-discussion-body').outerText", + "required_contents": { + "exact_match": "Thank you" + } + } + ] + }, + "intent_template_id": 360, + "changelogs": [ + { + "key": "expected_backend_state", + "category": "switch_to_api_eval", + "note": "Use API call for MR comment verification; more accurate" + }, + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "gitlab" + ], + "task_id": 417, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Go to the merge request on {{topic}} I have to review, find if the author of the merge request responded at the end, and reply \"Thank you\" if he did. Otherwise remind him with a simple @.", + "instantiation_dict": { + "topic": "404 link" + }, + "intent": "Go to the merge request on 404 link I have to review, find if the author of the merge request responded at the end, and reply \"Thank you\" if he did. Otherwise remind him with a simple @.", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "does_merge_request_have_comment", + "eval_params": { + "group": "a11yproject", + "project": "a11yproject.com", + "merge_request_iid": 1485, + "comment": "@Roshanjossey" + } + } + } + ], + "site": "gitlab" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__GITLAB__/a11yproject/a11yproject.com/-/merge_requests/1485", + "locator": "document.querySelector('[id=\"notes-list\"').lastElementChild.querySelector('.timeline-discussion-body').outerText", + "required_contents": { + "exact_match": "@Roshanjossey" + } + } + ] + }, + "intent_template_id": 360, + "changelogs": [ + { + "key": "expected_backend_state", + "category": "switch_to_api_eval", + "note": "Use API call for MR comment verification; more accurate" + }, + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "gitlab" + ], + "task_id": 418, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Set my gitlab status as {{status}}.", + "instantiation_dict": { + "status": "Busy" + }, + "intent": "Set my gitlab status as Busy.", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "check_user_status_message", + "eval_params": { + "user_id": "byteblaze", + "compare_str": "Busy" + } + } + } + ], + "site": "gitlab" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__GITLAB__/byteblaze", + "locator": "document.querySelector('.cover-status').lastChild.textContent", + "required_contents": { + "exact_match": "Busy" + } + } + ] + }, + "intent_template_id": 361, + "changelogs": [ + { + "key": "expected_backend_state", + "category": "switch_to_api_eval", + "note": "Use API to verify; more accurate" + }, + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "gitlab" + ], + "task_id": 419, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Set my gitlab status as {{status}}.", + "instantiation_dict": { + "status": "Enjoying life" + }, + "intent": "Set my gitlab status as Enjoying life.", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "check_user_status_message", + "eval_params": { + "user_id": "byteblaze", + "compare_str": "Enjoying life" + } + } + } + ], + "site": "gitlab" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__GITLAB__/byteblaze", + "locator": "document.querySelector('.cover-status').lastChild.textContent", + "required_contents": { + "exact_match": "Enjoying life" + } + } + ] + }, + "intent_template_id": 361, + "changelogs": [ + { + "key": "expected_backend_state", + "category": "switch_to_api_eval", + "note": "Use API to verify; more accurate" + }, + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "gitlab" + ], + "task_id": 420, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Set my gitlab status as {{status}}.", + "instantiation_dict": { + "status": "Playing Badminton" + }, + "intent": "Set my gitlab status as Playing Badminton.", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "check_user_status_message", + "eval_params": { + "user_id": "byteblaze", + "compare_str": "Playing Badminton" + } + } + } + ], + "site": "gitlab" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__GITLAB__/byteblaze", + "locator": "document.querySelector('.cover-status').lastChild.textContent", + "required_contents": { + "exact_match": "Playing Badminton" + } + } + ] + }, + "intent_template_id": 361, + "changelogs": [ + { + "key": "expected_backend_state", + "category": "switch_to_api_eval", + "note": "Use API to verify; more accurate" + }, + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "gitlab" + ], + "task_id": 421, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Set my gitlab status as {{status}}.", + "instantiation_dict": { + "status": "Resting due to leg injury" + }, + "intent": "Set my gitlab status as Resting due to leg injury.", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "check_user_status_message", + "eval_params": { + "user_id": "byteblaze", + "compare_str": "Resting due to leg injury" + } + } + } + ], + "site": "gitlab" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__GITLAB__/byteblaze", + "locator": "document.querySelector('.cover-status').lastChild.textContent", + "required_contents": { + "exact_match": "Resting due to leg injury" + } + } + ] + }, + "intent_template_id": 361, + "changelogs": [ + { + "key": "expected_backend_state", + "category": "switch_to_api_eval", + "note": "Use API to verify; more accurate" + }, + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "gitlab" + ], + "task_id": 422, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Set my gitlab status as {{status}}.", + "instantiation_dict": { + "status": "Out of Office" + }, + "intent": "Set my gitlab status as Out of Office.", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "check_user_status_message", + "eval_params": { + "user_id": "byteblaze", + "compare_str": "Out of Office" + } + } + } + ], + "site": "gitlab" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__GITLAB__/byteblaze", + "locator": "document.querySelector('.cover-status').lastChild.textContent", + "required_contents": { + "exact_match": "Out of Office" + } + } + ] + }, + "intent_template_id": 361, + "changelogs": [ + { + "key": "expected_backend_state", + "category": "switch_to_api_eval", + "note": "Use API to verify; more accurate" + }, + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 423, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Mark all {{brand}} shirts on sale", + "instantiation_dict": { + "brand": "Hollister" + }, + "intent": "Mark all Hollister shirts on sale", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "verify_product_details", + "eval_params": { + "product_id": "126" + }, + "expected_data": { + "on_sale": true + } + } + } + ], + "site": "shopping_admin" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__SHOPPING_ADMIN__/catalog/product/edit/id/126/", + "locator": "document.querySelector('input[name=\"product[sale]\"]').value", + "required_contents": { + "exact_match": "1" + } + } + ] + }, + "intent_template_id": 237, + "changelogs": [ + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "wikipedia", + "map" + ], + "task_id": 424, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "Find the page of {{description}} on the map.", + "instantiation_dict": { + "description": "the place where Mr. Rogers was filmed" + }, + "intent": "Find the page of the place where Mr. Rogers was filmed on the map.", + "require_reset": false, + "eval": { + "site": "map" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "last", + "locator": "document.querySelector('[id=\"sidebar_content\"').outerText", + "required_contents": { + "must_include": [ + "Pittsburgh" + ] + } + } + ] + }, + "intent_template_id": 371 + }, + { + "sites": [ + "wikipedia", + "map" + ], + "task_id": 425, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "Find the page of {{description}} on the map.", + "instantiation_dict": { + "description": "the longest bridge in the Western hemisphere" + }, + "intent": "Find the page of the longest bridge in the Western hemisphere on the map.", + "require_reset": false, + "eval": { + "site": "map" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "last", + "locator": "document.querySelector('[id=\"sidebar_content\"').outerText", + "required_contents": { + "must_include": [ + "Mackinac Bridge" + ] + } + } + ] + }, + "intent_template_id": 371 + }, + { + "sites": [ + "wikipedia", + "map" + ], + "task_id": 426, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "Find the page of {{description}} on the map.", + "instantiation_dict": { + "description": "the place in Pennsylvania where a plane crashed during the September 11th attacks" + }, + "intent": "Find the page of the place in Pennsylvania where a plane crashed during the September 11th attacks on the map.", + "require_reset": false, + "eval": { + "site": "map" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "last", + "locator": "document.querySelector('[id=\"sidebar_content\"').outerText", + "required_contents": { + "must_include": [ + "Somerset County" + ] + } + } + ] + }, + "intent_template_id": 371 + }, + { + "sites": [ + "wikipedia", + "map" + ], + "task_id": 427, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "Find the page of {{description}} on the map.", + "instantiation_dict": { + "description": "the university that has most Turning Award winners" + }, + "intent": "Find the page of the university that has most Turning Award winners on the map.", + "require_reset": false, + "eval": { + "site": "map" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "last", + "locator": "document.querySelector('[id=\"sidebar_content\"').outerText", + "required_contents": { + "must_include": [ + "Massachusetts Institute of Technology" + ] + } + } + ] + }, + "intent_template_id": 371 + }, + { + "sites": [ + "wikipedia", + "map" + ], + "task_id": 428, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "Find the page of {{description}} on the map.", + "instantiation_dict": { + "description": "the undergrad college of the person who developed the Nash equilibrium" + }, + "intent": "Find the page of the undergrad college of the person who developed the Nash equilibrium on the map.", + "require_reset": false, + "eval": { + "site": "map" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "last", + "locator": "document.querySelector('[id=\"sidebar_content\"').outerText", + "required_contents": { + "must_include": [ + "Carnegie Mellon University" + ] + } + } + ] + }, + "intent_template_id": 371 + }, + { + "sites": [ + "wikipedia", + "map" + ], + "task_id": 429, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "Find the page of {{description}} on the map.", + "instantiation_dict": { + "description": "the colleges where The Chair was filmed in Pittsburgh" + }, + "intent": "Find the page of the colleges where The Chair was filmed in Pittsburgh on the map.", + "require_reset": false, + "eval": { + "site": "map" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "last", + "locator": "document.querySelector('[id=\"sidebar_content\"').outerText", + "required_contents": { + "must_include": [ + "Chatham University" + ] + } + } + ] + }, + "intent_template_id": 371 + }, + { + "sites": [ + "wikipedia", + "map" + ], + "task_id": 430, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "Find the page of {{description}} on the map.", + "instantiation_dict": { + "description": "the college(s) where The Chair was filmed in Pennsylvania other than the ones in Pittsburgh" + }, + "intent": "Find the page of the college(s) where The Chair was filmed in Pennsylvania other than the ones in Pittsburgh on the map.", + "require_reset": false, + "eval": { + "site": "map" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "last", + "locator": "document.querySelector('[id=\"sidebar_content\"').outerText", + "required_contents": { + "must_include": [ + "Washington & Jefferson College" + ] + } + } + ] + }, + "intent_template_id": 371 + }, + { + "sites": [ + "shopping" + ], + "task_id": 431, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__/tall-pink-taper-candles-4-piece-orange-colored-tapered-candles-gradient-candles-10-6-inches-tall-tie-dye-candle-set-large-dripless-long-burning-candlesticks-two-color-taper-candles-candlesticks.html |AND| __SHOPPING__/spaas-white-taper-candles-4-pack-10-inch-tall-candles-scent-free-premium-wax-candle-sticks-8-hour-long-burning-white-candlesticks-for-home-decoration-wedding-holiday-and-parties.html |AND| __SHOPPING__/white-starfish-wall-candle-sconces-set-of-2-beach-decor-ocean-themed-wall-mount-candleholders-nautical-style-beach-bathroom-decor-coastal-farmhouse-seashell-candle-holders.html", + "geolocation": null, + "intent_template": "Add the product with the lowest per unit price from my open tabs to the shopping cart", + "instantiation_dict": {}, + "intent": "Add the product with the lowest per unit price from my open tabs to the shopping cart", + "require_reset": false, + "eval": { + "site": "shopping" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__SHOPPING__/checkout/cart", + "locator": "", + "required_contents": { + "must_include": [ + "SPAAS White Taper Candles - 4 Pack |OR| 10 Inch Tall Candles, Scent-Free Premium Wax Candle Sticks |OR| 8 Hour Long Burning White Candlesticks for Home Decoration, Wedding, Holiday and Parties" + ] + } + } + ] + }, + "intent_template_id": 145 + }, + { + "sites": [ + "shopping" + ], + "task_id": 432, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__/ciclon-energy-drink-regular-24-cans-8-3oz.html |AND| __SHOPPING__/v8-energy-healthy-energy-drink-steady-energy-from-black-and-green-tea-pomegranate-blueberry-8-ounce-can-pack-of-24.html", + "geolocation": null, + "intent_template": "Add the product with the lowest per unit price from my open tabs to the shopping cart", + "instantiation_dict": {}, + "intent": "Add the product with the lowest per unit price from my open tabs to the shopping cart", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "validate_in_shopping_cart", + "expected_data": { + "sku": "B00CPTR7WS" + } + } + } + ], + "site": "shopping" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__SHOPPING__/checkout/cart", + "locator": "", + "required_contents": { + "must_include": [ + "V8 +Energy, Healthy Energy Drink, Steady Energy from Black and Green Tea, Pomegranate Blueberry, 8 Ounce Can ,Pack of 24" + ] + } + } + ] + }, + "intent_template_id": 145, + "changelogs": [ + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "shopping" + ], + "task_id": 433, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__/tazrigo-5pcs-white-dental-resin-brush-pens-dental-shaping-silicone-tooth-tool.html |AND| __SHOPPING__/stylus-pens-for-touch-screens-2-pcs-universal-stylus-2-in-1-2022-updated-touch-screen-pens-for-all-touch-screens-cell-phones-tablets-laptops-with-6-replacement-tips-4-discstips-2-fiber-tips.html", + "geolocation": null, + "intent_template": "Add the product with the lowest per unit price from my open tabs to the shopping cart", + "instantiation_dict": {}, + "intent": "Add the product with the lowest per unit price from my open tabs to the shopping cart", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "validate_in_shopping_cart", + "expected_data": { + "sku": "B07Q1NRQBW" + } + } + } + ], + "site": "shopping" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__SHOPPING__/checkout/cart", + "locator": "", + "required_contents": { + "must_include": [ + "Tazrigo 5pcs White Dental Resin Brush Pens Dental Shaping Silicone Tooth Tool" + ] + } + } + ] + }, + "intent_template_id": 145, + "changelogs": [ + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "shopping" + ], + "task_id": 434, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__/3-pairs-ruffle-socks-lace-ankle-socks-for-girls-frilly-socks-women-decorative.html |AND| __SHOPPING__/viviki-women-glitter-socks-ultrathin-transparent-tulle-lace-socks-no-show-ankle-crew-socks-3-pack.html", + "geolocation": null, + "intent_template": "Add the product with the lowest per unit price from my open tabs to the shopping cart", + "instantiation_dict": {}, + "intent": "Add the product with the lowest per unit price from my open tabs to the shopping cart", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "validate_in_shopping_cart", + "expected_data": { + "sku": "B08MFJFHQ4" + } + } + } + ], + "site": "shopping" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__SHOPPING__/checkout/cart", + "locator": "", + "required_contents": { + "must_include": [ + "VIVIKI Women Glitter Socks Ultrathin Transparent Tulle Lace Socks - No Show Ankle Crew Socks 3 Pack" + ] + } + } + ] + }, + "intent_template_id": 145, + "changelogs": [ + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "shopping" + ], + "task_id": 435, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__/35-ft-hdmi-cable-gearit-pro-series-hdmi-cable-35-feet-high-speed-ethernet-4k-resolution-3d-video-and-arc-audio-return-channel-hdmi-cable-white.html |AND| __SHOPPING__/dp-to-hdmi-cable-6ft-2-pack-fosmon-gold-plated-displayport-to-hdmi-cable-1080p-full-hd-for-pcs-to-hdtv-monitor-projector-with-hdmi-port.html", + "geolocation": null, + "intent_template": "Add the product with the lowest per unit price from my open tabs to the shopping cart", + "instantiation_dict": {}, + "intent": "Add the product with the lowest per unit price from my open tabs to the shopping cart", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "validate_in_shopping_cart", + "expected_data": { + "sku": "B01JMA0YX6" + } + } + } + ], + "site": "shopping" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__SHOPPING__/checkout/cart", + "locator": "", + "required_contents": { + "must_include": [ + "DP to HDMI Cable 6FT (2 Pack), Fosmon Gold Plated Displayport to HDMI Cable 1080p Full HD for PCs to HDTV, Monitor, Projector with HDMI Port" + ] + } + } + ] + }, + "intent_template_id": 145, + "changelogs": [ + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "shopping" + ], + "task_id": 436, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "I previously ordered {{product}} {{time}} and later cancelled. Can you reorder it for me?", + "original.intent_template": "I previously ordered some {{product}} {{time}} and later cancelled. Can you reorder it for me?", + "instantiation_dict": { + "product": "a mattress foundation", + "time": "around Feb or March 2023" + }, + "intent": "I previously ordered a mattress foundation around Feb or March 2023 and later cancelled. Can you reorder it for me?", + "original.intent": "I previously ordered some a mattress foundation around Feb or March 2023 and later cancelled. Can you reorder it for me?", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "validate_ordered", + "expected_data": { + "sku": "B07DFJ5XKH" + } + } + } + ], + "site": "shopping" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "func:shopping_get_latest_order_url()", + "locator": "document.querySelector(\".order-details-items.ordered\").outerText", + "required_contents": { + "must_include": [ + "B07DFJ5XKH" + ] + } + } + ] + }, + "intent_template_id": 156, + "changelogs": [ + { + "key": "intent_template", + "category": "spelling_or_grammar", + "note": "Fix wording" + }, + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "shopping" + ], + "task_id": 437, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "I previously ordered {{product}} {{time}} and later cancelled. Can you reorder it for me?", + "original.intent_template": "I previously ordered some {{product}} {{time}} and later cancelled. Can you reorder it for me?", + "instantiation_dict": { + "product": "a table lamp", + "time": "in May 2023" + }, + "intent": "I previously ordered a table lamp in May 2023 and later cancelled. Can you reorder it for me?", + "original.intent": "I previously ordered some a table lamp in May 2023 and later cancelled. Can you reorder it for me?", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "validate_ordered", + "expected_data": { + "sku": "B087QSCXGT" + } + } + } + ], + "site": "shopping" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "func:shopping_get_latest_order_url()", + "locator": "document.querySelector(\".order-details-items.ordered\").outerText", + "required_contents": { + "must_include": [ + "B072XS3F6W" + ] + } + } + ] + }, + "intent_template_id": 156, + "changelogs": [ + { + "key": "intent_template", + "category": "spelling_or_grammar", + "note": "Fix wording" + }, + { + "key": "expected_backend_state", + "category": "reference_alignment", + "note": "Original sku is for a lamp ordered in February not May. Updated to the lamp ordered in May" + }, + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "shopping" + ], + "task_id": 438, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "I previously ordered {{product}} {{time}} and later cancelled. Can you reorder it for me?", + "original.intent_template": "I previously ordered some {{product}} {{time}} and later cancelled. Can you reorder it for me?", + "instantiation_dict": { + "product": "a TV stand", + "time": "sometime around sep 2022" + }, + "intent": "I previously ordered a TV stand sometime around sep 2022 and later cancelled. Can you reorder it for me?", + "original.intent": "I previously ordered some a TV stand sometime around sep 2022 and later cancelled. Can you reorder it for me?", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "validate_ordered", + "expected_data": { + "sku": "B017SK5E3M" + } + } + } + ], + "site": "shopping" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "func:shopping_get_latest_order_url()", + "locator": "document.querySelector(\".order-details-items.ordered\").outerText", + "required_contents": { + "must_include": [ + "B08PVHRRB7" + ] + } + } + ] + }, + "intent_template_id": 156, + "changelogs": [ + { + "key": "intent_template", + "category": "spelling_or_grammar", + "note": "Fix wording" + }, + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "shopping" + ], + "task_id": 439, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "I previously ordered {{product}} {{time}} and later cancelled. Can you reorder it for me?", + "original.intent_template": "I previously ordered some {{product}} {{time}} and later cancelled. Can you reorder it for me?", + "instantiation_dict": { + "product": "a cat t-shirt", + "time": "during 2022" + }, + "intent": "I previously ordered a cat t-shirt during 2022 and later cancelled. Can you reorder it for me?", + "original.intent": "I previously ordered some a cat t-shirt during 2022 and later cancelled. Can you reorder it for me?", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "validate_ordered", + "expected_data": { + "sku": "B0844BWS76" + } + } + } + ], + "site": "shopping" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "func:shopping_get_latest_order_url()", + "locator": "document.querySelector(\".order-details-items.ordered\").outerText", + "required_contents": { + "must_include": [ + "B0844BWS76" + ] + } + } + ] + }, + "intent_template_id": 156, + "changelogs": [ + { + "key": "intent_template", + "category": "spelling_or_grammar", + "note": "Fix wording" + }, + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "shopping" + ], + "task_id": 440, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "I previously ordered {{product}} {{time}} and later cancelled. Can you reorder it for me?", + "original.intent_template": "I previously ordered some {{product}} {{time}} and later cancelled. Can you reorder it for me?", + "instantiation_dict": { + "product": "a make up removal kit", + "time": "during summer 2022" + }, + "intent": "I previously ordered a make up removal kit during summer 2022 and later cancelled. Can you reorder it for me?", + "original.intent": "I previously ordered some a make up removal kit during summer 2022 and later cancelled. Can you reorder it for me?", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "validate_ordered", + "expected_data": { + "sku": "B0738JQG6Q" + } + } + } + ], + "site": "shopping" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "func:shopping_get_latest_order_url()", + "locator": "document.querySelector(\".order-details-items.ordered\").outerText", + "required_contents": { + "must_include": [ + "B0738JQG6Q" + ] + } + } + ] + }, + "intent_template_id": 156, + "changelogs": [ + { + "key": "intent_template", + "category": "spelling_or_grammar", + "note": "Fix wording" + }, + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "gitlab" + ], + "task_id": 441, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__/byteblaze/gimmiethat.space", + "geolocation": null, + "intent_template": "Update the project site's title to \"{{title}}\"", + "instantiation_dict": { + "title": "GIVE ME SPACE" + }, + "intent": "Update the project site's title to \"GIVE ME SPACE\"", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "file_has_substring", + "eval_params": { + "group": "byteblaze", + "project": "gimmiethat.space", + "file_path": "index.html", + "substring": "GIVE ME SPACE", + "branch": "main" + } + } + } + ], + "site": "gitlab" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__GITLAB__/byteblaze/gimmiethat.space/-/raw/main/index.html", + "locator": "", + "required_contents": { + "must_include": [ + "GIVE ME SPACE" + ] + } + } + ] + }, + "intent_template_id": 308, + "changelogs": [ + { + "key": "expected_backend_state", + "category": "switch_to_api_eval", + "note": "Use API to verify; more accurate" + }, + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "gitlab" + ], + "task_id": 442, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__/byteblaze/gimmiethat.space", + "geolocation": null, + "intent_template": "Update the project site's title to \"{{title}}\"", + "instantiation_dict": { + "title": "Welcome to my site" + }, + "intent": "Update the project site's title to \"Welcome to my site\"", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "file_has_substring", + "eval_params": { + "group": "byteblaze", + "project": "gimmiethat.space", + "file_path": "index.html", + "substring": "Welcome to my site", + "branch": "main" + } + } + } + ], + "site": "gitlab" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__GITLAB__/byteblaze/gimmiethat.space/-/raw/main/index.html", + "locator": "", + "required_contents": { + "must_include": [ + "Welcome to my site" + ] + } + } + ] + }, + "intent_template_id": 308, + "changelogs": [ + { + "key": "expected_backend_state", + "category": "switch_to_api_eval", + "note": "Use API to verify; more accurate" + }, + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "gitlab" + ], + "task_id": 443, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__/byteblaze/gimmiethat.space", + "geolocation": null, + "intent_template": "Update the project site's title to \"{{title}}\"", + "instantiation_dict": { + "title": "Not an interesting site" + }, + "intent": "Update the project site's title to \"Not an interesting site\"", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "file_has_substring", + "eval_params": { + "group": "byteblaze", + "project": "gimmiethat.space", + "file_path": "index.html", + "substring": "Not an interesting site", + "branch": "main" + } + } + } + ], + "site": "gitlab" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__GITLAB__/byteblaze/gimmiethat.space/-/raw/main/index.html", + "locator": "", + "required_contents": { + "must_include": [ + "Not an interesting site" + ] + } + } + ] + }, + "intent_template_id": 308, + "changelogs": [ + { + "key": "expected_backend_state", + "category": "switch_to_api_eval", + "note": "Use API to verify; more accurate" + }, + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "gitlab" + ], + "task_id": 444, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__/byteblaze/gimmiethat.space", + "geolocation": null, + "intent_template": "Update the project site's title to \"{{title}}\"", + "instantiation_dict": { + "title": "Title Wanted" + }, + "intent": "Update the project site's title to \"Title Wanted\"", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "file_has_substring", + "eval_params": { + "group": "byteblaze", + "project": "gimmiethat.space", + "file_path": "index.html", + "substring": "Title Wanted", + "branch": "main" + } + } + } + ], + "site": "gitlab" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__GITLAB__/byteblaze/gimmiethat.space/-/raw/main/index.html", + "locator": "", + "required_contents": { + "must_include": [ + "Title Wanted" + ] + } + } + ] + }, + "intent_template_id": 308, + "changelogs": [ + { + "key": "expected_backend_state", + "category": "switch_to_api_eval", + "note": "Use API to verify; more accurate" + }, + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "gitlab" + ], + "task_id": 445, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__/byteblaze/gimmiethat.space", + "geolocation": null, + "intent_template": "Update the project site's title to \"{{title}}\"", + "instantiation_dict": { + "title": "Hello" + }, + "intent": "Update the project site's title to \"Hello\"", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "file_has_substring", + "eval_params": { + "group": "byteblaze", + "project": "gimmiethat.space", + "file_path": "index.html", + "substring": "Hello", + "branch": "main" + } + } + } + ], + "site": "gitlab" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__GITLAB__/byteblaze/gimmiethat.space/-/raw/main/index.html", + "locator": "", + "required_contents": { + "must_include": [ + "Hello" + ] + } + } + ] + }, + "intent_template_id": 308, + "changelogs": [ + { + "key": "expected_backend_state", + "category": "switch_to_api_eval", + "note": "Use API to verify; more accurate" + }, + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "gitlab" + ], + "task_id": 446, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Assign the issue regarding {{issue}} in {{repo}} to {{account}}.", + "instantiation_dict": { + "repo": "a11yproject", + "issue": 404, + "account": "Roshanjossey" + }, + "intent": "Assign the issue regarding 404 in a11yproject to Roshanjossey.", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "issue_assignee_is", + "eval_params": { + "usernames": [ + "Roshanjossey" + ], + "group": "a11yproject", + "project": "a11yproject.com", + "issue_iid": 1478 + } + } + } + ], + "site": "gitlab" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__GITLAB__/dashboard/issues?scope=all&state=opened&assignee_username=Roshanjossey", + "locator": "", + "required_contents": { + "must_include": [ + "404s, bad host, timeouts, bad urls for URLs linked from website" + ] + } + } + ] + }, + "intent_template_id": 999, + "changelogs": [ + { + "key": "expected_backend_state", + "category": "switch_to_api_eval", + "note": "Use API to verify; more accurate" + }, + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "gitlab" + ], + "task_id": 447, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Assign the issue regarding {{issue}} in {{repo}} to {{account}}.", + "instantiation_dict": { + "repo": "a11y-webring.club", + "issue": "linking to an accessibility statement", + "account": "Rohan" + }, + "intent": "Assign the issue regarding linking to an accessibility statement in a11y-webring.club to Rohan.", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "issue_assignee_is", + "eval_params": { + "usernames": [ + "Seirdy" + ], + "group": "byteblaze", + "project": "a11y-webring.club", + "issue_iid": 71 + } + } + } + ], + "site": "gitlab" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__GITLAB__/dashboard/issues?scope=all&state=opened&assignee_username=Seirdy", + "locator": "", + "required_contents": { + "must_include": [ + "linking to an accessibility statement" + ] + } + } + ] + }, + "intent_template_id": 999, + "changelogs": [ + { + "key": "expected_backend_state", + "category": "switch_to_api_eval", + "note": "Use API to verify; more accurate" + }, + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "gitlab" + ], + "task_id": 448, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "set the homepage URL on my GitLab profile to {{url}}", + "instantiation_dict": { + "url": "https://egg.tart.com" + }, + "intent": "set the homepage URL on my GitLab profile to https://egg.tart.com", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "is_user_website_url", + "eval_params": { + "username": "byteblaze", + "website_url": "https://egg.tart.com" + } + } + } + ], + "site": "gitlab" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__GITLAB__/byteblaze", + "locator": "document.querySelector('.profile-header [itemprop=\"url\"]').outerText", + "required_contents": { + "exact_match": "egg.tart.com" + } + } + ] + }, + "intent_template_id": 331, + "changelogs": [ + { + "key": "expected_backend_state", + "category": "switch_to_api_eval", + "note": "Use API to verify; more accurate" + }, + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "gitlab" + ], + "task_id": 449, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "set the homepage URL on my GitLab profile to {{url}}", + "instantiation_dict": { + "url": "https://helloworld.xyz" + }, + "intent": "set the homepage URL on my GitLab profile to https://helloworld.xyz", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "is_user_website_url", + "eval_params": { + "username": "byteblaze", + "website_url": "https://helloworld.xyz" + } + } + } + ], + "site": "gitlab" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__GITLAB__/byteblaze", + "locator": "document.querySelector('.profile-header [itemprop=\"url\"]').outerText", + "required_contents": { + "exact_match": "helloworld.xyz" + } + } + ] + }, + "intent_template_id": 331, + "changelogs": [ + { + "key": "expected_backend_state", + "category": "switch_to_api_eval", + "note": "Use API to verify; more accurate" + }, + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "gitlab" + ], + "task_id": 450, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "set the homepage URL on my GitLab profile to {{url}}", + "instantiation_dict": { + "url": "a11yproject.contributor.me" + }, + "intent": "set the homepage URL on my GitLab profile to a11yproject.contributor.me", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "validate_status", + "expected_data": { + "expected_status": "ACTION_NOT_ALLOWED_ERROR" + } + } + } + ], + "site": "gitlab" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__GITLAB__/byteblaze", + "locator": "document.querySelector('.profile-header [itemprop=\"url\"]').outerText", + "required_contents": { + "exact_match": "a11yproject.contributor.me" + } + } + ] + }, + "intent_template_id": 331, + "changelogs": [ + { + "key": "expected_data", + "category": "unachievable_tasks", + "note": "Website URL invalid in GitLab; cannot fulfill" + }, + { + "key": "expected_retrieve_value", + "category": "avoid_permissive_text", + "note": "Use structured response to judge correctness" + } + ] + }, + { + "sites": [ + "gitlab" + ], + "task_id": 451, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "set the homepage URL on my GitLab profile to {{url}}", + "instantiation_dict": { + "url": "www.byteblaze.com" + }, + "intent": "set the homepage URL on my GitLab profile to www.byteblaze.com", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "validate_status", + "expected_data": { + "expected_status": "ACTION_NOT_ALLOWED_ERROR" + } + } + } + ], + "site": "gitlab" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__GITLAB__/byteblaze", + "locator": "document.querySelector('.profile-header [itemprop=\"url\"]').outerText", + "required_contents": { + "exact_match": "www.byteblaze.com" + } + } + ] + }, + "intent_template_id": 331, + "changelogs": [ + { + "key": "expected_data", + "category": "unachievable_tasks", + "note": "Website URL invalid in GitLab; cannot fulfill" + }, + { + "key": "expected_retrieve_value", + "category": "avoid_permissive_text", + "note": "Use structured response to judge correctness" + } + ] + }, + { + "sites": [ + "gitlab" + ], + "task_id": 452, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "set the homepage URL on my GitLab profile to {{url}}", + "instantiation_dict": { + "url": "byteblaze.github.io" + }, + "intent": "set the homepage URL on my GitLab profile to byteblaze.github.io", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "validate_status", + "expected_data": { + "expected_status": "ACTION_NOT_ALLOWED_ERROR" + } + } + } + ], + "site": "gitlab" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__GITLAB__/byteblaze", + "locator": "document.querySelector('.profile-header [itemprop=\"url\"]').outerText", + "required_contents": { + "exact_match": "byteblaze.github.io" + } + } + ] + }, + "intent_template_id": 331, + "changelogs": [ + { + "key": "expected_data", + "category": "unachievable_tasks", + "note": "Website URL not valid in GitLab" + }, + { + "key": "expected_retrieve_value", + "category": "avoid_permissive_text", + "note": "Use structured response to judge correctness" + } + ] + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 453, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Disable {{product}} from the site, they are facing some quality issues.", + "instantiation_dict": { + "product": "Teton pullover hoodie" + }, + "intent": "Disable Teton pullover hoodie from the site, they are facing some quality issues.", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "verify_product_details", + "eval_params": { + "name": "Teton pullover hoodie" + }, + "expected_data": { + "is_disabled": true + } + } + } + ], + "site": "shopping_admin" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__SHOPPING_ADMIN__/catalog/product/edit/id/78/", + "locator": "document.querySelector('[name=\"product[status]\"').value", + "required_contents": { + "exact_match": "2" + } + } + ] + }, + "intent_template_id": 242, + "changelogs": [ + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 454, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Disable {{product}} from the site, they are facing some quality issues.", + "instantiation_dict": { + "product": "Ryker Tee Crew Neck" + }, + "intent": "Disable Ryker Tee Crew Neck from the site, they are facing some quality issues.", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "verify_product_details", + "eval_params": { + "name": "Ryker LumaTech™ Tee (Crew-neck)" + }, + "expected_data": { + "is_disabled": true + } + } + } + ], + "site": "shopping_admin" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__SHOPPING_ADMIN__/catalog/product/edit/id/478/", + "locator": "document.querySelector('[name=\"product[status]\"').value", + "required_contents": { + "exact_match": "2" + } + } + ] + }, + "intent_template_id": 242, + "changelogs": [ + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 455, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Disable {{product}} from the site, they are facing some quality issues.", + "instantiation_dict": { + "product": "Helios Endurance Tank" + }, + "original.instantiation_dict": { + "product": "lHelios Endurance Tank" + }, + "intent": "Disable Helios Endurance Tank from the site, they are facing some quality issues.", + "original.intent": "Disable lHelios Endurance Tank from the site, they are facing some quality issues.", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "verify_product_details", + "eval_params": { + "name": "Helios Endurance Tank" + }, + "expected_data": { + "is_disabled": true + } + } + } + ], + "site": "shopping_admin" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__SHOPPING_ADMIN__/catalog/product/edit/id/676/", + "locator": "document.querySelector('[name=\"product[status]\"').value", + "required_contents": { + "exact_match": "2" + } + } + ] + }, + "intent_template_id": 242, + "changelogs": [ + { + "key": "expected_backend_state", + "category": "reference_alignment", + "note": "Product name misspelled causing failure to find product" + }, + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 456, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Disable {{product}} from the site, they are facing some quality issues.", + "instantiation_dict": { + "product": "Cora Pant" + }, + "intent": "Disable Cora Pant from the site, they are facing some quality issues.", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "verify_product_details", + "eval_params": { + "name": "Cora Parachute Pant" + }, + "expected_data": { + "is_disabled": true + } + } + } + ], + "site": "shopping_admin" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__SHOPPING_ADMIN__/catalog/product/edit/id/1840/", + "locator": "document.querySelector('[name=\"product[status]\"').value", + "required_contents": { + "exact_match": "2" + } + } + ] + }, + "intent_template_id": 242, + "changelogs": [ + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 457, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Disable {{product}} from the site, they are facing some quality issues.", + "instantiation_dict": { + "product": "Karmen yoga pants" + }, + "intent": "Disable Karmen yoga pants from the site, they are facing some quality issues.", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "verify_product_details", + "eval_params": { + "name": "Karmen Yoga Pant" + }, + "expected_data": { + "is_disabled": true + } + } + } + ], + "site": "shopping_admin" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__SHOPPING_ADMIN__/catalog/product/edit/id/1819/", + "locator": "document.querySelector('[name=\"product[status]\"').value", + "required_contents": { + "exact_match": "2" + } + } + ] + }, + "intent_template_id": 242, + "changelogs": [ + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 458, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__/catalog/product/edit/id/1481/", + "geolocation": null, + "intent_template": "{{action}} the price of this product by {{amount}}", + "instantiation_dict": { + "amount": "$5", + "action": "Reduce" + }, + "intent": "Reduce the price of this product by $5", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "verify_product_details", + "eval_params": { + "product_id": "1481" + }, + "expected_data": { + "price": "27.00" + } + } + } + ], + "site": "shopping_admin" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__SHOPPING_ADMIN__/catalog/product/edit/id/1481/", + "locator": "document.querySelector('[name=\"product[price]\"').value", + "required_contents": { + "exact_match": "27.00" + } + } + ] + }, + "intent_template_id": 247, + "changelogs": [ + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 459, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__/catalog/product/edit/id/237/", + "geolocation": null, + "intent_template": "{{action}} the price of this product by {{amount}}", + "instantiation_dict": { + "amount": "10%", + "action": "Reduce" + }, + "intent": "Reduce the price of this product by 10%", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "verify_product_details", + "eval_params": { + "product_id": "237" + }, + "expected_data": { + "price": "62.10" + } + } + } + ], + "site": "shopping_admin" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__SHOPPING_ADMIN__/catalog/product/edit/id/237/", + "locator": "document.querySelector('[name=\"product[price]\"').value", + "required_contents": { + "exact_match": "62.10" + } + } + ] + }, + "intent_template_id": 247, + "changelogs": [ + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 460, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__/catalog/product/edit/id/418/", + "geolocation": null, + "intent_template": "{{action}} the price of this product by {{amount}}", + "instantiation_dict": { + "amount": "15%", + "action": "Reduce" + }, + "intent": "Reduce the price of this product by 15%", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "verify_product_details", + "eval_params": { + "product_id": "418" + }, + "expected_data": { + "price": "38.25" + } + } + } + ], + "site": "shopping_admin" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__SHOPPING_ADMIN__/catalog/product/edit/id/418/", + "locator": "document.querySelector('[name=\"product[price]\"').value", + "required_contents": { + "exact_match": "38.25" + } + } + ] + }, + "intent_template_id": 247, + "changelogs": [ + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 461, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__/catalog/product/edit/id/721/", + "geolocation": null, + "intent_template": "{{action}} the price of this product by {{amount}}", + "instantiation_dict": { + "amount": "$11.5", + "action": "Increase" + }, + "intent": "Increase the price of this product by $11.5", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "verify_product_details", + "eval_params": { + "product_id": "721" + }, + "expected_data": { + "price": "29.50" + } + } + } + ], + "site": "shopping_admin" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__SHOPPING_ADMIN__/catalog/product/edit/id/721/", + "locator": "document.querySelector('[name=\"product[price]\"').value", + "required_contents": { + "exact_match": "29.50" + } + } + ] + }, + "intent_template_id": 247, + "changelogs": [ + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 462, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__/catalog/product/edit/id/710/", + "geolocation": null, + "intent_template": "{{action}} the price of this product by {{amount}}", + "instantiation_dict": { + "amount": "10%", + "action": "Increase" + }, + "intent": "Increase the price of this product by 10%", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "verify_product_details", + "eval_params": { + "product_id": "710" + }, + "expected_data": { + "price": "19.80" + } + } + } + ], + "site": "shopping_admin" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__SHOPPING_ADMIN__/catalog/product/edit/id/710/", + "locator": "document.querySelector('[name=\"product[price]\"').value", + "required_contents": { + "exact_match": "19.80" + } + } + ] + }, + "intent_template_id": 247, + "changelogs": [ + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 463, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__/catalog/product/edit/id/996/", + "geolocation": null, + "intent_template": "{{action}} the price of this product by {{amount}}", + "instantiation_dict": { + "amount": "15%", + "action": "Increase" + }, + "intent": "Increase the price of this product by 15%", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "verify_product_details", + "eval_params": { + "product_id": "996" + }, + "expected_data": { + "price": "36.80" + } + } + } + ], + "site": "shopping_admin" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__SHOPPING_ADMIN__/catalog/product/edit/id/996/", + "locator": "document.querySelector('[name=\"product[price]\"').value", + "required_contents": { + "exact_match": "36.80" + } + } + ] + }, + "intent_template_id": 247, + "changelogs": [ + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 464, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Update the product description of {{product}} with the titles of all reviews with 4 stars or above.", + "original.intent_template": "Update the product description of {{product}} to highlight the real user positive reviews by quoting the comments", + "instantiation_dict": { + "product": "Antonia Racer Tank" + }, + "intent": "Update the product description of Antonia Racer Tank with the titles of all reviews with 4 stars or above.", + "original.intent": "Update the product description of Antonia Racer Tank to highlight the real user positive reviews by quoting the comments", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "verify_product_details", + "eval_params": { + "product_id": "1796" + }, + "expected_data": { + "description": [ + "A regular or me" + ] + } + } + } + ], + "site": "shopping_admin" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__SHOPPING_ADMIN__/../antonia-racer-tank.html", + "locator": "document.querySelector('.data.item.content').outerText + (document.querySelector('.product.attribute.overview [itemprop=\"description\"]')?.outerText || '')", + "required_contents": { + "must_include": [ + "This is in regular rotation at the gym", + "Its colorful and looks kinda cute under my exercise tanks", + "it's very stylish for yoga or something else low impact" + ] + } + } + ] + }, + "intent_template_id": 251, + "changelogs": [ + { + "key": "intent_template", + "category": "permissive_string_match", + "note": "Clarify the intent to specify exactly what to update the value with" + }, + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "shopping" + ], + "task_id": 465, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "Add {{product}} to my wish list", + "instantiation_dict": { + "product": "Tide PODS Spring Meadow Scent HE Turbo Laundry Detergent Pacs, 81 Count" + }, + "intent": "Add Tide PODS Spring Meadow Scent HE Turbo Laundry Detergent Pacs, 81 Count to my wish list", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "validate_in_wishlist", + "expected_data": { + "sku": "B074QVN413" + } + } + } + ], + "site": "shopping" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__SHOPPING__/wishlist/", + "locator": "document.querySelector('.products-grid.wishlist').outerText", + "required_contents": { + "must_include": [ + "Tide PODS Spring Meadow Scent HE Turbo Laundry Detergent Pacs, 81 Count" + ] + } + } + ] + }, + "intent_template_id": 186, + "changelogs": [ + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "shopping" + ], + "task_id": 466, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "Add {{product}} to my wish list", + "instantiation_dict": { + "product": "2 Hawaiian Bamboo Orchid Roots #zc50 - by Discount Hawaiian Gifts" + }, + "intent": "Add 2 Hawaiian Bamboo Orchid Roots #zc50 - by Discount Hawaiian Gifts to my wish list", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "validate_in_wishlist", + "expected_data": { + "sku": "B01M1RMOLX" + } + } + } + ], + "site": "shopping" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__SHOPPING__/wishlist/", + "locator": "document.querySelector('.products-grid.wishlist').outerText", + "required_contents": { + "must_include": [ + "2 Hawaiian Bamboo Orchid Roots #zc50 - by Discount Hawaiian Gifts" + ] + } + } + ] + }, + "intent_template_id": 186, + "changelogs": [ + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "shopping" + ], + "task_id": 467, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "Add {{product}} to my wish list", + "instantiation_dict": { + "product": "HONGJ Hawaiian Beach Outfits Set for Mens, Summer Tropical Tree Printed Relaxed-fit Hawaii Shirts Shorts 2 Piece Suits" + }, + "intent": "Add HONGJ Hawaiian Beach Outfits Set for Mens, Summer Tropical Tree Printed Relaxed-fit Hawaii Shirts Shorts 2 Piece Suits to my wish list", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "validate_in_wishlist", + "expected_data": { + "sku": "B09STCV25D" + } + } + } + ], + "site": "shopping" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__SHOPPING__/wishlist/", + "locator": "document.querySelector('.products-grid.wishlist').outerText", + "required_contents": { + "must_include": [ + "HONGJ Hawaiian Beach Outfits Set for Mens, Summer Tropical Tree Printed Relaxed-fit Hawaii Shirts Shorts 2 Piece Suits" + ] + } + } + ] + }, + "intent_template_id": 186, + "changelogs": [ + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "shopping" + ], + "task_id": 468, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "Add {{product}} to my wish list", + "instantiation_dict": { + "product": "DkRgVNY Lace Spcling Lingerie Womens Sexy Hollow Out Underwear Bodysuit One Piece Snap Crotch Clubwear Teddy Bodysuit" + }, + "intent": "Add DkRgVNY Lace Spcling Lingerie Womens Sexy Hollow Out Underwear Bodysuit One Piece Snap Crotch Clubwear Teddy Bodysuit to my wish list", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "validate_in_wishlist", + "expected_data": { + "sku": "B09QZRWT97" + } + } + } + ], + "site": "shopping" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__SHOPPING__/wishlist/", + "locator": "document.querySelector('.products-grid.wishlist').outerText", + "required_contents": { + "must_include": [ + "DkRgVNY Lace Spcling Lingerie Womens Sexy Hollow Out Underwear Bodysuit One Piece Snap Crotch Clubwear Teddy Bodysuit" + ] + } + } + ] + }, + "intent_template_id": 186, + "changelogs": [ + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "shopping" + ], + "task_id": 469, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "Add {{product}} to my wish list", + "instantiation_dict": { + "product": "Light Blue Simple Summer New Low Heels Slippers for Women Fashion Chunky Heels Pointed Toe Wine Glasses Sandals Comfortable Walking Shoes Ladies All-Match Sexy Party Shoes" + }, + "intent": "Add Light Blue Simple Summer New Low Heels Slippers for Women Fashion Chunky Heels Pointed Toe Wine Glasses Sandals Comfortable Walking Shoes Ladies All-Match Sexy Party Shoes to my wish list", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "validate_in_wishlist", + "expected_data": { + "sku": "B09QXM7B42" + } + } + } + ], + "site": "shopping" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__SHOPPING__/wishlist/", + "locator": "document.querySelector('.products-grid.wishlist').outerText", + "required_contents": { + "must_include": [ + "Light Blue Simple Summer New Low Heels Slippers for Women Fashion Chunky Heels Pointed Toe Wine Glasses Sandals Comfortable Walking Shoes Ladies All-Match Sexy Party Shoes" + ] + } + } + ] + }, + "intent_template_id": 186, + "changelogs": [ + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 470, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Cancel order {{id}}", + "instantiation_dict": { + "id": "302" + }, + "intent": "Cancel order 302", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "verify_order_details", + "eval_params": { + "order_id": "302" + }, + "expected_data": { + "status": "Canceled" + } + } + } + ], + "site": "shopping_admin" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__SHOPPING_ADMIN__/sales/order/view/order_id/302/", + "locator": "document.querySelector(\"#order_status\").outerText", + "required_contents": { + "exact_match": "Canceled" + } + } + ] + }, + "intent_template_id": 257, + "changelogs": [ + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 471, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Cancel order {{id}}", + "instantiation_dict": { + "id": "307" + }, + "intent": "Cancel order 307", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "verify_order_details", + "eval_params": { + "order_id": "307" + }, + "expected_data": { + "status": "Canceled" + } + } + } + ], + "site": "shopping_admin" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__SHOPPING_ADMIN__/sales/order/view/order_id/307/", + "locator": "document.querySelector(\"#order_status\").outerText", + "required_contents": { + "exact_match": "Canceled" + } + } + ] + }, + "intent_template_id": 257, + "changelogs": [ + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 472, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Cancel order {{id}}", + "instantiation_dict": { + "id": "299" + }, + "intent": "Cancel order 299", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "verify_order_details", + "eval_params": { + "order_id": "299" + }, + "expected_data": { + "status": "Canceled" + } + } + } + ], + "site": "shopping_admin" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__SHOPPING_ADMIN__/sales/order/view/order_id/299/", + "locator": "document.querySelector(\"#order_status\").outerText", + "required_contents": { + "exact_match": "Canceled" + } + } + ] + }, + "intent_template_id": 257, + "changelogs": [ + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 473, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Cancel order {{id}}", + "instantiation_dict": { + "id": "301" + }, + "intent": "Cancel order 301", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "verify_order_details", + "eval_params": { + "order_id": "301" + }, + "expected_data": { + "status": "Canceled" + } + } + } + ], + "site": "shopping_admin" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__SHOPPING_ADMIN__/sales/order/view/order_id/301/", + "locator": "document.querySelector(\"#order_status\").outerText", + "required_contents": { + "exact_match": "Canceled" + } + } + ] + }, + "intent_template_id": 257, + "changelogs": [ + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 474, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Cancel order {{id}}", + "instantiation_dict": { + "id": "305" + }, + "intent": "Cancel order 305", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "verify_order_details", + "eval_params": { + "order_id": "305" + }, + "expected_data": { + "status": "Canceled" + } + } + } + ], + "site": "shopping_admin" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__SHOPPING_ADMIN__/sales/order/view/order_id/305/", + "locator": "document.querySelector(\"#order_status\").outerText", + "required_contents": { + "exact_match": "Canceled" + } + } + ] + }, + "intent_template_id": 257, + "changelogs": [ + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "gitlab" + ], + "task_id": 475, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Set up a new, empty repository with the name {{project_name}}?", + "instantiation_dict": { + "project_name": "chatgpt_plugin" + }, + "intent": "Set up a new, empty repository with the name chatgpt_plugin?", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "user_has_project", + "eval_params": { + "project": "chatgpt_plugin" + } + } + } + ], + "site": "gitlab" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__GITLAB__/byteblaze/chatgpt_plugin", + "locator": "", + "required_contents": { + "must_include": [ + "chatgpt_plugin" + ] + } + } + ] + }, + "intent_template_id": 292, + "changelogs": [ + { + "key": "expected_backend_state", + "category": "switch_to_api_eval", + "note": "Use API to verify; more accurate" + }, + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "gitlab" + ], + "task_id": 476, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Set up a new, empty repository with the name {{project_name}}?", + "instantiation_dict": { + "project_name": "awesome_llm_reading" + }, + "intent": "Set up a new, empty repository with the name awesome_llm_reading?", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "user_has_project", + "eval_params": { + "project": "awesome_llm_reading" + } + } + } + ], + "site": "gitlab" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__GITLAB__/byteblaze/awesome_llm_reading", + "locator": "", + "required_contents": { + "must_include": [ + "awesome_llm_reading" + ] + } + } + ] + }, + "intent_template_id": 292, + "changelogs": [ + { + "key": "expected_backend_state", + "category": "switch_to_api_eval", + "note": "Use API to verify; more accurate" + }, + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "gitlab" + ], + "task_id": 477, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Set up a new, empty repository with the name {{project_name}}?", + "instantiation_dict": { + "project_name": "awesome_program_aided_reasoning" + }, + "intent": "Set up a new, empty repository with the name awesome_program_aided_reasoning?", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "user_has_project", + "eval_params": { + "project": "awesome_program_aided_reasoning" + } + } + } + ], + "site": "gitlab" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__GITLAB__/byteblaze/awesome_program_aided_reasoning", + "locator": "", + "required_contents": { + "must_include": [ + "awesome_program_aided_reasoning" + ] + } + } + ] + }, + "intent_template_id": 292, + "changelogs": [ + { + "key": "expected_backend_state", + "category": "switch_to_api_eval", + "note": "Use API to verify; more accurate" + }, + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "gitlab" + ], + "task_id": 478, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Set up a new, empty repository with the name {{project_name}}?", + "instantiation_dict": { + "project_name": "webagent" + }, + "intent": "Set up a new, empty repository with the name webagent?", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "user_has_project", + "eval_params": { + "project": "webagent" + } + } + } + ], + "site": "gitlab" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__GITLAB__/byteblaze/webagent", + "locator": "", + "required_contents": { + "must_include": [ + "webagent" + ] + } + } + ] + }, + "intent_template_id": 292, + "changelogs": [ + { + "key": "expected_backend_state", + "category": "switch_to_api_eval", + "note": "Use API to verify; more accurate" + }, + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "gitlab" + ], + "task_id": 479, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Set up a new, empty repository with the name {{project_name}}?", + "instantiation_dict": { + "project_name": "awesome_webagent" + }, + "intent": "Set up a new, empty repository with the name awesome_webagent?", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "user_has_project", + "eval_params": { + "project": "awesome_webagent" + } + } + } + ], + "site": "gitlab" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__GITLAB__/byteblaze/awesome_webagent", + "locator": "", + "required_contents": { + "must_include": [ + "awesome_webagent" + ] + } + } + ] + }, + "intent_template_id": 292, + "changelogs": [ + { + "key": "expected_backend_state", + "category": "switch_to_api_eval", + "note": "Use API to verify; more accurate" + }, + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "gitlab" + ], + "task_id": 480, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Invite {{collaborator_account_list}} as collaborator to {{repo}}", + "instantiation_dict": { + "collaborator_account_list": "yjlou", + "repo": "solarized-prism-theme" + }, + "intent": "Invite yjlou as collaborator to solarized-prism-theme", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "project_has_collaborators", + "eval_params": { + "collaborators": [ + "yjlou" + ], + "group": "byteblaze", + "project": "solarized-prism-theme" + } + } + } + ], + "site": "gitlab" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__GITLAB__/byteblaze/solarized-prism-theme/-/project_members", + "locator": "", + "required_contents": { + "must_include": [ + "yjlou" + ] + } + } + ] + }, + "intent_template_id": 293, + "changelogs": [ + { + "key": "expected_backend_state", + "category": "switch_to_api_eval", + "note": "Use API to verify; more accurate" + }, + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "gitlab" + ], + "task_id": 481, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "{{name}} wants to check my dotfile configurations. Please invite him to the repo as a guest.", + "instantiation_dict": { + "name": "Abishek" + }, + "intent": "Abishek wants to check my dotfile configurations. Please invite him to the repo as a guest.", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "project_has_collaborators", + "eval_params": { + "collaborators": [ + "abisubramanya27" + ], + "group": "byteblaze", + "project": "dotfiles", + "access_level": 10 + } + } + } + ], + "site": "gitlab" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__GITLAB__/byteblaze/dotfiles/-/project_members", + "locator": "func:gitlab_get_project_memeber_role(__page__, 'abisubramanya27')", + "required_contents": { + "must_include": [ + "Guest" + ] + } + } + ] + }, + "intent_template_id": 294, + "changelogs": [ + { + "key": "expected_backend_state", + "category": "switch_to_api_eval", + "note": "Use API to verify; more accurate" + }, + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "gitlab" + ], + "task_id": 482, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "{{name}} wants to check my dotfile configurations. Please invite him to the repo as a guest.", + "instantiation_dict": { + "name": "yjlou" + }, + "intent": "yjlou wants to check my dotfile configurations. Please invite him to the repo as a guest.", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "project_has_collaborators", + "eval_params": { + "collaborators": [ + "yjlou" + ], + "group": "byteblaze", + "project": "dotfiles", + "access_level": 10 + } + } + } + ], + "site": "gitlab" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__GITLAB__/byteblaze/dotfiles/-/project_members", + "locator": "func:gitlab_get_project_memeber_role(__page__, 'yjlou')", + "required_contents": { + "must_include": [ + "Guest" + ] + } + } + ] + }, + "intent_template_id": 294, + "changelogs": [ + { + "key": "expected_backend_state", + "category": "switch_to_api_eval", + "note": "Use API to verify; more accurate" + }, + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "gitlab" + ], + "task_id": 483, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "{{name}} wants to check my dotfile configurations. Please invite him to the repo as a guest.", + "instantiation_dict": { + "name": "Koushik" + }, + "intent": "Koushik wants to check my dotfile configurations. Please invite him to the repo as a guest.", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "project_has_collaborators", + "eval_params": { + "collaborators": [ + "koush" + ], + "group": "byteblaze", + "project": "dotfiles", + "access_level": 10 + } + } + } + ], + "site": "gitlab" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__GITLAB__/byteblaze/dotfiles/-/project_members", + "locator": "func:gitlab_get_project_memeber_role(__page__, 'koush')", + "required_contents": { + "must_include": [ + "Guest" + ] + } + } + ] + }, + "intent_template_id": 294, + "changelogs": [ + { + "key": "expected_backend_state", + "category": "switch_to_api_eval", + "note": "Use API to verify; more accurate" + }, + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "gitlab" + ], + "task_id": 484, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "{{name}} wants to check my dotfile configurations. Please invite him to the repo as a guest.", + "instantiation_dict": { + "name": "Jakub Klinkovsk\u00fd" + }, + "intent": "Jakub Klinkovsk\u00fd wants to check my dotfile configurations. Please invite him to the repo as a guest.", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "project_has_collaborators", + "eval_params": { + "collaborators": [ + "lahwaacz" + ], + "group": "byteblaze", + "project": "dotfiles", + "access_level": 10 + } + } + } + ], + "site": "gitlab" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__GITLAB__/byteblaze/dotfiles/-/project_members", + "locator": "func:gitlab_get_project_memeber_role(__page__, 'lahwaacz')", + "required_contents": { + "must_include": [ + "Guest" + ] + } + } + ] + }, + "intent_template_id": 294, + "changelogs": [ + { + "key": "expected_backend_state", + "category": "switch_to_api_eval", + "note": "Use API to verify; more accurate" + }, + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "gitlab" + ], + "task_id": 485, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "{{name}} wants to check my dotfile configurations. Please invite him to the repo as a guest.", + "instantiation_dict": { + "name": "Vinta" + }, + "intent": "Vinta wants to check my dotfile configurations. Please invite him to the repo as a guest.", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "project_has_collaborators", + "eval_params": { + "collaborators": [ + "vinta" + ], + "group": "byteblaze", + "project": "dotfiles", + "access_level": 10 + } + } + } + ], + "site": "gitlab" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__GITLAB__/byteblaze/dotfiles/-/project_members", + "locator": "func:gitlab_get_project_memeber_role(__page__, 'vinta')", + "required_contents": { + "must_include": [ + "Guest" + ] + } + } + ] + }, + "intent_template_id": 294, + "changelogs": [ + { + "key": "expected_backend_state", + "category": "switch_to_api_eval", + "note": "Use API to verify; more accurate" + }, + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 486, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Change the page title of \"{{old-heading}}\" page on my site to \"{{heading}}\".", + "instantiation_dict": { + "old-heading": "404 Not Found", + "heading": "Bruh bro you clicked the wrong page" + }, + "intent": "Change the page title of \"404 Not Found\" page on my site to \"Bruh bro you clicked the wrong page\".", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "verify_site_page_details", + "eval_params": { + "page_id": "1" + }, + "expected_data": { + "title": "Bruh bro you clicked the wrong page" + } + } + } + ], + "site": "shopping_admin" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__SHOPPING_ADMIN__/cms/page/edit/page_id/1/", + "locator": "document.querySelector('input[name=\"title\"').value", + "required_contents": { + "exact_match": "Bruh bro you clicked the wrong page" + } + } + ] + }, + "intent_template_id": 275, + "changelogs": [ + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 487, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Change the page title of \"{{old-heading}}\" page on my site to \"{{heading}}\".", + "instantiation_dict": { + "old-heading": "Enable Cookies", + "heading": "Cookie monster coming to your place" + }, + "intent": "Change the page title of \"Enable Cookies\" page on my site to \"Cookie monster coming to your place\".", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "verify_site_page_details", + "eval_params": { + "page_id": "3" + }, + "expected_data": { + "title": "Cookie monster coming to your place" + } + } + } + ], + "site": "shopping_admin" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__SHOPPING_ADMIN__/cms/page/edit/page_id/3/", + "locator": "document.querySelector('input[name=\"title\"').value", + "required_contents": { + "exact_match": "Cookie monster coming to your place" + } + } + ] + }, + "intent_template_id": 275, + "changelogs": [ + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 488, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Change the page title of \"{{old-heading}}\" page on my site to \"{{heading}}\".", + "instantiation_dict": { + "old-heading": "Home Page", + "heading": "This is the home page!! Leave here!!" + }, + "intent": "Change the page title of \"Home Page\" page on my site to \"This is the home page!! Leave here!!\".", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "verify_site_page_details", + "eval_params": { + "page_id": "2" + }, + "expected_data": { + "title": "This is the home page!! Leave here!!" + } + } + } + ], + "site": "shopping_admin" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__SHOPPING_ADMIN__/cms/page/edit/page_id/2/", + "locator": "document.querySelector('input[name=\"title\"').value", + "required_contents": { + "exact_match": "This is the home page!! Leave here!!" + } + } + ] + }, + "intent_template_id": 275, + "changelogs": [ + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 489, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Change the page title of \"{{old-heading}}\" page on my site to \"{{heading}}\".", + "instantiation_dict": { + "old-heading": "Privacy Policy", + "heading": "No privacy policy is needed in this dystopian world" + }, + "original.instantiation_dict": { + "old-heading": "Privacy Policy", + "heading": "No privacy policy is needed is this dystopian world" + }, + "intent": "Change the page title of \"Privacy Policy\" page on my site to \"No privacy policy is needed in this dystopian world\".", + "original.intent": "Change the page title of \"Privacy Policy\" page on my site to \"No privacy policy is needed is this dystopian world\".", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "verify_site_page_details", + "eval_params": { + "page_id": "4" + }, + "expected_data": { + "title": "No privacy policy is needed in this dystopian world" + } + } + } + ], + "site": "shopping_admin" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__SHOPPING_ADMIN__/cms/page/edit/page_id/4/", + "locator": "document.querySelector('input[name=\"title\"').value", + "required_contents": { + "exact_match": "No privacy policy is needed is this dystopian world" + } + } + ] + }, + "intent_template_id": 275, + "changelogs": [ + { + "key": "instantiation_dict", + "category": "spelling_or_grammar", + "note": "Update grammar for heading" + }, + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 490, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Change the page title of \"{{old-heading}}\" page on my site to \"{{heading}}\".", + "instantiation_dict": { + "old-heading": "About us", + "heading": "Secret" + }, + "intent": "Change the page title of \"About us\" page on my site to \"Secret\".", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "verify_site_page_details", + "eval_params": { + "page_id": "5" + }, + "expected_data": { + "title": "Secret" + } + } + } + ], + "site": "shopping_admin" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__SHOPPING_ADMIN__/cms/page/edit/page_id/5/", + "locator": "document.querySelector('input[name=\"title\"').value", + "required_contents": { + "exact_match": "Secret" + } + } + ] + }, + "intent_template_id": 275, + "changelogs": [ + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 491, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Notify {{name}} in their most recent pending order with message \"{{message}}\"", + "instantiation_dict": { + "name": "Sarah Miller", + "message": "the order is ready to be shipped soon!" + }, + "intent": "Notify Sarah Miller in their most recent pending order with message \"the order is ready to be shipped soon!\"", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "validate_status", + "expected_data": { + "expected_status": "ACTION_NOT_ALLOWED_ERROR" + } + } + } + ], + "site": "shopping_admin" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": "N/A" + }, + "reference_url": "", + "program_html": [], + "string_note": "System message: We cannot add order history." + }, + "intent_template_id": 280, + "changelogs": [ + { + "key": "expected_retrieve_value", + "category": "unachievable_tasks", + "note": "Status check used instead of fuzzy match" + } + ] + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 492, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Notify {{name}} in their most recent pending order with message \"{{message}}\"", + "instantiation_dict": { + "name": "Jane Doe", + "message": "sorry we are out of stock, please reorder" + }, + "intent": "Notify Jane Doe in their most recent pending order with message \"sorry we are out of stock, please reorder\"", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "verify_order_details", + "eval_params": { + "order_id": "302" + }, + "expected_data": { + "comment": "sorry we are out of stock, please reorder" + } + } + } + ], + "site": "shopping_admin" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__SHOPPING_ADMIN__/sales/order/view/order_id/302/", + "locator": "document.querySelector(\"#order_history_block\").querySelector(\".note-list\").firstElementChild.querySelector(\".note-list-comment\").outerText", + "required_contents": { + "exact_match": "sorry we are out of stock, please reorder" + } + } + ] + }, + "intent_template_id": 280, + "changelogs": [ + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 493, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Notify {{name}} in their most recent pending order with message \"{{message}}\"", + "instantiation_dict": { + "name": "Grace Nguyen", + "message": "sorry we are bankrupt, please contact our customer service for refund" + }, + "intent": "Notify Grace Nguyen in their most recent pending order with message \"sorry we are bankrupt, please contact our customer service for refund\"", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "verify_order_details", + "eval_params": { + "order_id": "307" + }, + "expected_data": { + "comment": "sorry we are bankrupt, please contact our customer service for refund" + } + } + } + ], + "site": "shopping_admin" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__SHOPPING_ADMIN__/sales/order/view/order_id/307/", + "locator": "document.querySelector(\"#order_history_block\").querySelector(\".note-list\").firstElementChild.querySelector(\".note-list-comment\").outerText", + "required_contents": { + "exact_match": "sorry we are bankrupt, please contact our customer service for refund" + } + } + ] + }, + "intent_template_id": 280, + "changelogs": [ + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 494, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Notify {{name}} in their most recent pending order with message \"{{message}}\"", + "instantiation_dict": { + "name": "Alex Thomas", + "message": "Yo, your order will be shipped soon!" + }, + "intent": "Notify Alex Thomas in their most recent pending order with message \"Yo, your order will be shipped soon!\"", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "verify_order_details", + "eval_params": { + "order_id": "304" + }, + "expected_data": { + "comment": "Yo, your order will be shipped soon!" + } + } + } + ], + "site": "shopping_admin" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__SHOPPING_ADMIN__/sales/order/view/order_id/304/", + "locator": "document.querySelector(\"#order_history_block\").querySelector(\".note-list\").firstElementChild.querySelector(\".note-list-comment\").outerText", + "required_contents": { + "exact_match": "Yo, your order will be shipped soon!" + } + } + ] + }, + "intent_template_id": 280, + "changelogs": [ + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 495, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Notify {{name}} in their most recent pending order with message \"{{message}}\"", + "instantiation_dict": { + "name": "Lily Potter", + "message": "Thanks, your order is ready to be shipped!" + }, + "intent": "Notify Lily Potter in their most recent pending order with message \"Thanks, your order is ready to be shipped!\"", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "verify_order_details", + "eval_params": { + "order_id": "303" + }, + "expected_data": { + "comment": "Thanks, your order is ready to be shipped!" + } + } + } + ], + "site": "shopping_admin" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__SHOPPING_ADMIN__/sales/order/view/order_id/303/", + "locator": "document.querySelector(\"#order_history_block\").querySelector(\".note-list\").firstElementChild.querySelector(\".note-list-comment\").outerText", + "required_contents": { + "exact_match": "Thanks, your order is ready to be shipped!" + } + } + ] + }, + "intent_template_id": 280, + "changelogs": [ + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 496, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Update order #{{order}} with the {{service}} tracking number {{tracking}}", + "instantiation_dict": { + "tracking": "8974568499", + "order": "299", + "service": "Federal Express" + }, + "intent": "Update order #299 with the Federal Express tracking number 8974568499", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "verify_order_details", + "eval_params": { + "order_id": "299" + }, + "expected_data": { + "shipping_carrier": "Federal Express", + "tracking_number": "8974568499" + } + } + } + ], + "site": "shopping_admin" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__SHOPPING_ADMIN__/sales/order/commentsHistory/order_id/299/active_tab/order_shipments/", + "locator": "", + "required_contents": { + "must_include": [ + "Tracking number 8974568499 for Federal Express assigned" + ] + } + } + ] + }, + "intent_template_id": 284, + "changelogs": [ + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 497, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Update order #{{order}} with the {{service}} tracking number {{tracking}}", + "instantiation_dict": { + "tracking": "24353446464", + "order": "307", + "service": "DHL" + }, + "intent": "Update order #307 with the DHL tracking number 24353446464", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "verify_order_details", + "eval_params": { + "order_id": "307" + }, + "expected_data": { + "shipping_carrier": "DHL", + "tracking_number": "24353446464" + } + } + } + ], + "site": "shopping_admin" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__SHOPPING_ADMIN__/sales/order/commentsHistory/order_id/307/active_tab/order_shipments/", + "locator": "", + "required_contents": { + "must_include": [ + "Tracking number 24353446464 for DHL assigned" + ] + } + } + ] + }, + "intent_template_id": 284, + "changelogs": [ + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 498, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Update order #{{order}} with the {{service}} tracking number {{tracking}}", + "instantiation_dict": { + "tracking": "55591023930", + "order": "306", + "service": "UPS" + }, + "intent": "Update order #306 with the UPS tracking number 55591023930", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "verify_order_details", + "eval_params": { + "order_id": "306" + }, + "expected_data": { + "shipping_carrier": "UPS", + "tracking_number": "55591023930" + } + } + } + ], + "site": "shopping_admin" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__SHOPPING_ADMIN__/sales/order/commentsHistory/order_id/306/active_tab/order_shipments/", + "locator": "", + "required_contents": { + "must_include": [ + "Tracking number 55591023930 for United Parcel Service assigned" + ] + } + } + ] + }, + "intent_template_id": 284, + "changelogs": [ + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 499, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Update order #{{order}} with the {{service}} tracking number {{tracking}}", + "instantiation_dict": { + "tracking": "13849373987", + "order": "304", + "service": "USPS" + }, + "intent": "Update order #304 with the USPS tracking number 13849373987", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "verify_order_details", + "eval_params": { + "order_id": "304" + }, + "expected_data": { + "shipping_carrier": "USPS", + "tracking_number": "13849373987" + } + } + } + ], + "site": "shopping_admin" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__SHOPPING_ADMIN__/sales/order/commentsHistory/order_id/304/active_tab/order_shipments/", + "locator": "", + "required_contents": { + "must_include": [ + "Tracking number 13849373987 for United States Postal Service assigned" + ] + } + } + ] + }, + "intent_template_id": 284, + "changelogs": [ + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 500, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Update order #{{order}} with the {{service}} tracking number {{tracking}}", + "instantiation_dict": { + "tracking": "239028439840", + "order": "301", + "service": "DHL" + }, + "intent": "Update order #301 with the DHL tracking number 239028439840", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "verify_order_details", + "eval_params": { + "order_id": "301" + }, + "expected_data": { + "shipping_carrier": "DHL", + "tracking_number": "239028439840" + } + } + } + ], + "site": "shopping_admin" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__SHOPPING_ADMIN__/sales/order/commentsHistory/order_id/301/active_tab/order_shipments/", + "locator": "", + "required_contents": { + "must_include": [ + "Tracking number 239028439840 for DHL assigned" + ] + } + } + ] + }, + "intent_template_id": 284, + "changelogs": [ + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 501, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Mark all {{product}} as out of stock", + "original.intent_template": "Make all {{product}} as out of stock", + "instantiation_dict": { + "product": "Taurus Elements Shell" + }, + "intent": "Mark all Taurus Elements Shell as out of stock", + "original.intent": "Make all Taurus Elements Shell as out of stock", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "verify_product_details", + "eval_params": { + "product_id": "350" + }, + "expected_data": { + "in_stock": false + } + } + } + ], + "site": "shopping_admin" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__SHOPPING_ADMIN__/catalog/product/edit/id/350/", + "locator": "document.querySelector('[name=\"product[quantity_and_stock_status][is_in_stock]\"').value", + "required_contents": { + "exact_match": "0" + } + } + ] + }, + "intent_template_id": 287, + "changelogs": [ + { + "key": "intent_template", + "category": "spelling_or_grammar", + "note": "Use 'mark' instead of 'make'" + }, + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 502, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Mark all {{product}} as out of stock", + "original.intent_template": "Make all {{product}} as out of stock", + "instantiation_dict": { + "product": "Gobi HeatTec Tee" + }, + "intent": "Mark all Gobi HeatTec Tee as out of stock", + "original.intent": "Make all Gobi HeatTec Tee as out of stock", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "verify_product_details", + "eval_params": { + "product_id": "446" + }, + "expected_data": { + "in_stock": false + } + } + } + ], + "site": "shopping_admin" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__SHOPPING_ADMIN__/catalog/product/edit/id/446/", + "locator": "document.querySelector('[name=\"product[quantity_and_stock_status][is_in_stock]\"').value", + "required_contents": { + "exact_match": "0" + } + } + ] + }, + "intent_template_id": 287, + "changelogs": [ + { + "key": "intent_template", + "category": "spelling_or_grammar", + "note": "Use 'mark' instead of 'make'" + }, + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 503, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Mark all {{product}} as out of stock", + "original.intent_template": "Make all {{product}} as out of stock", + "instantiation_dict": { + "product": "rocco gym tank" + }, + "intent": "Mark all rocco gym tank as out of stock", + "original.intent": "Make all rocco gym tank as out of stock", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "verify_product_details", + "eval_params": { + "product_id": "682" + }, + "expected_data": { + "in_stock": false + } + } + } + ], + "site": "shopping_admin" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__SHOPPING_ADMIN__/catalog/product/edit/id/682/", + "locator": "document.querySelector('[name=\"product[quantity_and_stock_status][is_in_stock]\"').value", + "required_contents": { + "exact_match": "0" + } + } + ] + }, + "intent_template_id": 287, + "changelogs": [ + { + "key": "intent_template", + "category": "spelling_or_grammar", + "note": "Use 'mark' instead of 'make'" + }, + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 504, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Mark all {{product}} as out of stock", + "original.intent_template": "Make all {{product}} as out of stock", + "instantiation_dict": { + "product": "Selene yoga hoodie" + }, + "intent": "Mark all Selene yoga hoodie as out of stock", + "original.intent": "Make all Selene yoga hoodie as out of stock", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "verify_product_details", + "eval_params": { + "product_id": "1108" + }, + "expected_data": { + "in_stock": false + } + } + } + ], + "site": "shopping_admin" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__SHOPPING_ADMIN__/catalog/product/edit/id/1108/", + "locator": "document.querySelector('[name=\"product[quantity_and_stock_status][is_in_stock]\"').value", + "required_contents": { + "exact_match": "0" + } + } + ] + }, + "intent_template_id": 287, + "changelogs": [ + { + "key": "intent_template", + "category": "spelling_or_grammar", + "note": "Use 'mark' instead of 'make'" + }, + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 505, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Mark all {{product}} as out of stock", + "original.intent_template": "Make all {{product}} as out of stock", + "instantiation_dict": { + "product": "Aeon capri" + }, + "original.instantiation_dict": { + "product": "Aeno capri" + }, + "intent": "Mark all Aeon capri as out of stock", + "original.intent": "Make all Aeno capri as out of stock", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "verify_product_details", + "eval_params": { + "product_id": "1861" + }, + "expected_data": { + "in_stock": false + } + } + } + ], + "site": "shopping_admin" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__SHOPPING_ADMIN__/catalog/product/edit/id/1861/", + "locator": "document.querySelector('[name=\"product[quantity_and_stock_status][is_in_stock]\"').value", + "required_contents": { + "exact_match": "0" + } + } + ] + }, + "intent_template_id": 287, + "changelogs": [ + { + "key": "intent_template", + "category": "spelling_or_grammar", + "note": "Use 'mark' instead of 'make'" + }, + { + "key": "instantiation_dict", + "category": "task_ambiguity", + "note": "Incorrect spelling of product name leading inability to find product. Changed 'Aeno' to 'Aeon'" + }, + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "shopping" + ], + "task_id": 506, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "Buy the highest rated product from the {{product_category}} category within a budget {{dollar_value}}.", + "instantiation_dict": { + "product_category": "meat substitute", + "dollar_value": "between 100 and 200" + }, + "intent": "Buy the highest rated product from the meat substitute category within a budget between 100 and 200.", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "validate_ordered", + "expected_data": { + "sku": "B01CTR3DLE" + } + } + } + ], + "site": "shopping" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "func:shopping_get_latest_order_url()", + "locator": "document.querySelector(\".order-details-items.ordered\").outerText", + "required_contents": { + "must_include": [ + "B01CTR3DLE" + ] + } + } + ] + }, + "intent_template_id": 172, + "changelogs": [ + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "shopping" + ], + "task_id": 507, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "Buy the highest rated product from the {{product_category}} category within a budget {{dollar_value}}.", + "instantiation_dict": { + "product_category": "Ceiling light", + "dollar_value": "above 1000" + }, + "intent": "Buy the highest rated product from the Ceiling light category within a budget above 1000.", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "validate_ordered", + "expected_data": { + "sku": "B07BVL3P1V" + } + } + } + ], + "site": "shopping" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "func:shopping_get_latest_order_url()", + "locator": "document.querySelector(\".order-details-items.ordered\").outerText", + "required_contents": { + "must_include": [ + "B07BVL3P1V" + ] + } + } + ] + }, + "intent_template_id": 172, + "changelogs": [ + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "shopping" + ], + "task_id": 508, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "Buy the highest rated product from the {{product_category}} category within a budget {{dollar_value}}.", + "instantiation_dict": { + "product_category": "NS switch pouch", + "dollar_value": "under 60" + }, + "intent": "Buy the highest rated product from the NS switch pouch category within a budget under 60.", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "validate_ordered", + "expected_data": { + "sku": "B07116LGP6" + } + } + } + ], + "site": "shopping" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "func:shopping_get_latest_order_url()", + "locator": "document.querySelector(\".order-details-items.ordered\").outerText", + "required_contents": { + "must_include": [ + "B07116LGP6" + ] + } + } + ] + }, + "intent_template_id": 172, + "changelogs": [ + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "shopping" + ], + "task_id": 509, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "Buy the best rating product from \"{{category}}\" category with at least 5 reviews and the product is least expensive", + "instantiation_dict": { + "category": "Men's shoe" + }, + "intent": "Buy the best rating product from \"Men's shoe\" category with at least 5 reviews and the product is least expensive", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "validate_ordered", + "expected_data": { + "sku": "B01J4MM3KO" + } + } + } + ], + "site": "shopping" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "func:shopping_get_latest_order_url()", + "locator": "document.querySelector(\".order-details-items.ordered\").outerText", + "required_contents": { + "must_include": [ + "B01J4MM3KO" + ] + } + } + ] + }, + "intent_template_id": 216, + "changelogs": [ + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "shopping" + ], + "task_id": 510, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "Buy the best rating product from \"{{category}}\" category with at least 5 reviews and the product is least expensive", + "instantiation_dict": { + "category": "Home Audio Speaker" + }, + "intent": "Buy the best rating product from \"Home Audio Speaker\" category with at least 5 reviews and the product is least expensive", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "validate_ordered", + "expected_data": { + "sku": "B002R5ABIW" + } + } + } + ], + "site": "shopping" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "func:shopping_get_latest_order_url()", + "locator": "document.querySelector(\".order-details-items.ordered\").outerText", + "required_contents": { + "must_include": [ + "B002R5ABIW" + ] + } + } + ] + }, + "intent_template_id": 216, + "changelogs": [ + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "shopping" + ], + "task_id": 511, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "Add a {{product}} to my wish list.", + "instantiation_dict": { + "product": "laundry detergent" + }, + "intent": "Add a laundry detergent to my wish list.", + "require_reset": false, + "eval": { + "site": "shopping" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__SHOPPING__/wishlist/", + "locator": "document.querySelector('.products-grid.wishlist').outerText", + "required_contents": { + "must_include": [ + "laundry", + "detergent" + ] + } + } + ] + }, + "intent_template_id": 189 + }, + { + "sites": [ + "shopping" + ], + "task_id": 512, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "Add a {{product}} to my wish list.", + "instantiation_dict": { + "product": "toothpaste" + }, + "intent": "Add a toothpaste to my wish list.", + "require_reset": false, + "eval": { + "site": "shopping" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__SHOPPING__/wishlist/", + "locator": "document.querySelector('.products-grid.wishlist').outerText", + "required_contents": { + "must_include": [ + "toothpaste" + ] + } + } + ] + }, + "intent_template_id": 189 + }, + { + "sites": [ + "shopping" + ], + "task_id": 513, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "Add a {{product}} to my wish list.", + "instantiation_dict": { + "product": "chair" + }, + "intent": "Add a chair to my wish list.", + "require_reset": false, + "eval": { + "site": "shopping" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__SHOPPING__/wishlist/", + "locator": "document.querySelector('.products-grid.wishlist').outerText", + "required_contents": { + "must_include": [ + "chair" + ] + } + } + ] + }, + "intent_template_id": 189 + }, + { + "sites": [ + "shopping" + ], + "task_id": 514, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "Add a {{product}} to my wish list.", + "instantiation_dict": { + "product": "white desk" + }, + "intent": "Add a white desk to my wish list.", + "require_reset": false, + "eval": { + "site": "shopping" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__SHOPPING__/wishlist/", + "locator": "document.querySelector('.products-grid.wishlist').outerText", + "required_contents": { + "must_include": [ + "white", + "desk" + ] + } + } + ] + }, + "intent_template_id": 189 + }, + { + "sites": [ + "shopping" + ], + "task_id": 515, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "Add a {{product}} to my wish list.", + "instantiation_dict": { + "product": "white computer desk" + }, + "intent": "Add a white computer desk to my wish list.", + "require_reset": false, + "eval": { + "site": "shopping" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__SHOPPING__/wishlist/", + "locator": "document.querySelector('.products-grid.wishlist').outerText", + "required_contents": { + "must_include": [ + "white", + "computer", + "desk" + ] + } + } + ] + }, + "intent_template_id": 189 + }, + { + "sites": [ + "shopping" + ], + "task_id": 516, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__/elmwood-inn-fine-teas-orange-vanilla-caffeine-free-fruit-infusion-16-ounce-pouch.html", + "geolocation": null, + "intent_template": "Add this product to my wishlist", + "instantiation_dict": {}, + "intent": "Add this product to my wishlist", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "validate_in_wishlist", + "expected_data": { + "sku": "B0040WHKIY" + } + } + } + ], + "site": "shopping" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__SHOPPING__/wishlist/", + "locator": "document.querySelector('.products-grid.wishlist').outerText", + "required_contents": { + "must_include": [ + "Elmwood Inn Fine Teas, Orange Vanilla Caffeine-free Fruit Infusion, 16-Ounce Pouch" + ] + } + } + ] + }, + "intent_template_id": 196, + "changelogs": [ + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "shopping" + ], + "task_id": 517, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__/skinit-decal-gaming-skin-compatible-with-xbox-one-s-console-and-controller-bundle-officially-licensed-nfl-baltimore-ravens-design.html", + "geolocation": null, + "intent_template": "Add this product to my wishlist", + "instantiation_dict": {}, + "intent": "Add this product to my wishlist", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "validate_in_wishlist", + "expected_data": { + "sku": "B01MTYJG38" + } + } + } + ], + "site": "shopping" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__SHOPPING__/wishlist/", + "locator": "document.querySelector('.products-grid.wishlist').outerText", + "required_contents": { + "must_include": [ + "Skinit Decal Gaming Skin Compatible with Xbox One S Console and Controller Bundle - Officially Licensed NFL Baltimore Ravens Design" + ] + } + } + ] + }, + "intent_template_id": 196, + "changelogs": [ + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "shopping" + ], + "task_id": 518, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__/sceptre-e195bd-srr-19-inch-720p-led-tv-true-black-2017.html", + "geolocation": null, + "intent_template": "Add this product to my wishlist", + "instantiation_dict": {}, + "intent": "Add this product to my wishlist", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "validate_in_wishlist", + "expected_data": { + "sku": "B01MY87FWG" + } + } + } + ], + "site": "shopping" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__SHOPPING__/wishlist/", + "locator": "document.querySelector('.products-grid.wishlist').outerText", + "required_contents": { + "must_include": [ + "Sceptre E195BD-SRR 19-Inch 720P LED TV, True Black (2017)" + ] + } + } + ] + }, + "intent_template_id": 196, + "changelogs": [ + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "shopping" + ], + "task_id": 519, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__/iphone-13-pro-max-case-neon-turtle-iphone-13-pro-max-cases-tempered-glass-back-soft-silicone-tpu-shock-protective-case-for-apple-iphone-13-pro-max.html", + "geolocation": null, + "intent_template": "Add this product to my wishlist", + "instantiation_dict": {}, + "intent": "Add this product to my wishlist", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "validate_in_wishlist", + "expected_data": { + "sku": "B09GG4P4MD" + } + } + } + ], + "site": "shopping" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__SHOPPING__/wishlist/", + "locator": "document.querySelector('.products-grid.wishlist').outerText", + "required_contents": { + "must_include": [ + "iPhone 13 Pro Max Case, Neon Turtle iPhone 13 Pro Max Cases, Tempered Glass Back+Soft Silicone TPU Shock Protective Case for Apple iPhone 13 Pro Max" + ] + } + } + ] + }, + "intent_template_id": 196, + "changelogs": [ + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "shopping" + ], + "task_id": 520, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__/magnetic-metal-stainless-steel-d-pads-kits-directional-pad-replacement-parts-for-xbox-one-elite-controller-elite-series-2-xbox-one-xbox-one-s-x-controller.html", + "geolocation": null, + "intent_template": "Add this product to my wishlist", + "instantiation_dict": {}, + "intent": "Add this product to my wishlist", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "validate_in_wishlist", + "expected_data": { + "sku": "B073XDR3K6" + } + } + } + ], + "site": "shopping" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__SHOPPING__/wishlist/", + "locator": "document.querySelector('.products-grid.wishlist').outerText", + "required_contents": { + "must_include": [ + "Magnetic Metal Stainless Steel D-pads Kits Directional Pad Replacement Parts for Xbox One Elite Controller, Elite Series 2, Xbox One, Xbox One S/X Controller" + ] + } + } + ] + }, + "intent_template_id": 196, + "changelogs": [ + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "shopping" + ], + "task_id": 521, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "Subscribe to the newsletter of OneStopMarket", + "instantiation_dict": {}, + "intent": "Subscribe to the newsletter of OneStopMarket", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "validate_newsletter", + "expected_data": null + } + } + ], + "site": "shopping" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__SHOPPING__/newsletter/manage/", + "locator": "document.querySelector('[title=\"General Subscription\"').checked.toString()", + "required_contents": { + "exact_match": "true" + } + } + ] + }, + "intent_template_id": 199, + "changelogs": [ + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "gitlab" + ], + "task_id": 522, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Fork {{repo}}.", + "instantiation_dict": { + "repo": "all repos from facebook" + }, + "intent": "Fork all repos from facebook.", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "user_has_project", + "eval_params": { + "project": [ + "create-react-app", + "buck" + ] + } + } + } + ], + "site": "gitlab" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__GITLAB__/dashboard/projects", + "locator": "document.querySelector('[data-qa-selector=\"projects_list\"').outerText", + "required_contents": { + "must_include": [ + "create-react-app", + "buck" + ] + } + } + ] + }, + "intent_template_id": 352, + "changelogs": [ + { + "key": "expected_backend_state", + "category": "switch_to_api_eval", + "note": "Use API to verify; more accurate" + }, + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "gitlab" + ], + "task_id": 523, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Star the top {{number}} most stared repos in Gitlab", + "instantiation_dict": { + "number": "five" + }, + "intent": "Star the top five most stared repos in Gitlab", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "user_has_starred_project", + "eval_params": { + "user_id": "byteblaze", + "project_names": [ + "AndroidSlidingUpPanel", + "create-react-app", + "ffmpeg-python", + "PHP_XLSXWriter", + "AndroidAsync" + ] + } + } + } + ], + "site": "gitlab" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__GITLAB__/users/byteblaze/starred", + "locator": "", + "required_contents": { + "must_include": [ + "AndroidSlidingUpPanel", + "create-react-app", + "ffmpeg-python", + "PHP_XLSXWriter", + "AndroidAsync" + ] + } + } + ] + }, + "intent_template_id": 354, + "changelogs": [ + { + "key": "expected_backend_state", + "category": "switch_to_api_eval", + "note": "Use API to verify; more accurate" + }, + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "gitlab" + ], + "task_id": 524, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Star the top {{number}} most stared repos in Gitlab", + "instantiation_dict": { + "number": "eight" + }, + "intent": "Star the top eight most stared repos in Gitlab", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "user_has_starred_project", + "eval_params": { + "user_id": "byteblaze", + "project_names": [ + "AndroidSlidingUpPanel", + "create-react-app", + "ffmpeg-python", + "PHP_XLSXWriter", + "AndroidAsync", + "Pytorch-GAN", + "administrate", + "keycloak" + ] + } + } + } + ], + "site": "gitlab" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__GITLAB__/users/byteblaze/starred", + "locator": "", + "required_contents": { + "must_include": [ + "AndroidSlidingUpPanel", + "create-react-app", + "ffmpeg-python", + "PHP_XLSXWriter", + "AndroidAsync", + "Pytorch-GAN", + "administrate", + "keycloak" + ] + } + } + ] + }, + "intent_template_id": 354, + "changelogs": [ + { + "key": "expected_backend_state", + "category": "switch_to_api_eval", + "note": "Use API to verify; more accurate" + }, + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "gitlab" + ], + "task_id": 525, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Star the top {{number}} most stared repos in Gitlab", + "instantiation_dict": { + "number": "four" + }, + "intent": "Star the top four most stared repos in Gitlab", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "user_has_starred_project", + "eval_params": { + "user_id": "byteblaze", + "project_names": [ + "AndroidSlidingUpPanel", + "create-react-app", + "ffmpeg-python", + "PHP_XLSXWriter" + ] + } + } + } + ], + "site": "gitlab" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__GITLAB__/users/byteblaze/starred", + "locator": "", + "required_contents": { + "must_include": [ + "AndroidSlidingUpPanel", + "create-react-app", + "ffmpeg-python", + "PHP_XLSXWriter" + ] + } + } + ] + }, + "intent_template_id": 354, + "changelogs": [ + { + "key": "expected_backend_state", + "category": "switch_to_api_eval", + "note": "Use API to verify; more accurate" + }, + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "gitlab" + ], + "task_id": 526, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Star the top {{number}} most stared repos in Gitlab", + "instantiation_dict": { + "number": "three" + }, + "intent": "Star the top three most stared repos in Gitlab", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "user_has_starred_project", + "eval_params": { + "user_id": "byteblaze", + "project_names": [ + "AndroidSlidingUpPanel", + "create-react-app", + "ffmpeg-python" + ] + } + } + } + ], + "site": "gitlab" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__GITLAB__/users/byteblaze/starred", + "locator": "", + "required_contents": { + "must_include": [ + "AndroidSlidingUpPanel", + "create-react-app", + "ffmpeg-python" + ] + } + } + ] + }, + "intent_template_id": 354, + "changelogs": [ + { + "key": "expected_backend_state", + "category": "switch_to_api_eval", + "note": "Use API to verify; more accurate" + }, + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "gitlab" + ], + "task_id": 527, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Star the top {{number}} most stared repos in Gitlab", + "instantiation_dict": { + "number": "one" + }, + "intent": "Star the top one most stared repos in Gitlab", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "user_has_starred_project", + "eval_params": { + "user_id": "byteblaze", + "project_names": [ + "AndroidSlidingUpPanel" + ] + } + } + } + ], + "site": "gitlab" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__GITLAB__/users/byteblaze/starred", + "locator": "", + "required_contents": { + "must_include": [ + "AndroidSlidingUpPanel" + ] + } + } + ] + }, + "intent_template_id": 354, + "changelogs": [ + { + "key": "expected_backend_state", + "category": "switch_to_api_eval", + "note": "Use API to verify; more accurate" + }, + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "shopping" + ], + "task_id": 528, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "Draft a refund message via their \"contact us\" form for the {{product}} I bought {{time}}. State: \"It broke after three days of use\". The shop requires the order id, the reason and the amount to refund in the message. Don't submit yet", + "original.intent_template": "Draft a refund message via their \"contact us\" form for the {{product}} I bought {{time}}. It broke after three days of use. The shop requires the order id, the reason and the amount to refund in the message. Don't submit yet", + "instantiation_dict": { + "product": "phone screen protector", + "time": "March 2023" + }, + "intent": "Draft a refund message via their \"contact us\" form for the phone screen protector I bought March 2023. State: \"It broke after three days of use\". The shop requires the order id, the reason and the amount to refund in the message. Don't submit yet", + "original.intent": "Draft a refund message via their \"contact us\" form for the phone screen protector I bought March 2023. It broke after three days of use. The shop requires the order id, the reason and the amount to refund in the message. Don't submit yet", + "require_reset": false, + "eval": { + "site": "shopping" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "last", + "locator": "document.querySelector('[title=\"What\u2019s on your mind?\"').value", + "required_contents": { + "must_include": [ + "refund", + "it broke after three days of use", + "000000180", + "12.99" + ] + } + } + ] + }, + "intent_template_id": 154, + "changelogs": [ + { + "key": "intent_template", + "category": "task_ambiguity", + "note": "Clarify the intent to provide exact string to enter" + } + ] + }, + { + "sites": [ + "shopping" + ], + "task_id": 529, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "Draft a refund message via their \"contact us\" form for the {{product}} I bought {{time}}. State: \"It broke after three days of use\". The shop requires the order id, the reason and the amount to refund in the message. Don't submit yet", + "original.intent_template": "Draft a refund message via their \"contact us\" form for the {{product}} I bought {{time}}. It broke after three days of use. The shop requires the order id, the reason and the amount to refund in the message. Don't submit yet", + "instantiation_dict": { + "product": "bluetooth speaker", + "time": "Feb 2023" + }, + "intent": "Draft a refund message via their \"contact us\" form for the bluetooth speaker I bought Feb 2023. State: \"It broke after three days of use\". The shop requires the order id, the reason and the amount to refund in the message. Don't submit yet", + "original.intent": "Draft a refund message via their \"contact us\" form for the bluetooth speaker I bought Feb 2023. It broke after three days of use. The shop requires the order id, the reason and the amount to refund in the message. Don't submit yet", + "require_reset": false, + "eval": { + "site": "shopping" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "last", + "locator": "document.querySelector('[title=\"What\u2019s on your mind?\"').value", + "required_contents": { + "must_include": [ + "refund", + "it broke after three days of use", + "000000148", + "169.95" + ] + } + } + ] + }, + "intent_template_id": 154, + "changelogs": [ + { + "key": "intent_template", + "category": "task_ambiguity", + "note": "Clarify the intent to provide exact string to enter" + } + ] + }, + { + "sites": [ + "shopping" + ], + "task_id": 530, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "Draft a refund message via their \"contact us\" form for the {{product}} I bought {{time}}. State: \"It broke after three days of use\". The shop requires the order id, the reason and the amount to refund in the message. Don't submit yet", + "original.intent_template": "Draft a refund message via their \"contact us\" form for the {{product}} I bought {{time}}. It broke after three days of use. The shop requires the order id, the reason and the amount to refund in the message. Don't submit yet", + "instantiation_dict": { + "product": "kitchen organizer", + "time": "around Feb 2023" + }, + "intent": "Draft a refund message via their \"contact us\" form for the kitchen organizer I bought around Feb 2023. State: \"It broke after three days of use\". The shop requires the order id, the reason and the amount to refund in the message. Don't submit yet", + "original.intent": "Draft a refund message via their \"contact us\" form for the kitchen organizer I bought around Feb 2023. It broke after three days of use. The shop requires the order id, the reason and the amount to refund in the message. Don't submit yet", + "require_reset": false, + "eval": { + "site": "shopping" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "last", + "locator": "document.querySelector('[title=\"What\u2019s on your mind?\"').value", + "required_contents": { + "must_include": [ + "refund", + "it broke after three days of use", + "000000161", + "68.88" + ] + } + } + ] + }, + "intent_template_id": 154, + "changelogs": [ + { + "key": "intent_template", + "category": "task_ambiguity", + "note": "Clarify the intent to provide exact string to enter" + } + ] + }, + { + "sites": [ + "shopping" + ], + "task_id": 531, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "Draft a refund message via their \"contact us\" form for the {{product}} I bought {{time}}. State: \"It broke after three days of use\". The shop requires the order id, the reason and the amount to refund in the message. Don't submit yet", + "original.intent_template": "Draft a refund message via their \"contact us\" form for the {{product}} I bought {{time}}. It broke after three days of use. The shop requires the order id, the reason and the amount to refund in the message. Don't submit yet", + "instantiation_dict": { + "product": "phone case", + "time": "March 2023" + }, + "intent": "Draft a refund message via their \"contact us\" form for the phone case I bought March 2023. State: \"It broke after three days of use\". The shop requires the order id, the reason and the amount to refund in the message. Don't submit yet", + "original.intent": "Draft a refund message via their \"contact us\" form for the phone case I bought March 2023. It broke after three days of use. The shop requires the order id, the reason and the amount to refund in the message. Don't submit yet", + "require_reset": false, + "eval": { + "site": "shopping" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "last", + "locator": "document.querySelector('[title=\"What\u2019s on your mind?\"').value", + "required_contents": { + "must_include": [ + "refund", + "it broke after three days of use", + "000000180", + "$12.99" + ] + } + } + ] + }, + "intent_template_id": 154, + "changelogs": [ + { + "key": "intent_template", + "category": "task_ambiguity", + "note": "Clarify the intent to provide exact string to enter" + } + ] + }, + { + "sites": [ + "shopping" + ], + "task_id": 532, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "Draft a refund message via their \"contact us\" form for the {{product}} I bought {{time}}. State: \"It broke after three days of use\". The shop requires the order id, the reason and the amount to refund in the message. Don't submit yet", + "original.intent_template": "Draft a refund message via their \"contact us\" form for the {{product}} I bought {{time}}. It broke after three days of use. The shop requires the order id, the reason and the amount to refund in the message. Don't submit yet", + "instantiation_dict": { + "product": "PS3 remote controller", + "time": "early 2023" + }, + "intent": "Draft a refund message via their \"contact us\" form for the PS3 remote controller I bought early 2023. State: \"It broke after three days of use\". The shop requires the order id, the reason and the amount to refund in the message. Don't submit yet", + "original.intent": "Draft a refund message via their \"contact us\" form for the PS3 remote controller I bought early 2023. It broke after three days of use. The shop requires the order id, the reason and the amount to refund in the message. Don't submit yet", + "require_reset": false, + "eval": { + "site": "shopping" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "last", + "locator": "document.querySelector('[title=\"What\u2019s on your mind?\"').value", + "required_contents": { + "must_include": [ + "refund", + "it broke after three days of use", + "000000180", + "1.63" + ] + } + } + ] + }, + "intent_template_id": 154, + "changelogs": [ + { + "key": "intent_template", + "category": "task_ambiguity", + "note": "Clarify the intent to provide exact string to enter" + } + ] + }, + { + "sites": [ + "gitlab" + ], + "task_id": 533, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Follow {{account_list}} on Gitlab", + "instantiation_dict": { + "account_list": [ + "convexegg", + "yjlou" + ] + }, + "intent": "Follow ['convexegg', 'yjlou'] on Gitlab", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "is_user_following", + "eval_params": { + "username": "byteblaze", + "following_usernames": [ + "convexegg", + "yjlou" + ] + } + } + } + ], + "site": "gitlab" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__GITLAB__/users/byteblaze/following", + "locator": "document.querySelector('.user-profile').outerText", + "required_contents": { + "must_include": [ + "@convexegg", + "@yjlou" + ] + } + } + ] + }, + "intent_template_id": 330, + "changelogs": [ + { + "key": "expected_backend_state", + "category": "switch_to_api_eval", + "note": "Use API-based follower check; more accurate" + }, + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "gitlab" + ], + "task_id": 534, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Follow {{account_list}} on Gitlab", + "instantiation_dict": { + "account_list": [ + "Jakub Klinkovsk\u00fd", + "Koushik", + "Vinta Chen" + ] + }, + "intent": "Follow ['Jakub Klinkovsk\u00fd', 'Koushik', 'Vinta Chen'] on Gitlab", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "is_user_following", + "eval_params": { + "username": "byteblaze", + "following_usernames": [ + "lahwaacz", + "koush", + "vinta" + ] + } + } + } + ], + "site": "gitlab" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__GITLAB__/users/byteblaze/following", + "locator": "document.querySelector('.user-profile').outerText", + "required_contents": { + "must_include": [ + "@lahwaacz", + "@koush", + "@vinta" + ] + } + } + ] + }, + "intent_template_id": 330, + "changelogs": [ + { + "key": "expected_backend_state", + "category": "switch_to_api_eval", + "note": "Use API-based follower check; more accurate" + }, + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "gitlab" + ], + "task_id": 535, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Follow {{account_list}} on Gitlab", + "instantiation_dict": { + "account_list": [ + "Jakub K", + "ghost", + "Beno\u00eet Blanchon" + ] + }, + "intent": "Follow ['Jakub K', 'ghost', 'Beno\u00eet Blanchon'] on Gitlab", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "is_user_following", + "eval_params": { + "username": "byteblaze", + "following_usernames": [ + "lahwaacz", + "ghost", + "bblanchon" + ] + } + } + } + ], + "site": "gitlab" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__GITLAB__/users/byteblaze/following", + "locator": "document.querySelector('.user-profile').outerText", + "required_contents": { + "must_include": [ + "@lahwaacz", + "@ghost", + "@bblanchon" + ] + } + } + ] + }, + "intent_template_id": 330, + "changelogs": [ + { + "key": "expected_backend_state", + "category": "switch_to_api_eval", + "note": "Use API-based follower check; more accurate" + }, + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "gitlab" + ], + "task_id": 536, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Follow {{account_list}} on Gitlab", + "instantiation_dict": { + "account_list": [ + "ghost", + "R1kk3r", + "Abishek" + ] + }, + "intent": "Follow ['ghost', 'R1kk3r', 'Abishek'] on Gitlab", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "is_user_following", + "eval_params": { + "username": "byteblaze", + "following_usernames": [ + "ghost", + "R1kk3r", + "abisubramanya27" + ] + } + } + } + ], + "site": "gitlab" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__GITLAB__/users/byteblaze/following", + "locator": "document.querySelector('.user-profile').outerText", + "required_contents": { + "must_include": [ + "@lahwaacz", + "@R1kk3r", + "@abisubramanya27" + ] + } + } + ] + }, + "intent_template_id": 330, + "changelogs": [ + { + "key": "expected_backend_state", + "category": "switch_to_api_eval", + "note": "Use API-based follower check; more accurate" + }, + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "gitlab" + ], + "task_id": 537, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Follow {{account_list}} on Gitlab", + "instantiation_dict": { + "account_list": [ + "Jakub Klinkovsk", + "convexegg", + "Vinta Chen", + "yjlou", + "Abishek S" + ] + }, + "intent": "Follow ['Jakub Klinkovsk', 'convexegg', 'Vinta Chen', 'yjlou', 'Abishek S'] on Gitlab", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "is_user_following", + "eval_params": { + "username": "byteblaze", + "following_usernames": [ + "lahwaacz", + "convexegg", + "vinta", + "yjlou", + "abisubramanya27" + ] + } + } + } + ], + "site": "gitlab" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__GITLAB__/users/byteblaze/following", + "locator": "document.querySelector('.user-profile').outerText", + "required_contents": { + "must_include": [ + "@lahwaacz", + "@convexegg", + "@vinta", + "@yjlou", + "@abisubramanya27" + ] + } + } + ] + }, + "intent_template_id": 330, + "changelogs": [ + { + "key": "expected_backend_state", + "category": "switch_to_api_eval", + "note": "Use API-based follower check; more accurate" + }, + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 538, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Modify the address of order #{{order_id}} to {{address}}", + "instantiation_dict": { + "order_id": "299", + "address": "456 Oak Avenue, Apartment 5B, New York, NY, 10001" + }, + "intent": "Modify the address of order #299 to 456 Oak Avenue, Apartment 5B, New York, NY, 10001", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "verify_order_details", + "eval_params": { + "order_id": "299" + }, + "expected_data": { + "address": "456 Oak Avenue", + "address2": "Apartment 5B", + "city": "New York", + "state": "New York", + "zip_code": "10001", + "country": "US" + } + } + } + ], + "site": "shopping_admin" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__SHOPPING_ADMIN__/sales/order/view/order_id/299", + "locator": "", + "required_contents": { + "must_include": [ + "456 Oak Avenue", + "Apartment 5B", + "New York", + "10001" + ] + } + } + ] + }, + "intent_template_id": 240, + "changelogs": [ + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 539, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Modify the address of order #{{order_id}} to {{address}}", + "instantiation_dict": { + "order_id": "65", + "address": "789 Pine Lane, San Francisco, CA, 94102" + }, + "intent": "Modify the address of order #65 to 789 Pine Lane, San Francisco, CA, 94102", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "verify_order_details", + "eval_params": { + "order_id": "65" + }, + "expected_data": { + "address": "789 Pine Lane", + "city": "San Francisco", + "state": "California", + "zip_code": "94102" + } + } + } + ], + "site": "shopping_admin" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__SHOPPING_ADMIN__/sales/order/view/order_id/65", + "locator": "", + "required_contents": { + "must_include": [ + "789 Pine Lane", + "San Francisco", + "California", + "94102" + ] + } + } + ] + }, + "intent_template_id": 240, + "changelogs": [ + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 540, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Modify the address of order #{{order_id}} to {{address}}", + "instantiation_dict": { + "order_id": "301", + "address": "321 Birch Boulevard, Suite 200, Dallas, TX, 75201" + }, + "intent": "Modify the address of order #301 to 321 Birch Boulevard, Suite 200, Dallas, TX, 75201", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "verify_order_details", + "eval_params": { + "order_id": "301" + }, + "expected_data": { + "address": "321 Birch Boulevard", + "address2": "Suite 200", + "city": "Dallas", + "state": "Texas", + "zip_code": "75201" + } + } + } + ], + "site": "shopping_admin" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__SHOPPING_ADMIN__/sales/order/view/order_id/301", + "locator": "", + "required_contents": { + "must_include": [ + "321 Birch Boulevard", + "Suite 200", + "Dallas", + "Texas", + "75201" + ] + } + } + ] + }, + "intent_template_id": 240, + "changelogs": [ + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 541, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Modify the address of order #{{order_id}} to {{address}}", + "instantiation_dict": { + "order_id": "125", + "address": "654 Elm Drive, Apartment 12, Miami, FL, 33101" + }, + "intent": "Modify the address of order #125 to 654 Elm Drive, Apartment 12, Miami, FL, 33101", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "verify_order_details", + "eval_params": { + "order_id": "125" + }, + "expected_data": { + "address": "654 Elm Drive", + "address2": "Apartment 12", + "city": "Miami", + "state": "Florida", + "zip_code": "33101" + } + } + } + ], + "site": "shopping_admin" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__SHOPPING_ADMIN__/sales/order/view/order_id/125", + "locator": "", + "required_contents": { + "must_include": [ + "654 Elm Drive", + "Apartment 12", + "Miami", + "Florida", + "33101" + ] + } + } + ] + }, + "intent_template_id": 240, + "changelogs": [ + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 542, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Modify the address of order #{{order_id}} to {{address}}", + "instantiation_dict": { + "order_id": "300", + "address": "987 Cedar Court, Los Angeles, CA, 90012" + }, + "intent": "Modify the address of order #300 to 987 Cedar Court, Los Angeles, CA, 90012", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "verify_order_details", + "eval_params": { + "order_id": "300" + }, + "expected_data": { + "address": "987 Cedar Court", + "city": "Los Angeles", + "state": "California", + "zip_code": "90012" + } + } + } + ], + "site": "shopping_admin" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__SHOPPING_ADMIN__/sales/order/view/order_id/300", + "locator": "", + "required_contents": { + "must_include": [ + "987 Cedar Court", + "Los Angeles", + "California", + "90012" + ] + } + } + ] + }, + "intent_template_id": 240, + "changelogs": [ + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 543, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Update the product description of {{product}} with the titles of all reviews with 4 stars or above.", + "original.intent_template": "Update the product description of {{product}} to highlight the real user positive reviews by quoting the comments", + "instantiation_dict": { + "product": "Bella Tank" + }, + "intent": "Update the product description of Bella Tank with the titles of all reviews with 4 stars or above.", + "original.intent": "Update the product description of Bella Tank to highlight the real user positive reviews by quoting the comments", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "verify_product_details", + "eval_params": { + "product_id": "1684" + }, + "expected_data": { + "description": [ + "A sweet n sporty look for the gym", + "Good choice for working out" + ] + } + } + } + ], + "site": "shopping_admin" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__SHOPPING_ADMIN__/../bella-tank.html", + "locator": "document.querySelector('.data.item.content').outerText + (document.querySelector('.product.attribute.overview [itemprop=\"description\"]')?.outerText || '')", + "required_contents": { + "must_include": [ + "Good choice for working out and stylin' enough to wear when I'm hanging with friends on hot days", + "Also washes really well", + "Always a sweet n sporty look for the gym", + "Keeps me cool and the seams don't rub up against me like some of my other tanks" + ] + } + } + ] + }, + "intent_template_id": 251, + "changelogs": [ + { + "key": "intent_template", + "category": "permissive_string_match", + "note": "Clarify the intent to specify exactly what to update the value with" + }, + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 544, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Update the product description of {{product}} with the titles of all reviews with 4 stars or above.", + "original.intent_template": "Update the description of {{product}} to highlight the real user positive reviews by quoting the comments", + "instantiation_dict": { + "product": "Selene Yoga Hoodie" + }, + "original.instantiation_dict": { + "product": "Selena Yoga Hoodie" + }, + "intent": "Update the product description of Selene Yoga Hoodie with the titles of all reviews with 4 stars or above.", + "original.intent": "Update the description of Selena Yoga Hoodie to highlight the real user positive reviews by quoting the comments", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "verify_product_details", + "eval_params": { + "product_id": "1108" + }, + "expected_data": { + "description": [ + "Best hoodies I've owned.", + "Great value", + "Kept me warm" + ] + } + } + } + ], + "site": "shopping_admin" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__SHOPPING_ADMIN__/../selene-yoga-hoodie.html", + "locator": "document.querySelector('.data.item.content').outerText + (document.querySelector('.product.attribute.overview [itemprop=\"description\"]')?.outerText || '')", + "required_contents": { + "must_include": [ + "I was super cold and it did the job.", + "The sleeves are definitely thicker than you realize, which is a good thing", + "really quite substantial", + "planning on buying another one of these in another color", + "the best hoodie ive ever owned" + ] + } + } + ] + }, + "intent_template_id": 251, + "changelogs": [ + { + "key": "intent_template", + "category": "permissive_string_match", + "note": "Clarify the intent to specify exactly what to update the value with" + }, + { + "key": "instantiation_dict", + "category": "task_ambiguity", + "note": "Product name misspelled leading to inability to update the product description. Changed from 'Selena' to 'Selene'" + }, + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 545, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Update the product description of {{product}} with the titles of all reviews with 4 stars or above.", + "original.intent_template": "Update the description of {{product}} to highlight the real user positive reviews by quoting the comments", + "instantiation_dict": { + "product": "Radiant Tee" + }, + "intent": "Update the product description of Radiant Tee with the titles of all reviews with 4 stars or above.", + "original.intent": "Update the description of Radiant Tee to highlight the real user positive reviews by quoting the comments", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "verify_product_details", + "eval_params": { + "product_id": "1556" + }, + "expected_data": { + "description": [ + "What a versatile shirt!" + ] + } + } + } + ], + "site": "shopping_admin" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__SHOPPING_ADMIN__/../radiant-tee.html", + "locator": "document.querySelector('.data.item.content').outerText + (document.querySelector('.product.attribute.overview [itemprop=\"description\"]')?.outerText || '')", + "required_contents": { + "must_include": [ + "What I rally love here is that it does the job of keeping me cool and dry", + "I'm a big guy and sweat A LOT", + "Even after a day of gulf, I'm still dry and comfortable", + "What a versatile shirt", + "Not only does it feel very soft compared to my old worn out polos, but it also does the job promised", + "I like going out after my game for drinks so I look good then too and don't need to change into something fresh" + ] + } + } + ] + }, + "intent_template_id": 251, + "changelogs": [ + { + "key": "intent_template", + "category": "permissive_string_match", + "note": "Clarify the intent to specify exactly what to update the value with" + }, + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 546, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Update the product description of {{product}} with the titles of all reviews with 4 stars or above.", + "original.intent_template": "Update the description of {{product}} to highlight the real user positive reviews by quoting the comments", + "instantiation_dict": { + "product": "Lucia Cross-Fit Bra" + }, + "intent": "Update the product description of Lucia Cross-Fit Bra with the titles of all reviews with 4 stars or above.", + "original.intent": "Update the description of Lucia Cross-Fit Bra to highlight the real user positive reviews by quoting the comments", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "validate_status", + "expected_data": { + "expected_status": "ACTION_NOT_ALLOWED_ERROR" + } + } + } + ], + "site": "shopping_admin" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__SHOPPING_ADMIN__/../affirm-water-bottle.html", + "locator": "document.querySelector('.data.item.content').outerText + (document.querySelector('.product.attribute.overview [itemprop=\"description\"]')?.outerText || '')", + "required_contents": { + "must_include": [ + "Wide mouth opening makes it easy to clean" + ] + } + } + ] + }, + "intent_template_id": 251, + "changelogs": [ + { + "key": "intent_template", + "category": "permissive_string_match", + "note": "Clarify the intent to specify exactly what to update the value with" + }, + { + "key": "expected_retrieve_value", + "category": "reference_alignment", + "note": "Original expected value include incorrect review" + } + ] + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 547, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Add a new {{option}} option {{value}} to the {{base_setting}} of {{product}}", + "instantiation_dict": { + "option": "color", + "value": "brown", + "base_setting": "size S", + "product": "Phoebe Zipper Sweatshirt" + }, + "intent": "Add a new color option brown to the size S of Phoebe Zipper Sweatshirt", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "verify_product_details", + "eval_params": { + "product_id": "1130" + }, + "expected_data": { + "variants": [ + "Phoebe Zipper Sweatshirt-S-Brown" + ] + } + } + } + ], + "site": "shopping_admin" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__SHOPPING_ADMIN__/catalog/product/edit/id/1130/", + "locator": "document.querySelector('[data-index=\"configurable\"').outerText", + "required_contents": { + "must_include": [ + "Phoebe Zipper Sweatshirt-S-Brown" + ] + } + } + ] + }, + "intent_template_id": 252, + "changelogs": [ + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 548, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Add a new {{option}} {{value}} to {{base_setting}} of {{product}}", + "instantiation_dict": { + "option": "color", + "value": "blue", + "base_setting": "size S and M", + "product": "Frankie Sweatshirt" + }, + "intent": "Add a new color blue to size S and M of Frankie Sweatshirt", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "verify_product_details", + "eval_params": { + "product_id": "110" + }, + "expected_data": { + "variants": [ + "Frankie Sweatshirt-M-Blue", + "Frankie Sweatshirt-S-Blue" + ] + } + } + } + ], + "site": "shopping_admin" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__SHOPPING_ADMIN__/catalog/product/edit/id/110/", + "locator": "document.querySelector('[data-index=\"configurable\"').outerText", + "required_contents": { + "must_include": [ + "Sweatshirt-M-Blue", + "Sweatshirt-S-Blue" + ] + } + } + ] + }, + "intent_template_id": 252, + "changelogs": [ + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 549, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Add a new {{option}} {{value}} to {{base_setting}} {{product}}", + "instantiation_dict": { + "option": "size", + "value": "XXXL", + "base_setting": "green", + "product": "Minerva LumaTech\u2122 V-Tee" + }, + "original.instantiation_dict": { + "option": "size", + "value": "XXXL", + "base_setting": "green", + "product": "Minerva LumaTech V-Tee" + }, + "intent": "Add a new size XXXL to green Minerva LumaTech\u2122 V-Tee", + "original.intent": "Add a new size XXXL to green Minerva LumaTech V-Tee", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "verify_product_details", + "eval_params": { + "product_id": "1492" + }, + "expected_data": { + "variants": [ + "Minerva LumaTech\u2122 V-Tee-XXXL-Green" + ] + } + } + } + ], + "site": "shopping_admin" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__SHOPPING_ADMIN__/catalog/product/edit/id/1492/", + "locator": "document.querySelector('[data-index=\"configurable\"').outerText", + "required_contents": { + "must_include": [ + "V-Tee-XXXL-Green" + ] + } + } + ] + }, + "intent_template_id": 252, + "changelogs": [ + { + "key": "instantiation_dict", + "category": "task_ambiguity", + "note": "Changed 'Minerva LumaTech V-Tee' to full product name 'Minerva LumaTech\u2122 V-Tee'" + }, + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 550, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Add a new {{option}} {{value}} to {{base_setting}} {{product}}", + "instantiation_dict": { + "option": "size", + "value": "XXS", + "base_setting": "blue and purple", + "product": "Nona Fitness Tank" + }, + "intent": "Add a new size XXS to blue and purple Nona Fitness Tank", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "verify_product_details", + "eval_params": { + "product_id": "1732" + }, + "expected_data": { + "variants": [ + "Nona Fitness Tank-XXS-Blue", + "Nona Fitness Tank-XXS-Purple" + ] + } + } + } + ], + "site": "shopping_admin" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__SHOPPING_ADMIN__/catalog/product/edit/id/1732/", + "locator": "document.querySelector('[data-index=\"configurable\"').outerText", + "required_contents": { + "must_include": [ + "Tank-XXS-Blue", + "Tank-XXS-Purple" + ] + } + } + ] + }, + "intent_template_id": 252, + "changelogs": [ + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 551, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Add new {{option}} {{value}} to {{base_setting}} of {{product}}", + "instantiation_dict": { + "option": "size", + "value": "30 and 31", + "base_setting": "all color variants", + "product": "Diana Tights" + }, + "intent": "Add new size 30 and 31 to all color variants of Diana Tights", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "verify_product_details", + "eval_params": { + "product_id": "1854" + }, + "expected_data": { + "variants": [ + "Diana Tights-30-Blue", + "Diana Tights-30-Black", + "Diana Tights-30-Orange", + "Diana Tights-31-Blue", + "Diana Tights-31-Black", + "Diana Tights-31-Orange" + ] + } + } + } + ], + "site": "shopping_admin" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__SHOPPING_ADMIN__/catalog/product/edit/id/1854/", + "locator": "document.querySelector('[data-index=\"configurable\"').outerText", + "required_contents": { + "must_include": [ + "Tights-30-Blue", + "Tights-30-Black", + "Tights-30-Orange", + "Tights-31-Blue", + "Tights-31-Black", + "Tights-31-Orange" + ] + } + } + ] + }, + "intent_template_id": 252, + "changelogs": [ + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "gitlab", + "reddit" + ], + "task_id": 552, + "require_login": true, + "storage_state": "./.auth/gitlab.reddit_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Create a folder named {{directory}} in {{gitlab_repo}} repo. Within it, create a file named urls.txt that contains the URLs of the 5 most recent posts from the {{subreddit}}?", + "instantiation_dict": { + "directory": "real_space", + "subreddit": "space", + "gitlab_repo": "gimmiethat.space" + }, + "intent": "Create a folder named real_space in gimmiethat.space repo. Within it, create a file named urls.txt that contains the URLs of the 5 most recent posts from the space?", + "require_reset": false, + "eval": { + "site": "gitlab" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__GITLAB__/byteblaze/gimmiethat.space/-/raw/main/real_space/urls.txt", + "locator": "", + "required_contents": { + "must_include": [ + "__REDDIT__/f/space/134164/scientists-erupt-at-nasa-gutting-funding-for-crucial-venus", + "__REDDIT__/f/space/134163/virgin-orbit-fails-to-secure-funding-will-cease-operations", + "__REDDIT__/f/space/134162/nasa-to-name-artemis-2-crew-next-week-the-first-moon", + "__REDDIT__/f/space/134161/bent-light-in-deep-space-reveals-one-of-the-biggest-black", + "__REDDIT__/f/space/134160/seti-s-new-machine-learning-algorithm-works-like-google-s" + ] + } + } + ] + }, + "intent_template_id": 84 + }, + { + "sites": [ + "gitlab", + "reddit" + ], + "task_id": 553, + "require_login": true, + "storage_state": "./.auth/gitlab.reddit_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Create a folder named {{directory}} in {{gitlab_repo}} repo. Within it, create a file named urls.txt that contains the URLs of the 5 most recent posts from the {{subreddit}}?", + "instantiation_dict": { + "directory": "news", + "gitlab_repo": "gimmiethat.space", + "subreddit": "news related subreddits" + }, + "intent": "Create a folder named news in gimmiethat.space repo. Within it, create a file named urls.txt that contains the URLs of the 5 most recent posts from the news related subreddits?", + "require_reset": false, + "eval": { + "site": "gitlab" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__GITLAB__/byteblaze/gimmiethat.space/-/raw/main/news/urls.txt", + "locator": "", + "required_contents": { + "must_include": [ + "__REDDIT__/f/news/129905/ohio-man-charged-for-using-molotov-cocktails-to-attack", + "__REDDIT__/f/news/129904/in-a-loss-for-fox-news-judge-allows-dominion-s-defamation", + "__REDDIT__/f/news/129903/theater-group-sues-to-block-tennessee-s-new-anti-drag-law", + "__REDDIT__/f/news/129902/andrew-tate-released-from-jail-in-romania-and-placed-under", + "__REDDIT__/f/news/129901/rare-high-risk-storm-alert-issued-for-parts-of-midwest-and" + ] + } + } + ] + }, + "intent_template_id": 84 + }, + { + "sites": [ + "gitlab", + "reddit" + ], + "task_id": 554, + "require_login": true, + "storage_state": "./.auth/gitlab.reddit_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Create a folder named {{directory}} in {{gitlab_repo}} repo. Within it, create a file named urls.txt that contains the URLs of the 5 most recent posts from the {{subreddit}}?", + "instantiation_dict": { + "directory": "moive_space", + "gitlab_repo": "gimmiethat.space", + "subreddit": "movies" + }, + "intent": "Create a folder named moive_space in gimmiethat.space repo. Within it, create a file named urls.txt that contains the URLs of the 5 most recent posts from the movies?", + "require_reset": false, + "eval": { + "site": "gitlab" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__GITLAB__/byteblaze/gimmiethat.space/-/raw/main/moive_space/urls.txt", + "locator": "", + "required_contents": { + "must_include": [ + "__REDDIT__/f/movies/128825/scenes-in-film-that-feel-off-or-wrong-in-some-way-and-make", + "__REDDIT__/f/movies/128824/disney-s-live-action-lilo-amp-stitch-movie-finds-its-lilo-in", + "__REDDIT__/f/movies/128823/fantastic-four-movie-gets-new-writer-with-avatar-the-way-of", + "__REDDIT__/f/movies/128822/can-someone-explain-what-made-steven-seagal-so-appealing-for", + "__REDDIT__/f/movies/128821/ban-on-fetish-sex-depictions-in-film-should-end-australia" + ] + } + } + ] + }, + "intent_template_id": 84 + }, + { + "sites": [ + "gitlab", + "reddit" + ], + "task_id": 555, + "require_login": true, + "storage_state": "./.auth/gitlab.reddit_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Create a folder named {{directory}} in {{gitlab_repo}} repo. Within it, create a file named urls.txt that contains the URLs of the 5 most recent posts from the {{subreddit}}?", + "instantiation_dict": { + "directory": "funny_pic", + "gitlab_repo": "gimmiethat.space", + "subreddit": "memes" + }, + "intent": "Create a folder named funny_pic in gimmiethat.space repo. Within it, create a file named urls.txt that contains the URLs of the 5 most recent posts from the memes?", + "require_reset": false, + "eval": { + "site": "gitlab" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__GITLAB__/byteblaze/gimmiethat.space/-/raw/main/funny_pic/urls.txt", + "locator": "", + "required_contents": { + "must_include": [ + "__REDDIT__/f/memes/127991/it-do-be-like-that-tho", + "__REDDIT__/f/memes/127990/thank-you-memers-this-wouldn-t-be-possible-without-you", + "__REDDIT__/f/memes/127989/if-you-have-no-other-choice", + "__REDDIT__/f/memes/127988/yes-yes-yes", + "__REDDIT__/f/memes/127987/shagadelic-baby" + ] + } + } + ] + }, + "intent_template_id": 84 + }, + { + "sites": [ + "gitlab", + "wikipedia" + ], + "task_id": 556, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Create a repo named {{name}} with {{topics}} in a README file", + "instantiation_dict": { + "name": "nolan_honest_fans", + "topics": "movies directed by Christopher Nolan" + }, + "intent": "Create a repo named nolan_honest_fans with movies directed by Christopher Nolan in a README file", + "require_reset": false, + "eval": { + "site": "gitlab" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__GITLAB__/byteblaze/nolan_honest_fans/-/raw/main/README.md", + "locator": "", + "required_contents": { + "must_include": [ + "Following", + "Memento", + "Insomnia", + "Batman Begins", + "The Prestige", + "The Dark Knight", + "Inception", + "The Dark Knight Rises", + "Interstellar", + "Dunkirk", + "Tenet", + "Oppenheimer" + ] + } + } + ] + }, + "intent_template_id": 87 + }, + { + "sites": [ + "gitlab", + "wikipedia" + ], + "task_id": 557, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Create a repo named {{name}} with {{topics}} in a README file", + "instantiation_dict": { + "name": "nolan_old_fans", + "topics": "movies directed by Christopher Nolan before 2010" + }, + "intent": "Create a repo named nolan_old_fans with movies directed by Christopher Nolan before 2010 in a README file", + "require_reset": false, + "eval": { + "site": "gitlab" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__GITLAB__/byteblaze/nolan_old_fans/-/raw/main/README.md", + "locator": "", + "required_contents": { + "must_include": [ + "Following", + "Memento", + "Insomnia", + "Batman Begins", + "The Prestige", + "The Dark Knight" + ] + } + } + ] + }, + "intent_template_id": 87 + }, + { + "sites": [ + "gitlab", + "wikipedia" + ], + "task_id": 558, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Create a repo named {{name}} with {{topics}} in a README file", + "instantiation_dict": { + "name": "nolan_young_fans", + "topics": "movies directed by Christopher Nolan after 2010" + }, + "intent": "Create a repo named nolan_young_fans with movies directed by Christopher Nolan after 2010 in a README file", + "require_reset": false, + "eval": { + "site": "gitlab" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__GITLAB__/byteblaze/nolan_young_fans/-/raw/main/README.md", + "locator": "", + "required_contents": { + "must_include": [ + "Inception", + "The Dark Knight Rises", + "Interstellar", + "Dunkirk", + "Tenet", + "Oppenheimer" + ] + } + } + ] + }, + "intent_template_id": 87 + }, + { + "sites": [ + "gitlab", + "wikipedia" + ], + "task_id": 559, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Create a repo named {{name}} with {{topics}} in a README file", + "instantiation_dict": { + "name": "nolan_followers", + "topics": "career timeline of Christopher Nolan" + }, + "intent": "Create a repo named nolan_followers with career timeline of Christopher Nolan in a README file", + "require_reset": false, + "eval": { + "site": "gitlab" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__GITLAB__/byteblaze/nolan_followers/-/raw/main/README.md", + "locator": "", + "required_contents": { + "must_include": [ + "1993\u20132003: Early career and breakthrough", + "2003\u20132013: Widespread recognition", + "2014\u20132019: Established Hollywood auteur", + "2020\u2013present" + ] + } + } + ] + }, + "intent_template_id": 87 + }, + { + "sites": [ + "gitlab", + "wikipedia" + ], + "task_id": 560, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Create a repo named {{name}} with {{topics}} in a README file", + "instantiation_dict": { + "name": "nolan_academy_awards", + "topics": "movies that won Academy Awards by Christopher Nolan" + }, + "intent": "Create a repo named nolan_academy_awards with movies that won Academy Awards by Christopher Nolan in a README file", + "require_reset": false, + "eval": { + "site": "gitlab" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__GITLAB__/byteblaze/nolan_academy_awards/-/raw/main/README.md", + "locator": "", + "required_contents": { + "must_include": [ + "The Dark Knight", + "Inception", + "Interstellar", + "Dunkirk", + "Tenet" + ] + } + } + ] + }, + "intent_template_id": 87 + }, + { + "sites": [ + "gitlab", + "wikipedia" + ], + "task_id": 561, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Create a repo named {{name}} with {{topics}} in a README file", + "instantiation_dict": { + "name": "bafta_awards_nolan", + "topics": "movies that are nominated BAFTA Awards by Christopher Nolan" + }, + "intent": "Create a repo named bafta_awards_nolan with movies that are nominated BAFTA Awards by Christopher Nolan in a README file", + "require_reset": false, + "eval": { + "site": "gitlab" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__GITLAB__/byteblaze/bafta_awards_nolan/-/raw/main/README.md", + "locator": "", + "required_contents": { + "must_include": [ + "Batman Begins", + "The Dark Knight", + "Inception", + "The Dark Knight Rises", + "Interstellar", + "Dunkirk", + "Tenet" + ] + } + } + ] + }, + "intent_template_id": 87 + }, + { + "sites": [ + "gitlab", + "reddit" + ], + "task_id": 562, + "require_login": true, + "storage_state": "./.auth/gitlab.reddit_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "create a repository named {{name}} that includes a README file with the links to the most active {{num}} DIY ideas on DIY subreddit?", + "instantiation_dict": { + "name": "Awesome_DIY_ideas", + "num": 6 + }, + "intent": "create a repository named Awesome_DIY_ideas that includes a README file with the links to the most active 6 DIY ideas on DIY subreddit?", + "require_reset": false, + "eval": { + "site": "gitlab" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__GITLAB__/byteblaze/Awesome_DIY_ideas/-/raw/main/README.md", + "locator": "", + "required_contents": { + "must_include": [ + "__REDDIT__/f/DIY/118903/separate-glued-plastic-parts", + "__REDDIT__/f/DIY/118923/how-would-you-fix-this-dryer-vent-mess", + "__REDDIT__/f/DIY/118935/basement-bulkhead-soffit-wall-framing", + "__REDDIT__/f/DIY/118904/ge-water-heater-pilot-light-won-t-stay-lit", + "__REDDIT__/f/DIY/118960/attempting-to-move-a-wall-outlet-in-my-basement-a-few-inches", + "__REDDIT__/f/DIY/118931/afci-outlet-question" + ] + } + } + ] + }, + "intent_template_id": 88 + }, + { + "sites": [ + "gitlab", + "reddit" + ], + "task_id": 563, + "require_login": true, + "storage_state": "./.auth/gitlab.reddit_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "create a repository named {{name}} that includes a README file with the links to the most active {{num}} DIY ideas on DIY subreddit?", + "instantiation_dict": { + "name": "fun_thing_to_do", + "num": 5 + }, + "intent": "create a repository named fun_thing_to_do that includes a README file with the links to the most active 5 DIY ideas on DIY subreddit?", + "require_reset": false, + "eval": { + "site": "gitlab" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__GITLAB__/byteblaze/fun_thing_to_do/-/raw/main/README.md", + "locator": "", + "required_contents": { + "must_include": [ + "__REDDIT__/f/DIY/118903/separate-glued-plastic-parts", + "__REDDIT__/f/DIY/118923/how-would-you-fix-this-dryer-vent-mess", + "__REDDIT__/f/DIY/118935/basement-bulkhead-soffit-wall-framing", + "__REDDIT__/f/DIY/118904/ge-water-heater-pilot-light-won-t-stay-lit", + "__REDDIT__/f/DIY/118960/attempting-to-move-a-wall-outlet-in-my-basement-a-few-inches" + ] + } + } + ] + }, + "intent_template_id": 88 + }, + { + "sites": [ + "gitlab", + "reddit" + ], + "task_id": 564, + "require_login": true, + "storage_state": "./.auth/gitlab.reddit_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "create a repository named {{name}} that includes a README file with the links to the most active {{num}} DIY ideas on DIY subreddit?", + "instantiation_dict": { + "name": "live_a_life", + "num": 3 + }, + "intent": "create a repository named live_a_life that includes a README file with the links to the most active 3 DIY ideas on DIY subreddit?", + "require_reset": false, + "eval": { + "site": "gitlab" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__GITLAB__/byteblaze/live_a_life/-/raw/main/README.md", + "locator": "", + "required_contents": { + "must_include": [ + "__REDDIT__/f/DIY/118903/separate-glued-plastic-parts", + "__REDDIT__/f/DIY/118923/how-would-you-fix-this-dryer-vent-mess", + "__REDDIT__/f/DIY/118935/basement-bulkhead-soffit-wall-framing" + ] + } + } + ] + }, + "intent_template_id": 88 + }, + { + "sites": [ + "gitlab", + "reddit" + ], + "task_id": 565, + "require_login": true, + "storage_state": "./.auth/gitlab.reddit_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "create a repository named {{name}} that includes a README file with the links to the most active {{num}} DIY ideas on DIY subreddit?", + "instantiation_dict": { + "name": "TODO", + "num": 10 + }, + "intent": "create a repository named TODO that includes a README file with the links to the most active 10 DIY ideas on DIY subreddit?", + "require_reset": false, + "eval": { + "site": "gitlab" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__GITLAB__/byteblaze/TODO/-/raw/main/README.md", + "locator": "", + "required_contents": { + "must_include": [ + "__REDDIT__/f/DIY/118903/separate-glued-plastic-parts", + "__REDDIT__/f/DIY/118923/how-would-you-fix-this-dryer-vent-mess", + "__REDDIT__/f/DIY/118935/basement-bulkhead-soffit-wall-framing", + "__REDDIT__/f/DIY/118904/ge-water-heater-pilot-light-won-t-stay-lit", + "__REDDIT__/f/DIY/118960/attempting-to-move-a-wall-outlet-in-my-basement-a-few-inches", + "__REDDIT__/f/DIY/118931/afci-outlet-question", + "__REDDIT__/f/DIY/118824/teflon-tape-to-attach-washing-machine-drain-hose-to-pipe", + "__REDDIT__/f/DIY/118866/paver-base-for-shed", + "__REDDIT__/f/DIY/118820/ways-to-locate-our-buried-electrical-service", + "__REDDIT__/f/DIY/118836/how-to-eliminate-transitions-for-disability-mobility-reasons" + ] + } + } + ] + }, + "intent_template_id": 88 + }, + { + "sites": [ + "gitlab", + "reddit" + ], + "task_id": 566, + "require_login": true, + "storage_state": "./.auth/gitlab.reddit_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "create a repository named {{name}} that includes a README file with the links to the most active {{num}} DIY ideas on DIY subreddit?", + "instantiation_dict": { + "name": "Do it myself", + "num": 8 + }, + "intent": "create a repository named Do it myself that includes a README file with the links to the most active 8 DIY ideas on DIY subreddit?", + "require_reset": false, + "eval": { + "site": "gitlab" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__GITLAB__/byteblaze/Do-it-myself/-/raw/main/README.md", + "locator": "", + "required_contents": { + "must_include": [ + "__REDDIT__/f/DIY/118903/separate-glued-plastic-parts", + "__REDDIT__/f/DIY/118923/how-would-you-fix-this-dryer-vent-mess", + "__REDDIT__/f/DIY/118935/basement-bulkhead-soffit-wall-framing", + "__REDDIT__/f/DIY/118904/ge-water-heater-pilot-light-won-t-stay-lit", + "__REDDIT__/f/DIY/118960/attempting-to-move-a-wall-outlet-in-my-basement-a-few-inches", + "__REDDIT__/f/DIY/118931/afci-outlet-question", + "__REDDIT__/f/DIY/118824/teflon-tape-to-attach-washing-machine-drain-hose-to-pipe", + "__REDDIT__/f/DIY/118866/paver-base-for-shed" + ] + } + } + ] + }, + "intent_template_id": 88 + }, + { + "sites": [ + "gitlab" + ], + "task_id": 567, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Invite {{collaborator_account_list}} as collaborator to {{repo}} repo", + "instantiation_dict": { + "collaborator_account_list": "Jakub Klinkovsk\u00fd and Beno\u00eet Blanchon", + "repo": "gimmiethat.space" + }, + "intent": "Invite Jakub Klinkovsk\u00fd and Beno\u00eet Blanchon as collaborator to gimmiethat.space repo", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "project_has_collaborators", + "eval_params": { + "collaborators": [ + "lahwaacz", + "bblanchon" + ], + "group": "byteblaze", + "project": "gimmiethat.space" + } + } + } + ], + "site": "gitlab" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__GITLAB__/byteblaze/gimmiethat.space/-/project_members", + "locator": "", + "required_contents": { + "must_include": [ + "@lahwaacz", + "@bblanchon" + ] + } + } + ] + }, + "intent_template_id": 293, + "changelogs": [ + { + "key": "expected_backend_state", + "category": "switch_to_api_eval", + "note": "Use API to verify; more accurate" + }, + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "gitlab" + ], + "task_id": 568, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Invite {{collaborator_account_list}} as collaborator to {{repo}} repo", + "instantiation_dict": { + "collaborator_account_list": "Abishek and Vinta", + "repo": "a11yproject.com" + }, + "intent": "Invite Abishek and Vinta as collaborator to a11yproject.com repo", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "project_has_collaborators", + "eval_params": { + "collaborators": [ + "abisubramanya27", + "vinta" + ], + "group": "a11yproject", + "project": "a11yproject.com" + } + } + } + ], + "site": "gitlab" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__GITLAB__/a11yproject/a11yproject.com/-/project_members", + "locator": "", + "required_contents": { + "must_include": [ + "@abisubramanya27", + "@vinta" + ] + } + } + ] + }, + "intent_template_id": 293, + "changelogs": [ + { + "key": "expected_backend_state", + "category": "switch_to_api_eval", + "note": "Use API to verify; more accurate" + }, + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "gitlab" + ], + "task_id": 569, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Invite {{collaborator_account_list}} as collaborator to {{repo}} repo", + "instantiation_dict": { + "collaborator_account_list": "Beno\u00eet and Abishek", + "repo": "my HTML5 markup extention" + }, + "intent": "Invite Beno\u00eet and Abishek as collaborator to my HTML5 markup extention repo", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "project_has_collaborators", + "eval_params": { + "collaborators": [ + "bblanchon", + "abisubramanya27" + ], + "group": "byteblaze", + "project": "accessible-html-content-patterns" + } + } + } + ], + "site": "gitlab" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__GITLAB__/byteblaze/accessible-html-content-patterns/-/project_members", + "locator": "", + "required_contents": { + "must_include": [ + "@bblanchon", + "@abisubramanya27" + ] + } + } + ] + }, + "intent_template_id": 293, + "changelogs": [ + { + "key": "expected_backend_state", + "category": "switch_to_api_eval", + "note": "Use API to verify; more accurate" + }, + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "gitlab" + ], + "task_id": 570, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Invite {{collaborator_account_list}} as collaborator to {{repo}} repo", + "instantiation_dict": { + "collaborator_account_list": "Jakub K, Alex Dills, Alex Hutnik and Beno\u00eet Blanchon", + "repo": "my time tracking tool project" + }, + "intent": "Invite Jakub K, Alex Dills, Alex Hutnik and Beno\u00eet Blanchon as collaborator to my time tracking tool project repo", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "project_has_collaborators", + "eval_params": { + "collaborators": [ + "lahwaacz", + "V13Axel", + "alexhutnik", + "bblanchon" + ], + "group": "byteblaze", + "project": "timeit" + } + } + } + ], + "site": "gitlab" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__GITLAB__/byteblaze/timeit/-/project_members", + "locator": "", + "required_contents": { + "must_include": [ + "@lahwaacz", + "@V13Axel", + "@alexhutnik", + "@bblanchon" + ] + } + } + ] + }, + "intent_template_id": 293, + "changelogs": [ + { + "key": "expected_backend_state", + "category": "switch_to_api_eval", + "note": "Use API to verify; more accurate" + }, + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "shopping" + ], + "task_id": 571, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "I recently moved, my address is {{address}}, update my information on OneStopShopping accordingly", + "instantiation_dict": { + "address": "231 Willow Way, Suite 100, Chicago, IL, 60601" + }, + "intent": "I recently moved, my address is 231 Willow Way, Suite 100, Chicago, IL, 60601, update my information on OneStopShopping accordingly", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "validate_address", + "expected_data": { + "address": "231 Willow Way", + "address2": "Suite 100", + "city": "Chicago", + "state": "Illinois", + "zip_code": "60601" + } + } + } + ], + "site": "shopping" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__SHOPPING__/customer/address", + "locator": "document.querySelector(\".box.box-address-billing > .box-content\").outerText", + "required_contents": { + "must_include": [ + "231 Willow Way", + "Suite 100", + "Chicago, Illinois, 60601" + ] + } + }, + { + "url": "__SHOPPING__/customer/address", + "locator": "document.querySelector(\".box.box-address-shipping > .box-content\").outerText", + "required_contents": { + "must_include": [ + "231 Willow Way", + "Suite 100", + "Chicago, Illinois, 60601" + ] + } + } + ] + }, + "intent_template_id": 165, + "changelogs": [ + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "shopping" + ], + "task_id": 572, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "I recently moved, my address is {{address}}, update my information on OneStopShopping accordingly", + "instantiation_dict": { + "address": "654 Aspen Road, House #3, Boston, MA, 02110" + }, + "intent": "I recently moved, my address is 654 Aspen Road, House #3, Boston, MA, 02110, update my information on OneStopShopping accordingly", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "validate_address", + "expected_data": { + "address": "654 Aspen Road", + "address2": "House #3", + "city": "Boston", + "state": "Massachusetts", + "zip_code": "02110" + } + } + } + ], + "site": "shopping" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__SHOPPING__/customer/address", + "locator": "document.querySelector(\".box.box-address-billing > .box-content\").outerText", + "required_contents": { + "must_include": [ + "654 Aspen Road", + "House #3", + "Boston, Massachusetts, 02110" + ] + } + }, + { + "url": "__SHOPPING__/customer/address", + "locator": "document.querySelector(\".box.box-address-shipping > .box-content\").outerText", + "required_contents": { + "must_include": [ + "654 Aspen Road", + "House #3", + "Boston, Massachusetts, 02110" + ] + } + } + ] + }, + "intent_template_id": 165, + "changelogs": [ + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "shopping" + ], + "task_id": 573, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "I recently moved, my address is {{address}}, update my information on OneStopShopping accordingly", + "instantiation_dict": { + "address": "987 Sycamore Circle, Philadelphia, PA, 19102" + }, + "intent": "I recently moved, my address is 987 Sycamore Circle, Philadelphia, PA, 19102, update my information on OneStopShopping accordingly", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "validate_address", + "expected_data": { + "address": "987 Sycamore Circle", + "city": "Philadelphia", + "state": "Pennsylvania", + "zip_code": "19102" + } + } + } + ], + "site": "shopping" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__SHOPPING__/customer/address", + "locator": "document.querySelector(\".box.box-address-shipping > .box-content\").outerText", + "required_contents": { + "must_include": [ + "987 Sycamore Circle", + "Philadelphia, Pennsylvania, 19102" + ] + } + }, + { + "url": "__SHOPPING__/customer/address", + "locator": "document.querySelector(\".box.box-address-billing > .box-content\").outerText", + "required_contents": { + "must_include": [ + "987 Sycamore Circle", + "Philadelphia, Pennsylvania, 19102" + ] + } + } + ] + }, + "intent_template_id": 165, + "changelogs": [ + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "shopping" + ], + "task_id": 574, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "I recently moved, my address is {{address}}, update my information on OneStopShopping accordingly", + "instantiation_dict": { + "address": "111 Magnolia Path, Atlanta, GA, 30303" + }, + "intent": "I recently moved, my address is 111 Magnolia Path, Atlanta, GA, 30303, update my information on OneStopShopping accordingly", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "validate_address", + "expected_data": { + "address": "111 Magnolia Path", + "city": "Atlanta", + "state": "Georgia", + "zip_code": "30303" + } + } + } + ], + "site": "shopping" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__SHOPPING__/customer/address", + "locator": "document.querySelector(\".box.box-address-shipping > .box-content\").outerText", + "required_contents": { + "must_include": [ + "111 Magnolia Path", + "Atlanta, Georgia, 30303" + ] + } + }, + { + "url": "__SHOPPING__/customer/address", + "locator": "document.querySelector(\".box.box-address-billing > .box-content\").outerText", + "required_contents": { + "must_include": [ + "111 Magnolia Path", + "Atlanta, Georgia, 30303" + ] + } + } + ] + }, + "intent_template_id": 165, + "changelogs": [ + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "shopping" + ], + "task_id": 575, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "I recently moved, my address is {{address}}, update my information on OneStopShopping accordingly", + "instantiation_dict": { + "address": "222 Redwood Rise, Suite 300, Seattle, WA, 98101" + }, + "intent": "I recently moved, my address is 222 Redwood Rise, Suite 300, Seattle, WA, 98101, update my information on OneStopShopping accordingly", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "validate_address", + "expected_data": { + "address": "222 Redwood Rise", + "address2": "Suite 300", + "city": "Seattle", + "state": "Washington", + "zip_code": "98101" + } + } + } + ], + "site": "shopping" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__SHOPPING__/customer/address", + "locator": "document.querySelector(\".box.box-address-shipping > .box-content\").outerText", + "required_contents": { + "must_include": [ + "222 Redwood Rise", + "Suite 300", + "Seattle, Washington, 98101" + ] + } + }, + { + "url": "__SHOPPING__/customer/address", + "locator": "document.querySelector(\".box.box-address-billing > .box-content\").outerText", + "required_contents": { + "must_include": [ + "222 Redwood Rise", + "Suite 300", + "Seattle, Washington, 98101" + ] + } + } + ] + }, + "intent_template_id": 165, + "changelogs": [ + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "gitlab" + ], + "task_id": 576, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Add the following users to repo {{repo}} as {{role}}: {{user_list}}", + "instantiation_dict": { + "repo": "a11y-webring.club", + "role": "developer", + "user_list": [ + "abisubramanya27", + "lahwaacz" + ] + }, + "intent": "Add the following users to repo a11y-webring.club as developer: ['abisubramanya27', 'lahwaacz']", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "project_has_collaborators", + "eval_params": { + "collaborators": [ + "abisubramanya27", + "lahwaacz" + ], + "group": "byteblaze", + "project": "a11y-webring.club", + "access_level": 30 + } + } + } + ], + "site": "gitlab" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__GITLAB__/byteblaze/a11y-webring.club/-/project_members", + "locator": "func:gitlab_get_project_memeber_role(__page__, 'abisubramanya27')", + "required_contents": { + "must_include": [ + "Developer" + ] + } + }, + { + "url": "__GITLAB__/byteblaze/a11y-webring.club/-/project_members", + "locator": "func:gitlab_get_project_memeber_role(__page__, 'lahwaacz')", + "required_contents": { + "must_include": [ + "Developer" + ] + } + } + ] + }, + "intent_template_id": 351, + "changelogs": [ + { + "key": "expected_backend_state", + "category": "switch_to_api_eval", + "note": "Use API to verify; more accurate" + }, + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "gitlab" + ], + "task_id": 577, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Add the following users to my {{repo}} as {{role}}: {{user_list}}", + "instantiation_dict": { + "repo": "GitHub timeline item management extension", + "role": "maintainer", + "user_list": [ + "abisubramanya27", + "lahwaacz" + ] + }, + "intent": "Add the following users to my GitHub timeline item management extension as maintainer: ['abisubramanya27', 'lahwaacz']", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "project_has_collaborators", + "eval_params": { + "collaborators": [ + "abisubramanya27", + "lahwaacz" + ], + "group": "byteblaze", + "project": "remove-board-movement-events-from-the-github-issue-timeline", + "access_level": 40 + } + } + } + ], + "site": "gitlab" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__GITLAB__/byteblaze/remove-board-movement-events-from-the-github-issue-timeline/-/project_members", + "locator": "func:gitlab_get_project_memeber_role(__page__, 'abisubramanya27')", + "required_contents": { + "must_include": [ + "Maintainer" + ] + } + }, + { + "url": "__GITLAB__/byteblaze/remove-board-movement-events-from-the-github-issue-timeline/-/project_members", + "locator": "func:gitlab_get_project_memeber_role(__page__, 'lahwaacz')", + "required_contents": { + "must_include": [ + "Maintainer" + ] + } + } + ] + }, + "intent_template_id": 351, + "changelogs": [ + { + "key": "expected_backend_state", + "category": "switch_to_api_eval", + "note": "Use API to verify; more accurate" + }, + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "gitlab" + ], + "task_id": 578, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Add the following users to repo {{repo}} as {{role}}: {{user_list}}", + "instantiation_dict": { + "repo": "millennials-to-snake-people", + "role": "reporter", + "user_list": [ + "yjlou", + "a11yproject" + ] + }, + "intent": "Add the following users to repo millennials-to-snake-people as reporter: ['yjlou', 'a11yproject']", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "project_has_collaborators", + "eval_params": { + "collaborators": [ + "yjlou", + "a11yproject" + ], + "group": "byteblaze", + "project": "millennials-to-snake-people", + "access_level": 20 + } + } + } + ], + "site": "gitlab" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__GITLAB__/byteblaze/millennials-to-snake-people/-/project_members", + "locator": "func:gitlab_get_project_memeber_role(__page__, 'yjlou')", + "required_contents": { + "must_include": [ + "Reporter" + ] + } + }, + { + "url": "__GITLAB__/byteblaze/millennials-to-snake-people/-/project_members", + "locator": "func:gitlab_get_project_memeber_role(__page__, 'a11yproject')", + "required_contents": { + "must_include": [ + "Reporter" + ] + } + } + ] + }, + "intent_template_id": 351, + "changelogs": [ + { + "key": "expected_backend_state", + "category": "switch_to_api_eval", + "note": "Use API to verify; more accurate" + }, + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "gitlab" + ], + "task_id": 579, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Add the following users to my {{repo}} as {{role}}: {{user_list}}", + "instantiation_dict": { + "repo": "time tracking tool", + "role": "guest", + "user_list": [ + "yjlou" + ] + }, + "intent": "Add the following users to my time tracking tool as guest: ['yjlou']", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "project_has_collaborators", + "eval_params": { + "collaborators": [ + "yjlou" + ], + "group": "byteblaze", + "project": "timeit", + "access_level": 10 + } + } + } + ], + "site": "gitlab" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__GITLAB__/byteblaze/timeit/-/project_members", + "locator": "func:gitlab_get_project_memeber_role(__page__, 'yjlou')", + "required_contents": { + "must_include": [ + "Guest" + ] + } + } + ] + }, + "intent_template_id": 351, + "changelogs": [ + { + "key": "expected_backend_state", + "category": "switch_to_api_eval", + "note": "Use API to verify; more accurate" + }, + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "reddit" + ], + "task_id": 580, + "require_login": true, + "storage_state": "./.auth/reddit_state.json", + "start_url": "__REDDIT__", + "geolocation": null, + "intent_template": "Create a new forum named {{name}}, with a description of {{description}}, and include {{sidebar_list}} in the sidebar?", + "instantiation_dict": { + "name": "sci_fi", + "description": "A wild place for sci-fi enthusiasts", + "sidebar_list": [ + "New", + "Classic", + "Movies", + "Post my novel", + "Random" + ] + }, + "intent": "Create a new forum named sci_fi, with a description of A wild place for sci-fi enthusiasts, and include ['New', 'Classic', 'Movies', 'Post my novel', 'Random'] in the sidebar?", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "validate_subreddit_has_description", + "expected_data": { + "subreddit": "sci_fi", + "description": [ + "New", + "Classic", + "Movies", + "Post my novel", + "Random" + ] + } + } + } + ], + "site": "reddit" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__REDDIT__/f/sci_fi/edit", + "locator": "document.querySelector(\"#forum_description\").value", + "required_contents": { + "must_include": [ + "A wild place for sci-fi enthusiasts" + ] + } + }, + { + "url": "__REDDIT__/f/sci_fi/edit", + "locator": "document.querySelector(\"#forum_sidebar\").value", + "required_contents": { + "must_include": [ + "New", + "Classic", + "Movies", + "Post my novel", + "Random" + ] + } + } + ] + }, + "intent_template_id": 7, + "changelogs": [ + { + "key": "expected_backend_state", + "category": "switch_to_api_eval", + "note": "Update to use system check instead of dom check" + }, + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "reddit" + ], + "task_id": 581, + "require_login": true, + "storage_state": "./.auth/reddit_state.json", + "start_url": "__REDDIT__", + "geolocation": null, + "intent_template": "Create a new forum named {{name}}, with a description of {{description}}, and include {{sidebar_list}} in the sidebar?", + "instantiation_dict": { + "name": "cmu_lti", + "description": "Language Technologies Institute at Carnegie Mellon University", + "sidebar_list": [ + "announcement", + "paper", + "alumni" + ] + }, + "intent": "Create a new forum named cmu_lti, with a description of Language Technologies Institute at Carnegie Mellon University, and include ['announcement', 'paper', 'alumni'] in the sidebar?", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "validate_subreddit_has_description", + "expected_data": { + "subreddit": "cmu_lti", + "description": [ + "announcement", + "paper", + "alumni" + ] + } + } + } + ], + "site": "reddit" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__REDDIT__/f/cmu_lti/edit", + "locator": "document.querySelector(\"#forum_description\").value", + "required_contents": { + "must_include": [ + "Language Technologies Institute at Carnegie Mellon University" + ] + } + }, + { + "url": "__REDDIT__/f/cmu_lti/edit", + "locator": "document.querySelector(\"#forum_sidebar\").value", + "required_contents": { + "must_include": [ + "announcement", + "paper", + "alumni" + ] + } + } + ] + }, + "intent_template_id": 7, + "changelogs": [ + { + "key": "expected_backend_state", + "category": "switch_to_api_eval", + "note": "Update to use system check instead of dom check" + }, + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "reddit" + ], + "task_id": 582, + "require_login": true, + "storage_state": "./.auth/reddit_state.json", + "start_url": "__REDDIT__", + "geolocation": null, + "intent_template": "Create a new forum named {{name}}, with a description of {{description}}, and include {{sidebar_list}} in the sidebar?", + "instantiation_dict": { + "name": "Cyberpunk", + "description": "Welcome to the future", + "sidebar_list": [ + "Games", + "Books", + "Movies", + "Future" + ] + }, + "intent": "Create a new forum named Cyberpunk, with a description of Welcome to the future, and include ['Games', 'Books', 'Movies', 'Future'] in the sidebar?", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "validate_subreddit_has_description", + "expected_data": { + "subreddit": "sci_fi", + "description": [ + "Games", + "Books", + "Movies", + "Future" + ] + } + } + } + ], + "site": "reddit" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__REDDIT__/f/Cyberpunk/edit", + "locator": "document.querySelector(\"#forum_description\").value", + "required_contents": { + "must_include": [ + "Welcome to the future" + ] + } + }, + { + "url": "__REDDIT__/f/Cyberpunk/edit", + "locator": "document.querySelector(\"#forum_sidebar\").value", + "required_contents": { + "must_include": [ + "Games", + "Books", + "Movies", + "Future" + ] + } + } + ] + }, + "intent_template_id": 7, + "changelogs": [ + { + "key": "expected_backend_state", + "category": "switch_to_api_eval", + "note": "Update to use system check instead of dom check" + }, + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "reddit" + ], + "task_id": 583, + "require_login": true, + "storage_state": "./.auth/reddit_state.json", + "start_url": "__REDDIT__", + "geolocation": null, + "intent_template": "Create a new forum named {{name}}, with a description of {{description}}, and include {{sidebar_list}} in the sidebar?", + "instantiation_dict": { + "name": "PlantsForCatParents", + "description": "Cat parents & plan lovers", + "sidebar_list": [ + "Cat friendly", + "Local vendors", + "Promotion", + "Toxic plants!" + ] + }, + "intent": "Create a new forum named PlantsForCatParents, with a description of Cat parents & plan lovers, and include ['Cat friendly', 'Local vendors', 'Promotion', 'Toxic plants!'] in the sidebar?", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "validate_subreddit_has_description", + "expected_data": { + "subreddit": "PlantsForCatParents", + "description": [ + "Cat friendly", + "Local vendors", + "Promotion", + "Toxic plants!" + ] + } + } + } + ], + "site": "reddit" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__REDDIT__/f/PlantsForCatParents/edit", + "locator": "document.querySelector(\"#forum_description\").value", + "required_contents": { + "must_include": [ + "Cat parents & plan lovers" + ] + } + }, + { + "url": "__REDDIT__/f/PlantsForCatParents/edit", + "locator": "document.querySelector(\"#forum_sidebar\").value", + "required_contents": { + "must_include": [ + "Cat friendly", + "Local vendors", + "Promotion", + "Toxic plants!" + ] + } + } + ] + }, + "intent_template_id": 7, + "changelogs": [ + { + "key": "expected_backend_state", + "category": "switch_to_api_eval", + "note": "Update to use system check instead of dom check" + }, + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "reddit" + ], + "task_id": 584, + "require_login": true, + "storage_state": "./.auth/reddit_state.json", + "start_url": "__REDDIT__", + "geolocation": null, + "intent_template": "Create a new forum named {{name}}, with a description of {{description}}, and include {{sidebar_list}} in the sidebar?", + "instantiation_dict": { + "name": "Karaoke", + "description": "Place for Karaoke lovers", + "sidebar_list": [ + "devices", + "setup" + ] + }, + "intent": "Create a new forum named Karaoke, with a description of Place for Karaoke lovers, and include ['devices', 'setup'] in the sidebar?", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "validate_subreddit_has_description", + "expected_data": { + "subreddit": "Karaoke", + "description": [ + "devices", + "setup" + ] + } + } + } + ], + "site": "reddit" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__REDDIT__/f/Karaoke", + "locator": "document.querySelector(\"#forum_description\").value", + "required_contents": { + "must_include": [ + "Place for Karaoke lovers" + ] + } + }, + { + "url": "__REDDIT__/f/Karaoke", + "locator": "document.querySelector(\"#forum_sidebar\").value", + "required_contents": { + "must_include": [ + "devices", + "setup" + ] + } + } + ] + }, + "intent_template_id": 7, + "changelogs": [ + { + "key": "expected_backend_state", + "category": "switch_to_api_eval", + "note": "Update to use system check instead of dom check" + }, + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "shopping" + ], + "task_id": 585, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "Rate my recent purchase of {{product}} with {{num_star}} stars, using my nickname {{nickname}}?", + "instantiation_dict": { + "product": "floor lamp", + "num_star": 5, + "nickname": "Emma Lopez" + }, + "intent": "Rate my recent purchase of floor lamp with 5 stars, using my nickname Emma Lopez?", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "validate_review", + "expected_data": { + "sku": "B00J8RZL7I", + "num_star": 5, + "nickname": "Emma Lopez" + } + } + } + ], + "site": "shopping" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "last", + "locator": "func:shopping_get_sku_latest_review_rating('B00J8RZL7I')", + "required_contents": { + "must_include": [ + "100" + ] + } + }, + { + "url": "last", + "locator": "func:shopping_get_sku_latest_review_author('B00J8RZL7I')", + "required_contents": { + "must_include": [ + "Emma Lopez" + ] + } + } + ] + }, + "intent_template_id": 194, + "changelogs": [ + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "shopping" + ], + "task_id": 586, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "Rate my recent purchase of {{product}} with {{num_star}} stars, using my nickname {{nickname}}?", + "instantiation_dict": { + "product": "Jiffy Corn Muffin Cornbread Mix", + "num_star": 4, + "nickname": "ShoppingEmma" + }, + "intent": "Rate my recent purchase of Jiffy Corn Muffin Cornbread Mix with 4 stars, using my nickname ShoppingEmma?", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "validate_review", + "expected_data": { + "sku": "B07HZB38XH", + "num_star": 4, + "nickname": "ShoppingEmma" + } + } + } + ], + "site": "shopping" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "last", + "locator": "func:shopping_get_sku_latest_review_rating('B07HZB38XH')", + "required_contents": { + "must_include": [ + "80" + ] + } + }, + { + "url": "last", + "locator": "func:shopping_get_sku_latest_review_author('B07HZB38XH')", + "required_contents": { + "must_include": [ + "ShoppingEmma" + ] + } + } + ] + }, + "intent_template_id": 194, + "changelogs": [ + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "shopping" + ], + "task_id": 587, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "Rate my recent purchase of {{product}} with {{num_star}} stars, using my nickname {{nickname}}?", + "instantiation_dict": { + "product": "PS3 Remote Controller Skins", + "num_star": 3, + "nickname": "GamingEmma" + }, + "original.instantiation_dict": { + "product": "PS3 Remote Controllers", + "num_star": 3, + "nickname": "GamingEmma" + }, + "intent": "Rate my recent purchase of PS3 Remote Controller Skins with 3 stars, using my nickname GamingEmma?", + "original.intent": "Rate my recent purchase of PS3 Remote Controllers with 3 stars, using my nickname GamingEmma?", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "validate_review", + "expected_data": { + "sku": "B0041MSF2S", + "num_star": 3, + "nickname": "GamingEmma" + } + } + } + ], + "site": "shopping" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "last", + "locator": "func:shopping_get_sku_latest_review_rating('B0041MSF2S')", + "required_contents": { + "must_include": [ + "60" + ] + } + }, + { + "url": "last", + "locator": "func:shopping_get_sku_latest_review_author('B0041MSF2S')", + "required_contents": { + "must_include": [ + "GamingEmma" + ] + } + } + ] + }, + "intent_template_id": 194, + "changelogs": [ + { + "key": "instantiation_dict", + "category": "reference_alignment", + "note": "Changed product name to indicate the desired product is a skin not the controller" + }, + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "shopping" + ], + "task_id": 588, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "Rate my recent purchase of {{product}} with {{num_star}} stars, using my nickname {{nickname}}?", + "instantiation_dict": { + "product": "Foundation For Mattress With Frame Set", + "num_star": 1, + "nickname": "ShoppingEmma" + }, + "intent": "Rate my recent purchase of Foundation For Mattress With Frame Set with 1 stars, using my nickname ShoppingEmma?", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "validate_review", + "expected_data": { + "sku": "B07DFJ5XKH", + "num_star": 1, + "nickname": "ShoppingEmma" + } + } + } + ], + "site": "shopping" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "last", + "locator": "func:shopping_get_sku_latest_review_rating('B07DFJ5XKH')", + "required_contents": { + "must_include": [ + "20" + ] + } + }, + { + "url": "last", + "locator": "func:shopping_get_sku_latest_review_author('B07DFJ5XKH')", + "required_contents": { + "must_include": [ + "ShoppingEmma" + ] + } + } + ] + }, + "intent_template_id": 194, + "changelogs": [ + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "shopping" + ], + "task_id": 589, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "Rate my recent purchase of {{product}} with {{num_star}} stars, using my nickname {{nickname}}?", + "instantiation_dict": { + "product": "Mini Wireless Bluetooth Speaker", + "num_star": 2, + "nickname": "SimpleEmma" + }, + "intent": "Rate my recent purchase of Mini Wireless Bluetooth Speaker with 2 stars, using my nickname SimpleEmma?", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "validate_review", + "expected_data": { + "sku": "B09P7BFL4H", + "num_star": 2, + "nickname": "SimpleEmma" + } + } + } + ], + "site": "shopping" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "last", + "locator": "func:shopping_get_sku_latest_review_rating('B09P7BFL4H')", + "required_contents": { + "must_include": [ + "40" + ] + } + }, + { + "url": "last", + "locator": "func:shopping_get_sku_latest_review_author('B09P7BFL4H')", + "required_contents": { + "must_include": [ + "SimpleEmma" + ] + } + } + ] + }, + "intent_template_id": 194, + "changelogs": [ + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "gitlab" + ], + "task_id": 590, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__/primer/design", + "geolocation": null, + "intent_template": "Create a milestone for the upcoming {{event}} starting on {{start_date}} and ending on {{end_date}}", + "instantiation_dict": { + "event": "event of product launch", + "start_date": "1/16/2023", + "end_date": "1/30/2023" + }, + "intent": "Create a milestone for the upcoming event of product launch starting on 1/16/2023 and ending on 1/30/2023", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "does_milestone_exist_with_fields", + "eval_params": { + "group": "primer", + "project": "design", + "values": { + "title": "product launch", + "start_date": "2023-01-16", + "due_date": "2023-01-30" + }, + "title_field_is_substring": true + } + } + } + ], + "site": "gitlab" + }, + "original.eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__GITLAB__/primer/design/-/milestones", + "program_html": [ + { + "url": "last", + "locator": "document.querySelector(\"#content-body\").outerText", + "required_contents": { + "must_include": [ + "product launch" + ] + } + }, + { + "url": "last", + "locator": "document.querySelector('.block.start_date').outerText", + "required_contents": { + "must_include": [ + "Jan 16, 2030" + ] + } + }, + { + "url": "last", + "locator": "document.querySelector('.block.due_date').outerText", + "required_contents": { + "must_include": [ + "Jan 30, 2030" + ] + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 339, + "changelogs": [ + { + "key": "expected_backend_state", + "category": "switch_to_api_eval", + "note": "Use API to verify; more accurate" + }, + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "gitlab" + ], + "task_id": 591, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__/primer/design", + "geolocation": null, + "intent_template": "Create a milestone for the upcoming {{event}} starting on {{start_date}} and ending on {{end_date}}", + "instantiation_dict": { + "event": "practice of collective code review", + "start_date": "1/16/2023", + "end_date": "in 20 days" + }, + "intent": "Create a milestone for the upcoming practice of collective code review starting on 1/16/2023 and ending on in 20 days", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "does_milestone_exist_with_fields", + "eval_params": { + "group": "primer", + "project": "design", + "values": { + "title": "code review", + "start_date": "2023-01-16", + "due_date": "2023-02-05" + }, + "title_field_is_substring": true + } + } + } + ], + "site": "gitlab" + }, + "original.eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__GITLAB__/primer/design/-/milestones", + "program_html": [ + { + "url": "last", + "locator": "document.querySelector(\"#content-body\").outerText", + "required_contents": { + "must_include": [ + "code review" + ] + } + }, + { + "url": "last", + "locator": "document.querySelector('.block.start_date').outerText", + "required_contents": { + "must_include": [ + "Jan 16, 2030" + ] + } + }, + { + "url": "last", + "locator": "document.querySelector('.block.due_date').outerText", + "required_contents": { + "must_include": [ + "Feb 5, 2030" + ] + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 339, + "changelogs": [ + { + "key": "expected_backend_state", + "category": "switch_to_api_eval", + "note": "Use API to verify; more accurate" + }, + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "gitlab" + ], + "task_id": 592, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__/primer/design", + "geolocation": null, + "intent_template": "Create a milestone for the upcoming {{event}} starting on {{start_date}} and ending on {{end_date}}", + "instantiation_dict": { + "event": "task of cleaning sensitive information", + "start_date": "2/16/2023", + "end_date": "in 20 days" + }, + "intent": "Create a milestone for the upcoming task of cleaning sensitive information starting on 2/16/2023 and ending on in 20 days", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "does_milestone_exist_with_fields", + "eval_params": { + "group": "primer", + "project": "design", + "values": { + "title": "sensitive information", + "start_date": "2023-02-16", + "due_date": "2023-03-08" + }, + "title_field_is_substring": true + } + } + } + ], + "site": "gitlab" + }, + "original.eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__GITLAB__/primer/design/-/milestones", + "program_html": [ + { + "url": "last", + "locator": "document.querySelector(\"#content-body\").outerText", + "required_contents": { + "must_include": [ + "sensitive information" + ] + } + }, + { + "url": "last", + "locator": "document.querySelector('.block.start_date').outerText", + "required_contents": { + "must_include": [ + "Feb 16, 2030" + ] + } + }, + { + "url": "last", + "locator": "document.querySelector('.block.due_date').outerText", + "required_contents": { + "must_include": [ + "Mar 8, 2030" + ] + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 339, + "changelogs": [ + { + "key": "expected_backend_state", + "category": "switch_to_api_eval", + "note": "Use API to verify; more accurate" + }, + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "gitlab" + ], + "task_id": 593, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__/byteblaze/dotfiles", + "geolocation": null, + "intent_template": "Create a milestone for the upcoming {{event}} starting on {{start_date}} and ending on {{end_date}}", + "instantiation_dict": { + "event": "task of merging all branches to main", + "start_date": "March 15, 2044", + "end_date": "March 30, 2044" + }, + "intent": "Create a milestone for the upcoming task of merging all branches to main starting on March 15, 2044 and ending on March 30, 2044", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "does_milestone_exist_with_fields", + "eval_params": { + "group": "byteblaze", + "project": "dotfiles", + "values": { + "title": "all branches to main", + "start_date": "2044-03-15", + "due_date": "2044-03-30" + }, + "title_field_is_substring": true + } + } + } + ], + "site": "gitlab" + }, + "original.eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__GITLAB__/byteblaze/dotfiles/-/milestones", + "program_html": [ + { + "url": "last", + "locator": "document.querySelector(\"#content-body\").outerText", + "required_contents": { + "must_include": [ + "all branches to main" + ] + } + }, + { + "url": "last", + "locator": "document.querySelector('.block.start_date').outerText", + "required_contents": { + "must_include": [ + "Mar 15, 2044" + ] + } + }, + { + "url": "last", + "locator": "document.querySelector('.block.due_date').outerText", + "required_contents": { + "must_include": [ + "Mar 30, 2044" + ] + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 339, + "changelogs": [ + { + "key": "expected_backend_state", + "category": "switch_to_api_eval", + "note": "Use API to verify; more accurate" + }, + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "gitlab" + ], + "task_id": 594, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__/byteblaze/dotfiles", + "geolocation": null, + "intent_template": "Create a milestone for the upcoming {{event}} starting on {{start_date}} and ending on {{end_date}}", + "instantiation_dict": { + "event": "task of adding a new branch for zsh comprehensive support", + "start_date": "5/1/2044", + "end_date": "in 20 days" + }, + "intent": "Create a milestone for the upcoming task of adding a new branch for zsh comprehensive support starting on 5/1/2044 and ending on in 20 days", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "does_milestone_exist_with_fields", + "eval_params": { + "group": "byteblaze", + "project": "dotfiles", + "values": { + "title": "zsh comprehensive support", + "start_date": "2044-05-01", + "due_date": "2044-05-21" + }, + "title_field_is_substring": true + } + } + } + ], + "site": "gitlab" + }, + "original.eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__GITLAB__/byteblaze/dotfiles/-/milestones", + "program_html": [ + { + "url": "last", + "locator": "document.querySelector(\"#content-body\").outerText", + "required_contents": { + "must_include": [ + "zsh comprehensive support" + ] + } + }, + { + "url": "last", + "locator": "document.querySelector('.block.start_date').outerText", + "required_contents": { + "must_include": [ + "May 1, 2044" + ] + } + }, + { + "url": "last", + "locator": "document.querySelector('.block.due_date').outerText", + "required_contents": { + "must_include": [ + "May 21, 2044" + ] + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 339, + "changelogs": [ + { + "key": "expected_backend_state", + "category": "switch_to_api_eval", + "note": "Use API to verify; more accurate" + }, + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "reddit" + ], + "task_id": 595, + "require_login": true, + "storage_state": "./.auth/reddit_state.json", + "start_url": "__REDDIT__", + "geolocation": null, + "intent_template": "Open the thread of a trending post on the forum \"{{subreddit}}\" and subscribe.", + "instantiation_dict": { + "subreddit": "space" + }, + "intent": "Open the thread of a trending post on the forum \"space\" and subscribe.", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "validate_user_subscribed_to_subreddit", + "expected_data": { + "subreddit": "space" + } + } + } + ], + "site": "reddit" + }, + "original.eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__REDDIT__/f/space", + "program_html": [ + { + "url": "__REDDIT__/", + "locator": "document.querySelector(\"#sidebar > section\").outerText", + "required_contents": { + "must_include": [ + "space" + ] + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 4, + "changelogs": [ + { + "key": "expected_backend_state", + "category": "switch_to_api_eval", + "note": "Update to use system check instead of dom check" + }, + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "reddit" + ], + "task_id": 596, + "require_login": true, + "storage_state": "./.auth/reddit_state.json", + "start_url": "__REDDIT__", + "geolocation": null, + "intent_template": "Open the thread of a trending post on the forum \"{{subreddit}}\" and subscribe.", + "instantiation_dict": { + "subreddit": "books" + }, + "intent": "Open the thread of a trending post on the forum \"books\" and subscribe.", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "validate_user_subscribed_to_subreddit", + "expected_data": { + "subreddit": "books" + } + } + } + ], + "site": "reddit" + }, + "original.eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__REDDIT__/f/books", + "program_html": [ + { + "url": "__REDDIT__/", + "locator": "document.querySelector(\"#sidebar > section\").outerText", + "required_contents": { + "must_include": [ + "books" + ] + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 4, + "changelogs": [ + { + "key": "expected_backend_state", + "category": "switch_to_api_eval", + "note": "Update to use system check instead of dom check" + }, + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "reddit" + ], + "task_id": 597, + "require_login": true, + "storage_state": "./.auth/reddit_state.json", + "start_url": "__REDDIT__", + "geolocation": null, + "intent_template": "Open the thread of a trending post on the forum \"{{subreddit}}\" and subscribe.", + "instantiation_dict": { + "subreddit": "consoles" + }, + "intent": "Open the thread of a trending post on the forum \"consoles\" and subscribe.", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "validate_user_subscribed_to_subreddit", + "expected_data": { + "subreddit": "consoles" + } + } + } + ], + "site": "reddit" + }, + "original.eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__REDDIT__/f/consoles", + "program_html": [ + { + "url": "__REDDIT__/", + "locator": "document.querySelector(\"#sidebar > section\").outerText", + "required_contents": { + "must_include": [ + "consoles" + ] + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 4, + "changelogs": [ + { + "key": "expected_backend_state", + "category": "switch_to_api_eval", + "note": "Update to use system check instead of dom check" + }, + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "reddit" + ], + "task_id": 598, + "require_login": true, + "storage_state": "./.auth/reddit_state.json", + "start_url": "__REDDIT__", + "geolocation": null, + "intent_template": "Open the thread of a trending post on the forum \"{{subreddit}}\" and subscribe.", + "instantiation_dict": { + "subreddit": "pittsburgh" + }, + "intent": "Open the thread of a trending post on the forum \"pittsburgh\" and subscribe.", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "validate_user_subscribed_to_subreddit", + "expected_data": { + "subreddit": "pittsburgh" + } + } + } + ], + "site": "reddit" + }, + "original.eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__REDDIT__/f/pittsburgh", + "program_html": [ + { + "url": "__REDDIT__/", + "locator": "document.querySelector(\"#sidebar > section\").outerText", + "required_contents": { + "must_include": [ + "pittsburgh" + ] + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 4, + "changelogs": [ + { + "key": "expected_backend_state", + "category": "switch_to_api_eval", + "note": "Update to use system check instead of dom check" + }, + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "reddit" + ], + "task_id": 599, + "require_login": true, + "storage_state": "./.auth/reddit_state.json", + "start_url": "__REDDIT__", + "geolocation": null, + "intent_template": "Open the thread of a trending post on the forum \"{{subreddit}}\" and subscribe.", + "instantiation_dict": { + "subreddit": "machine learning" + }, + "intent": "Open the thread of a trending post on the forum \"machine learning\" and subscribe.", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "validate_user_subscribed_to_subreddit", + "expected_data": { + "subreddit": "machine learning" + } + } + } + ], + "site": "reddit" + }, + "original.eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__REDDIT__/f/MachineLearning", + "program_html": [ + { + "url": "__REDDIT__/", + "locator": "document.querySelector(\"#sidebar > section\").outerText", + "required_contents": { + "must_include": [ + "MachineLearning" + ] + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 4, + "changelogs": [ + { + "key": "expected_backend_state", + "category": "switch_to_api_eval", + "note": "Update to use system check instead of dom check" + }, + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "reddit" + ], + "task_id": 600, + "require_login": true, + "storage_state": "./.auth/reddit_state.json", + "start_url": "__REDDIT__", + "geolocation": null, + "intent_template": "Post my question, \"{{question}}\", in a subreddit where I'm likely to get an answer", + "instantiation_dict": { + "question": "what is the recommended console to buy these days" + }, + "intent": "Post my question, \"what is the recommended console to buy these days\", in a subreddit where I'm likely to get an answer", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "validate_post_exists_in_subreddit", + "expected_data": { + "subreddit": "consoles", + "title": "what is the recommended console to buy these days" + } + } + } + ], + "site": "reddit" + }, + "original.eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__REDDIT__/f/consoles", + "program_html": [ + { + "url": "func:reddit_get_post_url('__last_url__')", + "locator": "document.querySelector('.submission__inner').outerText", + "required_contents": { + "must_include": [ + "what is the recommended console to buy these days" + ] + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 3765, + "changelogs": [ + { + "key": "expected_backend_state", + "category": "switch_to_api_eval", + "note": "Update to use system check instead of dom check" + }, + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "reddit" + ], + "task_id": 601, + "require_login": true, + "storage_state": "./.auth/reddit_state.json", + "start_url": "__REDDIT__", + "geolocation": null, + "intent_template": "Post my question, \"{{question}}\", in a subreddit where I'm likely to get an answer", + "instantiation_dict": { + "question": "is car necessary in NYC" + }, + "intent": "Post my question, \"is car necessary in NYC\", in a subreddit where I'm likely to get an answer", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "validate_post_exists_in_subreddit", + "expected_data": { + "subreddit": "nyc", + "title": "is car necessary in NYC?" + } + } + } + ], + "site": "reddit" + }, + "original.eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__REDDIT__/f/nyc", + "program_html": [ + { + "url": "func:reddit_get_post_url('__last_url__')", + "locator": "document.querySelector('.submission__inner').outerText", + "required_contents": { + "must_include": [ + "is car necessary in NYC" + ] + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 3765, + "changelogs": [ + { + "key": "expected_backend_state", + "category": "switch_to_api_eval", + "note": "Update to use system check instead of dom check" + }, + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "reddit" + ], + "task_id": 602, + "require_login": true, + "storage_state": "./.auth/reddit_state.json", + "start_url": "__REDDIT__", + "geolocation": null, + "intent_template": "Post my question, \"{{question}}\", in a subreddit where I'm likely to get an answer", + "instantiation_dict": { + "question": "places for new drivers to learn driving in pittsburgh" + }, + "intent": "Post my question, \"places for new drivers to learn driving in pittsburgh\", in a subreddit where I'm likely to get an answer", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "validate_post_exists_in_subreddit", + "expected_data": { + "subreddit": "pittsburgh", + "title": "places for new drivers to learn driving in pittsburgh" + } + } + } + ], + "site": "reddit" + }, + "original.eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__REDDIT__/f/pittsburgh", + "program_html": [ + { + "url": "func:reddit_get_post_url('__last_url__')", + "locator": "document.querySelector('.submission__inner').outerText", + "required_contents": { + "must_include": [ + "places for new drivers to learn driving in pittsburgh" + ] + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 3765, + "changelogs": [ + { + "key": "expected_backend_state", + "category": "switch_to_api_eval", + "note": "Update to use system check instead of dom check" + }, + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "reddit" + ], + "task_id": 603, + "require_login": true, + "storage_state": "./.auth/reddit_state.json", + "start_url": "__REDDIT__", + "geolocation": null, + "intent_template": "Post my question, \"{{question}}\", in a subreddit where I'm likely to get an answer", + "instantiation_dict": { + "question": "safe and budget apartment to live in nyc" + }, + "original.instantiation_dict": { + "question": "safe and budge apartment to live in nyc" + }, + "intent": "Post my question, \"safe and budget apartment to live in nyc\", in a subreddit where I'm likely to get an answer", + "original.intent": "Post my question, \"safe and budge apartment to live in nyc\", in a subreddit where I'm likely to get an answer", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "validate_post_exists_in_subreddit", + "expected_data": { + "subreddit": "nyc", + "title": "safe and budget apartment to live in nyc" + } + } + } + ], + "site": "reddit" + }, + "original.eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__REDDIT__/f/nyc", + "program_html": [ + { + "url": "func:reddit_get_post_url('__last_url__')", + "locator": "document.querySelector('.submission__inner').outerText", + "required_contents": { + "must_include": [ + "safe and budge apartment to live in nyc" + ] + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 3765, + "changelogs": [ + { + "key": "instantiation_dict", + "category": "spelling_or_grammar", + "note": "Typo in the question" + }, + { + "key": "expected_backend_state", + "category": "switch_to_api_eval", + "note": "Update to use system check instead of dom check" + }, + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "reddit" + ], + "task_id": 604, + "require_login": true, + "storage_state": "./.auth/reddit_state.json", + "start_url": "__REDDIT__", + "geolocation": null, + "intent_template": "Post my question, \"{{question}}\", in a subreddit where I'm likely to get an answer", + "instantiation_dict": { + "question": "what is the SOTA web navigation agent repo" + }, + "intent": "Post my question, \"what is the SOTA web navigation agent repo\", in a subreddit where I'm likely to get an answer", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "validate_post_exists_in_subreddit", + "expected_data": { + "subreddit": "consoles", + "title": "what is the recommended console to buy these days" + } + } + } + ], + "site": "reddit" + }, + "original.eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__REDDIT__/f/deeplearning |OR| __REDDIT__/f/MachineLearning |OR| __REDDIT__/f/singularity", + "program_html": [ + { + "url": "func:reddit_get_post_url('__last_url__')", + "locator": "document.querySelector('.submission__inner').outerText", + "required_contents": { + "must_include": [ + "what is the SOTA web navigation agent repo" + ] + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 3765, + "changelogs": [ + { + "key": "expected_backend_state", + "category": "switch_to_api_eval", + "note": "Update to use system check instead of dom check" + }, + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "reddit" + ], + "task_id": 605, + "require_login": true, + "storage_state": "./.auth/reddit_state.json", + "start_url": "__REDDIT__", + "geolocation": null, + "intent_template": "Find a subreddit focused on topics related to {{topic}}, and post my question, \"{{question}}\" there", + "instantiation_dict": { + "topic": "gaming consoles", + "question": "what is the recommended console to buy these days" + }, + "intent": "Find a subreddit focused on topics related to gaming consoles, and post my question, \"what is the recommended console to buy these days\" there", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "validate_post_exists_in_subreddit", + "expected_data": { + "subreddit": "consoles", + "title": "what is the recommended console to buy these days" + } + } + } + ], + "site": "reddit" + }, + "original.eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__REDDIT__/f/consoles", + "program_html": [ + { + "url": "func:reddit_get_post_url('__last_url__')", + "locator": "document.querySelector('.submission__inner').outerText", + "required_contents": { + "must_include": [ + "what is the recommended console to buy these days" + ] + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 5, + "changelogs": [ + { + "key": "expected_backend_state", + "category": "switch_to_api_eval", + "note": "Update to use system check instead of dom check" + }, + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "reddit" + ], + "task_id": 606, + "require_login": true, + "storage_state": "./.auth/reddit_state.json", + "start_url": "__REDDIT__", + "geolocation": null, + "intent_template": "Find a subreddit focused on topics related to {{topic}}, and post my question, \"{{question}}\" there", + "instantiation_dict": { + "topic": "NYC", + "question": "is car necessary" + }, + "intent": "Find a subreddit focused on topics related to NYC, and post my question, \"is car necessary\" there", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "validate_post_exists_in_subreddit", + "expected_data": { + "subreddit": "nyc", + "title": "is car necessary" + } + } + } + ], + "site": "reddit" + }, + "original.eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__REDDIT__/f/nyc", + "program_html": [ + { + "url": "func:reddit_get_post_url('__last_url__')", + "locator": "document.querySelector('.submission__inner').outerText", + "required_contents": { + "must_include": [ + "is car necessary" + ] + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 5, + "changelogs": [ + { + "key": "expected_backend_state", + "category": "switch_to_api_eval", + "note": "Update to use system check instead of dom check" + }, + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "reddit" + ], + "task_id": 607, + "require_login": true, + "storage_state": "./.auth/reddit_state.json", + "start_url": "__REDDIT__", + "geolocation": null, + "intent_template": "Find a subreddit focused on topics related to {{topic}}, and post my question, \"{{question}}\" there", + "instantiation_dict": { + "topic": "city Pittsburgh", + "question": "places for new drivers to learn driving" + }, + "intent": "Find a subreddit focused on topics related to city Pittsburgh, and post my question, \"places for new drivers to learn driving\" there", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "validate_post_exists_in_subreddit", + "expected_data": { + "subreddit": "pittsburgh", + "title": "places for new drivers to learn driving" + } + } + } + ], + "site": "reddit" + }, + "original.eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__REDDIT__/f/pittsburgh", + "program_html": [ + { + "url": "func:reddit_get_post_url('__last_url__')", + "locator": "document.querySelector('.submission__inner').outerText", + "required_contents": { + "must_include": [ + "places for new drivers to learn driving" + ] + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 5, + "changelogs": [ + { + "key": "expected_backend_state", + "category": "switch_to_api_eval", + "note": "Update to use system check instead of dom check" + }, + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "reddit" + ], + "task_id": 608, + "require_login": true, + "storage_state": "./.auth/reddit_state.json", + "start_url": "__REDDIT__", + "geolocation": null, + "intent_template": "Find a subreddit focused on topics related to {{topic}}, and post my question, \"{{question}}\" there", + "instantiation_dict": { + "topic": "city lives in DMV area", + "question": "safe and budget apartment to live" + }, + "original.instantiation_dict": { + "topic": "city lives in DMV area", + "question": "safe and budge apartment to live" + }, + "intent": "Find a subreddit focused on topics related to city lives in DMV area, and post my question, \"safe and budget apartment to live\" there", + "original.intent": "Find a subreddit focused on topics related to city lives in DMV area, and post my question, \"safe and budge apartment to live\" there", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "validate_post_exists_in_subreddit", + "expected_data": { + "subreddit": "washington", + "title": "safe and budge apartment to live" + } + } + } + ], + "site": "reddit" + }, + "original.eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__REDDIT__/f/washington |OR| __REDDIT__/f/washingtondc", + "program_html": [ + { + "url": "func:reddit_get_post_url('__last_url__')", + "locator": "document.querySelector('.submission__inner').outerText", + "required_contents": { + "must_include": [ + "safe and budge apartment to live" + ] + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 5, + "changelogs": [ + { + "key": "instantiation_dict", + "category": "spelling_or_grammar", + "note": "Typo in the question" + }, + { + "key": "expected_backend_state", + "category": "switch_to_api_eval", + "note": "Update to use system check instead of dom check" + }, + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "reddit" + ], + "task_id": 609, + "require_login": true, + "storage_state": "./.auth/reddit_state.json", + "start_url": "__REDDIT__", + "geolocation": null, + "intent_template": "Find a subreddit focused on topics related to {{topic}}, and post my question, \"{{question}}\" there", + "instantiation_dict": { + "topic": "ML, DL, NLP", + "question": "what is the SOTA web navigation agent repo" + }, + "intent": "Find a subreddit focused on topics related to ML, DL, NLP, and post my question, \"what is the SOTA web navigation agent repo\" there", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "validate_post_exists_in_subreddit", + "expected_data": { + "subreddit": "deeplearning", + "title": "what is the SOTA web navigation agent repo" + } + } + } + ], + "site": "reddit" + }, + "original.eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__REDDIT__/f/deeplearning |OR| __REDDIT__/f/MachineLearning |OR| __REDDIT__/f/singularity", + "program_html": [ + { + "url": "func:reddit_get_post_url('__last_url__')", + "locator": "document.querySelector('.submission__inner').outerText", + "required_contents": { + "must_include": [ + "what is the SOTA web navigation agent repo" + ] + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 5, + "changelogs": [ + { + "key": "expected_backend_state", + "category": "switch_to_api_eval", + "note": "Update to use system check instead of dom check" + }, + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "reddit" + ], + "task_id": 610, + "require_login": true, + "storage_state": "./.auth/reddit_state.json", + "start_url": "__REDDIT__", + "geolocation": null, + "intent_template": "Post a review of my recent reading, titled \"{{book}}\" in the r/books and put my comment \"{{content}}\" in the body.", + "original.intent_template": "Post a review of my recent reading \"{{book}}\" in the r/books with my comment \"{{content}}\".", + "instantiation_dict": { + "book": "To Kill a Mockingbird by Harper Lee", + "content": "good book!" + }, + "intent": "Post a review of my recent reading, titled \"To Kill a Mockingbird by Harper Lee\" in the r/books and put my comment \"good book!\" in the body.", + "original.intent": "Post a review of my recent reading \"To Kill a Mockingbird by Harper Lee\" in the r/books with my comment \"good book!\".", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "validate_post_exists_in_subreddit", + "expected_data": { + "subreddit": "books", + "content": "good book!", + "title": "To Kill a Mockingbird by Harper Lee" + } + } + } + ], + "site": "reddit" + }, + "original.eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__REDDIT__/f/books", + "program_html": [ + { + "url": "func:reddit_get_post_url('__last_url__')", + "locator": "document.querySelector('.submission__inner').outerText", + "required_contents": { + "must_include": [ + "To Kill a Mockingbird by Harper Lee", + "good book!" + ] + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 9, + "changelogs": [ + { + "key": "intent_template", + "category": "clarify_instructions", + "note": "Need to specify the book title and content" + }, + { + "key": "expected_backend_state", + "category": "switch_to_api_eval", + "note": "Update to use system check instead of dom check" + }, + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "reddit" + ], + "task_id": 611, + "require_login": true, + "storage_state": "./.auth/reddit_state.json", + "start_url": "__REDDIT__", + "geolocation": null, + "intent_template": "Post a review of my recent reading, titled \"{{book}}\" in the r/books and put my comment \"{{content}}\" in the body.", + "original.intent_template": "Post a review of my recent reading \"{{book}}\" in the r/books with my comment \"{{content}}\".", + "instantiation_dict": { + "book": "Harry Potter", + "content": "Wonderful journey" + }, + "intent": "Post a review of my recent reading, titled \"Harry Potter\" in the r/books and put my comment \"Wonderful journey\" in the body.", + "original.intent": "Post a review of my recent reading \"Harry Potter\" in the r/books with my comment \"Wonderful journey\".", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "validate_post_exists_in_subreddit", + "expected_data": { + "subreddit": "books", + "content": "Wonderful journey", + "title": "Harry Potter" + } + } + } + ], + "site": "reddit" + }, + "original.eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__REDDIT__/f/books", + "program_html": [ + { + "url": "func:reddit_get_post_url('__last_url__')", + "locator": "document.querySelector('.submission__inner').outerText", + "required_contents": { + "must_include": [ + "Harry Potter", + "Wonderful journey" + ] + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 9, + "changelogs": [ + { + "key": "intent_template", + "category": "clarify_instructions", + "note": "Need to specify the book title and content" + }, + { + "key": "expected_backend_state", + "category": "switch_to_api_eval", + "note": "Update to use system check instead of dom check" + }, + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "reddit" + ], + "task_id": 612, + "require_login": true, + "storage_state": "./.auth/reddit_state.json", + "start_url": "__REDDIT__", + "geolocation": null, + "intent_template": "Post a review of my recent reading, titled \"{{book}}\" in the r/books and put my comment \"{{content}}\" in the body.", + "original.intent_template": "Post a review of my recent reading \"{{book}}\" in the r/books with my comment \"{{content}}\".", + "instantiation_dict": { + "book": "big little lies", + "content": "can't stop it" + }, + "intent": "Post a review of my recent reading, titled \"big little lies\" in the r/books and put my comment \"can't stop it\" in the body.", + "original.intent": "Post a review of my recent reading \"big little lies\" in the r/books with my comment \"can't stop it\".", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "validate_post_exists_in_subreddit", + "expected_data": { + "subreddit": "books", + "content": "can't stop it", + "title": "big little lies" + } + } + } + ], + "site": "reddit" + }, + "original.eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__REDDIT__/f/books", + "program_html": [ + { + "url": "func:reddit_get_post_url('__last_url__')", + "locator": "document.querySelector('.submission__inner').outerText", + "required_contents": { + "must_include": [ + "big little lies", + "can't stop it" + ] + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 9, + "changelogs": [ + { + "key": "intent_template", + "category": "clarify_instructions", + "note": "Need to specify the book title and content" + }, + { + "key": "expected_backend_state", + "category": "switch_to_api_eval", + "note": "Update to use system check instead of dom check" + }, + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "reddit" + ], + "task_id": 613, + "require_login": true, + "storage_state": "./.auth/reddit_state.json", + "start_url": "__REDDIT__", + "geolocation": null, + "intent_template": "Post a review of my recent reading, titled \"{{book}}\" in the r/books and put my comment \"{{content}}\" in the body.", + "original.intent_template": "Post a review of my recent reading \"{{book}}\" in the r/books with my comment \"{{content}}\".", + "instantiation_dict": { + "book": "Love story", + "content": "I cried" + }, + "intent": "Post a review of my recent reading, titled \"Love story\" in the r/books and put my comment \"I cried\" in the body.", + "original.intent": "Post a review of my recent reading \"Love story\" in the r/books with my comment \"I cried\".", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "validate_post_exists_in_subreddit", + "expected_data": { + "subreddit": "books", + "comment": "I cried", + "title": "Love story" + } + } + } + ], + "site": "reddit" + }, + "original.eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__REDDIT__/f/books", + "program_html": [ + { + "url": "func:reddit_get_post_url('__last_url__')", + "locator": "document.querySelector('.submission__inner').outerText", + "required_contents": { + "must_include": [ + "Love story", + "I cried" + ] + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 9, + "changelogs": [ + { + "key": "intent_template", + "category": "clarify_instructions", + "note": "Need to specify the book title and content" + }, + { + "key": "expected_backend_state", + "category": "switch_to_api_eval", + "note": "Update to use system check instead of dom check" + }, + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "reddit" + ], + "task_id": 614, + "require_login": true, + "storage_state": "./.auth/reddit_state.json", + "start_url": "__REDDIT__", + "geolocation": null, + "intent_template": "Post a review of my recent reading, titled \"{{book}}\" in the r/books and put my comment \"{{content}}\" in the body.", + "original.intent_template": "Post a review of my recent reading \"{{book}}\" in the r/books with my comment \"{{content}}\".", + "instantiation_dict": { + "book": "Gone with the wind", + "content": "It's a book with history" + }, + "intent": "Post a review of my recent reading, titled \"Gone with the wind\" in the r/books and put my comment \"It's a book with history\" in the body.", + "original.intent": "Post a review of my recent reading \"Gone with the wind\" in the r/books with my comment \"It's a book with history\".", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "validate_post_exists_in_subreddit", + "expected_data": { + "subreddit": "books", + "content": "It's a book with history", + "title": "Gone with the wind" + } + } + } + ], + "site": "reddit" + }, + "original.eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__REDDIT__/f/books", + "program_html": [ + { + "url": "func:reddit_get_post_url('__last_url__')", + "locator": "document.querySelector('.submission__inner').outerText", + "required_contents": { + "must_include": [ + "Gone with the wind", + "It's a book with history" + ] + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 9, + "changelogs": [ + { + "key": "intent_template", + "category": "clarify_instructions", + "note": "Need to specify the book title and content" + }, + { + "key": "expected_backend_state", + "category": "switch_to_api_eval", + "note": "Update to use system check instead of dom check" + }, + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "reddit" + ], + "task_id": 615, + "require_login": true, + "storage_state": "./.auth/reddit_state.json", + "start_url": "__REDDIT__/f/pics", + "geolocation": null, + "intent_template": "Re-post the image of {{content}} in this page to {{subreddit}} subreddit and note \"from /f/pics\"", + "instantiation_dict": { + "content": "Bald Eagle", + "subreddit": "earthporn" + }, + "intent": "Re-post the image of Bald Eagle in this page to earthporn subreddit and note \"from /f/pics\"", + "require_reset": false, + "eval": { + "expected_ui_state": [ + { + "url": "__REDDIT__/f/earthporn" + } + ], + "site": "reddit" + }, + "original.eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__REDDIT__/f/earthporn", + "program_html": [ + { + "url": "func:reddit_get_post_url('__last_url__')", + "locator": "document.querySelector('.submission__inner').outerText", + "required_contents": { + "must_include": [ + "from /f/pics" + ] + } + }, + { + "url": "func:reddit_get_post_url('__last_url__')", + "locator": "[...document.querySelector('.submission__inner').querySelectorAll('[href],[src]')].map(elem => elem.getAttribute('href') || elem.getAttribute('src')).join(' ')", + "required_contents": { + "must_include": [ + "b02113033af32feae9ff147dbbe3764039368d67d193885bd04e65c2e6beea9c.jpg" + ] + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 11 + }, + { + "sites": [ + "reddit" + ], + "task_id": 616, + "require_login": true, + "storage_state": "./.auth/reddit_state.json", + "start_url": "__REDDIT__/f/pics", + "geolocation": null, + "intent_template": "Re-post the image of {{content}} in this page to {{subreddit}} subreddit and note \"from /f/pics\"", + "instantiation_dict": { + "content": "Thanksgiving turkey", + "subreddit": "funny" + }, + "intent": "Re-post the image of Thanksgiving turkey in this page to funny subreddit and note \"from /f/pics\"", + "require_reset": false, + "eval": { + "expected_ui_state": [ + { + "url": "__REDDIT__/f/funny" + } + ], + "site": "reddit" + }, + "original.eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__REDDIT__/f/funny", + "program_html": [ + { + "url": "func:reddit_get_post_url('__last_url__')", + "locator": "document.querySelector('.submission__inner').outerText", + "required_contents": { + "must_include": [ + "from /f/pics" + ] + } + }, + { + "url": "func:reddit_get_post_url('__last_url__')", + "locator": "[...document.querySelector('.submission__inner').querySelectorAll('[href],[src]')].map(elem => elem.getAttribute('href') || elem.getAttribute('src')).join(' ')", + "required_contents": { + "must_include": [ + "2e4fa0a328e653a97a7d07046291c298ef5b4e0d0c73a287f317ca86a8e8685f.jpg" + ] + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 11 + }, + { + "sites": [ + "reddit" + ], + "task_id": 617, + "require_login": true, + "storage_state": "./.auth/reddit_state.json", + "start_url": "__REDDIT__/f/pics", + "geolocation": null, + "intent_template": "Re-post the image of {{content}} in this page to {{subreddit}} subreddit and note \"from /f/pics\"", + "instantiation_dict": { + "content": "Firework", + "subreddit": "earthporn" + }, + "intent": "Re-post the image of Firework in this page to earthporn subreddit and note \"from /f/pics\"", + "require_reset": false, + "eval": { + "expected_ui_state": [ + { + "url": "__REDDIT__/f/earthporn" + } + ], + "site": "reddit" + }, + "original.eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__REDDIT__/f/earthporn", + "program_html": [ + { + "url": "func:reddit_get_post_url('__last_url__')", + "locator": "document.querySelector('.submission__inner').outerText", + "required_contents": { + "must_include": [ + "from /f/pics" + ] + } + }, + { + "url": "func:reddit_get_post_url('__last_url__')", + "locator": "[...document.querySelector('.submission__inner').querySelectorAll('[href],[src]')].map(elem => elem.getAttribute('href') || elem.getAttribute('src')).join(' ')", + "required_contents": { + "must_include": [ + "92411be6af4e9ad5ccd3ccbaa01c10457bb00e704e99c58dd430de1a958307fd.jpg" + ] + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 11 + }, + { + "sites": [ + "reddit" + ], + "task_id": 618, + "require_login": true, + "storage_state": "./.auth/reddit_state.json", + "start_url": "__REDDIT__/f/pics", + "geolocation": null, + "intent_template": "Re-post the image of {{content}} in this page to {{subreddit}} subreddit and note \"from /f/pics\"", + "instantiation_dict": { + "content": "Wife's costume", + "subreddit": "funny" + }, + "intent": "Re-post the image of Wife's costume in this page to funny subreddit and note \"from /f/pics\"", + "require_reset": false, + "eval": { + "expected_ui_state": [ + { + "url": "__REDDIT__/f/funny" + } + ], + "site": "reddit" + }, + "original.eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__REDDIT__/f/funny", + "program_html": [ + { + "url": "func:reddit_get_post_url('__last_url__')", + "locator": "document.querySelector('.submission__inner').outerText", + "required_contents": { + "must_include": [ + "from /f/pics" + ] + } + }, + { + "url": "func:reddit_get_post_url('__last_url__')", + "locator": "[...document.querySelector('.submission__inner').querySelectorAll('[href],[src]')].map(elem => elem.getAttribute('href') || elem.getAttribute('src')).join(' ')", + "required_contents": { + "must_include": [ + "6bfbf1280d28d84a9261695f0cac5a90addaaff1174807a2b381fdc159f3ed00.jpg" + ] + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 11 + }, + { + "sites": [ + "reddit" + ], + "task_id": 619, + "require_login": true, + "storage_state": "./.auth/reddit_state.json", + "start_url": "__REDDIT__/f/pics", + "geolocation": null, + "intent_template": "Re-post the image of {{content}} in this page to {{subreddit}} subreddit and note \"from /f/pics\"", + "instantiation_dict": { + "content": "costume contest", + "subreddit": "funny" + }, + "intent": "Re-post the image of costume contest in this page to funny subreddit and note \"from /f/pics\"", + "require_reset": false, + "eval": { + "expected_ui_state": [ + { + "url": "__REDDIT__/f/funny" + } + ], + "site": "reddit" + }, + "original.eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__REDDIT__/f/funny", + "program_html": [ + { + "url": "func:reddit_get_post_url('__last_url__')", + "locator": "document.querySelector('.submission__inner').outerText", + "required_contents": { + "must_include": [ + "from /f/pics" + ] + } + }, + { + "url": "func:reddit_get_post_url('__last_url__')", + "locator": "[...document.querySelector('.submission__inner').querySelectorAll('[href],[src]')].map(elem => elem.getAttribute('href') || elem.getAttribute('src')).join(' ')", + "required_contents": { + "must_include": [ + "bd8bc5f4c846aac4df08626faa3a34a7d47c8f3bdd92bf615a54afd939f063a7.jpg" + ] + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 11 + }, + { + "sites": [ + "reddit" + ], + "task_id": 620, + "require_login": true, + "storage_state": "./.auth/reddit_state.json", + "start_url": "__REDDIT__", + "geolocation": null, + "intent_template": "Ask for advice about {{issue}} in a subreddit for relations", + "instantiation_dict": { + "issue": "deal with long-distance relationships" + }, + "intent": "Ask for advice about deal with long-distance relationships in a subreddit for relations", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "validate_post_exists_in_subreddit", + "expected_data": { + "subreddit": "relationship_advice", + "title": [ + "long", + "distance", + "relationship" + ] + } + } + } + ], + "site": "reddit" + }, + "original.eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__REDDIT__/f/relationship_advice", + "program_html": [ + { + "url": "func:reddit_get_post_url('__last_url__')", + "locator": "document.querySelector('.submission__inner').outerText", + "required_contents": { + "must_include": [ + "long", + "distance", + "relation" + ] + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 12, + "changelogs": [ + { + "key": "expected_backend_state", + "category": "switch_to_api_eval", + "note": "Update to use system check instead of dom check" + }, + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "reddit" + ], + "task_id": 621, + "require_login": true, + "storage_state": "./.auth/reddit_state.json", + "start_url": "__REDDIT__", + "geolocation": null, + "intent_template": "Ask for advice about {{issue}} in a subreddit for relations", + "instantiation_dict": { + "issue": "cheat" + }, + "intent": "Ask for advice about cheat in a subreddit for relations", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "validate_post_exists_in_subreddit", + "expected_data": { + "subreddit": "relationship_advice", + "title": "cheat" + } + } + } + ], + "site": "reddit" + }, + "original.eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__REDDIT__/f/relationship_advice", + "program_html": [ + { + "url": "func:reddit_get_post_url('__last_url__')", + "locator": "document.querySelector('.submission__inner').outerText", + "required_contents": { + "must_include": [ + "cheat" + ] + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 12, + "changelogs": [ + { + "key": "expected_backend_state", + "category": "switch_to_api_eval", + "note": "Update to use system check instead of dom check" + }, + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "reddit" + ], + "task_id": 622, + "require_login": true, + "storage_state": "./.auth/reddit_state.json", + "start_url": "__REDDIT__", + "geolocation": null, + "intent_template": "Ask for advice about {{issue}} in a subreddit for relations", + "instantiation_dict": { + "issue": "sexual harassment" + }, + "intent": "Ask for advice about sexual harassment in a subreddit for relations", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "validate_post_exists_in_subreddit", + "expected_data": { + "subreddit": "relationship_advice", + "title": "sexual harassment" + } + } + } + ], + "site": "reddit" + }, + "original.eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__REDDIT__/f/relationship_advice", + "program_html": [ + { + "url": "func:reddit_get_post_url('__last_url__')", + "locator": "document.querySelector('.submission__inner').outerText", + "required_contents": { + "must_include": [ + "sexual", + "harassment" + ] + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 12, + "changelogs": [ + { + "key": "expected_backend_state", + "category": "switch_to_api_eval", + "note": "Update to use system check instead of dom check" + }, + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "reddit" + ], + "task_id": 623, + "require_login": true, + "storage_state": "./.auth/reddit_state.json", + "start_url": "__REDDIT__", + "geolocation": null, + "intent_template": "Ask for advice about {{issue}} in a subreddit for relations", + "instantiation_dict": { + "issue": "gift for birthday" + }, + "intent": "Ask for advice about gift for birthday in a subreddit for relations", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "validate_post_exists_in_subreddit", + "expected_data": { + "subreddit": "relationship_advice", + "title": [ + "gift", + "birthday" + ] + } + } + } + ], + "site": "reddit" + }, + "original.eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__REDDIT__/f/relationship_advice", + "program_html": [ + { + "url": "func:reddit_get_post_url('__last_url__')", + "locator": "document.querySelector('.submission__inner').outerText", + "required_contents": { + "must_include": [ + "gift", + "birthday" + ] + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 12, + "changelogs": [ + { + "key": "expected_backend_state", + "category": "switch_to_api_eval", + "note": "Update to use system check instead of dom check" + }, + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "reddit" + ], + "task_id": 624, + "require_login": true, + "storage_state": "./.auth/reddit_state.json", + "start_url": "__REDDIT__", + "geolocation": null, + "intent_template": "Ask for advice about {{issue}} in a subreddit for relations", + "instantiation_dict": { + "issue": "break-up remedy" + }, + "intent": "Ask for advice about break-up remedy in a subreddit for relations", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "validate_post_exists_in_subreddit", + "expected_data": { + "subreddit": "relationship_advice", + "title": [ + "break", + "up", + "remedy" + ] + } + } + } + ], + "site": "reddit" + }, + "original.eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__REDDIT__/f/relationship_advice", + "program_html": [ + { + "url": "func:reddit_get_post_url('__last_url__')", + "locator": "document.querySelector('.submission__inner').outerText", + "required_contents": { + "must_include": [ + "break", + "remedy" + ] + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 12, + "changelogs": [ + { + "key": "expected_backend_state", + "category": "switch_to_api_eval", + "note": "Update to use system check instead of dom check" + }, + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "reddit" + ], + "task_id": 625, + "require_login": true, + "storage_state": "./.auth/reddit_state.json", + "start_url": "__REDDIT__", + "geolocation": null, + "intent_template": "Create a discussion post titled \"{{topic}}\" in a relevant subreddit and ask users for their opinions with the simple prompt, \"your opinion\"", + "original.intent_template": "Create a discussion post about \"{{topic}}\" in a relevant subreddit and ask users for their opinions with the simple prompt, \"your opinion\"", + "instantiation_dict": { + "topic": "the effectiveness of deep learning" + }, + "original.instantiation_dict": { + "topic": "the effectiveness of online learning" + }, + "intent": "Create a discussion post titled \"the effectiveness of deep learning\" in a relevant subreddit and ask users for their opinions with the simple prompt, \"your opinion\"", + "original.intent": "Create a discussion post about \"the effectiveness of online learning\" in a relevant subreddit and ask users for their opinions with the simple prompt, \"your opinion\"", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "validate_post_exists_in_subreddit", + "expected_data": { + "subreddit": "deeplearning", + "content": [ + "your opinion" + ], + "title": "the effectiveness of deep learning" + } + } + } + ], + "site": "reddit" + }, + "original.eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__REDDIT__/f/machinelearning |OR| __REDDIT__/f/deeplearning", + "program_html": [ + { + "url": "func:reddit_get_post_url('__last_url__')", + "locator": "document.querySelector('.submission__inner').outerText", + "required_contents": { + "must_include": [ + "your opinion", + "the effectiveness of online learning" + ] + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 13, + "changelogs": [ + { + "key": "instantiation_dict", + "category": "clarify_instructions", + "note": "Relevant subreddit is difficult to decide based on the original topic" + }, + { + "key": "intent_template", + "category": "clarify_instructions", + "note": "Ambiguous subreddits based on original topic" + }, + { + "key": "expected_backend_state", + "category": "switch_to_api_eval", + "note": "Update to use system check instead of dom check" + }, + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "reddit" + ], + "task_id": 626, + "require_login": true, + "storage_state": "./.auth/reddit_state.json", + "start_url": "__REDDIT__", + "geolocation": null, + "intent_template": "Create a discussion post titled \"{{topic}}\" in a relevant subreddit and ask users for their opinions with the simple prompt, \"your opinion\"", + "original.intent_template": "Create a discussion post about \"{{topic}}\" in a relevant subreddit and ask users for their opinions with the simple prompt, \"your opinion\"", + "instantiation_dict": { + "topic": "Iphone 14" + }, + "intent": "Create a discussion post titled \"Iphone 14\" in a relevant subreddit and ask users for their opinions with the simple prompt, \"your opinion\"", + "original.intent": "Create a discussion post about \"Iphone 14\" in a relevant subreddit and ask users for their opinions with the simple prompt, \"your opinion\"", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "validate_post_exists_in_subreddit", + "expected_data": { + "subreddit": "iphone", + "content": [ + "your opinion" + ], + "title": "Iphone 14" + } + } + } + ], + "site": "reddit" + }, + "original.eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__REDDIT__/f/iphone", + "program_html": [ + { + "url": "func:reddit_get_post_url('__last_url__')", + "locator": "document.querySelector('.submission__inner').outerText", + "required_contents": { + "must_include": [ + "your opinion", + "Iphone 14" + ] + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 13, + "changelogs": [ + { + "key": "intent_template", + "category": "clarify_instructions", + "note": "Need to specify the post title and content" + }, + { + "key": "expected_backend_state", + "category": "switch_to_api_eval", + "note": "Update to use system check instead of dom check" + }, + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "reddit" + ], + "task_id": 627, + "require_login": true, + "storage_state": "./.auth/reddit_state.json", + "start_url": "__REDDIT__", + "geolocation": null, + "intent_template": "Create a discussion post titled \"{{topic}}\" in a relevant subreddit and ask users for their opinions with the simple prompt, \"your opinion\"", + "original.intent_template": "Create a discussion post about \"{{topic}}\" in a relevant subreddit and ask users for their opinions with the simple prompt, \"your opinion\"", + "instantiation_dict": { + "topic": "Harry Potter movie series" + }, + "intent": "Create a discussion post titled \"Harry Potter movie series\" in a relevant subreddit and ask users for their opinions with the simple prompt, \"your opinion\"", + "original.intent": "Create a discussion post about \"Harry Potter movie series\" in a relevant subreddit and ask users for their opinions with the simple prompt, \"your opinion\"", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "validate_post_exists_in_subreddit", + "expected_data": { + "subreddit": "movies", + "content": [ + "your opinion" + ], + "title": "Harry Potter movie series" + } + } + } + ], + "site": "reddit" + }, + "original.eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__REDDIT__/f/movies", + "program_html": [ + { + "url": "func:reddit_get_post_url('__last_url__')", + "locator": "document.querySelector('.submission__inner').outerText", + "required_contents": { + "must_include": [ + "your opinion", + "Harry Potter movie series" + ] + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 13, + "changelogs": [ + { + "key": "intent_template", + "category": "clarify_instructions", + "note": "Need to specify the post title and content" + }, + { + "key": "expected_backend_state", + "category": "switch_to_api_eval", + "note": "Update to use system check instead of dom check" + }, + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "reddit" + ], + "task_id": 628, + "require_login": true, + "storage_state": "./.auth/reddit_state.json", + "start_url": "__REDDIT__", + "geolocation": null, + "intent_template": "Create a discussion post titled \"{{topic}}\" in a relevant subreddit and ask users for their opinions with the simple prompt, \"your opinion\"", + "original.intent_template": "Create a discussion post about \"{{topic}}\" in a relevant subreddit and ask users for their opinions with the simple prompt, \"your opinion\"", + "instantiation_dict": { + "topic": "long distance relationship" + }, + "intent": "Create a discussion post titled \"long distance relationship\" in a relevant subreddit and ask users for their opinions with the simple prompt, \"your opinion\"", + "original.intent": "Create a discussion post about \"long distance relationship\" in a relevant subreddit and ask users for their opinions with the simple prompt, \"your opinion\"", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "validate_post_exists_in_subreddit", + "expected_data": { + "subreddit": "relationship_advice", + "content": [ + "your opinion" + ], + "title": "long distance relationship" + } + } + } + ], + "site": "reddit" + }, + "original.eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__REDDIT__/f/relationship_advice", + "program_html": [ + { + "url": "func:reddit_get_post_url('__last_url__')", + "locator": "document.querySelector('.submission__inner').outerText", + "required_contents": { + "must_include": [ + "your opinion", + "long distance relationship" + ] + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 13, + "changelogs": [ + { + "key": "intent_template", + "category": "clarify_instructions", + "note": "Need to specify the post title and content" + }, + { + "key": "expected_backend_state", + "category": "switch_to_api_eval", + "note": "Update to use system check instead of dom check" + }, + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "reddit" + ], + "task_id": 629, + "require_login": true, + "storage_state": "./.auth/reddit_state.json", + "start_url": "__REDDIT__", + "geolocation": null, + "intent_template": "Create a discussion post titled \"{{topic}}\" in a relevant subreddit and ask users for their opinions with the simple prompt, \"your opinion\"", + "original.intent_template": "Create a discussion post about \"{{topic}}\" in a relevant subreddit and ask users for their opinions with the simple prompt, \"your opinion\"", + "instantiation_dict": { + "topic": "Fun thing to do in Pittsburgh" + }, + "intent": "Create a discussion post titled \"Fun thing to do in Pittsburgh\" in a relevant subreddit and ask users for their opinions with the simple prompt, \"your opinion\"", + "original.intent": "Create a discussion post about \"Fun thing to do in Pittsburgh\" in a relevant subreddit and ask users for their opinions with the simple prompt, \"your opinion\"", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "validate_post_exists_in_subreddit", + "expected_data": { + "subreddit": "pittsburgh", + "content": [ + "your opinion" + ], + "title": "Fun thing to do in Pittsburgh" + } + } + } + ], + "site": "reddit" + }, + "original.eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__REDDIT__/f/pittsburgh", + "program_html": [ + { + "url": "func:reddit_get_post_url('__last_url__')", + "locator": "document.querySelector('.submission__inner').outerText", + "required_contents": { + "must_include": [ + "your opinion", + "Fun thing to do in Pittsburgh" + ] + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 13, + "changelogs": [ + { + "key": "intent_template", + "category": "clarify_instructions", + "note": "Need to specify the post title and content" + }, + { + "key": "expected_backend_state", + "category": "switch_to_api_eval", + "note": "Update to use system check instead of dom check" + }, + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "reddit" + ], + "task_id": 630, + "require_login": true, + "storage_state": "./.auth/reddit_state.json", + "start_url": "__REDDIT__", + "geolocation": null, + "intent_template": "Ask for product recommendations for {{category}} within a budget of {{price}} in {{subreddit}}", + "instantiation_dict": { + "category": "noise-cancelling headphones", + "price": "$200", + "subreddit": "r/headphones" + }, + "intent": "Ask for product recommendations for noise-cancelling headphones within a budget of $200 in r/headphones", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "validate_post_exists_in_subreddit", + "expected_data": { + "subreddit": "headphones", + "title": [ + "200", + "noise", + "cancelling", + "headphones" + ] + } + } + } + ], + "site": "reddit" + }, + "original.eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__REDDIT__/f/headphones", + "program_html": [ + { + "url": "func:reddit_get_post_url('__last_url__')", + "locator": "document.querySelector('.submission__inner').outerText", + "required_contents": { + "must_include": [ + "200", + "noise-cancelling", + "headphone" + ] + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 15, + "changelogs": [ + { + "key": "expected_backend_state", + "category": "switch_to_api_eval", + "note": "Update to use system check instead of dom check" + }, + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "reddit" + ], + "task_id": 631, + "require_login": true, + "storage_state": "./.auth/reddit_state.json", + "start_url": "__REDDIT__", + "geolocation": null, + "intent_template": "Ask for product recommendations for {{category}} within a budget of {{price}} in {{subreddit}}", + "instantiation_dict": { + "category": "running shoes", + "price": "$100", + "subreddit": "r/sports" + }, + "intent": "Ask for product recommendations for running shoes within a budget of $100 in r/sports", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "validate_post_exists_in_subreddit", + "expected_data": { + "subreddit": "sports", + "title": [ + "running", + "shoes", + "100" + ] + } + } + } + ], + "site": "reddit" + }, + "original.eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__REDDIT__/f/sports", + "program_html": [ + { + "url": "func:reddit_get_post_url('__last_url__')", + "locator": "document.querySelector('.submission__inner').outerText", + "required_contents": { + "must_include": [ + "100", + "running", + "shoes" + ] + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 15, + "changelogs": [ + { + "key": "expected_backend_state", + "category": "switch_to_api_eval", + "note": "Update to use system check instead of dom check" + }, + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "reddit" + ], + "task_id": 632, + "require_login": true, + "storage_state": "./.auth/reddit_state.json", + "start_url": "__REDDIT__", + "geolocation": null, + "intent_template": "Ask for product recommendations for {{category}} within a budget of {{price}} in {{subreddit}}", + "instantiation_dict": { + "category": "running shoes", + "price": "$500", + "subreddit": "r/sports" + }, + "intent": "Ask for product recommendations for running shoes within a budget of $500 in r/sports", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "validate_post_exists_in_subreddit", + "expected_data": { + "subreddit": "sports", + "title": [ + "running", + "shoes", + "500" + ] + } + } + } + ], + "site": "reddit" + }, + "original.eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__REDDIT__/f/sports", + "program_html": [ + { + "url": "func:reddit_get_post_url('__last_url__')", + "locator": "document.querySelector('.submission__inner').outerText", + "required_contents": { + "must_include": [ + "500", + "running", + "shoes" + ] + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 15, + "changelogs": [ + { + "key": "expected_backend_state", + "category": "switch_to_api_eval", + "note": "Update to use system check instead of dom check" + }, + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "reddit" + ], + "task_id": 633, + "require_login": true, + "storage_state": "./.auth/reddit_state.json", + "start_url": "__REDDIT__", + "geolocation": null, + "intent_template": "Ask for product recommendations for {{category}} within a budget of {{price}} in {{subreddit}}", + "instantiation_dict": { + "category": "running pants", + "price": "$500", + "subreddit": "r/sports" + }, + "intent": "Ask for product recommendations for running pants within a budget of $500 in r/sports", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "validate_post_exists_in_subreddit", + "expected_data": { + "subreddit": "sports", + "title": [ + "running", + "pants", + "500" + ] + } + } + } + ], + "site": "reddit" + }, + "original.eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__REDDIT__/f/sports", + "program_html": [ + { + "url": "func:reddit_get_post_url('__last_url__')", + "locator": "document.querySelector('.submission__inner').outerText", + "required_contents": { + "must_include": [ + "500", + "running", + "pants" + ] + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 15, + "changelogs": [ + { + "key": "expected_backend_state", + "category": "switch_to_api_eval", + "note": "Update to use system check instead of dom check" + }, + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "reddit" + ], + "task_id": 634, + "require_login": true, + "storage_state": "./.auth/reddit_state.json", + "start_url": "__REDDIT__", + "geolocation": null, + "intent_template": "Ask for product recommendations for {{category}} within a budget of {{price}} in {{subreddit}}", + "instantiation_dict": { + "category": "used iphone", + "price": "$1000", + "subreddit": "r/iphone" + }, + "intent": "Ask for product recommendations for used iphone within a budget of $1000 in r/iphone", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "validate_post_exists_in_subreddit", + "expected_data": { + "subreddit": "iphone", + "title": [ + "1000", + "used iphone" + ] + } + } + } + ], + "site": "reddit" + }, + "original.eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__REDDIT__/f/iphone", + "program_html": [ + { + "url": "func:reddit_get_post_url('__last_url__')", + "locator": "document.querySelector('.submission__inner').outerText", + "required_contents": { + "must_include": [ + "1000", + "used iphone" + ] + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 15, + "changelogs": [ + { + "key": "expected_backend_state", + "category": "switch_to_api_eval", + "note": "Update to use system check instead of dom check" + }, + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "reddit" + ], + "task_id": 635, + "require_login": true, + "storage_state": "./.auth/reddit_state.json", + "start_url": "__REDDIT__", + "geolocation": null, + "intent_template": "Post in the most appropriate subreddit and ask for recommendations for {{category}} products within a budget of {{price}}", + "instantiation_dict": { + "category": "noise-cancelling headphones", + "price": "$200" + }, + "intent": "Post in the most appropriate subreddit and ask for recommendations for noise-cancelling headphones products within a budget of $200", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "validate_post_exists_in_subreddit", + "expected_data": { + "subreddit": "headphones", + "title": [ + "200", + "noise", + "cancelling", + "headphones" + ] + } + } + } + ], + "site": "reddit" + }, + "original.eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__REDDIT__/f/headphones", + "program_html": [ + { + "url": "func:reddit_get_post_url('__last_url__')", + "locator": "document.querySelector('.submission__inner').outerText", + "required_contents": { + "must_include": [ + "200", + "noise-cancelling", + "headphone" + ] + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 6100, + "changelogs": [ + { + "key": "expected_backend_state", + "category": "switch_to_api_eval", + "note": "Update to use system check instead of dom check" + }, + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "reddit" + ], + "task_id": 636, + "require_login": true, + "storage_state": "./.auth/reddit_state.json", + "start_url": "__REDDIT__", + "geolocation": null, + "intent_template": "Post in the most appropriate subreddit and ask for recommendations for {{category}} products within a budget of {{price}}", + "instantiation_dict": { + "category": "DIY toolkit", + "price": "$100" + }, + "intent": "Post in the most appropriate subreddit and ask for recommendations for DIY toolkit products within a budget of $100", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "validate_post_exists_in_subreddit", + "expected_data": { + "subreddit": "diy", + "title": [ + "100", + "DIY", + "toolkit" + ] + } + } + } + ], + "site": "reddit" + }, + "original.eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__REDDIT__/f/DIY", + "program_html": [ + { + "url": "func:reddit_get_post_url('__last_url__')", + "locator": "document.querySelector('.submission__inner').outerText", + "required_contents": { + "must_include": [ + "100", + "DIY", + "toolkit" + ] + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 6100, + "changelogs": [ + { + "key": "expected_backend_state", + "category": "switch_to_api_eval", + "note": "Update to use system check instead of dom check" + }, + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "reddit" + ], + "task_id": 637, + "require_login": true, + "storage_state": "./.auth/reddit_state.json", + "start_url": "__REDDIT__", + "geolocation": null, + "intent_template": "Post in the most appropriate subreddit and ask for recommendations for {{category}} products within a budget of {{price}}", + "instantiation_dict": { + "category": "sony headphones", + "price": "$500" + }, + "intent": "Post in the most appropriate subreddit and ask for recommendations for sony headphones products within a budget of $500", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "validate_post_exists_in_subreddit", + "expected_data": { + "subreddit": "headphones", + "title": [ + "500", + "sony", + "headphones" + ] + } + } + } + ], + "site": "reddit" + }, + "original.eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__REDDIT__/f/headphones", + "program_html": [ + { + "url": "func:reddit_get_post_url('__last_url__')", + "locator": "document.querySelector('.submission__inner').outerText", + "required_contents": { + "must_include": [ + "500", + "sony headphone" + ] + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 6100, + "changelogs": [ + { + "key": "expected_backend_state", + "category": "switch_to_api_eval", + "note": "Update to use system check instead of dom check" + }, + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "reddit" + ], + "task_id": 638, + "require_login": true, + "storage_state": "./.auth/reddit_state.json", + "start_url": "__REDDIT__", + "geolocation": null, + "intent_template": "Post in the most appropriate subreddit and ask for recommendations for {{category}} products within a budget of {{price}}", + "instantiation_dict": { + "category": "must-have product in my life", + "price": "$30" + }, + "intent": "Post in the most appropriate subreddit and ask for recommendations for must-have product in my life products within a budget of $30", + "require_reset": false, + "eval": { + "expected_ui_state": [ + { + "url": "__REDDIT__/f/BuyItForLife" + } + ], + "site": "reddit" + }, + "original.eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__REDDIT__/f/BuyItForLife", + "program_html": [ + { + "url": "func:reddit_get_post_url('__last_url__')", + "locator": "document.querySelector('.submission__inner').outerText", + "required_contents": { + "must_include": [ + "30", + "must-have", + "product", + "life" + ] + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 6100 + }, + { + "sites": [ + "reddit" + ], + "task_id": 639, + "require_login": true, + "storage_state": "./.auth/reddit_state.json", + "start_url": "__REDDIT__", + "geolocation": null, + "intent_template": "Post in the most appropriate subreddit and ask for recommendations for {{category}} products within a budget of {{price}}", + "instantiation_dict": { + "category": "used iphone", + "price": "$1000" + }, + "intent": "Post in the most appropriate subreddit and ask for recommendations for used iphone products within a budget of $1000", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "validate_post_exists_in_subreddit", + "expected_data": { + "subreddit": "iphone", + "title": [ + "1000", + "used iphone" + ] + } + } + } + ], + "site": "reddit" + }, + "original.eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__REDDIT__/f/iphone", + "program_html": [ + { + "url": "func:reddit_get_post_url('__last_url__')", + "locator": "document.querySelector('.submission__inner').outerText", + "required_contents": { + "must_include": [ + "1000", + "used iphone" + ] + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 6100, + "changelogs": [ + { + "key": "expected_backend_state", + "category": "switch_to_api_eval", + "note": "Update to use system check instead of dom check" + }, + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "reddit" + ], + "task_id": 640, + "require_login": true, + "storage_state": "./.auth/reddit_state.json", + "start_url": "__REDDIT__", + "geolocation": null, + "intent_template": "Post a notice on a virtual meetup for {{interest}} enthusiasts on {{date}} in the {{subreddit}} subreddit", + "instantiation_dict": { + "interest": "book reading", + "date": "March 15th", + "subreddit": "r/books" + }, + "intent": "Post a notice on a virtual meetup for book reading enthusiasts on March 15th in the r/books subreddit", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "validate_post_exists_in_subreddit", + "expected_data": { + "subreddit": "books", + "title": [ + "book reading", + "March 15th", + "virtual meetup" + ] + } + } + } + ], + "site": "reddit" + }, + "original.eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__REDDIT__/f/books", + "program_html": [ + { + "url": "func:reddit_get_post_url('__last_url__')", + "locator": "document.querySelector('.submission__inner').outerText", + "required_contents": { + "must_include": [ + "book reading", + "March 15th", + "virtual meetup" + ] + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 16, + "changelogs": [ + { + "key": "expected_backend_state", + "category": "switch_to_api_eval", + "note": "Update to use system check instead of dom check" + }, + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "reddit" + ], + "task_id": 641, + "require_login": true, + "storage_state": "./.auth/reddit_state.json", + "start_url": "__REDDIT__", + "geolocation": null, + "intent_template": "Post a notice on a virtual meetup for {{interest}} enthusiasts on {{date}} in the {{subreddit}} subreddit", + "instantiation_dict": { + "interest": "Harry Potter", + "date": "July 8th", + "subreddit": "books" + }, + "intent": "Post a notice on a virtual meetup for Harry Potter enthusiasts on July 8th in the books subreddit", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "validate_post_exists_in_subreddit", + "expected_data": { + "subreddit": "books", + "title": [ + "Harry Potter", + "July 8th", + "virtual meetup" + ] + } + } + } + ], + "site": "reddit" + }, + "original.eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__REDDIT__/f/books", + "program_html": [ + { + "url": "func:reddit_get_post_url('__last_url__')", + "locator": "document.querySelector('.submission__inner').outerText", + "required_contents": { + "must_include": [ + "Harry Potter", + "July 8th", + "virtual meetup" + ] + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 16, + "changelogs": [ + { + "key": "expected_backend_state", + "category": "switch_to_api_eval", + "note": "Update to use system check instead of dom check" + }, + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "reddit" + ], + "task_id": 642, + "require_login": true, + "storage_state": "./.auth/reddit_state.json", + "start_url": "__REDDIT__", + "geolocation": null, + "intent_template": "Post a notice on a virtual meetup for {{interest}} enthusiasts on {{date}} in the {{subreddit}} subreddit", + "instantiation_dict": { + "interest": "Big little lies", + "date": "Sep 10th", + "subreddit": "books" + }, + "intent": "Post a notice on a virtual meetup for Big little lies enthusiasts on Sep 10th in the books subreddit", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "validate_post_exists_in_subreddit", + "expected_data": { + "subreddit": "books", + "title": [ + "Big little lies", + "Sep 10th", + "virtual meetup" + ] + } + } + } + ], + "site": "reddit" + }, + "original.eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__REDDIT__/f/books", + "program_html": [ + { + "url": "func:reddit_get_post_url('__last_url__')", + "locator": "document.querySelector('.submission__inner').outerText", + "required_contents": { + "must_include": [ + "Big little lies", + "Sep 10th", + "virtual meetup" + ] + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 16, + "changelogs": [ + { + "key": "expected_backend_state", + "category": "switch_to_api_eval", + "note": "Update to use system check instead of dom check" + }, + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "reddit" + ], + "task_id": 643, + "require_login": true, + "storage_state": "./.auth/reddit_state.json", + "start_url": "__REDDIT__", + "geolocation": null, + "intent_template": "Post a notice on a virtual meetup for {{interest}} enthusiasts on {{date}} in the {{subreddit}} subreddit", + "instantiation_dict": { + "interest": "racing cars", + "date": "Oct 21st", + "subreddit": "nyc" + }, + "intent": "Post a notice on a virtual meetup for racing cars enthusiasts on Oct 21st in the nyc subreddit", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "validate_post_exists_in_subreddit", + "expected_data": { + "subreddit": "books", + "title": [ + "racing cars", + "Oct 21st", + "virtual meetup" + ] + } + } + } + ], + "site": "reddit" + }, + "original.eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__REDDIT__/f/nyc", + "program_html": [ + { + "url": "func:reddit_get_post_url('__last_url__')", + "locator": "document.querySelector('.submission__inner').outerText", + "required_contents": { + "must_include": [ + "racing cars", + "Oct 21st", + "virtual meetup" + ] + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 16, + "changelogs": [ + { + "key": "expected_backend_state", + "category": "switch_to_api_eval", + "note": "Update to use system check instead of dom check" + }, + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "reddit" + ], + "task_id": 644, + "require_login": true, + "storage_state": "./.auth/reddit_state.json", + "start_url": "__REDDIT__", + "geolocation": null, + "intent_template": "Post a notice on a virtual meetup for {{interest}} enthusiasts on {{date}} in the {{subreddit}} subreddit", + "instantiation_dict": { + "interest": "Tears of Kingdom", + "date": "Dec 15th", + "subreddit": "games" + }, + "intent": "Post a notice on a virtual meetup for Tears of Kingdom enthusiasts on Dec 15th in the games subreddit", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "validate_post_exists_in_subreddit", + "expected_data": { + "subreddit": "books", + "title": [ + "Tears of Kingdom", + "Dec 15th", + "virtual meetup" + ] + } + } + } + ], + "site": "reddit" + }, + "original.eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__REDDIT__/f/games", + "program_html": [ + { + "url": "func:reddit_get_post_url('__last_url__')", + "locator": "document.querySelector('.submission__inner').outerText", + "required_contents": { + "must_include": [ + "Tears of Kingdom", + "Dec 15th", + "virtual meetup" + ] + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 16, + "changelogs": [ + { + "key": "expected_backend_state", + "category": "switch_to_api_eval", + "note": "Update to use system check instead of dom check" + }, + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "reddit" + ], + "task_id": 645, + "require_login": true, + "storage_state": "./.auth/reddit_state.json", + "start_url": "__REDDIT__", + "geolocation": null, + "intent_template": "Post in {{subreddit}} subreddit about what could {{model_type}} help the corresponding field.", + "original.intent_template": "Post in {{subreddit}} subreddit about what could machine learning help the correpong field.", + "instantiation_dict": { + "model_type": "machine learning", + "subreddit": "books" + }, + "original.instantiation_dict": { + "subreddit": "books" + }, + "intent": "Post in books subreddit about what could machine learning help the corresponding field.", + "original.intent": "Post in books subreddit about what could machine learning help the correpong field.", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "validate_post_exists_in_subreddit", + "expected_data": { + "subreddit": "books", + "title": [ + "machine learning", + "help" + ] + } + } + } + ], + "site": "reddit" + }, + "original.eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__REDDIT__/f/books", + "program_html": [ + { + "url": "func:reddit_get_post_url('__last_url__')", + "locator": "document.querySelector('.submission__inner').outerText", + "required_contents": { + "must_include": [ + "machine learning", + "help" + ] + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 19, + "changelogs": [ + { + "key": "intent_template", + "category": "spelling_or_grammar", + "note": "Fix task wording" + }, + { + "key": "intent_template", + "category": "spelling_or_grammar", + "note": "Grammatical error in task intent template" + }, + { + "key": "expected_backend_state", + "category": "switch_to_api_eval", + "note": "Update to use system check instead of dom check" + }, + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "reddit" + ], + "task_id": 646, + "require_login": true, + "storage_state": "./.auth/reddit_state.json", + "start_url": "__REDDIT__", + "geolocation": null, + "intent_template": "Post in {{subreddit}} subreddit about what could {{model_type}} help the corresponding field.", + "original.intent_template": "Post in {{subreddit}} subreddit about what could midjourney help the correpong field.", + "instantiation_dict": { + "model_type": "midjourney", + "subreddit": "DIY" + }, + "original.instantiation_dict": { + "subreddit": "DIY" + }, + "intent": "Post in DIY subreddit about what could midjourney help the corresponding field.", + "original.intent": "Post in DIY subreddit about what could midjourney help the correpong field.", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "validate_post_exists_in_subreddit", + "expected_data": { + "subreddit": "diy", + "title": [ + "midjourney", + "help" + ] + } + } + } + ], + "site": "reddit" + }, + "original.eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__REDDIT__/f/diy", + "program_html": [ + { + "url": "func:reddit_get_post_url('__last_url__')", + "locator": "document.querySelector('.submission__inner').outerText", + "required_contents": { + "must_include": [ + "midjourney", + "help" + ] + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 19, + "changelogs": [ + { + "key": "intent_template", + "category": "spelling_or_grammar", + "note": "Fix task wording" + }, + { + "key": "intent_template", + "category": "spelling_or_grammar", + "note": "Grammatical error in task intent template" + }, + { + "key": "expected_backend_state", + "category": "switch_to_api_eval", + "note": "Update to use system check instead of dom check" + }, + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "reddit" + ], + "task_id": 647, + "require_login": true, + "storage_state": "./.auth/reddit_state.json", + "start_url": "__REDDIT__", + "geolocation": null, + "intent_template": "Post in {{subreddit}} subreddit about what could {{model_type}} help the corresponding field.", + "original.intent_template": "Post in {{subreddit}} forum about what could open-source LLMs help the correpong field.", + "instantiation_dict": { + "model_type": "open-source LLMs", + "subreddit": "technology" + }, + "original.instantiation_dict": { + "subreddit": "technology" + }, + "intent": "Post in technology subreddit about what could open-source LLMs help the corresponding field.", + "original.intent": "Post in technology forum about what could open-source LLMs help the correpong field.", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "validate_post_exists_in_subreddit", + "expected_data": { + "subreddit": "technology", + "title": [ + "open-source LLMs", + "help" + ] + } + } + } + ], + "site": "reddit" + }, + "original.eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__REDDIT__/f/technology", + "program_html": [ + { + "url": "func:reddit_get_post_url('__last_url__')", + "locator": "document.querySelector('.submission__inner').outerText", + "required_contents": { + "must_include": [ + "open-source LLMs", + "help" + ] + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 19, + "changelogs": [ + { + "key": "intent_template", + "category": "spelling_or_grammar", + "note": "Fix task wording" + }, + { + "key": "intent_template", + "category": "spelling_or_grammar", + "note": "Grammatical error in task intent template" + }, + { + "key": "expected_backend_state", + "category": "switch_to_api_eval", + "note": "Update to use system check instead of dom check" + }, + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "reddit" + ], + "task_id": 648, + "require_login": true, + "storage_state": "./.auth/reddit_state.json", + "start_url": "__REDDIT__", + "geolocation": null, + "intent_template": "Post in {{subreddit}} subreddit about what could {{model_type}} help the corresponding field.", + "original.intent_template": "Post in {{subreddit}} forum about what could large language models help the correpong field.", + "instantiation_dict": { + "model_type": "large language models", + "subreddit": "dataisbeautiful" + }, + "original.instantiation_dict": { + "subreddit": "dataisbeautiful" + }, + "intent": "Post in dataisbeautiful subreddit about what could large language models help the corresponding field.", + "original.intent": "Post in dataisbeautiful forum about what could large language models help the correpong field.", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "validate_post_exists_in_subreddit", + "expected_data": { + "subreddit": "dataisbeautiful", + "title": [ + "large language models", + "help" + ] + } + } + } + ], + "site": "reddit" + }, + "original.eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__REDDIT__/f/dataisbeautiful", + "program_html": [ + { + "url": "func:reddit_get_post_url('__last_url__')", + "locator": "document.querySelector('.submission__inner').outerText", + "required_contents": { + "must_include": [ + "large language models", + "help" + ] + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 19, + "changelogs": [ + { + "key": "intent_template", + "category": "spelling_or_grammar", + "note": "Fix task wording" + }, + { + "key": "intent_template", + "category": "spelling_or_grammar", + "note": "Grammatical error in task intent template" + }, + { + "key": "expected_backend_state", + "category": "switch_to_api_eval", + "note": "Update to use system check instead of dom check" + }, + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "reddit" + ], + "task_id": 649, + "require_login": true, + "storage_state": "./.auth/reddit_state.json", + "start_url": "__REDDIT__", + "geolocation": null, + "intent_template": "Post in {{subreddit}} subreddit about what could {{model_type}} help the corresponding field.", + "original.intent_template": "Post in {{subreddit}} subreddit about what could diffusion model help the correpong field.", + "instantiation_dict": { + "model_type": "diffusion model", + "subreddit": "dataisbeautiful" + }, + "original.instantiation_dict": { + "subreddit": "history" + }, + "intent": "Post in dataisbeautiful subreddit about what could diffusion model help the corresponding field.", + "original.intent": "Post in history subreddit about what could diffusion model help the correpong field.", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "validate_post_exists_in_subreddit", + "expected_data": { + "subreddit": "history", + "title": [ + "diffusion models", + "help" + ] + } + } + } + ], + "site": "reddit" + }, + "original.eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__REDDIT__/f/history", + "program_html": [ + { + "url": "func:reddit_get_post_url('__last_url__')", + "locator": "document.querySelector('.submission__inner').outerText", + "required_contents": { + "must_include": [ + "diffusion model", + "help" + ] + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 19, + "changelogs": [ + { + "key": "intent_template", + "category": "spelling_or_grammar", + "note": "Fix task wording" + }, + { + "key": "intent_template", + "category": "spelling_or_grammar", + "note": "Grammatical error in task intent template" + }, + { + "key": "expected_backend_state", + "category": "switch_to_api_eval", + "note": "Update to use system check instead of dom check" + }, + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "reddit" + ], + "task_id": 650, + "require_login": true, + "storage_state": "./.auth/reddit_state.json", + "start_url": "__REDDIT__/f/books/59421/friendly-reminder-bookshop-org-exists", + "geolocation": null, + "intent_template": "Reply to {{position_description}} with my comment \"{{content_description}}\"", + "instantiation_dict": { + "position_description": "the post", + "content_description": "I am a big fan of the bookorg" + }, + "intent": "Reply to the post with my comment \"I am a big fan of the bookorg\"", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "validate_comment_exists_in_post", + "expected_data": { + "post_id": "59421", + "comment_content": "I am a big fan of the bookorg" + } + } + } + ], + "site": "reddit" + }, + "original.eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__REDDIT__/f/books/59421/friendly-reminder-bookshop-org-exists", + "program_html": [ + { + "url": "last", + "locator": "document.querySelector('.comment__body').outerText", + "required_contents": { + "exact_match": "I am a big fan of the bookorg" + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 23, + "changelogs": [ + { + "key": "expected_backend_state", + "category": "switch_to_api_eval", + "note": "Update to use system check instead of dom check" + }, + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "reddit" + ], + "task_id": 651, + "require_login": true, + "storage_state": "./.auth/reddit_state.json", + "start_url": "__REDDIT__/f/pittsburgh/45899/driving-in-pittsburgh-summed-up-by-one-traffic-sign", + "geolocation": null, + "intent_template": "Reply to {{position_description}} with my comment \"{{content_description}}\"", + "instantiation_dict": { + "position_description": "the post", + "content_description": "Yeah, pittsburgh traffic, you know..." + }, + "intent": "Reply to the post with my comment \"Yeah, pittsburgh traffic, you know...\"", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "validate_comment_exists_in_post", + "expected_data": { + "post_id": "45899", + "comment_content": "Yeah, pittsburgh traffic, you know..." + } + } + } + ], + "site": "reddit" + }, + "original.eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__REDDIT__/f/pittsburgh/45899/driving-in-pittsburgh-summed-up-by-one-traffic-sign", + "program_html": [ + { + "url": "last", + "locator": "document.querySelector('.comment__body').outerText", + "required_contents": { + "exact_match": "Yeah, pittsburgh traffic, you know..." + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 23, + "changelogs": [ + { + "key": "expected_backend_state", + "category": "switch_to_api_eval", + "note": "Update to use system check instead of dom check" + }, + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "reddit" + ], + "task_id": 652, + "require_login": true, + "storage_state": "./.auth/reddit_state.json", + "start_url": "__REDDIT__/f/technology/134852/ai-experts-disown-musk-backed-campaign-citing-their-research", + "geolocation": null, + "intent_template": "Reply to {{position_description}} with my comment \"{{content_description}}\"", + "instantiation_dict": { + "position_description": "the post", + "content_description": "???" + }, + "intent": "Reply to the post with my comment \"???\"", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "validate_comment_exists_in_post", + "expected_data": { + "post_id": "134852", + "comment_content": "???" + } + } + } + ], + "site": "reddit" + }, + "original.eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__REDDIT__/f/technology/134852/ai-experts-disown-musk-backed-campaign-citing-their-research", + "program_html": [ + { + "url": "last", + "locator": "document.querySelector('.comment__body').outerText", + "required_contents": { + "exact_match": "???" + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 23, + "changelogs": [ + { + "key": "expected_backend_state", + "category": "switch_to_api_eval", + "note": "Update to use system check instead of dom check" + }, + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "shopping" + ], + "task_id": 653, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "Fill the \"contact us\" form in the site for a refund on the {{product}} I bought, stating that \"it broke after just three days of use\". Also, ensure to include the order number #{{order_id}} and the product SKU. Don't submit yet, I will check.", + "original.intent_template": "Fill the \"contact us\" form in the site for a refund on the {{product}} I bought, stating that it broke after just three days of use. Also, ensure to include the order number #{{order_id}} and the product SKU. Don't submit yet, I will check.", + "instantiation_dict": { + "product": "phone screen protector", + "order_id": "000000180" + }, + "intent": "Fill the \"contact us\" form in the site for a refund on the phone screen protector I bought, stating that \"it broke after just three days of use\". Also, ensure to include the order number #000000180 and the product SKU. Don't submit yet, I will check.", + "original.intent": "Fill the \"contact us\" form in the site for a refund on the phone screen protector I bought, stating that it broke after just three days of use. Also, ensure to include the order number #000000180 and the product SKU. Don't submit yet, I will check.", + "require_reset": false, + "eval": { + "expected_ui_state": [ + { + "url": "__SHOPPING__/contact", + "eval_func": { + "name": "verify_ui_value_exists", + "eval_params": { + "locator": "title=\"What\u2019s on your mind?\"" + } + } + } + ], + "site": "shopping" + }, + "original.eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__SHOPPING__/contact", + "program_html": [ + { + "url": "last", + "locator": "document.querySelector('[title=\"What\u2019s on your mind?\"').value", + "required_contents": { + "must_include": [ + "refund", + "it broke after three days of use", + "000000180", + "B087QJN9W1" + ] + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 153, + "changelogs": [ + { + "key": "intent_template", + "category": "task_ambiguity", + "note": "Clarify the intent to provide exact string to enter" + }, + { + "key": "expected_ui_state", + "category": "context_free_evaluation", + "note": "Used locator check" + } + ] + }, + { + "sites": [ + "shopping" + ], + "task_id": 654, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "Fill the \"contact us\" form in the site for a refund on the {{product}} I bought, stating that \"it broke after just three days of use\". Also, ensure to include the order number #{{order_id}} and the product SKU. Don't submit yet, I will check.", + "original.intent_template": "Fill the \"contact us\" form in the site for a refund on the {{product}} I bought, stating that it broke after just three days of use. Also, ensure to include the order number #{{order_id}} and the product SKU. Don't submit yet, I will check.", + "instantiation_dict": { + "product": "bluetooth speaker", + "order_id": "161" + }, + "intent": "Fill the \"contact us\" form in the site for a refund on the bluetooth speaker I bought, stating that \"it broke after just three days of use\". Also, ensure to include the order number #161 and the product SKU. Don't submit yet, I will check.", + "original.intent": "Fill the \"contact us\" form in the site for a refund on the bluetooth speaker I bought, stating that it broke after just three days of use. Also, ensure to include the order number #161 and the product SKU. Don't submit yet, I will check.", + "require_reset": false, + "eval": { + "expected_ui_state": [ + { + "url": "__SHOPPING__/contact", + "eval_func": { + "name": "verify_ui_value_exists", + "eval_params": { + "locator": "title=\"What\u2019s on your mind?\"" + } + } + } + ], + "site": "shopping" + }, + "original.eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__SHOPPING__/contact", + "program_html": [ + { + "url": "last", + "locator": "document.querySelector('[title=\"What\u2019s on your mind?\"').value", + "required_contents": { + "must_include": [ + "refund", + "it broke after three days of use", + "161", + "B09P7BFL4H" + ] + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 153, + "changelogs": [ + { + "key": "intent_template", + "category": "task_ambiguity", + "note": "Clarify the intent to provide exact string to enter" + }, + { + "key": "expected_ui_state", + "category": "context_free_evaluation", + "note": "Used locator check" + } + ] + }, + { + "sites": [ + "shopping" + ], + "task_id": 655, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "Fill the \"contact us\" form in the site for a refund on the {{product}} I bought, stating that \"it broke after just three days of use\". Also, ensure to include the order number #{{order_id}} and the product SKU. Don't submit yet, I will check.", + "original.intent_template": "Fill the \"contact us\" form in the site for a refund on the {{product}} I bought, stating that it broke after just three days of use. Also, ensure to include the order number #{{order_id}} and the product SKU. Don't submit yet, I will check.", + "instantiation_dict": { + "product": "iphone case", + "order_id": "180" + }, + "intent": "Fill the \"contact us\" form in the site for a refund on the iphone case I bought, stating that \"it broke after just three days of use\". Also, ensure to include the order number #180 and the product SKU. Don't submit yet, I will check.", + "original.intent": "Fill the \"contact us\" form in the site for a refund on the iphone case I bought, stating that it broke after just three days of use. Also, ensure to include the order number #180 and the product SKU. Don't submit yet, I will check.", + "require_reset": false, + "eval": { + "expected_ui_state": [ + { + "url": "__SHOPPING__/contact", + "eval_func": { + "name": "verify_ui_value_exists", + "eval_params": { + "locator": "title=\"What\u2019s on your mind?\"" + } + } + } + ], + "site": "shopping" + }, + "original.eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__SHOPPING__/contact", + "program_html": [ + { + "url": "last", + "locator": "document.querySelector('[title=\"What\u2019s on your mind?\"').value", + "required_contents": { + "must_include": [ + "refund", + "it broke after three days of use", + "180", + "B087QJN9W1" + ] + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 153, + "changelogs": [ + { + "key": "intent_template", + "category": "task_ambiguity", + "note": "Clarify the intent to provide exact string to enter" + }, + { + "key": "expected_ui_state", + "category": "context_free_evaluation", + "note": "Used locator check" + } + ] + }, + { + "sites": [ + "shopping" + ], + "task_id": 656, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "Fill the \"contact us\" form in the site for a refund on the {{product}} I bought, stating that \"it broke after just three days of use\". Also, ensure to include the order number #{{order_id}} and the product SKU. Don't submit yet, I will check.", + "original.intent_template": "Fill the \"contact us\" form in the site for a refund on the {{product}} I bought, stating that it broke after just three days of use. Also, ensure to include the order number #{{order_id}} and the product SKU. Don't submit yet, I will check.", + "instantiation_dict": { + "product": "remote controller", + "order_id": "180" + }, + "intent": "Fill the \"contact us\" form in the site for a refund on the remote controller I bought, stating that \"it broke after just three days of use\". Also, ensure to include the order number #180 and the product SKU. Don't submit yet, I will check.", + "original.intent": "Fill the \"contact us\" form in the site for a refund on the remote controller I bought, stating that it broke after just three days of use. Also, ensure to include the order number #180 and the product SKU. Don't submit yet, I will check.", + "require_reset": false, + "eval": { + "expected_ui_state": [ + { + "url": "__SHOPPING__/contact", + "eval_func": { + "name": "verify_ui_value_exists", + "eval_params": { + "locator": "title=\"What\u2019s on your mind?\"" + } + } + } + ], + "site": "shopping" + }, + "original.eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__SHOPPING__/contact", + "program_html": [ + { + "url": "last", + "locator": "document.querySelector('[title=\"What\u2019s on your mind?\"').value", + "required_contents": { + "must_include": [ + "refund", + "it broke after three days of use", + "180", + "B0041MSF2S" + ] + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 153, + "changelogs": [ + { + "key": "intent_template", + "category": "task_ambiguity", + "note": "Clarify the intent to provide exact string to enter" + }, + { + "key": "expected_ui_state", + "category": "context_free_evaluation", + "note": "Used locator check" + } + ] + }, + { + "sites": [ + "shopping" + ], + "task_id": 657, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "Fill the \"contact us\" form in the site for a refund on the {{product}} I bought, stating that \"it broke after just three days of use\". Also, ensure to include the order number #{{order_id}} and the product SKU. Don't submit yet, I will check.", + "original.intent_template": "Fill the \"contact us\" form in the site for a refund on the {{product}} I bought, stating that it broke after just three days of use. Also, ensure to include the order number #{{order_id}} and the product SKU. Don't submit yet, I will check.", + "instantiation_dict": { + "product": "speaker", + "order_id": "148" + }, + "intent": "Fill the \"contact us\" form in the site for a refund on the speaker I bought, stating that \"it broke after just three days of use\". Also, ensure to include the order number #148 and the product SKU. Don't submit yet, I will check.", + "original.intent": "Fill the \"contact us\" form in the site for a refund on the speaker I bought, stating that it broke after just three days of use. Also, ensure to include the order number #148 and the product SKU. Don't submit yet, I will check.", + "require_reset": false, + "eval": { + "expected_ui_state": [ + { + "url": "__SHOPPING__/contact", + "eval_func": { + "name": "verify_ui_value_exists", + "eval_params": { + "locator": "title=\"What\u2019s on your mind?\"" + } + } + } + ], + "site": "shopping" + }, + "original.eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__SHOPPING__/contact", + "program_html": [ + { + "url": "last", + "locator": "document.querySelector('[title=\"What\u2019s on your mind?\"').value", + "required_contents": { + "must_include": [ + "refund", + "broke after three days of use", + "148", + "B003FVW3VA" + ] + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 153, + "changelogs": [ + { + "key": "intent_template", + "category": "task_ambiguity", + "note": "Clarify the intent to provide exact string to enter" + }, + { + "key": "expected_ui_state", + "category": "context_free_evaluation", + "note": "Used locator check" + } + ] + }, + { + "sites": [ + "gitlab" + ], + "task_id": 658, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Create an issue in {{repo}} repo with title \"{{issue}}\". Assign the issue to {{account}}. Set due date to be {{due}}", + "instantiation_dict": { + "repo": "a11yproject", + "issue": "401 bad gateway", + "account": "Roshanjossey", + "due": "the end of 2030" + }, + "intent": "Create an issue in a11yproject repo with title \"401 bad gateway\". Assign the issue to Roshanjossey. Set due date to be the end of 2030", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "project_has_issue_with_fields", + "eval_params": { + "group": "a11yproject", + "project": "a11yproject.com", + "fields": { + "title": "401 bad gateway", + "assignee.username": "Roshanjossey", + "due_date": "2030-12-31" + } + } + } + } + ], + "site": "gitlab" + }, + "original.eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__GITLAB__/a11yproject/a11yproject.com/-/issues", + "program_html": [ + { + "url": "last", + "locator": "document.querySelector('[data-qa-selector=\"title_content\"]').outerText", + "required_contents": { + "exact_match": "401 bad gateway" + } + }, + { + "url": "last", + "locator": "document.querySelector('[data-testid=\"sidebar-due-date\"').outerText", + "required_contents": { + "must_include": [ + "Dec 31, 2030" + ] + } + }, + { + "url": "last", + "locator": "document.querySelector('.block.assignee').outerText", + "required_contents": { + "must_include": [ + "Roshan Jossey" + ] + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 327, + "changelogs": [ + { + "key": "expected_backend_state", + "category": "switch_to_api_eval", + "note": "Use API to verify; more accurate" + }, + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "gitlab" + ], + "task_id": 659, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Create an issue in {{repo}} repo with title \"{{issue}}\". Assign the issue to {{account}}. Set due date to be {{due}}", + "instantiation_dict": { + "repo": "empathy-prompts", + "issue": "Integrating LLMs for better prompts", + "account": "Roshanjossey", + "due": "the beginning of Q2 2033" + }, + "intent": "Create an issue in empathy-prompts repo with title \"Integrating LLMs for better prompts\". Assign the issue to Roshanjossey. Set due date to be the beginning of Q2 2033", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "project_has_issue_with_fields", + "eval_params": { + "group": "byteblaze", + "project": "empathy-prompts", + "fields": { + "title": "Integrating LLMs for better prompts", + "assignee.username": "Roshanjossey", + "due_date": "2033-04-01" + } + } + } + } + ], + "site": "gitlab" + }, + "original.eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__GITLAB__/byteblaze/empathy-prompts/-/issues", + "program_html": [ + { + "url": "last", + "locator": "document.querySelector('[data-qa-selector=\"title_content\"]').outerText", + "required_contents": { + "exact_match": "Integrating LLMs for better prompts" + } + }, + { + "url": "last", + "locator": "document.querySelector('[data-testid=\"sidebar-due-date\"').outerText", + "required_contents": { + "must_include": [ + "Apr 1, 2033" + ] + } + }, + { + "url": "last", + "locator": "document.querySelector('.block.assignee').outerText", + "required_contents": { + "must_include": [ + "Roshan Jossey" + ] + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 327, + "changelogs": [ + { + "key": "expected_backend_state", + "category": "switch_to_api_eval", + "note": "Use API to verify; more accurate" + }, + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "gitlab" + ], + "task_id": 660, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Create an issue in {{repo}} repo with title \"{{issue}}\". Assign the issue to {{account}}. Set due date to be {{due}}", + "instantiation_dict": { + "repo": "dotfiles", + "issue": "add support for oh-my-zsh", + "account": "Abishek", + "due": "July 18 2033" + }, + "intent": "Create an issue in dotfiles repo with title \"add support for oh-my-zsh\". Assign the issue to Abishek. Set due date to be July 18 2033", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "project_has_issue_with_fields", + "eval_params": { + "group": "byteblaze", + "project": "dotfiles", + "fields": { + "title": "add support for oh-my-zsh", + "assignee.username": "abisubramanya27", + "due_date": "2033-07-18" + } + } + } + } + ], + "site": "gitlab" + }, + "original.eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__GITLAB__/byteblaze/dotfiles/-/tree/main", + "program_html": [ + { + "url": "last", + "locator": "document.querySelector('[data-qa-selector=\"title_content\"]').outerText", + "required_contents": { + "exact_match": "add support for oh-my-zsh" + } + }, + { + "url": "last", + "locator": "document.querySelector('[data-testid=\"sidebar-due-date\"').outerText", + "required_contents": { + "must_include": [ + "Jul 18, 2033" + ] + } + }, + { + "url": "last", + "locator": "document.querySelector('.block.assignee').outerText", + "required_contents": { + "must_include": [ + "Abishek S" + ] + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 327, + "changelogs": [ + { + "key": "expected_backend_state", + "category": "switch_to_api_eval", + "note": "Use API to verify; more accurate" + }, + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "gitlab" + ], + "task_id": 661, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Open an issue to {{issue}} in {{repo}}.", + "instantiation_dict": { + "repo": "ChatGPT", + "issue": "report the issue of connection refused" + }, + "intent": "Open an issue to report the issue of connection refused in ChatGPT.", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "project_has_issue_with_fields", + "eval_params": { + "group": "convexegg", + "project": "chatgpt", + "fields": {} + } + } + } + ], + "site": "gitlab" + }, + "original.eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__GITLAB__/convexegg/chatgpt/-/issues", + "program_html": [ + { + "url": "last", + "locator": "document.querySelector('.detail-page-description').outerText", + "required_contents": { + "must_include": [ + "connection refused" + ] + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 328, + "changelogs": [ + { + "key": "expected_backend_state", + "category": "switch_to_api_eval", + "note": "Use API to verify; more accurate" + }, + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "gitlab" + ], + "task_id": 662, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Open an issue to {{issue}} in {{repo}}.", + "instantiation_dict": { + "repo": "aem-hacker", + "issue": "report experiencing \"OSError: [Errno 98] Address already in use\" during executions" + }, + "intent": "Open an issue to report experiencing \"OSError: [Errno 98] Address already in use\" during executions in aem-hacker.", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "project_has_issue_with_fields", + "eval_params": { + "group": "0ang3el", + "project": "aem-hacker", + "fields": {} + } + } + } + ], + "site": "gitlab" + }, + "original.eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__GITLAB__/0ang3el/aem-hacker/-/issues", + "program_html": [ + { + "url": "last", + "locator": "document.querySelector('.detail-page-description').outerText", + "required_contents": { + "must_include": [ + "OSError: [Errno 98] Address already in use" + ] + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 328, + "changelogs": [ + { + "key": "expected_backend_state", + "category": "switch_to_api_eval", + "note": "Use API to verify; more accurate" + }, + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "gitlab" + ], + "task_id": 663, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Open an issue to {{issue}} in {{repo}}.", + "instantiation_dict": { + "repo": "metaseq", + "issue": "ask their plan on supporting Llama and other llama family models" + }, + "intent": "Open an issue to ask their plan on supporting Llama and other llama family models in metaseq.", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "project_has_issue_with_fields", + "eval_params": { + "group": "root", + "project": "metaseq", + "fields": {} + } + } + } + ], + "site": "gitlab" + }, + "original.eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__GITLAB__/root/metaseq/-/issues", + "program_html": [ + { + "url": "last", + "locator": "document.querySelector('.detail-page-description').outerText", + "required_contents": { + "must_include": [ + "llama" + ] + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 328, + "changelogs": [ + { + "key": "expected_backend_state", + "category": "switch_to_api_eval", + "note": "Use API to verify; more accurate" + }, + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "gitlab" + ], + "task_id": 664, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Open an issue to {{issue}} in {{repo}}.", + "instantiation_dict": { + "repo": "awesome-python", + "issue": "ask their plans on adding Python 3.11 related resources" + }, + "intent": "Open an issue to ask their plans on adding Python 3.11 related resources in awesome-python.", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "project_has_issue_with_fields", + "eval_params": { + "group": "vinta", + "project": "awesome-python", + "fields": {} + } + } + } + ], + "site": "gitlab" + }, + "original.eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__GITLAB__/vinta/awesome-python/-/issues", + "program_html": [ + { + "url": "last", + "locator": "document.querySelector('.detail-page-description').outerText", + "required_contents": { + "must_include": [ + "Python 3.11" + ] + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 328, + "changelogs": [ + { + "key": "expected_backend_state", + "category": "switch_to_api_eval", + "note": "Use API to verify; more accurate" + }, + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "gitlab" + ], + "task_id": 665, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Open an issue to {{issue}} in {{repo}}.", + "instantiation_dict": { + "repo": "a11y-syntax-highlighting", + "issue": "request adding support for MT theme editor" + }, + "intent": "Open an issue to request adding support for MT theme editor in a11y-syntax-highlighting.", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "project_has_issue_with_fields", + "eval_params": { + "group": "byteblaze", + "project": "a11y-syntax-highlighting", + "fields": {} + } + } + } + ], + "site": "gitlab" + }, + "original.eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__GITLAB__/byteblaze/a11y-syntax-highlighting/-/issues", + "program_html": [ + { + "url": "last", + "locator": "document.querySelector('.detail-page-description').outerText", + "required_contents": { + "must_include": [ + "MT theme editor" + ] + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 328, + "changelogs": [ + { + "key": "expected_backend_state", + "category": "switch_to_api_eval", + "note": "Use API to verify; more accurate" + }, + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "gitlab" + ], + "task_id": 666, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__/primer/design", + "geolocation": null, + "intent_template": "Submit a request to merge {{source_branch}} branch into {{target_branch}} branch, assign {{reviewer}} as the reviewer", + "instantiation_dict": { + "source_branch": "dialog-component", + "target_branch": "dialog", + "reviewer": "Carol" + }, + "intent": "Submit a request to merge dialog-component branch into dialog branch, assign Carol as the reviewer", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "validate_status", + "expected_data": { + "expected_status": "RESOURCE_NOT_FOUND_ERROR" + } + } + } + ], + "site": "gitlab" + }, + "original.eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__GITLAB__/primer/design/-/merge_requests", + "program_html": [ + { + "url": "last", + "locator": "document.querySelectorAll(\".detail-page-description > a.gl-font-monospace\")[1].outerText", + "required_contents": { + "exact_match": "dialog" + } + }, + { + "url": "last", + "locator": "document.querySelectorAll(\".detail-page-description > a.gl-font-monospace\")[0].outerText", + "required_contents": { + "exact_match": "dialog-component" + } + }, + { + "url": "last", + "locator": "document.querySelector('.block.reviewer').outerText", + "required_contents": { + "must_include": [ + "Caroline Stewart" + ] + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 335, + "changelogs": [ + { + "key": "expected_retrieve_value", + "category": "unachievable_tasks", + "note": "No user named Carol" + } + ] + }, + { + "sites": [ + "gitlab" + ], + "task_id": 667, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__/primer/design", + "geolocation": null, + "intent_template": "Submit a merge request for {{source_branch}} branch to be merged into {{target_branch}} branch, assign {{reviewer}} as the reviewer", + "instantiation_dict": { + "source_branch": "dialog-component", + "target_branch": "bump-doctocat", + "reviewer": "primer" + }, + "intent": "Submit a merge request for dialog-component branch to be merged into bump-doctocat branch, assign primer as the reviewer", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "check_merge_request_has_values", + "eval_params": { + "group": "primer", + "project": "design", + "source_branch": "dialog-component", + "target_branch": "bump-doctocat", + "reviewer": "primer" + } + } + } + ], + "site": "gitlab" + }, + "original.eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__GITLAB__/primer/design/-/merge_requests", + "program_html": [ + { + "url": "last", + "locator": "document.querySelectorAll(\".detail-page-description > a.gl-font-monospace\")[1].outerText", + "required_contents": { + "exact_match": "bump-doctocat" + } + }, + { + "url": "last", + "locator": "document.querySelectorAll(\".detail-page-description > a.gl-font-monospace\")[0].outerText", + "required_contents": { + "exact_match": "dialog-component" + } + }, + { + "url": "last", + "locator": "document.querySelector('.block.reviewer').outerText", + "required_contents": { + "must_include": [ + "Primer" + ] + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 335, + "changelogs": [ + { + "key": "expected_backend_state", + "category": "switch_to_api_eval", + "note": "Use API to verify; more accurate" + }, + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "gitlab" + ], + "task_id": 668, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Submit a merge request for {{source_branch}} branch to be merged into {{target_branch}} branch, assign {{reviewer}} as the reviewer", + "instantiation_dict": { + "source_branch": "a11yproject.com/redesign", + "target_branch": "master", + "reviewer": "Justin Armstrong" + }, + "intent": "Submit a merge request for a11yproject.com/redesign branch to be merged into master branch, assign Justin Armstrong as the reviewer", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "validate_status", + "expected_data": { + "expected_status": "RESOURCE_NOT_FOUND_ERROR" + } + } + } + ], + "site": "gitlab" + }, + "original.eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__GITLAB__/a11yproject/a11yproject.com/-/merge_requests", + "program_html": [ + { + "url": "last", + "locator": "document.querySelectorAll(\".detail-page-description > a.gl-font-monospace\")[0].outerText", + "required_contents": { + "exact_match": "redesign" + } + }, + { + "url": "last", + "locator": "document.querySelectorAll(\".detail-page-description > a.gl-font-monospace\")[1].outerText", + "required_contents": { + "exact_match": "main" + } + }, + { + "url": "last", + "locator": "document.querySelector('.block.reviewer').outerText", + "required_contents": { + "must_include": [ + "Justin Armstrong" + ] + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 335, + "changelogs": [ + { + "key": "expected_retrieve_value", + "category": "unachievable_tasks", + "note": "No branch named master" + } + ] + }, + { + "sites": [ + "gitlab" + ], + "task_id": 669, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__/byteblaze/solarized-prism-theme", + "geolocation": null, + "intent_template": "Open a new issue to discuss the implementation of {{feature}}", + "instantiation_dict": { + "feature": "dark mode" + }, + "intent": "Open a new issue to discuss the implementation of dark mode", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "project_has_issue_with_fields", + "eval_params": { + "group": "byteblaze", + "project": "solarized-prism-theme", + "fields": { + "title": "dark mode" + }, + "title_field_is_substring": true + } + } + } + ], + "site": "gitlab" + }, + "original.eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__GITLAB__/byteblaze/solarized-prism-theme/-/issues", + "program_html": [ + { + "url": "last", + "locator": "document.querySelector('.detail-page-description').outerText", + "required_contents": { + "must_include": [ + "implementation", + "dark mode" + ] + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 337, + "changelogs": [ + { + "key": "expected_backend_state", + "category": "switch_to_api_eval", + "note": "Use API to verify; more accurate" + }, + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "gitlab" + ], + "task_id": 670, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__/byteblaze/dotfiles", + "geolocation": null, + "intent_template": "Open a new issue to discuss the implementation of {{feature}}", + "instantiation_dict": { + "feature": "default plugins for .zsh" + }, + "intent": "Open a new issue to discuss the implementation of default plugins for .zsh", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "project_has_issue_with_fields", + "eval_params": { + "group": "byteblaze", + "project": "dotfiles", + "fields": { + "title": "default plugins for .zsh" + }, + "title_field_is_substring": true + } + } + } + ], + "site": "gitlab" + }, + "original.eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__GITLAB__/byteblaze/dotfiles/-/issues", + "program_html": [ + { + "url": "last", + "locator": "document.querySelector('.detail-page-description').outerText", + "required_contents": { + "must_include": [ + "implementation", + "default plugins", + "zsh" + ] + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 337, + "changelogs": [ + { + "key": "expected_backend_state", + "category": "switch_to_api_eval", + "note": "Use API to verify; more accurate" + }, + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "shopping", + "reddit" + ], + "task_id": 671, + "require_login": true, + "storage_state": "./.auth/reddit_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "Gather the titles of {{product}} reviews with {{rating}} rating from OneStopShop, and post them in the games subreddit under the title \"real user feedback on {{product}}\"", + "instantiation_dict": { + "product": "Sony Computer Entertainment VR", + "rating": "2 stars and less" + }, + "intent": "Gather the titles of Sony Computer Entertainment VR reviews with 2 stars and less rating from OneStopShop, and post them in the games subreddit under the title \"real user feedback on Sony Computer Entertainment VR\"", + "require_reset": false, + "eval": { + "expected_ui_state": [ + { + "url": "__REDDIT__/f/gaming" + } + ], + "site": "reddit" + }, + "original.eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__REDDIT__/f/gaming", + "program_html": [ + { + "url": "func:reddit_get_post_url('__last_url__')", + "locator": "document.querySelector('.submission__title').outerText", + "required_contents": { + "exact_match": "real user feedback on Sony Computer Entertainment VR" + } + }, + { + "url": "func:reddit_get_post_url('__last_url__')", + "locator": "document.querySelector('.submission__body').outerText", + "required_contents": { + "must_include": [ + "didn't last a year without issues", + "Disappointing. Didn't last long before it stopped powering on and needed to be sent in for repair.", + "Received used items!!" + ] + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 101 + }, + { + "sites": [ + "shopping", + "reddit" + ], + "task_id": 672, + "require_login": true, + "storage_state": "./.auth/reddit_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "Gather the titles of {{product}} reviews with {{rating}} rating from OneStopShop, and post them in the games subreddit under the title \"real user feedback on {{product}}\"", + "instantiation_dict": { + "product": "Nintendo Switch Fortnite Wildcat Console EU", + "rating": "3 stars and less" + }, + "intent": "Gather the titles of Nintendo Switch Fortnite Wildcat Console EU reviews with 3 stars and less rating from OneStopShop, and post them in the games subreddit under the title \"real user feedback on Nintendo Switch Fortnite Wildcat Console EU\"", + "require_reset": false, + "eval": { + "expected_ui_state": [ + { + "url": "__REDDIT__/f/gaming" + } + ], + "site": "reddit" + }, + "original.eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__REDDIT__/f/gaming", + "program_html": [ + { + "url": "func:reddit_get_post_url('__last_url__')", + "locator": "document.querySelector('.submission__title').outerText", + "required_contents": { + "exact_match": "real user feedback on Nintendo Switch Fortnite Wildcat Console EU" + } + }, + { + "url": "func:reddit_get_post_url('__last_url__')", + "locator": "document.querySelector('.submission__body').outerText", + "required_contents": { + "must_include": [ + "EU charger and wild cat card doesn\u2019t even work!", + "REFUND REJECTED", + "Charging port not compatible", + "not compatible in the US", + "Wildcard Bonus Credits Not Redeemable!", + "Code not available!!" + ] + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 101 + }, + { + "sites": [ + "shopping", + "reddit" + ], + "task_id": 673, + "require_login": true, + "storage_state": "./.auth/reddit_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "Gather the titles of {{product}} reviews with {{rating}} rating from OneStopShop, and post them in the games subreddit under the title \"real user feedback on {{product}}\"", + "instantiation_dict": { + "product": "Racing Wheel Overdrive for Xbox X", + "rating": "1 star" + }, + "intent": "Gather the titles of Racing Wheel Overdrive for Xbox X reviews with 1 star rating from OneStopShop, and post them in the games subreddit under the title \"real user feedback on Racing Wheel Overdrive for Xbox X\"", + "require_reset": false, + "eval": { + "expected_ui_state": [ + { + "url": "__REDDIT__/f/gaming" + } + ], + "site": "reddit" + }, + "original.eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__REDDIT__/f/gaming", + "program_html": [ + { + "url": "func:reddit_get_post_url('__last_url__')", + "locator": "document.querySelector('.submission__title').outerText", + "required_contents": { + "exact_match": "real user feedback on Racing Wheel Overdrive for Xbox X" + } + }, + { + "url": "func:reddit_get_post_url('__last_url__')", + "locator": "document.querySelector('.submission__body').outerText", + "required_contents": { + "must_include": [ + "Unable to set neutral steering", + "Doesn\u2019t work with PC", + "Crazy problems in automatic mode", + "pedals stopped working", + "Only works with certain games" + ] + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 101 + }, + { + "sites": [ + "shopping", + "reddit" + ], + "task_id": 674, + "require_login": true, + "storage_state": "./.auth/reddit_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "Gather the titles of {{product}} reviews with {{rating}} rating from OneStopShop, and post them in the games subreddit under the title \"real user feedback on {{product}}\"", + "instantiation_dict": { + "product": "Doc and Pies Arcade Factory Cocktail Arcade Machine", + "rating": "3 stars and less" + }, + "intent": "Gather the titles of Doc and Pies Arcade Factory Cocktail Arcade Machine reviews with 3 stars and less rating from OneStopShop, and post them in the games subreddit under the title \"real user feedback on Doc and Pies Arcade Factory Cocktail Arcade Machine\"", + "require_reset": false, + "eval": { + "expected_ui_state": [ + { + "url": "__REDDIT__/f/gaming" + } + ], + "site": "reddit" + }, + "original.eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__REDDIT__/f/gaming", + "program_html": [ + { + "url": "func:reddit_get_post_url('__last_url__')", + "locator": "document.querySelector('.submission__title').outerText", + "required_contents": { + "exact_match": "real user feedback on Doc and Pies Arcade Factory Cocktail Arcade Machine" + } + }, + { + "url": "func:reddit_get_post_url('__last_url__')", + "locator": "document.querySelector('.submission__body').outerText", + "required_contents": { + "must_include": [ + "Poorly Made Exterior. Consider a different Company.", + "piece of junk ,..can't believe I spent money on this !!!!", + "Based arrived broken but game itself works" + ] + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 101 + }, + { + "sites": [ + "shopping", + "reddit" + ], + "task_id": 675, + "require_login": true, + "storage_state": "./.auth/reddit_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "Gather the titles of {{product}} reviews with {{rating}} rating from OneStopShop, and post them in the games subreddit under the title \"real user feedback on {{product}}\"", + "instantiation_dict": { + "product": "HORI 3D Surround Gaming Neckset", + "rating": "2 stars and less" + }, + "intent": "Gather the titles of HORI 3D Surround Gaming Neckset reviews with 2 stars and less rating from OneStopShop, and post them in the games subreddit under the title \"real user feedback on HORI 3D Surround Gaming Neckset\"", + "require_reset": false, + "eval": { + "expected_ui_state": [ + { + "url": "__REDDIT__/f/gaming" + } + ], + "site": "reddit" + }, + "original.eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__REDDIT__/f/gaming", + "program_html": [ + { + "url": "func:reddit_get_post_url('__last_url__')", + "locator": "document.querySelector('.submission__title').outerText", + "required_contents": { + "exact_match": "real user feedback on HORI 3D Surround Gaming Neckset" + } + }, + { + "url": "func:reddit_get_post_url('__last_url__')", + "locator": "document.querySelector('.submission__body').outerText", + "required_contents": { + "must_include": [ + "Not worth it for PC users", + "I really wanted to like this.", + "I wish this was better..." + ] + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 101 + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 676, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Navigate to and filter for orders that are {{status}}", + "original.intent_template": "Lookup orders that are {{status}}", + "instantiation_dict": { + "status": "suspected of being fraudulent" + }, + "intent": "Navigate to and filter for orders that are suspected of being fraudulent", + "original.intent": "Lookup orders that are suspected of being fraudulent", + "require_reset": false, + "eval": { + "expected_ui_state": [ + { + "url": "__SHOPPING_ADMIN__/sales/order/", + "eval_func": { + "name": "verify_ui_selector_value", + "eval_params": { + "locator": "div.admin__data-grid-filters-current._show > div.admin__current-filters-list-wrap > ul.admin__current-filters-list > li > span:nth-of-type(2)", + "expected_value": "Suspected Fraud" + } + } + } + ], + "site": "shopping_admin" + }, + "original.eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__SHOPPING_ADMIN__/sales/order/", + "program_html": [ + { + "url": "last", + "locator": "document.querySelector(\"div.admin__data-grid-filters-current\").outerText", + "required_contents": { + "must_include": [ + "Suspected Fraud" + ] + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 253, + "changelogs": [ + { + "key": "intent_template", + "category": "task_ambiguity", + "note": "Clearly specify to navigate to the orders page instead of returning a list of orders" + }, + { + "key": "expected_ui_state", + "category": "context_free_evaluation", + "note": "Used locator check" + } + ] + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 677, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Navigate to and filter for orders that are {{status}}", + "original.intent_template": "Lookup orders that are {{status}}", + "instantiation_dict": { + "status": "processing" + }, + "intent": "Navigate to and filter for orders that are processing", + "original.intent": "Lookup orders that are processing", + "require_reset": false, + "eval": { + "expected_ui_state": [ + { + "url": "__SHOPPING_ADMIN__/sales/order/", + "eval_func": { + "name": "verify_ui_selector_value", + "eval_params": { + "locator": "div.admin__data-grid-filters-current._show > div.admin__current-filters-list-wrap > ul.admin__current-filters-list > li > span:nth-of-type(2)", + "expected_value": "Processing" + } + } + } + ], + "site": "shopping_admin" + }, + "original.eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__SHOPPING_ADMIN__/sales/order/", + "program_html": [ + { + "url": "last", + "locator": "document.querySelector(\"div.admin__data-grid-filters-current\").outerText", + "required_contents": { + "must_include": [ + "Processing" + ] + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 253, + "changelogs": [ + { + "key": "intent_template", + "category": "task_ambiguity", + "note": "Clearly specify to navigate to the orders page instead of returning a list of orders" + }, + { + "key": "expected_ui_state", + "category": "context_free_evaluation", + "note": "Used locator check" + } + ] + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 678, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Navigate to and filter for orders that are {{status}}", + "original.intent_template": "Lookup orders that are {{status}}", + "instantiation_dict": { + "status": "canceled" + }, + "intent": "Navigate to and filter for orders that are canceled", + "original.intent": "Lookup orders that are canceled", + "require_reset": false, + "eval": { + "expected_ui_state": [ + { + "url": "__SHOPPING_ADMIN__/sales/order/", + "eval_func": { + "name": "verify_ui_selector_value", + "eval_params": { + "locator": "div.admin__data-grid-filters-current._show > div.admin__current-filters-list-wrap > ul.admin__current-filters-list > li > span:nth-of-type(2)", + "expected_value": "Canceled" + } + } + } + ], + "site": "shopping_admin" + }, + "original.eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__SHOPPING_ADMIN__/sales/order/", + "program_html": [ + { + "url": "last", + "locator": "document.querySelector(\"div.admin__data-grid-filters-current\").outerText", + "required_contents": { + "must_include": [ + "Canceled" + ] + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 253, + "changelogs": [ + { + "key": "intent_template", + "category": "task_ambiguity", + "note": "Clearly specify to navigate to the orders page instead of returning a list of orders" + }, + { + "key": "expected_ui_state", + "category": "context_free_evaluation", + "note": "Used locator check" + } + ] + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 679, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Navigate to and filter for orders that are {{status}}", + "original.intent_template": "Lookup orders that are {{status}}", + "instantiation_dict": { + "status": "completed" + }, + "intent": "Navigate to and filter for orders that are completed", + "original.intent": "Lookup orders that are completed", + "require_reset": false, + "eval": { + "expected_ui_state": [ + { + "url": "__SHOPPING_ADMIN__/sales/order/", + "eval_func": { + "name": "verify_ui_selector_value", + "eval_params": { + "locator": "div.admin__data-grid-filters-current._show > div.admin__current-filters-list-wrap > ul.admin__current-filters-list > li > span:nth-of-type(2)", + "expected_value": "Completed" + } + } + } + ], + "site": "shopping_admin" + }, + "original.eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__SHOPPING_ADMIN__/sales/order/", + "program_html": [ + { + "url": "last", + "locator": "document.querySelector(\"div.admin__data-grid-filters-current\").outerText", + "required_contents": { + "must_include": [ + "Completed" + ] + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 253, + "changelogs": [ + { + "key": "intent_template", + "category": "task_ambiguity", + "note": "Clearly specify to navigate to the orders page instead of returning a list of orders" + }, + { + "key": "expected_ui_state", + "category": "context_free_evaluation", + "note": "Used locator check" + } + ] + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 680, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Navigate to and filter for orders that are {{status}}", + "original.intent_template": "Lookup orders that are {{status}}", + "instantiation_dict": { + "status": "on hold" + }, + "intent": "Navigate to and filter for orders that are on hold", + "original.intent": "Lookup orders that are on hold", + "require_reset": false, + "eval": { + "expected_ui_state": [ + { + "url": "__SHOPPING_ADMIN__/sales/order/", + "eval_func": { + "name": "verify_ui_selector_value", + "eval_params": { + "locator": "div.admin__data-grid-filters-current._show > div.admin__current-filters-list-wrap > ul.admin__current-filters-list > li > span:nth-of-type(2)", + "expected_value": "On Hold" + } + } + } + ], + "site": "shopping_admin" + }, + "original.eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__SHOPPING_ADMIN__/sales/order/", + "program_html": [ + { + "url": "last", + "locator": "document.querySelector(\"div.admin__data-grid-filters-current\").outerText", + "required_contents": { + "must_include": [ + "On Hold" + ] + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 253, + "changelogs": [ + { + "key": "intent_template", + "category": "task_ambiguity", + "note": "Clearly specify to navigate to the orders page instead of returning a list of orders" + }, + { + "key": "expected_ui_state", + "category": "context_free_evaluation", + "note": "Used locator check" + } + ] + }, + { + "sites": [ + "reddit", + "gitlab" + ], + "task_id": 681, + "require_login": true, + "storage_state": "./.auth/gitlab.reddit_state.json", + "start_url": "__REDDIT__", + "geolocation": null, + "intent_template": "Find a GitLab repository related to {{topic}} and make a Reddit post linking to it in a relevant subreddit", + "instantiation_dict": { + "topic": "gan implementation" + }, + "intent": "Find a GitLab repository related to gan implementation and make a Reddit post linking to it in a relevant subreddit", + "require_reset": false, + "eval": { + "expected_ui_state": [ + { + "url": "__REDDIT__/f/machinelearning |OR| __REDDIT__/f/deeplearning |OR| __REDDIT__/f/technology" + } + ], + "site": "reddit" + }, + "original.eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__REDDIT__/f/machinelearning |OR| __REDDIT__/f/deeplearning |OR| __REDDIT__/f/technology", + "program_html": [ + { + "url": "last", + "locator": "document.querySelector('.submission__inner').outerText", + "required_contents": { + "must_include": [ + "gan implementation", + "__GITLAB__/eriklindernoren/PyTorch-GAN" + ] + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 116 + }, + { + "sites": [ + "reddit", + "gitlab" + ], + "task_id": 682, + "require_login": true, + "storage_state": "./.auth/gitlab.reddit_state.json", + "start_url": "__REDDIT__", + "geolocation": null, + "intent_template": "Find a GitLab repository related to {{topic}} and make a Reddit post linking to it in a relevant subreddit", + "instantiation_dict": { + "topic": "chatGPT" + }, + "intent": "Find a GitLab repository related to chatGPT and make a Reddit post linking to it in a relevant subreddit", + "require_reset": false, + "eval": { + "expected_ui_state": [ + { + "url": "__REDDIT__/f/machinelearning |OR| __REDDIT__/f/singularity |OR| __REDDIT__/f/deeplearning |OR| __REDDIT__/f/technology" + } + ], + "site": "reddit" + }, + "original.eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__REDDIT__/f/machinelearning |OR| __REDDIT__/f/singularity |OR| __REDDIT__/f/deeplearning |OR| __REDDIT__/f/technology", + "program_html": [ + { + "url": "last", + "locator": "document.querySelector('.submission__inner').outerText", + "required_contents": { + "must_include": [ + "chatgpt", + "__GITLAB__/convexegg/chatgpt" + ] + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 116 + }, + { + "sites": [ + "reddit", + "gitlab" + ], + "task_id": 683, + "require_login": true, + "storage_state": "./.auth/gitlab.reddit_state.json", + "start_url": "__REDDIT__", + "geolocation": null, + "intent_template": "Find a GitLab repository related to {{topic}} and make a Reddit post linking to it in a relevant subreddit", + "instantiation_dict": { + "topic": "metaseq" + }, + "intent": "Find a GitLab repository related to metaseq and make a Reddit post linking to it in a relevant subreddit", + "require_reset": false, + "eval": { + "expected_ui_state": [ + { + "url": "__REDDIT__/f/machinelearning |OR| __REDDIT__/f/deeplearning" + } + ], + "site": "reddit" + }, + "original.eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__REDDIT__/f/machinelearning |OR| __REDDIT__/f/deeplearning", + "program_html": [ + { + "url": "last", + "locator": "document.querySelector('.submission__inner').outerText", + "required_contents": { + "must_include": [ + "metaseq", + "__GITLAB__/root/metaseq" + ] + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 116 + }, + { + "sites": [ + "reddit", + "gitlab" + ], + "task_id": 684, + "require_login": true, + "storage_state": "./.auth/gitlab.reddit_state.json", + "start_url": "__REDDIT__", + "geolocation": null, + "intent_template": "Promote {{repo}} to subreddit {{subreddit}} with the description from the repo itself.", + "instantiation_dict": { + "repo": "byteblaze/cloud-to-butt", + "subreddit": "LifeProTips" + }, + "intent": "Promote byteblaze/cloud-to-butt to subreddit LifeProTips with the description from the repo itself.", + "require_reset": false, + "eval": { + "expected_ui_state": [ + { + "url": "__REDDIT__/f/LifeProTips" + } + ], + "site": "reddit" + }, + "original.eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__REDDIT__/f/LifeProTips", + "program_html": [ + { + "url": "last", + "locator": "document.querySelector('.submission__inner').outerText + [...document.querySelector('.submission__inner').querySelectorAll('[href],[src]')].map(elem => elem.getAttribute('href') || elem.getAttribute('src')).join(' ')", + "required_contents": { + "must_include": [ + "Chrome extension that replaces occurrences of 'the cloud' with 'my butt'", + "__GITLAB__/byteblaze/cloud-to-butt" + ] + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 117 + }, + { + "sites": [ + "reddit", + "gitlab" + ], + "task_id": 685, + "require_login": true, + "storage_state": "./.auth/gitlab.reddit_state.json", + "start_url": "__REDDIT__", + "geolocation": null, + "intent_template": "Promote {{repo}} to subreddit {{subreddit}} with the description from the repo itself.", + "instantiation_dict": { + "repo": "byteblaze/dotfiles", + "subreddit": "aww" + }, + "intent": "Promote byteblaze/dotfiles to subreddit aww with the description from the repo itself.", + "require_reset": false, + "eval": { + "expected_ui_state": [ + { + "url": "__REDDIT__/f/aww" + } + ], + "site": "reddit" + }, + "original.eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__REDDIT__/f/aww", + "program_html": [ + { + "url": "last", + "locator": "document.querySelector('.submission__inner').outerText + [...document.querySelector('.submission__inner').querySelectorAll('[href],[src]')].map(elem => elem.getAttribute('href') || elem.getAttribute('src')).join(' ')", + "required_contents": { + "must_include": [ + "Computer setup", + "__GITLAB__/byteblaze/dotfiles" + ] + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 117 + }, + { + "sites": [ + "reddit", + "gitlab" + ], + "task_id": 686, + "require_login": true, + "storage_state": "./.auth/gitlab.reddit_state.json", + "start_url": "__REDDIT__", + "geolocation": null, + "intent_template": "Promote {{repo}} to subreddit {{subreddit}} with the description from the repo itself.", + "instantiation_dict": { + "repo": "auth0/angular-storage", + "subreddit": "technology" + }, + "intent": "Promote auth0/angular-storage to subreddit technology with the description from the repo itself.", + "require_reset": false, + "eval": { + "expected_ui_state": [ + { + "url": "__REDDIT__/f/technology" + } + ], + "site": "reddit" + }, + "original.eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__REDDIT__/f/technology", + "program_html": [ + { + "url": "last", + "locator": "document.querySelector('.submission__inner').outerText + [...document.querySelector('.submission__inner').querySelectorAll('[href],[src]')].map(elem => elem.getAttribute('href') || elem.getAttribute('src')).join(' ')", + "required_contents": { + "must_include": [ + "A storage library for AngularJS done right", + "__GITLAB__/auth0/angular-storage" + ] + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 117 + }, + { + "sites": [ + "reddit", + "gitlab" + ], + "task_id": 687, + "require_login": true, + "storage_state": "./.auth/gitlab.reddit_state.json", + "start_url": "__REDDIT__", + "geolocation": null, + "intent_template": "Promote {{repo}} to subreddit {{subreddit}} with the description from the repo itself.", + "instantiation_dict": { + "repo": "koush/AndroidAsync", + "subreddit": "funny" + }, + "intent": "Promote koush/AndroidAsync to subreddit funny with the description from the repo itself.", + "require_reset": false, + "eval": { + "expected_ui_state": [ + { + "url": "__REDDIT__/f/funny" + } + ], + "site": "reddit" + }, + "original.eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__REDDIT__/f/funny", + "program_html": [ + { + "url": "last", + "locator": "document.querySelector('.submission__inner').outerText + [...document.querySelector('.submission__inner').querySelectorAll('[href],[src]')].map(elem => elem.getAttribute('href') || elem.getAttribute('src')).join(' ')", + "required_contents": { + "must_include": [ + "Asynchronous socket, http(s) (client+server) and websocket library for android. Based on nio, not threads.", + "__GITLAB__/koush/AndroidAsync" + ] + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 117 + }, + { + "sites": [ + "reddit", + "gitlab" + ], + "task_id": 688, + "require_login": true, + "storage_state": "./.auth/gitlab.reddit_state.json", + "start_url": "__REDDIT__", + "geolocation": null, + "intent_template": "Promote {{repo}} to subreddit {{subreddit}} with the description from the repo itself.", + "instantiation_dict": { + "repo": "lahwaacz/arch-wiki-docs", + "subreddit": "science" + }, + "intent": "Promote lahwaacz/arch-wiki-docs to subreddit science with the description from the repo itself.", + "require_reset": false, + "eval": { + "expected_ui_state": [ + { + "url": "__REDDIT__/f/science" + } + ], + "site": "reddit" + }, + "original.eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__REDDIT__/f/science", + "program_html": [ + { + "url": "last", + "locator": "document.querySelector('.submission__inner').outerText + [...document.querySelector('.submission__inner').querySelectorAll('[href],[src]')].map(elem => elem.getAttribute('href') || elem.getAttribute('src')).join(' ')", + "required_contents": { + "must_include": [ + "A script to download pages from Arch Wiki for offline browsing", + "__GITLAB__/lahwaacz/arch-wiki-docs" + ] + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 117 + }, + { + "sites": [ + "shopping" + ], + "task_id": 689, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "Draft an email to the shop owner via their contact us function for a coupon, enter in the exact reason of \"{{reason}}\"", + "original.intent_template": "Draft an email to the shop owner via their contact us function for a coupon as {{reason}}", + "instantiation_dict": { + "reason": "I am a loyal customer" + }, + "intent": "Draft an email to the shop owner via their contact us function for a coupon, enter in the exact reason of \"I am a loyal customer\"", + "original.intent": "Draft an email to the shop owner via their contact us function for a coupon as I am a loyal customer", + "require_reset": false, + "eval": { + "expected_ui_state": [ + { + "url": "__SHOPPING__/contact", + "eval_func": { + "name": "verify_ui_value_exists", + "eval_params": { + "locator": "title=\"What\u2019s on your mind?\"" + } + } + } + ], + "site": "shopping" + }, + "original.eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__SHOPPING__/contact", + "program_html": [ + { + "url": "last", + "locator": "document.querySelector('[title=\"What\u2019s on your mind?\"').value", + "required_contents": { + "must_include": [ + "coupon", + "I am a loyal customer" + ] + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 163, + "changelogs": [ + { + "key": "intent_template", + "category": "permissive_string_match", + "note": "Clarify exact text to enter" + }, + { + "key": "expected_ui_state", + "category": "context_free_evaluation", + "note": "Used locator check" + } + ] + }, + { + "sites": [ + "shopping" + ], + "task_id": 690, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "Draft an email to the shop owner via their contact us function for a coupon, enter in the exact reason of \"{{reason}}\"", + "original.intent_template": "Draft an email to the shop owner via their contact us function for a coupon as {{reason}}", + "instantiation_dict": { + "reason": "they promised me a coupon last time" + }, + "intent": "Draft an email to the shop owner via their contact us function for a coupon, enter in the exact reason of \"they promised me a coupon last time\"", + "original.intent": "Draft an email to the shop owner via their contact us function for a coupon as they promised me a coupon last time", + "require_reset": false, + "eval": { + "expected_ui_state": [ + { + "url": "__SHOPPING__/contact", + "eval_func": { + "name": "verify_ui_value_exists", + "eval_params": { + "locator": "title=\"What\u2019s on your mind?\"" + } + } + } + ], + "site": "shopping" + }, + "original.eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__SHOPPING__/contact", + "program_html": [ + { + "url": "last", + "locator": "document.querySelector('[title=\"What\u2019s on your mind?\"').value", + "required_contents": { + "must_include": [ + "coupon", + "promised" + ] + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 163, + "changelogs": [ + { + "key": "intent_template", + "category": "permissive_string_match", + "note": "Clarify exact text to enter" + }, + { + "key": "expected_ui_state", + "category": "context_free_evaluation", + "note": "Used locator check" + } + ] + }, + { + "sites": [ + "shopping" + ], + "task_id": 691, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "Draft an email to the shop owner via their contact us function for a coupon, enter in the exact reason of \"{{reason}}\"", + "original.intent_template": "Draft an email to the shop owner via their contact us function for a coupon as {{reason}}", + "instantiation_dict": { + "reason": "I plan to make a bulk purchase" + }, + "intent": "Draft an email to the shop owner via their contact us function for a coupon, enter in the exact reason of \"I plan to make a bulk purchase\"", + "original.intent": "Draft an email to the shop owner via their contact us function for a coupon as I plan to make a bulk purchase", + "require_reset": false, + "eval": { + "expected_ui_state": [ + { + "url": "__SHOPPING__/contact", + "eval_func": { + "name": "verify_ui_value_exists", + "eval_params": { + "locator": "title=\"What\u2019s on your mind?\"" + } + } + } + ], + "site": "shopping" + }, + "original.eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__SHOPPING__/contact", + "program_html": [ + { + "url": "last", + "locator": "document.querySelector('[title=\"What\u2019s on your mind?\"').value", + "required_contents": { + "must_include": [ + "coupon", + "bulk purchase" + ] + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 163, + "changelogs": [ + { + "key": "intent_template", + "category": "permissive_string_match", + "note": "Clarify exact text to enter" + }, + { + "key": "expected_ui_state", + "category": "context_free_evaluation", + "note": "Used locator check" + } + ] + }, + { + "sites": [ + "shopping" + ], + "task_id": 692, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "Draft an email to the shop owner via their contact us function for a coupon, enter in the exact reason of \"{{reason}}\"", + "original.intent_template": "Draft an email to the shop owner via their contact us function for a coupon as {{reason}}", + "instantiation_dict": { + "reason": "I am a student" + }, + "intent": "Draft an email to the shop owner via their contact us function for a coupon, enter in the exact reason of \"I am a student\"", + "original.intent": "Draft an email to the shop owner via their contact us function for a coupon as I am a student", + "require_reset": false, + "eval": { + "expected_ui_state": [ + { + "url": "__SHOPPING__/contact", + "eval_func": { + "name": "verify_ui_value_exists", + "eval_params": { + "locator": "title=\"What\u2019s on your mind?\"" + } + } + } + ], + "site": "shopping" + }, + "original.eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__SHOPPING__/contact", + "program_html": [ + { + "url": "last", + "locator": "document.querySelector('[title=\"What\u2019s on your mind?\"').value", + "required_contents": { + "must_include": [ + "coupon", + "student" + ] + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 163, + "changelogs": [ + { + "key": "intent_template", + "category": "permissive_string_match", + "note": "Clarify exact text to enter" + }, + { + "key": "expected_ui_state", + "category": "context_free_evaluation", + "note": "Used locator check" + } + ] + }, + { + "sites": [ + "shopping" + ], + "task_id": 693, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "Draft an email to the shop owner via their contact us function for a coupon, enter in the exact reason of \"{{reason}}\"", + "original.intent_template": "Draft an email to the shop owner via their contact us function for a coupon as {{reason}}", + "instantiation_dict": { + "reason": "my refund is supposed to be replaced by a coupon" + }, + "original.instantiation_dict": { + "reason": "my refund is suppoed to be replaced by a coupon" + }, + "intent": "Draft an email to the shop owner via their contact us function for a coupon, enter in the exact reason of \"my refund is supposed to be replaced by a coupon\"", + "original.intent": "Draft an email to the shop owner via their contact us function for a coupon as my refund is suppoed to be replaced by a coupon", + "require_reset": false, + "eval": { + "expected_ui_state": [ + { + "url": "__SHOPPING__/contact", + "eval_func": { + "name": "verify_ui_value_exists", + "eval_params": { + "locator": "title=\"What\u2019s on your mind?\"" + } + } + } + ], + "site": "shopping" + }, + "original.eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__SHOPPING__/contact", + "program_html": [ + { + "url": "last", + "locator": "document.querySelector('[title=\"What\u2019s on your mind?\"').value", + "required_contents": { + "must_include": [ + "coupon", + "refund" + ] + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 163, + "changelogs": [ + { + "key": "intent_template", + "category": "permissive_string_match", + "note": "Clarify exact text to enter" + }, + { + "key": "instantiation_dict", + "category": "spelling_or_grammar", + "note": "Fix spelling" + }, + { + "key": "expected_ui_state", + "category": "context_free_evaluation", + "note": "Used locator check" + } + ] + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 694, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Add a simple product named {{product}} with {{stock}} in stock, available in size {{size}} and color {{color}}, priced at ${{price}}", + "instantiation_dict": { + "product": "Energy-Bulk Women Shirt", + "stock": "50", + "size": "S", + "color": "blue", + "price": "60" + }, + "intent": "Add a simple product named Energy-Bulk Women Shirt with 50 in stock, available in size S and color blue, priced at $60", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "verify_product_details", + "eval_params": { + "name": "Energy-Bulk Women Shirt" + }, + "expected_data": { + "name": "Energy-Bulk Women Shirt", + "price": 60.0, + "stock_qty": 50, + "color": "50", + "size": "167" + } + } + } + ], + "site": "shopping_admin" + }, + "original.eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__SHOPPING_ADMIN__/catalog/product", + "program_html": [ + { + "url": "last", + "locator": "document.querySelector('[name=\"product[price]\"').value", + "required_contents": { + "exact_match": "60.00" + } + }, + { + "url": "last", + "locator": "document.querySelector('[name=\"product[name]\"').value", + "required_contents": { + "must_include": [ + "Energy-Bulk Women Shirt" + ] + } + }, + { + "url": "last", + "locator": "document.querySelector('[name=\"product[quantity_and_stock_status][qty]\"').value", + "required_contents": { + "exact_match": "50" + } + }, + { + "url": "last", + "locator": "document.querySelector('[data-role=\"selected-option\"').outerText", + "required_contents": { + "must_include": [ + "top" + ] + } + }, + { + "url": "last", + "locator": "document.querySelector('[name=\"product[size]\"').value", + "required_contents": { + "exact_match": "167" + } + }, + { + "url": "last", + "locator": "document.querySelector('[name=\"product[color]\"').value", + "required_contents": { + "exact_match": "50" + } + }, + { + "url": "last", + "locator": "document.querySelector('[data-index=\"category_ids\"').outerText", + "required_contents": { + "must_include": [ + "tops" + ] + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 256, + "changelogs": [ + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 695, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Add a simple product named {{product}} with {{stock}} in stock, available in size {{size}} and color {{color}}, priced at ${{price}}", + "instantiation_dict": { + "product": "Energy-Bulk Man Yoga Pant", + "stock": "50", + "size": "38", + "color": "yellow", + "price": "69.99" + }, + "intent": "Add a simple product named Energy-Bulk Man Yoga Pant with 50 in stock, available in size 38 and color yellow, priced at $69.99", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "verify_product_details", + "eval_params": { + "name": "Energy-Bulk Man Yoga Pant" + }, + "expected_data": { + "name": "Energy-Bulk Man Yoga Pant", + "price": 69.99, + "stock_qty": 50, + "color": "60", + "size": "179" + } + } + } + ], + "site": "shopping_admin" + }, + "original.eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__SHOPPING_ADMIN__/catalog/product", + "program_html": [ + { + "url": "last", + "locator": "document.querySelector('[name=\"product[price]\"').value", + "required_contents": { + "exact_match": "69.99" + } + }, + { + "url": "last", + "locator": "document.querySelector('[name=\"product[name]\"').value", + "required_contents": { + "must_include": [ + "Energy-Bulk Man Yoga Pant" + ] + } + }, + { + "url": "last", + "locator": "document.querySelector('[name=\"product[quantity_and_stock_status][qty]\"').value", + "required_contents": { + "exact_match": "50" + } + }, + { + "url": "last", + "locator": "document.querySelector('[data-role=\"selected-option\"').outerText", + "required_contents": { + "must_include": [ + "bottom" + ] + } + }, + { + "url": "last", + "locator": "document.querySelector('[name=\"product[size]\"').value", + "required_contents": { + "exact_match": "179" + } + }, + { + "url": "last", + "locator": "document.querySelector('[name=\"product[color]\"').value", + "required_contents": { + "exact_match": "60" + } + }, + { + "url": "last", + "locator": "document.querySelector('[data-index=\"category_ids\"').outerText", + "required_contents": { + "must_include": [ + "bottoms" + ] + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 256, + "changelogs": [ + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 696, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Add a simple product named {{product}} with {{stock}} in stock, available in size {{size}} and color {{color}}, priced at ${{price}}", + "instantiation_dict": { + "product": "FancyBoy Man Causal Jeans", + "stock": "42", + "size": "34", + "color": "Blue", + "price": "169.99" + }, + "intent": "Add a simple product named FancyBoy Man Causal Jeans with 42 in stock, available in size 34 and color Blue, priced at $169.99", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "verify_product_details", + "eval_params": { + "name": "FancyBoy Man Causal Jeans" + }, + "expected_data": { + "name": "FancyBoy Man Causal Jeans", + "price": 169.99, + "stock_qty": 42, + "color": "50", + "size": "177" + } + } + } + ], + "site": "shopping_admin" + }, + "original.eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__SHOPPING_ADMIN__/catalog/product", + "program_html": [ + { + "url": "last", + "locator": "document.querySelector('[name=\"product[name]\"').value", + "required_contents": { + "must_include": [ + "FancyBoy Man Causal Jeans" + ] + } + }, + { + "url": "last", + "locator": "document.querySelector('[name=\"product[quantity_and_stock_status][qty]\"').value", + "required_contents": { + "exact_match": "42" + } + }, + { + "url": "last", + "locator": "document.querySelector('[name=\"product[price]\"').value", + "required_contents": { + "exact_match": "169.99" + } + }, + { + "url": "last", + "locator": "document.querySelector('[data-role=\"selected-option\"').outerText", + "required_contents": { + "must_include": [ + "bottom" + ] + } + }, + { + "url": "last", + "locator": "document.querySelector('[name=\"product[size]\"').value", + "required_contents": { + "exact_match": "177" + } + }, + { + "url": "last", + "locator": "document.querySelector('[name=\"product[color]\"').value", + "required_contents": { + "exact_match": "50" + } + }, + { + "url": "last", + "locator": "document.querySelector('[data-index=\"category_ids\"').outerText", + "required_contents": { + "must_include": [ + "bottoms" + ] + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 256, + "changelogs": [ + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 697, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Add a simple product named {{product}} with {{stock}} in stock, available in size {{size}} and color {{color}}, priced at ${{price}}", + "instantiation_dict": { + "product": "Swaatch Smart Watch", + "stock": "42", + "size": "uni-size", + "color": "Blue", + "price": "769.99" + }, + "intent": "Add a simple product named Swaatch Smart Watch with 42 in stock, available in size uni-size and color Blue, priced at $769.99", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "verify_product_details", + "eval_params": { + "name": "Swaatch Smart Watch" + }, + "expected_data": { + "name": "Swaatch Smart Watch", + "price": 769.99, + "stock_qty": "42", + "color": "50", + "size": "uni-size" + } + } + } + ], + "site": "shopping_admin" + }, + "original.eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__SHOPPING_ADMIN__/catalog/product", + "program_html": [ + { + "url": "last", + "locator": "document.querySelector('[name=\"product[name]\"').value", + "required_contents": { + "must_include": [ + "Swaatch Smart Watch" + ] + } + }, + { + "url": "last", + "locator": "document.querySelector('[name=\"product[quantity_and_stock_status][qty]\"').value", + "required_contents": { + "exact_match": "42" + } + }, + { + "url": "last", + "locator": "document.querySelector('[name=\"product[price]\"').value", + "required_contents": { + "exact_match": "769.99" + } + }, + { + "url": "last", + "locator": "document.querySelector('[data-role=\"selected-option\"').outerText", + "required_contents": { + "must_include": [ + "gear" + ] + } + }, + { + "url": "last", + "locator": "document.querySelector('[name=\"product[color]\"').value", + "required_contents": { + "exact_match": "50" + } + }, + { + "url": "last", + "locator": "document.querySelector('[data-index=\"category_ids\"').outerText", + "required_contents": { + "must_include": [ + "watches" + ] + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 256, + "changelogs": [ + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 698, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Add a simple product named {{product}} with {{stock}} in stock, available in size {{size}} and color {{color}}, priced at ${{price}}", + "instantiation_dict": { + "product": "Lelelumon Yoga Mat", + "stock": "42", + "size": "uni-size", + "color": "black", + "price": "769.99" + }, + "intent": "Add a simple product named Lelelumon Yoga Mat with 42 in stock, available in size uni-size and color black, priced at $769.99", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "verify_product_details", + "eval_params": { + "name": "Lelelumon Yoga Mat" + }, + "expected_data": { + "name": "Lelelumon Yoga Mat", + "price": 769.99, + "stock_qty": 42, + "color": "49-black", + "size": "uni-size" + } + } + } + ], + "site": "shopping_admin" + }, + "original.eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__SHOPPING_ADMIN__/catalog/product", + "program_html": [ + { + "url": "last", + "locator": "document.querySelector('[name=\"product[name]\"').value", + "required_contents": { + "must_include": [ + "Lelelumon Yoga Mat" + ] + } + }, + { + "url": "last", + "locator": "document.querySelector('[name=\"product[quantity_and_stock_status][qty]\"').value", + "required_contents": { + "exact_match": "42" + } + }, + { + "url": "last", + "locator": "document.querySelector('[name=\"product[price]\"').value", + "required_contents": { + "exact_match": "769.99" + } + }, + { + "url": "last", + "locator": "document.querySelector('[data-role=\"selected-option\"').outerText", + "required_contents": { + "must_include": [ + "gear" + ] + } + }, + { + "url": "last", + "locator": "document.querySelector('[name=\"product[color]\"').value", + "required_contents": { + "exact_match": "49" + } + }, + { + "url": "last", + "locator": "document.querySelector('[data-index=\"category_ids\"').outerText", + "required_contents": { + "must_include": [ + "fitness equipment" + ] + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 256, + "changelogs": [ + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 699, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Draft a new marketing price rule for {{topic}} that offers {{rule}} for all customers", + "instantiation_dict": { + "topic": "spring sale", + "rule": "a 20 percent discount site-wide" + }, + "intent": "Draft a new marketing price rule for spring sale that offers a 20 percent discount site-wide for all customers", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "verify_sale_details", + "eval_params": { + "topic": "spring sale" + }, + "expected_data": { + "name": "spring sale", + "website_ids": [ + 1 + ], + "customer_group_ids": [ + 1 + ], + "discount_amount_type": "by_percent", + "discount_amount": 20 + } + } + } + ], + "site": "shopping_admin" + }, + "original.eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__SHOPPING_ADMIN__/sales_rule/promo_quote", + "program_html": [ + { + "url": "last", + "locator": "document.querySelector(\"[name='name'\").value", + "required_contents": { + "must_include": [ + "spring sale" + ] + } + }, + { + "url": "last", + "locator": "document.querySelector('[name=\"website_ids\"').selectedIndex", + "required_contents": { + "exact_match": "0" + } + }, + { + "url": "last", + "locator": "document.querySelector('[name=\"customer_group_ids\"').selectedIndex", + "required_contents": { + "exact_match": "1" + } + }, + { + "url": "last", + "locator": "document.querySelector('[name=\"simple_action\"').value", + "prep_actions": [ + "document.querySelector('[data-index=\"actions\"]').querySelector('.admin__collapsible-title').click()" + ], + "required_contents": { + "exact_match": "by_percent" + } + }, + { + "url": "last", + "locator": "document.querySelector('[name=\"discount_amount\"').value", + "prep_actions": [ + "document.querySelector('[data-index=\"actions\"]').querySelector('.admin__collapsible-title').click()" + ], + "required_contents": { + "exact_match": "20" + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 258, + "changelogs": [ + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 700, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Draft a new marketing price rule for {{topic}} that offers {{rule}} for all customers", + "instantiation_dict": { + "topic": "fall discount", + "rule": "$10 discount on checkout" + }, + "intent": "Draft a new marketing price rule for fall discount that offers $10 discount on checkout for all customers", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "verify_sale_details", + "eval_params": { + "topic": "fall discount" + }, + "expected_data": { + "name": "fall discount", + "website_ids": [ + 1 + ], + "customer_group_ids": [ + 1 + ], + "discount_amount_type": "cart_fixed", + "discount_amount": 10 + } + } + } + ], + "site": "shopping_admin" + }, + "original.eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__SHOPPING_ADMIN__/sales_rule/promo_quote", + "program_html": [ + { + "url": "last", + "locator": "document.querySelector(\"[name='name'\").value", + "required_contents": { + "must_include": [ + "fall discount" + ] + } + }, + { + "url": "last", + "locator": "document.querySelector('[name=\"website_ids\"').selectedIndex", + "required_contents": { + "exact_match": "0" + } + }, + { + "url": "last", + "locator": "document.querySelector('[name=\"customer_group_ids\"').selectedIndex", + "required_contents": { + "exact_match": "1" + } + }, + { + "url": "last", + "locator": "document.querySelector('[name=\"simple_action\"').value", + "prep_actions": [ + "document.querySelector('[data-index=\"actions\"]').querySelector('.admin__collapsible-title').click()" + ], + "required_contents": { + "exact_match": "cart_fixed" + } + }, + { + "url": "last", + "locator": "document.querySelector('[name=\"discount_amount\"').value", + "prep_actions": [ + "document.querySelector('[data-index=\"actions\"]').querySelector('.admin__collapsible-title').click()" + ], + "required_contents": { + "exact_match": "10" + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 258, + "changelogs": [ + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 701, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Draft a new marketing price rule for {{topic}} that offers {{rule}} for all customers", + "instantiation_dict": { + "topic": "Mother's day sale", + "rule": "$15 discount on checkout" + }, + "intent": "Draft a new marketing price rule for Mother's day sale that offers $15 discount on checkout for all customers", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "verify_sale_details", + "eval_params": { + "topic": "Mother's day sale" + }, + "expected_data": { + "name": "Mother's day sale", + "website_ids": [ + 1 + ], + "customer_group_ids": [ + 1 + ], + "discount_amount_type": "cart_fixed", + "discount_amount": 15 + } + } + } + ], + "site": "shopping_admin" + }, + "original.eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__SHOPPING_ADMIN__/sales_rule/promo_quote", + "program_html": [ + { + "url": "last", + "locator": "document.querySelector(\"[name='name'\").value", + "required_contents": { + "must_include": [ + "Mother's day sale" + ] + } + }, + { + "url": "last", + "locator": "document.querySelector('[name=\"website_ids\"').selectedIndex", + "required_contents": { + "exact_match": "0" + } + }, + { + "url": "last", + "locator": "document.querySelector('[name=\"customer_group_ids\"').selectedIndex", + "required_contents": { + "exact_match": "1" + } + }, + { + "url": "last", + "locator": "document.querySelector('[name=\"simple_action\"').value", + "prep_actions": [ + "document.querySelector('[data-index=\"actions\"]').querySelector('.admin__collapsible-title').click()" + ], + "required_contents": { + "exact_match": "cart_fixed" + } + }, + { + "url": "last", + "locator": "document.querySelector('[name=\"discount_amount\"').value", + "prep_actions": [ + "document.querySelector('[data-index=\"actions\"]').querySelector('.admin__collapsible-title').click()" + ], + "required_contents": { + "exact_match": "15" + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 258, + "changelogs": [ + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 702, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Draft a new marketing price rule for {{topic}} that offers {{rule}} for all customers", + "instantiation_dict": { + "topic": "Pride Month", + "rule": "45% off on all products" + }, + "intent": "Draft a new marketing price rule for Pride Month that offers 45% off on all products for all customers", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "verify_sale_details", + "eval_params": { + "topic": "Pride Month" + }, + "expected_data": { + "name": "Pride Month", + "website_ids": [ + 1 + ], + "customer_group_ids": [ + 1 + ], + "discount_amount_type": "by_percent", + "discount_amount": 45 + } + } + } + ], + "site": "shopping_admin" + }, + "original.eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__SHOPPING_ADMIN__/sales_rule/promo_quote", + "program_html": [ + { + "url": "last", + "locator": "document.querySelector(\"[name='name'\").value", + "required_contents": { + "must_include": [ + "Pride Month" + ] + } + }, + { + "url": "last", + "locator": "document.querySelector('[name=\"website_ids\"').selectedIndex", + "required_contents": { + "exact_match": "0" + } + }, + { + "url": "last", + "locator": "document.querySelector('[name=\"customer_group_ids\"').selectedIndex", + "required_contents": { + "exact_match": "1" + } + }, + { + "url": "last", + "locator": "document.querySelector('[name=\"simple_action\"').value", + "prep_actions": [ + "document.querySelector('[data-index=\"actions\"]').querySelector('.admin__collapsible-title').click()" + ], + "required_contents": { + "exact_match": "by_percent" + } + }, + { + "url": "last", + "locator": "document.querySelector('[name=\"discount_amount\"').value", + "prep_actions": [ + "document.querySelector('[data-index=\"actions\"]').querySelector('.admin__collapsible-title').click()" + ], + "required_contents": { + "exact_match": "45" + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 258, + "changelogs": [ + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 703, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Draft a new marketing price rule for {{topic}} that offers {{rule}} for all customers", + "instantiation_dict": { + "topic": "Thanks giving sale", + "rule": "$40 discount on checkout" + }, + "intent": "Draft a new marketing price rule for Thanks giving sale that offers $40 discount on checkout for all customers", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "verify_sale_details", + "eval_params": { + "topic": "Thanks giving sale" + }, + "expected_data": { + "name": "Thanks giving sale", + "website_ids": [ + 1 + ], + "customer_group_ids": [ + 1 + ], + "discount_amount_type": "cart_fixed", + "discount_amount": 40 + } + } + } + ], + "site": "shopping_admin" + }, + "original.eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__SHOPPING_ADMIN__/sales_rule/promo_quote", + "program_html": [ + { + "url": "last", + "locator": "document.querySelector(\"[name='name'\").value", + "required_contents": { + "must_include": [ + "Thanks giving sale" + ] + } + }, + { + "url": "last", + "locator": "document.querySelector('[name=\"website_ids\"').selectedIndex", + "required_contents": { + "exact_match": "0" + } + }, + { + "url": "last", + "locator": "document.querySelector('[name=\"customer_group_ids\"').selectedIndex", + "required_contents": { + "exact_match": "1" + } + }, + { + "url": "last", + "locator": "document.querySelector('[name=\"simple_action\"').value", + "prep_actions": [ + "document.querySelector('[data-index=\"actions\"]').querySelector('.admin__collapsible-title').click()" + ], + "required_contents": { + "exact_match": "cart_fixed" + } + }, + { + "url": "last", + "locator": "document.querySelector('[name=\"discount_amount\"').value", + "prep_actions": [ + "document.querySelector('[data-index=\"actions\"]').querySelector('.admin__collapsible-title').click()" + ], + "required_contents": { + "exact_match": "40" + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 258, + "changelogs": [ + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 704, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Today is 3/15/2023, generate a {{report}} {{time_span}}", + "instantiation_dict": { + "report": "sales order report", + "time_span": "for last month" + }, + "intent": "Today is 3/15/2023, generate a sales order report for last month", + "require_reset": false, + "eval": { + "expected_ui_state": [ + { + "url": "__SHOPPING_ADMIN__/admin/reports/report_sales/sales/", + "eval_func": { + "name": "verify_report_generation_form", + "eval_params": { + "from_date": "2/1/23", + "to_date": "2/28/23", + "order_statuses": [], + "period_type": "day", + "report_type": "created_at_order", + "show_empty_rows": "0", + "show_order_statuses": "0" + } + } + } + ], + "site": "shopping_admin" + }, + "original.eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__SHOPPING_ADMIN__/reports/report_sales/sales", + "program_html": [ + { + "url": "last", + "locator": "document.querySelector('[id=\"sales_report_from\"').value", + "required_contents": { + "exact_match": "2/1/23" + } + }, + { + "url": "last", + "locator": "document.querySelector('[id=\"sales_report_to\"').value", + "required_contents": { + "exact_match": "2/28/23" + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 268 + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 705, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Today is 3/15/2023, generate a {{report}} {{time_span}}", + "instantiation_dict": { + "report": "sales order report", + "time_span": "over the last 45 days" + }, + "intent": "Today is 3/15/2023, generate a sales order report over the last 45 days", + "require_reset": false, + "eval": { + "expected_ui_state": [ + { + "url": "__SHOPPING_ADMIN__/admin/reports/report_sales/sales/", + "eval_func": { + "name": "verify_report_generation_form", + "eval_params": { + "from_date": "1/29/23", + "to_date": "3/15/23", + "order_statuses": [], + "period_type": "day", + "report_type": "created_at_order", + "show_empty_rows": "0", + "show_order_statuses": "0" + } + } + } + ], + "site": "shopping_admin" + }, + "original.eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__SHOPPING_ADMIN__/reports/report_sales/sales", + "program_html": [ + { + "url": "last", + "locator": "document.querySelector('[id=\"sales_report_from\"').value", + "required_contents": { + "exact_match": "1/29/23" + } + }, + { + "url": "last", + "locator": "document.querySelector('[id=\"sales_report_to\"').value", + "required_contents": { + "exact_match": "3/15/23" + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 268 + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 706, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Today is 3/15/2023, generate a {{report}} {{time_span}}", + "instantiation_dict": { + "report": "refund report", + "time_span": "for Q1" + }, + "intent": "Today is 3/15/2023, generate a refund report for Q1", + "require_reset": false, + "eval": { + "expected_ui_state": [ + { + "url": "__SHOPPING_ADMIN__/admin/reports/report_sales/refunded/", + "eval_func": { + "name": "verify_report_generation_form", + "eval_params": { + "from_date": "1/1/23", + "to_date": "3/31/23", + "order_statuses": [], + "period_type": "day", + "report_type": "created_at_order", + "show_empty_rows": "0", + "show_order_statuses": "0" + } + } + } + ], + "site": "shopping_admin" + }, + "original.eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__SHOPPING_ADMIN__/reports/report_sales/refunded", + "program_html": [ + { + "url": "last", + "locator": "document.querySelector('[id=\"sales_report_from\"').value", + "required_contents": { + "exact_match": "1/1/23" + } + }, + { + "url": "last", + "locator": "document.querySelector('[id=\"sales_report_to\"').value", + "required_contents": { + "exact_match": "3/31/23" + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 268 + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 707, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Today is 3/15/2023, generate a {{report}} {{time_span}}", + "instantiation_dict": { + "report": "sales order report", + "time_span": "for last year" + }, + "intent": "Today is 3/15/2023, generate a sales order report for last year", + "require_reset": false, + "eval": { + "expected_ui_state": [ + { + "url": "__SHOPPING_ADMIN__/admin/reports/report_sales/sales/", + "eval_func": { + "name": "verify_report_generation_form", + "eval_params": { + "from_date": "1/1/2022", + "to_date": "12/31/2022", + "order_statuses": [], + "period_type": "day", + "report_type": "created_at_order", + "show_empty_rows": "0", + "show_order_statuses": "0" + } + } + } + ], + "site": "shopping_admin" + }, + "original.eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__SHOPPING_ADMIN__/reports/report_sales/sales", + "program_html": [ + { + "url": "last", + "locator": "document.querySelector('[id=\"sales_report_from\"').value", + "required_contents": { + "exact_match": "1/1/2022" + } + }, + { + "url": "last", + "locator": "document.querySelector('[id=\"sales_report_to\"').value", + "required_contents": { + "exact_match": "12/31/2022" + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 268 + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 708, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Today is 3/15/2023, generate a {{report}} {{time_span}}", + "instantiation_dict": { + "report": "tax report", + "time_span": "for this year" + }, + "intent": "Today is 3/15/2023, generate a tax report for this year", + "require_reset": false, + "eval": { + "expected_ui_state": [ + { + "url": "__SHOPPING_ADMIN__/admin/reports/report_sales/tax/", + "eval_func": { + "name": "verify_report_generation_form", + "eval_params": { + "from_date": "1/1/2023", + "to_date": "12/31/2023", + "order_statuses": [], + "period_type": "day", + "report_type": "created_at_order", + "show_empty_rows": "0", + "show_order_statuses": "0" + } + } + } + ], + "site": "shopping_admin" + }, + "original.eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__SHOPPING_ADMIN__/reports/report_sales/tax/", + "program_html": [ + { + "url": "last", + "locator": "document.querySelector('[id=\"sales_report_from\"').value", + "required_contents": { + "exact_match": "1/1/2023" + } + }, + { + "url": "last", + "locator": "document.querySelector('[id=\"sales_report_to\"').value", + "required_contents": { + "exact_match": "12/31/2023" + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 268 + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 709, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Create an {{type}} report from {{start_date}} to {{end_date}}", + "instantiation_dict": { + "type": "orders", + "start_date": "beginning of May 2021", + "end_date": "end of March 2022" + }, + "intent": "Create an orders report from beginning of May 2021 to end of March 2022", + "require_reset": false, + "eval": { + "expected_ui_state": [ + { + "url": "__SHOPPING_ADMIN__/admin/reports/report_sales/sales/", + "eval_func": { + "name": "verify_report_generation_form", + "eval_params": { + "from_date": "5/1/2021", + "to_date": "3/31/2022", + "order_statuses": [], + "period_type": "day", + "report_type": "created_at_order", + "show_empty_rows": "0", + "show_order_statuses": "0" + } + } + } + ], + "site": "shopping_admin" + }, + "original.eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__SHOPPING_ADMIN__/reports/report_sales/sales", + "program_html": [ + { + "url": "last", + "locator": "document.querySelector('[id=\"sales_report_from\"').value", + "required_contents": { + "exact_match": "5/1/2021" + } + }, + { + "url": "last", + "locator": "document.querySelector('[id=\"sales_report_to\"').value", + "required_contents": { + "exact_match": "3/31/2022" + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 271 + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 710, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Create a {{type}} report from {{start_date}} to {{end_date}}", + "instantiation_dict": { + "type": "shipping", + "start_date": "08/05/2022", + "end_date": "03/01/2023" + }, + "intent": "Create a shipping report from 08/05/2022 to 03/01/2023", + "require_reset": false, + "eval": { + "expected_ui_state": [ + { + "url": "__SHOPPING_ADMIN__/admin/reports/report_sales/shipping/", + "eval_func": { + "name": "verify_report_generation_form", + "eval_params": { + "from_date": "8/5/22", + "to_date": "3/1/23", + "order_statuses": [], + "period_type": "day", + "report_type": "created_at_order", + "show_empty_rows": "0", + "show_order_statuses": "0" + } + } + } + ], + "site": "shopping_admin" + }, + "original.eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__SHOPPING_ADMIN__/reports/report_sales/shipping", + "program_html": [ + { + "url": "last", + "locator": "document.querySelector('[id=\"sales_report_from\"').value", + "required_contents": { + "exact_match": "8/5/22" + } + }, + { + "url": "last", + "locator": "document.querySelector('[id=\"sales_report_to\"').value", + "required_contents": { + "exact_match": "3/1/23" + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 271 + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 711, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Create a {{type}} report from {{start_date}} to {{end_date}}", + "instantiation_dict": { + "type": "product view", + "start_date": "07/05/2021", + "end_date": "05/31/2023" + }, + "intent": "Create a product view report from 07/05/2021 to 05/31/2023", + "require_reset": false, + "eval": { + "expected_ui_state": [ + { + "url": "__SHOPPING_ADMIN__/admin/reports/report_product/viewed/", + "eval_func": { + "name": "verify_report_generation_form", + "eval_params": { + "from_date": "7/5/21", + "to_date": "5/31/23", + "period_type": "day", + "show_empty_rows": "0" + } + } + } + ], + "site": "shopping_admin" + }, + "original.eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__SHOPPING_ADMIN__/reports/report_product/viewed/", + "program_html": [ + { + "url": "last", + "locator": "document.querySelector('[id=\"sales_report_from\"').value", + "required_contents": { + "exact_match": "7/5/21" + } + }, + { + "url": "last", + "locator": "document.querySelector('[id=\"sales_report_to\"').value", + "required_contents": { + "exact_match": "5/31/23" + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 271 + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 712, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Create a {{type}} report from {{start_date}} to {{end_date}}", + "instantiation_dict": { + "type": "coupons", + "start_date": "05/01/2021", + "end_date": "05/15/2023" + }, + "intent": "Create a coupons report from 05/01/2021 to 05/15/2023", + "require_reset": false, + "eval": { + "expected_ui_state": [ + { + "url": "__SHOPPING_ADMIN__/admin/reports/report_sales/coupons/", + "eval_func": { + "name": "verify_report_generation_form", + "eval_params": { + "from_date": "5/1/21", + "to_date": "5/15/23", + "order_statuses": [], + "period_type": "day", + "report_type": "created_at_order", + "show_empty_rows": "0", + "show_order_statuses": "0" + } + } + } + ], + "site": "shopping_admin" + }, + "original.eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__SHOPPING_ADMIN__/reports/report_sales/coupons/", + "program_html": [ + { + "url": "last", + "locator": "document.querySelector('[id=\"sales_report_from\"').value", + "required_contents": { + "exact_match": "5/1/21" + } + }, + { + "url": "last", + "locator": "document.querySelector('[id=\"sales_report_to\"').value", + "required_contents": { + "exact_match": "5/15/23" + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 271 + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 713, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Create a {{type}} report from {{start_date}} to {{end_date}}", + "instantiation_dict": { + "type": "best sellers", + "start_date": "05/01/2022", + "end_date": "05/31/2023" + }, + "intent": "Create a best sellers report from 05/01/2022 to 05/31/2023", + "require_reset": false, + "eval": { + "expected_ui_state": [ + { + "url": "__SHOPPING_ADMIN__/admin/reports/report_sales/bestsellers/", + "eval_func": { + "name": "verify_report_generation_form", + "eval_params": { + "from_date": "5/1/22", + "to_date": "5/31/23", + "order_statuses": [], + "period_type": "day", + "report_type": "created_at_order", + "show_empty_rows": "0", + "show_order_statuses": "0" + } + } + } + ], + "site": "shopping_admin" + }, + "original.eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__SHOPPING_ADMIN__/reports/report_sales/bestsellers/", + "program_html": [ + { + "url": "last", + "locator": "document.querySelector('[id=\"sales_report_from\"').value", + "required_contents": { + "exact_match": "5/1/22" + } + }, + { + "url": "last", + "locator": "document.querySelector('[id=\"sales_report_to\"').value", + "required_contents": { + "exact_match": "5/31/23" + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 271 + }, + { + "sites": [ + "reddit" + ], + "task_id": 714, + "require_login": true, + "storage_state": "./.auth/reddit_state.json", + "start_url": "__REDDIT__", + "geolocation": null, + "intent_template": "Thumbs down the top {{k}} post ever in {{subreddit}}.", + "instantiation_dict": { + "subreddit": "gadgets", + "k": "1" + }, + "intent": "Thumbs down the top 1 post ever in gadgets.", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "validate_user_downvoted_top_posts", + "expected_data": { + "subreddit": "gadgets", + "k": 1 + } + } + } + ], + "site": "reddit" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__REDDIT__/f/gadgets/19459/a-custom-gaming-pc-built-inside-a-vintage-1940s-motorola", + "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')", + "required_contents": { + "must_include": [ + "vote vote--user-downvoted" + ] + } + } + ] + }, + "intent_template_id": 24, + "changelogs": [ + { + "key": "expected_backend_state", + "category": "switch_to_api_eval", + "note": "Update to use system check instead of dom check" + }, + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "reddit" + ], + "task_id": 715, + "require_login": true, + "storage_state": "./.auth/reddit_state.json", + "start_url": "__REDDIT__", + "geolocation": null, + "intent_template": "Thumbs down the top {{k}} post ever in {{subreddit}}.", + "instantiation_dict": { + "subreddit": "history", + "k": "2" + }, + "intent": "Thumbs down the top 2 post ever in history.", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "validate_user_downvoted_top_posts", + "expected_data": { + "subreddit": "history", + "k": 2 + } + } + } + ], + "site": "reddit" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__REDDIT__/f/history/84338/the-scientist-who-discovered-sperm-was-so-grossed-out-he", + "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')", + "required_contents": { + "must_include": [ + "vote vote--user-downvoted" + ] + } + }, + { + "url": "__REDDIT__/f/history/105990/4-500-year-old-sumerian-temple-dedicated-to-mighty-thunder", + "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')", + "required_contents": { + "must_include": [ + "vote vote--user-downvoted" + ] + } + } + ] + }, + "intent_template_id": 24, + "changelogs": [ + { + "key": "expected_backend_state", + "category": "switch_to_api_eval", + "note": "Update to use system check instead of dom check" + }, + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "reddit" + ], + "task_id": 716, + "require_login": true, + "storage_state": "./.auth/reddit_state.json", + "start_url": "__REDDIT__", + "geolocation": null, + "intent_template": "Thumbs down the top {{k}} post ever in {{subreddit}}.", + "instantiation_dict": { + "k": "3", + "subreddit": "books" + }, + "intent": "Thumbs down the top 3 post ever in books.", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "validate_user_downvoted_top_posts", + "expected_data": { + "subreddit": "books", + "k": 3 + } + } + } + ], + "site": "reddit" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__REDDIT__/f/books/81371/the-letters-of-t-s-eliot-to-emily-hale-that-were-kept-sealed", + "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')", + "required_contents": { + "must_include": [ + "vote vote--user-downvoted" + ] + } + }, + { + "url": "__REDDIT__/f/books/59421/friendly-reminder-bookshop-org-exists", + "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')", + "required_contents": { + "must_include": [ + "vote vote--user-downvoted" + ] + } + }, + { + "url": "__REDDIT__/f/books/59447/appalachian-prison-book-project-seeks-notebook-donations-the", + "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')", + "required_contents": { + "must_include": [ + "vote vote--user-downvoted" + ] + } + } + ] + }, + "intent_template_id": 24, + "changelogs": [ + { + "key": "expected_backend_state", + "category": "switch_to_api_eval", + "note": "Update to use system check instead of dom check" + }, + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "reddit" + ], + "task_id": 717, + "require_login": true, + "storage_state": "./.auth/reddit_state.json", + "start_url": "__REDDIT__", + "geolocation": null, + "intent_template": "Thumbs down the top {{k}} post ever in {{subreddit}}.", + "instantiation_dict": { + "k": "4", + "subreddit": "movies" + }, + "intent": "Thumbs down the top 4 post ever in movies.", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "validate_user_downvoted_top_posts", + "expected_data": { + "subreddit": "movies", + "k": 4 + } + } + } + ], + "site": "reddit" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__REDDIT__/f/movies/86174/who-will-win-the-oscar-for-actress-in-a-supporting-role", + "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')", + "required_contents": { + "must_include": [ + "vote vote--user-downvoted" + ] + } + }, + { + "url": "__REDDIT__/f/movies/86029/who-will-win-the-oscar-for-film-editing", + "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')", + "required_contents": { + "must_include": [ + "vote vote--user-downvoted" + ] + } + }, + { + "url": "__REDDIT__/f/movies/86055/cindy-williams-dies-laverne-amp-shirley-star-who-appeared-in", + "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')", + "required_contents": { + "must_include": [ + "vote vote--user-downvoted" + ] + } + }, + { + "url": "__REDDIT__/f/movies/42682/michelle-yeoh-to-receive-palm-springs-film-festival-s", + "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')", + "required_contents": { + "must_include": [ + "vote vote--user-downvoted" + ] + } + } + ] + }, + "intent_template_id": 24, + "changelogs": [ + { + "key": "expected_backend_state", + "category": "switch_to_api_eval", + "note": "Update to use system check instead of dom check" + }, + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "reddit" + ], + "task_id": 718, + "require_login": true, + "storage_state": "./.auth/reddit_state.json", + "start_url": "__REDDIT__", + "geolocation": null, + "intent_template": "Thumbs down the top {{k}} post ever in {{subreddit}}.", + "instantiation_dict": { + "k": "5", + "subreddit": "technology" + }, + "intent": "Thumbs down the top 5 post ever in technology.", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "validate_user_downvoted_top_posts", + "expected_data": { + "subreddit": "technology", + "k": 5 + } + } + } + ], + "site": "reddit" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__REDDIT__/f/technology/48670/brain-cancer-vaccine-succeeds-at-prolonging-survival-in", + "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')", + "required_contents": { + "must_include": [ + "vote vote--user-downvoted" + ] + } + }, + { + "url": "__REDDIT__/f/technology/134696/india-cuts-internet-for-27-million-people-amid-search-for", + "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')", + "required_contents": { + "must_include": [ + "vote vote--user-downvoted" + ] + } + }, + { + "url": "__REDDIT__/f/technology/48785/us-judge-orders-amazon-to-cease-and-desist-anti-union", + "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')", + "required_contents": { + "must_include": [ + "vote vote--user-downvoted" + ] + } + }, + { + "url": "__REDDIT__/f/technology/70354/activision-s-boston-studio-workers-announce-unionization", + "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')", + "required_contents": { + "must_include": [ + "vote vote--user-downvoted" + ] + } + }, + { + "url": "__REDDIT__/f/technology/70233/social-media-influencers-are-charged-with-feeding-followers", + "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')", + "required_contents": { + "must_include": [ + "vote vote--user-downvoted" + ] + } + } + ] + }, + "intent_template_id": 24, + "changelogs": [ + { + "key": "expected_backend_state", + "category": "switch_to_api_eval", + "note": "Update to use system check instead of dom check" + }, + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "reddit" + ], + "task_id": 719, + "require_login": true, + "storage_state": "./.auth/reddit_state.json", + "start_url": "__REDDIT__", + "geolocation": null, + "intent_template": "Like all submissions created by {{user}} in subreddit {{subreddit}}", + "instantiation_dict": { + "subreddit": "wallstreetbets", + "user": "ThetaGang_wsb" + }, + "intent": "Like all submissions created by ThetaGang_wsb in subreddit wallstreetbets", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "validate_user_upvoted_all_user_posts", + "expected_data": { + "subreddit": "wallstreetbets", + "target_user": "ThetaGang_wsb" + } + } + } + ], + "site": "reddit" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__REDDIT__/f/wallstreetbets/29478/how-will-airbnb-close-following-their-earnings-report-on", + "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')", + "required_contents": { + "must_include": [ + "vote vote--user-upvoted" + ] + } + }, + { + "url": "__REDDIT__/f/wallstreetbets/29458/how-much-will-the-federal-reserve-raise-interest-rates-in", + "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')", + "required_contents": { + "must_include": [ + "vote vote--user-upvoted" + ] + } + } + ] + }, + "intent_template_id": 25, + "changelogs": [ + { + "key": "expected_backend_state", + "category": "switch_to_api_eval", + "note": "Update to use system check instead of dom check" + }, + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "reddit" + ], + "task_id": 720, + "require_login": true, + "storage_state": "./.auth/reddit_state.json", + "start_url": "__REDDIT__", + "geolocation": null, + "intent_template": "Like all submissions created by {{user}} in subreddit {{subreddit}}", + "instantiation_dict": { + "subreddit": "earthporn", + "user": "CameronKelsey" + }, + "intent": "Like all submissions created by CameronKelsey in subreddit earthporn", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "validate_user_upvoted_all_user_posts", + "expected_data": { + "subreddit": "earthporn", + "target_user": "CameronKelsey" + } + } + } + ], + "site": "reddit" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__REDDIT__/f/EarthPorn/98332/my-favorite-place-on-the-planet-henry-s-fork-of-the-snake", + "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')", + "required_contents": { + "must_include": [ + "vote vote--user-upvoted" + ] + } + }, + { + "url": "__REDDIT__/f/EarthPorn/98297/2-years-later-this-is-still-one-of-the-most-incredible", + "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')", + "required_contents": { + "must_include": [ + "vote vote--user-upvoted" + ] + } + }, + { + "url": "__REDDIT__/f/EarthPorn/98256/i-can-t-wait-for-all-this-green-to-start-coming-back-little", + "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')", + "required_contents": { + "must_include": [ + "vote vote--user-upvoted" + ] + } + } + ] + }, + "intent_template_id": 25, + "changelogs": [ + { + "key": "expected_backend_state", + "category": "switch_to_api_eval", + "note": "Update to use system check instead of dom check" + }, + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "reddit" + ], + "task_id": 721, + "require_login": true, + "storage_state": "./.auth/reddit_state.json", + "start_url": "__REDDIT__", + "geolocation": null, + "intent_template": "Like all submissions created by {{user}} in subreddit {{subreddit}}", + "instantiation_dict": { + "user": "UniversityofBath", + "subreddit": "IAmA" + }, + "intent": "Like all submissions created by UniversityofBath in subreddit IAmA", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "validate_user_upvoted_all_user_posts", + "expected_data": { + "subreddit": "IAmA", + "target_user": "UniversityofBath" + } + } + } + ], + "site": "reddit" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__REDDIT__/f/IAmA/119742/hi-i-m-vienne-a-doctoral-student-at-the-university-of-bath-i", + "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')", + "required_contents": { + "must_include": [ + "vote vote--user-upvoted" + ] + } + }, + { + "url": "__REDDIT__/f/IAmA/119719/hello-reddit-i-m-nazia-mehrban-a-lecturer-in-biotechnology", + "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')", + "required_contents": { + "must_include": [ + "vote vote--user-upvoted" + ] + } + }, + { + "url": "__REDDIT__/f/IAmA/119714/i-m-ellie-jarvis-she-her-a-2nd-year-phd-student-in-the", + "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')", + "required_contents": { + "must_include": [ + "vote vote--user-upvoted" + ] + } + }, + { + "url": "__REDDIT__/f/IAmA/55155/hi-i-m-dr-lucy-maddox-from-bath-university-uk-i-m-a-clinical", + "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')", + "required_contents": { + "must_include": [ + "vote vote--user-upvoted" + ] + } + }, + { + "url": "__REDDIT__/f/IAmA/55142/we-re-sadeka-nujhat-hannah-leese-and-sandhya-moise-from-the", + "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')", + "required_contents": { + "must_include": [ + "vote vote--user-upvoted" + ] + } + }, + { + "url": "__REDDIT__/f/IAmA/34032/we-re-sandhya-moise-david-phillips-and-chan-lee-from-the", + "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')", + "required_contents": { + "must_include": [ + "vote vote--user-upvoted" + ] + } + }, + { + "url": "__REDDIT__/f/IAmA/13175/hi-i-m-kit-yates-i-m-a-mathematical-biologist-at-the", + "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')", + "required_contents": { + "must_include": [ + "vote vote--user-upvoted" + ] + } + }, + { + "url": "__REDDIT__/f/IAmA/13170/hello-i-m-dr-sara-fontani-from-the-university-of", + "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')", + "required_contents": { + "must_include": [ + "vote vote--user-upvoted" + ] + } + } + ] + }, + "intent_template_id": 25, + "changelogs": [ + { + "key": "expected_backend_state", + "category": "switch_to_api_eval", + "note": "Update to use system check instead of dom check" + }, + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "reddit" + ], + "task_id": 722, + "require_login": true, + "storage_state": "./.auth/reddit_state.json", + "start_url": "__REDDIT__", + "geolocation": null, + "intent_template": "Like all submissions created by {{user}} in subreddit {{subreddit}}", + "instantiation_dict": { + "user": "Don_Gato1", + "subreddit": "nyc" + }, + "original.instantiation_dict": { + "user": "Don_Gato1", + "subreddit": "new york" + }, + "intent": "Like all submissions created by Don_Gato1 in subreddit nyc", + "original.intent": "Like all submissions created by Don_Gato1 in subreddit new york", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "validate_user_upvoted_all_user_posts", + "expected_data": { + "subreddit": "nyc", + "target_user": "Don_Gato1" + } + } + } + ], + "site": "reddit" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__REDDIT__/f/nyc/44650/fox-news-hosts-cast-new-york-as-crime-ridden-and-chaotic", + "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')", + "required_contents": { + "must_include": [ + "vote vote--user-upvoted" + ] + } + } + ] + }, + "intent_template_id": 25, + "changelogs": [ + { + "key": "instantiation_dict", + "category": "clarify_instructions", + "note": "Typo in subreddit name" + }, + { + "key": "expected_backend_state", + "category": "switch_to_api_eval", + "note": "Update to use system check instead of dom check" + }, + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "reddit" + ], + "task_id": 723, + "require_login": true, + "storage_state": "./.auth/reddit_state.json", + "start_url": "__REDDIT__", + "geolocation": null, + "intent_template": "Like all submissions created by {{user}} in subreddit {{subreddit}}", + "instantiation_dict": { + "user": "FTorrez81", + "subreddit": "iphone13" + }, + "intent": "Like all submissions created by FTorrez81 in subreddit iphone13", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "validate_status", + "expected_data": { + "expected_status": "ACTION_NOT_ALLOWED_ERROR" + } + } + } + ], + "site": "reddit" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": "N/A" + }, + "reference_url": "", + "program_html": [], + "reference_answer_raw_annotation": "N/A", + "string_note": "FTorrez81 does not have any submissions in iphone13" + }, + "intent_template_id": 25, + "string_note": "FTorrez81 has no submissions in subreddit iphone13", + "changelogs": [ + { + "key": "expected_backend_state", + "category": "switch_to_api_eval", + "note": "Update to use system check instead of dom check" + }, + { + "key": "expected_retrieve_value", + "category": "unachievable_tasks", + "note": "Status check used instead of fuzzy match" + } + ] + }, + { + "sites": [ + "reddit" + ], + "task_id": 724, + "require_login": true, + "storage_state": "./.auth/reddit_state.json", + "start_url": "__REDDIT__", + "geolocation": null, + "intent_template": "Like all submissions created by {{user}} in subreddit {{subreddit}}", + "instantiation_dict": { + "user": "Hrekires", + "subreddit": "news" + }, + "intent": "Like all submissions created by Hrekires in subreddit news", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "validate_user_upvoted_all_user_posts", + "expected_data": { + "subreddit": "news", + "target_user": "Hrekires" + } + } + } + ], + "site": "reddit" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__REDDIT__/f/news/129816/gov-whitmer-signs-bills-to-repeal-right-to-work-restore", + "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')", + "required_contents": { + "must_include": [ + "vote vote--user-upvoted" + ] + } + }, + { + "url": "__REDDIT__/f/news/129808/disney-world-deal-with-union-will-raise-minimum-wage-to-18", + "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')", + "required_contents": { + "must_include": [ + "vote vote--user-upvoted" + ] + } + }, + { + "url": "__REDDIT__/f/news/129794/judge-halts-wyoming-abortion-ban-days-after-it-took-effect", + "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')", + "required_contents": { + "must_include": [ + "vote vote--user-upvoted" + ] + } + }, + { + "url": "__REDDIT__/f/news/129783/don-t-say-gay-lawmaker-pleads-guilty-to-covid-relief-fraud", + "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')", + "required_contents": { + "must_include": [ + "vote vote--user-upvoted" + ] + } + }, + { + "url": "__REDDIT__/f/news/129594/arizona-gov-katie-hobbs-refuses-to-proceed-with-execution", + "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')", + "required_contents": { + "must_include": [ + "vote vote--user-upvoted" + ] + } + }, + { + "url": "__REDDIT__/f/news/129508/tennessee-governor-oks-bill-to-cut-nashville-council-in-half", + "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')", + "required_contents": { + "must_include": [ + "vote vote--user-upvoted" + ] + } + }, + { + "url": "__REDDIT__/f/news/43839/philadelphia-da-larry-krasner-impeached-by-pa-house", + "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')", + "required_contents": { + "must_include": [ + "vote vote--user-upvoted" + ] + } + }, + { + "url": "__REDDIT__/f/news/43781/crypto-giant-ftx-to-file-for-bankruptcy-ceo-sam-bankman", + "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')", + "required_contents": { + "must_include": [ + "vote vote--user-upvoted" + ] + } + }, + { + "url": "__REDDIT__/f/news/43572/sec-doj-investigating-crypto-platform-ftx", + "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')", + "required_contents": { + "must_include": [ + "vote vote--user-upvoted" + ] + } + }, + { + "url": "__REDDIT__/f/news/43558/kansas-gov-laura-kelly-wins-re-election-defeating-gop", + "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')", + "required_contents": { + "must_include": [ + "vote vote--user-upvoted" + ] + } + } + ] + }, + "intent_template_id": 25, + "changelogs": [ + { + "key": "expected_backend_state", + "category": "switch_to_api_eval", + "note": "Update to use system check instead of dom check" + }, + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "reddit" + ], + "task_id": 725, + "require_login": true, + "storage_state": "./.auth/reddit_state.json", + "start_url": "__REDDIT__", + "geolocation": null, + "intent_template": "DisLike all submissions created by {{user}} in subreddit {{subreddit}}", + "instantiation_dict": { + "subreddit": "massachusetts", + "user": "RickyDontLoseThat" + }, + "intent": "DisLike all submissions created by RickyDontLoseThat in subreddit massachusetts", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "validate_user_upvoted_all_user_posts", + "expected_data": { + "subreddit": "massachusetts", + "target_user": "RickyDontLoseThat" + } + } + } + ], + "site": "reddit" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__REDDIT__/f/massachusetts/84954/the-last-of-lincoln", + "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')", + "required_contents": { + "must_include": [ + "vote vote--user-downvoted" + ] + } + } + ] + }, + "intent_template_id": 1510, + "changelogs": [ + { + "key": "expected_backend_state", + "category": "switch_to_api_eval", + "note": "Update to use system check instead of dom check" + }, + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "reddit" + ], + "task_id": 726, + "require_login": true, + "storage_state": "./.auth/reddit_state.json", + "start_url": "__REDDIT__", + "geolocation": null, + "intent_template": "DisLike all submissions created by {{user}} in subreddit {{subreddit}}", + "instantiation_dict": { + "subreddit": "earthporn", + "user": "jacyanthis" + }, + "intent": "DisLike all submissions created by jacyanthis in subreddit earthporn", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "validate_user_downvoted_all_user_posts", + "expected_data": { + "subreddit": "earthporn", + "target_user": "jacyanthis" + } + } + } + ], + "site": "reddit" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": "N/A" + }, + "reference_url": "", + "program_html": [], + "string_note": "jacyanthis does not have any submissions in earthporn", + "reference_answer_raw_annotation": "N/A" + }, + "intent_template_id": 1510, + "changelogs": [ + { + "key": "expected_backend_state", + "category": "switch_to_api_eval", + "note": "Update to use system check instead of dom check" + }, + { + "key": "expected_retrieve_value", + "category": "unachievable_tasks", + "note": "Status check used instead of fuzzy match" + } + ] + }, + { + "sites": [ + "reddit" + ], + "task_id": 727, + "require_login": true, + "storage_state": "./.auth/reddit_state.json", + "start_url": "__REDDIT__", + "geolocation": null, + "intent_template": "DisLike all submissions created by {{user}} in subreddit {{subreddit}}", + "instantiation_dict": { + "user": "PatientBuilder499", + "subreddit": "videos" + }, + "intent": "DisLike all submissions created by PatientBuilder499 in subreddit videos", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "validate_user_downvoted_all_user_posts", + "expected_data": { + "subreddit": "videos", + "target_user": "PatientBuilder499" + } + } + } + ], + "site": "reddit" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__REDDIT__/f/videos/115139/hundreds-of-civilian-turkish-volunteers-waiting-to-be-sent", + "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')", + "required_contents": { + "must_include": [ + "vote vote--user-downvoted" + ] + } + } + ] + }, + "intent_template_id": 1510, + "changelogs": [ + { + "key": "expected_backend_state", + "category": "switch_to_api_eval", + "note": "Update to use system check instead of dom check" + }, + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "reddit" + ], + "task_id": 728, + "require_login": true, + "storage_state": "./.auth/reddit_state.json", + "start_url": "__REDDIT__", + "geolocation": null, + "intent_template": "DisLike all submissions created by {{user}} in subreddit {{subreddit}}", + "instantiation_dict": { + "user": "sirbarani", + "subreddit": "sports" + }, + "intent": "DisLike all submissions created by sirbarani in subreddit sports", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "validate_user_downvoted_all_user_posts", + "expected_data": { + "subreddit": "sports", + "target_user": "sirbarani" + } + } + } + ], + "site": "reddit" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__REDDIT__/f/sports/48303/iran-football-legend-daei-will-not-attend-world-cup-amid", + "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')", + "required_contents": { + "must_include": [ + "vote vote--user-downvoted" + ] + } + } + ] + }, + "intent_template_id": 1510, + "changelogs": [ + { + "key": "expected_backend_state", + "category": "switch_to_api_eval", + "note": "Update to use system check instead of dom check" + }, + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "reddit" + ], + "task_id": 729, + "require_login": true, + "storage_state": "./.auth/reddit_state.json", + "start_url": "__REDDIT__", + "geolocation": null, + "intent_template": "DisLike all submissions created by {{user}} in subreddit {{subreddit}}", + "instantiation_dict": { + "user": "AdamCannon", + "subreddit": "UpliftingNews" + }, + "intent": "DisLike all submissions created by AdamCannon in subreddit UpliftingNews", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "validate_user_downvoted_all_user_posts", + "expected_data": { + "subreddit": "UpliftingNews", + "target_user": "AdamCannon" + } + } + } + ], + "site": "reddit" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__REDDIT__/f/UpliftingNews/16087/same-sex-marriage-is-now-legal-in-all-of-mexico-s-states", + "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')", + "required_contents": { + "must_include": [ + "vote vote--user-downvoted" + ] + } + } + ] + }, + "intent_template_id": 1510, + "changelogs": [ + { + "key": "expected_backend_state", + "category": "switch_to_api_eval", + "note": "Update to use system check instead of dom check" + }, + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "reddit" + ], + "task_id": 730, + "require_login": true, + "storage_state": "./.auth/reddit_state.json", + "start_url": "__REDDIT__", + "geolocation": null, + "intent_template": "DisLike all submissions created by {{user}} in subreddit {{subreddit}}", + "instantiation_dict": { + "user": "Hrekires", + "subreddit": "news" + }, + "intent": "DisLike all submissions created by Hrekires in subreddit news", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "validate_user_downvoted_all_user_posts", + "expected_data": { + "subreddit": "news", + "target_user": "Hrekires" + } + } + } + ], + "site": "reddit" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__REDDIT__/f/news/129816/gov-whitmer-signs-bills-to-repeal-right-to-work-restore", + "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')", + "required_contents": { + "must_include": [ + "vote vote--user-downvoted" + ] + } + }, + { + "url": "__REDDIT__/f/news/129808/disney-world-deal-with-union-will-raise-minimum-wage-to-18", + "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')", + "required_contents": { + "must_include": [ + "vote vote--user-downvoted" + ] + } + }, + { + "url": "__REDDIT__/f/news/129794/judge-halts-wyoming-abortion-ban-days-after-it-took-effect", + "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')", + "required_contents": { + "must_include": [ + "vote vote--user-downvoted" + ] + } + }, + { + "url": "__REDDIT__/f/news/129783/don-t-say-gay-lawmaker-pleads-guilty-to-covid-relief-fraud", + "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')", + "required_contents": { + "must_include": [ + "vote vote--user-downvoted" + ] + } + }, + { + "url": "__REDDIT__/f/news/129594/arizona-gov-katie-hobbs-refuses-to-proceed-with-execution", + "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')", + "required_contents": { + "must_include": [ + "vote vote--user-downvoted" + ] + } + }, + { + "url": "__REDDIT__/f/news/129508/tennessee-governor-oks-bill-to-cut-nashville-council-in-half", + "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')", + "required_contents": { + "must_include": [ + "vote vote--user-downvoted" + ] + } + }, + { + "url": "__REDDIT__/f/news/43839/philadelphia-da-larry-krasner-impeached-by-pa-house", + "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')", + "required_contents": { + "must_include": [ + "vote vote--user-downvoted" + ] + } + }, + { + "url": "__REDDIT__/f/news/43781/crypto-giant-ftx-to-file-for-bankruptcy-ceo-sam-bankman", + "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')", + "required_contents": { + "must_include": [ + "vote vote--user-downvoted" + ] + } + }, + { + "url": "__REDDIT__/f/news/43572/sec-doj-investigating-crypto-platform-ftx", + "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')", + "required_contents": { + "must_include": [ + "vote vote--user-downvoted" + ] + } + }, + { + "url": "__REDDIT__/f/news/43558/kansas-gov-laura-kelly-wins-re-election-defeating-gop", + "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')", + "required_contents": { + "must_include": [ + "vote vote--user-downvoted" + ] + } + } + ] + }, + "intent_template_id": 1510, + "changelogs": [ + { + "key": "expected_backend_state", + "category": "switch_to_api_eval", + "note": "Update to use system check instead of dom check" + }, + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "reddit" + ], + "task_id": 731, + "require_login": true, + "storage_state": "./.auth/reddit_state.json", + "start_url": "__REDDIT__", + "geolocation": null, + "intent_template": "Edit my post on {{post}} by adding a line to the body that says \"{{content}}\"", + "instantiation_dict": { + "post": "Nvidia RTX 4090", + "content": "EDIT: This news aged well" + }, + "intent": "Edit my post on Nvidia RTX 4090 by adding a line to the body that says \"EDIT: This news aged well\"", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "validate_post_exists_in_subreddit", + "expected_data": { + "subreddit": "MachineLearning", + "post_id": 1, + "title": "EDIT: This news aged well" + } + } + } + ], + "site": "reddit" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__REDDIT__/f/MachineLearning/1/nvidia-rtx-4090", + "locator": "document.querySelector('.submission__body').outerText", + "required_contents": { + "must_include": [ + "EDIT: This news aged well", + "Crazy device for ML!" + ] + } + } + ] + }, + "intent_template_id": 27, + "changelogs": [ + { + "key": "expected_backend_state", + "category": "switch_to_api_eval", + "note": "Update to use system check instead of dom check" + }, + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "reddit" + ], + "task_id": 732, + "require_login": true, + "storage_state": "./.auth/reddit_state.json", + "start_url": "__REDDIT__", + "geolocation": null, + "intent_template": "Edit my post on {{post}} by adding a line to the body that says \"{{content}}\"", + "instantiation_dict": { + "post": "The Night Agent", + "content": "Done watching, pretty cool!" + }, + "intent": "Edit my post on The Night Agent by adding a line to the body that says \"Done watching, pretty cool!\"", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "validate_post_exists_in_subreddit", + "expected_data": { + "subreddit": "television", + "post_id": 134868, + "title": "Done watching, pretty cool!" + } + } + } + ], + "site": "reddit" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__REDDIT__/f/television/134868/the-night-agent-renewed-for-season-2-at-netflix", + "locator": "document.querySelector('.submission__body').outerText", + "required_contents": { + "exact_match": "Done watching, pretty cool!" + } + } + ] + }, + "intent_template_id": 27, + "changelogs": [ + { + "key": "expected_backend_state", + "category": "switch_to_api_eval", + "note": "Update to use system check instead of dom check" + }, + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "reddit" + ], + "task_id": 733, + "require_login": true, + "storage_state": "./.auth/reddit_state.json", + "start_url": "__REDDIT__", + "geolocation": null, + "intent_template": "Edit my post on {{post}} by adding a line to the body that says \"{{content}}\"", + "instantiation_dict": { + "post": "Star Trek Starfleet Academy series", + "content": "Every watch makes me feel like a kid again" + }, + "intent": "Edit my post on Star Trek Starfleet Academy series by adding a line to the body that says \"Every watch makes me feel like a kid again\"", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "validate_post_exists_in_subreddit", + "expected_data": { + "subreddit": "television", + "post_id": 135201, + "title": "Every watch makes me feel like a kid again" + } + } + } + ], + "site": "reddit" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__REDDIT__/f/television/135201/star-trek-starfleet-academy-series-from-alex-kurtzman-and", + "locator": "document.querySelector('.submission__body').outerText", + "required_contents": { + "exact_match": "Every watch makes me feel like a kid again" + } + } + ] + }, + "intent_template_id": 27, + "changelogs": [ + { + "key": "expected_backend_state", + "category": "switch_to_api_eval", + "note": "Update to use system check instead of dom check" + }, + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "reddit" + ], + "task_id": 734, + "require_login": true, + "storage_state": "./.auth/reddit_state.json", + "start_url": "__REDDIT__", + "geolocation": null, + "intent_template": "Edit my post on {{post}} by adding a line to the body that says \"{{content}}\"", + "instantiation_dict": { + "post": "Ted Lasso", + "content": "Done watching. I love the renew!" + }, + "intent": "Edit my post on Ted Lasso by adding a line to the body that says \"Done watching. I love the renew!\"", + "require_reset": false, + "eval": { + "site": "reddit" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__REDDIT__/f/television/135156/ted-lasso-season-3-premiere-scores-870k-u-s-households-up-59", + "locator": "document.querySelector('.submission__body').outerText", + "required_contents": { + "exact_match": "Done watching. I love the renew!" + } + } + ] + }, + "intent_template_id": 27 + }, + { + "sites": [ + "reddit" + ], + "task_id": 735, + "require_login": true, + "storage_state": "./.auth/reddit_state.json", + "start_url": "__REDDIT__", + "geolocation": null, + "intent_template": "Edit my post on {{post}} by adding a line to the body that says \"{{content}}\"", + "instantiation_dict": { + "post": "Lord of the Rings", + "content": "The cast is amazing!" + }, + "intent": "Edit my post on Lord of the Rings by adding a line to the body that says \"The cast is amazing!\"", + "require_reset": false, + "eval": { + "site": "reddit" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__REDDIT__/f/television/135152/lord-of-the-rings-the-rings-of-power-season-2-cast-adds", + "locator": "document.querySelector('.submission__body').outerText", + "required_contents": { + "exact_match": "The cast is amazing!" + } + } + ] + }, + "intent_template_id": 27 + }, + { + "sites": [ + "gitlab" + ], + "task_id": 736, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Make the LICENSE of {{repo}} to MIT license.", + "instantiation_dict": { + "repo": "gimmiethat.space and dotfiles" + }, + "intent": "Make the LICENSE of gimmiethat.space and dotfiles to MIT license.", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "file_has_substring", + "eval_params": { + "group": "byteblaze", + "project": "gimmiethat.space", + "file_path": "LICENSE", + "substring": "MIT License", + "branch": "main" + } + } + } + ], + "site": "gitlab" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__GITLAB__/byteblaze/gimmiethat.space/-/blob/main/LICENSE", + "locator": "", + "required_contents": { + "must_include": [ + "MIT license", + "The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software." + ] + } + }, + { + "url": "__GITLAB__/byteblaze/dotfiles/-/blob/main/LICENSE", + "locator": "", + "required_contents": { + "must_include": [ + "MIT license", + "The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software." + ] + } + } + ] + }, + "intent_template_id": 355, + "changelogs": [ + { + "key": "expected_backend_state", + "category": "switch_to_api_eval", + "note": "Use API to verify; more accurate" + }, + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "wikipedia", + "map" + ], + "task_id": 737, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "Show me the way from {{location}} to the home stadium of {{sport_team}} {{time}}", + "instantiation_dict": { + "location": "Carnegie Mellon University", + "sport_team": "Philadelphia 76ers", + "time": "" + }, + "intent": "Show me the way from Carnegie Mellon University to the home stadium of Philadelphia 76ers ", + "require_reset": false, + "eval": { + "site": "map" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "last", + "locator": "document.querySelector('[name=\"route_from\"').value", + "required_contents": { + "must_include": [ + "Carnegie Mellon University", + "Pittsburgh" + ] + } + }, + { + "url": "last", + "locator": "document.querySelector('[name=\"route_to\"').value", + "required_contents": { + "must_include": [ + "Wells Fargo Center", + "South Philadelphia Sports Complex" + ] + } + }, + { + "url": "last", + "locator": "document.querySelector(\"div#content select.routing_engines\").selectedIndex", + "required_contents": { + "exact_match": "1" + } + } + ] + }, + "intent_template_id": 94 + }, + { + "sites": [ + "wikipedia", + "map" + ], + "task_id": 738, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "Show me the way from {{location}} to the home stadium of {{sport_team}} {{time}}", + "instantiation_dict": { + "location": "Carnegie Mellon University", + "sport_team": "Philadelphia 76ers", + "time": "in the 70th" + }, + "intent": "Show me the way from Carnegie Mellon University to the home stadium of Philadelphia 76ers in the 70th", + "require_reset": false, + "eval": { + "site": "map" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "last", + "locator": "document.querySelector('[name=\"route_from\"').value", + "required_contents": { + "must_include": [ + "Carnegie Mellon University", + "Pittsburgh" + ] + } + }, + { + "url": "last", + "locator": "document.querySelector('[name=\"route_to\"').value", + "required_contents": { + "must_include": [ + "3601 South Broad Street", + "South Philadelphia" + ] + } + }, + { + "url": "last", + "locator": "document.querySelector(\"div#content select.routing_engines\").selectedIndex", + "required_contents": { + "exact_match": "1" + } + } + ] + }, + "intent_template_id": 94 + }, + { + "sites": [ + "wikipedia", + "map" + ], + "task_id": 739, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "Show me the way from {{location}} to the home stadium of {{sport_team}} {{time}}", + "instantiation_dict": { + "location": "Carnegie Mellon University", + "sport_team": "Yankees", + "time": "in the 80th" + }, + "intent": "Show me the way from Carnegie Mellon University to the home stadium of Yankees in the 80th", + "require_reset": false, + "eval": { + "site": "map" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "last", + "locator": "document.querySelector('[name=\"route_from\"').value", + "required_contents": { + "must_include": [ + "Carnegie Mellon University", + "Pittsburgh" + ] + } + }, + { + "url": "last", + "locator": "document.querySelector('[name=\"route_to\"').value", + "required_contents": { + "must_include": [ + "Yankee Stadium", + "East 161st Street" + ] + } + }, + { + "url": "last", + "locator": "document.querySelector(\"div#content select.routing_engines\").selectedIndex", + "required_contents": { + "exact_match": "1" + } + } + ] + }, + "intent_template_id": 94 + }, + { + "sites": [ + "wikipedia", + "map" + ], + "task_id": 740, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "Show me the way from {{location}} to the home stadium of {{sport_team}} {{time}}", + "instantiation_dict": { + "location": "Carnegie Mellon University", + "sport_team": "NYC NBA team", + "time": "" + }, + "intent": "Show me the way from Carnegie Mellon University to the home stadium of NYC NBA team ", + "require_reset": false, + "eval": { + "site": "map" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "last", + "locator": "document.querySelector('[name=\"route_from\"').value", + "required_contents": { + "must_include": [ + "Carnegie Mellon University", + "Pittsburgh" + ] + } + }, + { + "url": "last", + "locator": "document.querySelector('[name=\"route_to\"').value", + "required_contents": { + "must_include": [ + "Madison Square Garden", + "Pennsylvania Plaza", + "Manhattan", + "New York" + ] + } + }, + { + "url": "last", + "locator": "document.querySelector(\"div#content select.routing_engines\").selectedIndex", + "required_contents": { + "exact_match": "1" + } + } + ] + }, + "intent_template_id": 94 + }, + { + "sites": [ + "wikipedia", + "map" + ], + "task_id": 741, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "Show me the way from {{location}} to the home stadium of {{sport_team}} {{time}}", + "instantiation_dict": { + "location": "Carnegie Mellon University", + "sport_team": "Boston home NBA team", + "time": "" + }, + "intent": "Show me the way from Carnegie Mellon University to the home stadium of Boston home NBA team ", + "require_reset": false, + "eval": { + "site": "map" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "last", + "locator": "document.querySelector('[name=\"route_from\"').value", + "required_contents": { + "must_include": [ + "Carnegie Mellon University", + "Pittsburgh" + ] + } + }, + { + "url": "last", + "locator": "document.querySelector('[name=\"route_to\"').value", + "required_contents": { + "must_include": [ + "TD Garden", + "Boston", + "Massachusetts" + ] + } + }, + { + "url": "last", + "locator": "document.querySelector(\"div#content select.routing_engines\").selectedIndex", + "required_contents": { + "exact_match": "1" + } + } + ] + }, + "intent_template_id": 94 + }, + { + "sites": [ + "gitlab" + ], + "task_id": 742, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Create a new {{scope}} project \"planner\" and add {{account_list}} as members", + "instantiation_dict": { + "scope": "private", + "account_list": "Abishek, Vinta" + }, + "intent": "Create a new private project \"planner\" and add Abishek, Vinta as members", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "project_has_visibility_and_members", + "eval_params": { + "group": "byteblaze", + "project": "planner", + "visibility": "private", + "members": [ + "abisubramanya27", + "vinta" + ] + } + } + } + ], + "site": "gitlab" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__GITLAB__/byteblaze/planner", + "locator": "document.querySelector('.visibility-icon').getAttribute('title')", + "required_contents": { + "must_include": [ + "private" + ] + } + }, + { + "url": "__GITLAB__/byteblaze/planner/-/project_members", + "locator": "", + "required_contents": { + "must_include": [ + "@abisubramanya27", + "@vinta" + ] + } + } + ] + }, + "intent_template_id": 332, + "changelogs": [ + { + "key": "expected_backend_state", + "category": "switch_to_api_eval", + "note": "Use API to verify; more accurate" + }, + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "gitlab" + ], + "task_id": 743, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Create a new {{scope}} project \"web_arena\" and add {{account_list}} as members", + "instantiation_dict": { + "scope": "public", + "account_list": "Abishek, Vinta" + }, + "intent": "Create a new public project \"web_arena\" and add Abishek, Vinta as members", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "project_has_visibility_and_members", + "eval_params": { + "group": "byteblaze", + "project": "web_arena", + "visibility": "public", + "members": [ + "abisubramanya27", + "vinta" + ] + } + } + } + ], + "site": "gitlab" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__GITLAB__/byteblaze/web_arena", + "locator": "document.querySelector('.visibility-icon').getAttribute('title')", + "required_contents": { + "must_include": [ + "public" + ] + } + }, + { + "url": "__GITLAB__/byteblaze/web_arena/-/project_members", + "locator": "", + "required_contents": { + "must_include": [ + "@abisubramanya27", + "@vinta" + ] + } + } + ] + }, + "intent_template_id": 332, + "changelogs": [ + { + "key": "expected_backend_state", + "category": "switch_to_api_eval", + "note": "Use API to verify; more accurate" + }, + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "gitlab" + ], + "task_id": 744, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Create a new {{scope}} project \"AutoAGI\" and add {{account_list}} as members", + "instantiation_dict": { + "scope": "public", + "account_list": "primer" + }, + "intent": "Create a new public project \"AutoAGI\" and add primer as members", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "project_has_visibility_and_members", + "eval_params": { + "group": "byteblaze", + "project": "AutoAGI", + "visibility": "public", + "members": [ + "primer" + ] + } + } + } + ], + "site": "gitlab" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__GITLAB__/byteblaze/AutoAGI", + "locator": "document.querySelector('.visibility-icon').getAttribute('title')", + "required_contents": { + "must_include": [ + "public" + ] + } + }, + { + "url": "__GITLAB__/byteblaze/AutoAGI/-/project_members", + "locator": "", + "required_contents": { + "must_include": [ + "@primer" + ] + } + } + ] + }, + "intent_template_id": 332, + "changelogs": [ + { + "key": "expected_backend_state", + "category": "switch_to_api_eval", + "note": "Use API to verify; more accurate" + }, + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "gitlab" + ], + "task_id": 745, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Create a new {{scope}} project \"awesome-llms\" and add {{account_list}} as members", + "instantiation_dict": { + "scope": "public", + "account_list": "primer, convexegg, abishek" + }, + "intent": "Create a new public project \"awesome-llms\" and add primer, convexegg, abishek as members", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "project_has_visibility_and_members", + "eval_params": { + "group": "byteblaze", + "project": "awesome-llms", + "visibility": "public", + "members": [ + "primer", + "convexegg", + "abisubramanya27" + ] + } + } + } + ], + "site": "gitlab" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__GITLAB__/byteblaze/awesome-llms", + "locator": "document.querySelector('.visibility-icon').getAttribute('title')", + "required_contents": { + "must_include": [ + "public" + ] + } + }, + { + "url": "__GITLAB__/byteblaze/awesome-llms/-/project_members", + "locator": "", + "required_contents": { + "must_include": [ + "@primer", + "@convexegg", + "@abisubramanya27" + ] + } + } + ] + }, + "intent_template_id": 332, + "changelogs": [ + { + "key": "expected_backend_state", + "category": "switch_to_api_eval", + "note": "Use API to verify; more accurate" + }, + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "gitlab" + ], + "task_id": 746, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Create a new {{scope}} project \"llm_bulk_inference\" and add {{account_list}} as members", + "instantiation_dict": { + "scope": "private", + "account_list": "primer, convexegg, abishek" + }, + "intent": "Create a new private project \"llm_bulk_inference\" and add primer, convexegg, abishek as members", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "project_has_visibility_and_members", + "eval_params": { + "group": "byteblaze", + "project": "llm_bulk_inference", + "visibility": "private", + "members": [ + "primer", + "convexegg", + "abisubramanya27" + ] + } + } + } + ], + "site": "gitlab" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__GITLAB__/byteblaze/llm_bulk_inference", + "locator": "document.querySelector('.visibility-icon').getAttribute('title')", + "required_contents": { + "must_include": [ + "Private" + ] + } + }, + { + "url": "__GITLAB__/byteblaze/llm_bulk_inference/-/project_members", + "locator": "", + "required_contents": { + "must_include": [ + "@primer", + "@convexegg", + "@abisubramanya27" + ] + } + } + ] + }, + "intent_template_id": 332, + "changelogs": [ + { + "key": "expected_backend_state", + "category": "switch_to_api_eval", + "note": "Use API to verify; more accurate" + }, + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "gitlab" + ], + "task_id": 747, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Start a private project {{project_name}} with {{template}} template and add {{account_list}} as members", + "instantiation_dict": { + "project_name": "awesome_web_agents", + "template": "blank", + "account_list": "Abishek, Vinta" + }, + "intent": "Start a private project awesome_web_agents with blank template and add Abishek, Vinta as members", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "project_with_visibility_has_collaborators_and_initial_commit_message", + "eval_params": { + "group": "byteblaze", + "project": "awesome_web_agents", + "visibility": "private", + "initial_commit_message_substrings": [ + "Initial commit" + ], + "require_all_substrings": true, + "collaborators": [ + "abisubramanya27", + "vinta" + ] + } + } + } + ], + "site": "gitlab" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__GITLAB__/byteblaze/awesome_web_agents", + "locator": "document.querySelector('.visibility-icon').getAttribute('title')", + "required_contents": { + "must_include": [ + "Private" + ] + } + }, + { + "url": "__GITLAB__/byteblaze/awesome_web_agents/-/commits", + "locator": "", + "required_contents": { + "must_include": [ + "Initial commit" + ] + } + }, + { + "url": "__GITLAB__/byteblaze/awesome_web_agents/-/project_members", + "locator": "", + "required_contents": { + "must_include": [ + "@abisubramanya27", + "@vinta" + ] + } + } + ] + }, + "intent_template_id": 2100, + "changelogs": [ + { + "key": "expected_backend_state", + "category": "switch_to_api_eval", + "note": "API-based validation of collaborators and message is more accurate" + }, + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "gitlab" + ], + "task_id": 748, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Start a private project {{project_name}} with {{template}} template and add {{account_list}} as members", + "instantiation_dict": { + "project_name": "web_agent_android_xl", + "template": "Android", + "account_list": "primer, convexegg, abishek" + }, + "intent": "Start a private project web_agent_android_xl with Android template and add primer, convexegg, abishek as members", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "project_with_visibility_has_collaborators_and_initial_commit_message", + "eval_params": { + "group": "byteblaze", + "project": "web_agent_android_xl", + "visibility": "private", + "initial_commit_message_substrings": [ + "Initialized from 'Android' project template" + ], + "require_all_substrings": true, + "collaborators": [ + "primer", + "convexegg", + "abisubramanya27" + ] + } + } + } + ], + "site": "gitlab" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__GITLAB__/byteblaze/web_agent_android_xl", + "locator": "document.querySelector('.visibility-icon').getAttribute('title')", + "required_contents": { + "must_include": [ + "Private" + ] + } + }, + { + "url": "__GITLAB__/byteblaze/web_agent_android_xl/-/commits", + "locator": "", + "required_contents": { + "must_include": [ + "Initialized from 'Android' project template" + ] + } + }, + { + "url": "__GITLAB__/byteblaze/web_agent_android_xl/-/project_members", + "locator": "", + "required_contents": { + "must_include": [ + "@primer", + "@convexegg", + "@abisubramanya27" + ] + } + } + ] + }, + "intent_template_id": 2100, + "changelogs": [ + { + "key": "expected_backend_state", + "category": "switch_to_api_eval", + "note": "API-based validation is more accurate" + }, + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "gitlab" + ], + "task_id": 749, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Start a private project {{project_name}} with {{template}} template and add {{account_list}} as members", + "instantiation_dict": { + "project_name": "project_site", + "template": "NodeJS", + "account_list": "primer, convexegg, vinta" + }, + "intent": "Start a private project project_site with NodeJS template and add primer, convexegg, vinta as members", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "project_with_visibility_has_collaborators_and_initial_commit_message", + "eval_params": { + "group": "byteblaze", + "project": "project_site", + "visibility": "private", + "initial_commit_message_substrings": [ + "Initialized from 'NodeJS Express' project template" + ], + "require_all_substrings": true, + "collaborators": [ + "primer", + "convexegg", + "vinta" + ] + } + } + } + ], + "site": "gitlab" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__GITLAB__/byteblaze/project_site", + "locator": "document.querySelector('.visibility-icon').getAttribute('title')", + "required_contents": { + "must_include": [ + "Private" + ] + } + }, + { + "url": "__GITLAB__/byteblaze/project_site/-/commits", + "locator": "", + "required_contents": { + "must_include": [ + "Initialized from 'NodeJS Express' project template" + ] + } + }, + { + "url": "__GITLAB__/byteblaze/project_site/-/project_members", + "locator": "", + "required_contents": { + "must_include": [ + "@primer", + "@convexegg", + "@vinta" + ] + } + } + ] + }, + "intent_template_id": 2100, + "changelogs": [ + { + "key": "expected_backend_state", + "category": "switch_to_api_eval", + "note": "API-based validation is more accurate" + }, + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "gitlab" + ], + "task_id": 750, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Start a private project {{project_name}} with {{template}} template and add {{account_list}} as members", + "instantiation_dict": { + "project_name": "agi_index", + "template": "HTML", + "account_list": "Vinta Chen" + }, + "intent": "Start a private project agi_index with HTML template and add Vinta Chen as members", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "project_has_fields_and_collaborators", + "eval_params": { + "group": "byteblaze", + "project": "agi_index", + "fields": { + "name": "agi_index", + "description": "plain HTML", + "visibility": "private" + }, + "collaborators": [ + "vinta" + ], + "description_is_substring": true + } + } + } + ], + "site": "gitlab" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__GITLAB__/byteblaze/agi_index", + "locator": "document.querySelector('.visibility-icon').getAttribute('title')", + "required_contents": { + "must_include": [ + "Private" + ] + } + }, + { + "url": "__GITLAB__/byteblaze/agi_index", + "locator": "document.querySelector('.home-panel-description-markdown').outerText", + "required_contents": { + "must_include": [ + "Example plain HTML site using GitLab Pages: https://pages.gitlab.io/plain-html |OR| A plain HTML site that uses Netlify for CI/CD instead of GitLab, but still with all the other great GitLab features." + ] + } + }, + { + "url": "__GITLAB__/byteblaze/agi_index/-/project_members", + "locator": "", + "required_contents": { + "must_include": [ + "Vinta Chen" + ] + } + } + ] + }, + "intent_template_id": 2100, + "changelogs": [ + { + "key": "expected_backend_state", + "category": "switch_to_api_eval", + "note": "API-based validation is more accurate" + }, + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "gitlab" + ], + "task_id": 751, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Start a private project {{project_name}} with {{template}} template and add {{account_list}} as members", + "instantiation_dict": { + "project_name": "AGISite", + "template": "JEKYLL", + "account_list": "Rohan and Vinta" + }, + "intent": "Start a private project AGISite with JEKYLL template and add Rohan and Vinta as members", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "project_has_fields_and_collaborators", + "eval_params": { + "group": "byteblaze", + "project": "AGISite", + "fields": { + "name": "AGISite", + "description": "Jekyll site", + "visibility": "private" + }, + "collaborators": [ + "Seirdy", + "vinta" + ], + "description_is_substring": true + } + } + } + ], + "site": "gitlab" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__GITLAB__/byteblaze/AGISite", + "locator": "document.querySelector('.visibility-icon').getAttribute('title')", + "required_contents": { + "must_include": [ + "Private" + ] + } + }, + { + "url": "__GITLAB__/byteblaze/AGISite", + "locator": "document.querySelector('.home-panel-description-markdown').outerText", + "required_contents": { + "must_include": [ + "Example Jekyll site using GitLab Pages: https://pages.gitlab.io/jekyll |OR| A Jekyll site that uses Netlify for CI/CD instead of GitLab, but still with all the other great GitLab features." + ] + } + }, + { + "url": "__GITLAB__/byteblaze/AGISite/-/project_members", + "locator": "", + "required_contents": { + "must_include": [ + "@Seirdy", + "@vinta" + ] + } + } + ] + }, + "intent_template_id": 2100, + "changelogs": [ + { + "key": "expected_backend_state", + "category": "switch_to_api_eval", + "note": "API-based validation is more accurate" + }, + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "gitlab" + ], + "task_id": 752, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Create a private {{template}} repository called \"{{project_name}}\" using the right template to speed up development.", + "instantiation_dict": { + "project_name": "web_agent", + "template": "blank" + }, + "intent": "Create a private blank repository called \"web_agent\" using the right template to speed up development.", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "project_with_visibility_has_initial_commit_message", + "eval_params": { + "group": "byteblaze", + "project": "web_agent", + "visibility": "private", + "initial_commit_message_substrings": [ + "Initial commit" + ], + "require_all_substrings": false + } + } + } + ], + "site": "gitlab" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__GITLAB__/byteblaze/web_agent", + "locator": "document.querySelector('.visibility-icon').getAttribute('title')", + "required_contents": { + "must_include": [ + "Private" + ] + } + }, + { + "url": "__GITLAB__/byteblaze/web_agent/-/commits", + "locator": "", + "required_contents": { + "must_include": [ + "Initial commit" + ] + } + } + ] + }, + "intent_template_id": 332, + "changelogs": [ + { + "key": "intent_template_id", + "category": "reference_alignment", + "note": "Group similar tasks together" + }, + { + "key": "expected_backend_state", + "category": "switch_to_api_eval", + "note": "API-based validation via project_with_visibility_has_initial_commit_message is more accurate" + }, + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "gitlab" + ], + "task_id": 753, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Create a private {{template}} repository called \"{{project_name}}\" using the right template to speed up development.", + "instantiation_dict": { + "project_name": "web_agent_android_xs", + "template": "Android" + }, + "intent": "Create a private Android repository called \"web_agent_android_xs\" using the right template to speed up development.", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "project_with_visibility_has_initial_commit_message", + "eval_params": { + "group": "byteblaze", + "project": "web_agent_android_xs", + "visibility": "private", + "initial_commit_message_substrings": [ + "Initialized from 'Android' project template" + ], + "require_all_substrings": false + } + } + } + ], + "site": "gitlab" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__GITLAB__/byteblaze/web_agent_android_xs", + "locator": "document.querySelector('.visibility-icon').getAttribute('title')", + "required_contents": { + "must_include": [ + "Private" + ] + } + }, + { + "url": "__GITLAB__/byteblaze/web_agent_android_xs/-/commits", + "locator": "", + "required_contents": { + "must_include": [ + "Initialized from 'Android' project template" + ] + } + } + ] + }, + "intent_template_id": 332, + "changelogs": [ + { + "key": "intent_template_id", + "category": "reference_alignment", + "note": "Group similar tasks together" + }, + { + "key": "expected_backend_state", + "category": "switch_to_api_eval", + "note": "API-based validation is more accurate" + }, + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "gitlab" + ], + "task_id": 754, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Create a private {{template}} repository called \"{{project_name}}\" using the right template to speed up development.", + "instantiation_dict": { + "project_name": "web_agent_nodejs", + "template": "NodeJS" + }, + "intent": "Create a private NodeJS repository called \"web_agent_nodejs\" using the right template to speed up development.", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "project_with_visibility_has_initial_commit_message", + "eval_params": { + "group": "byteblaze", + "project": "web_agent_nodejs", + "visibility": "private", + "initial_commit_message_substrings": [ + "Initialized from 'NodeJS Express' project template" + ], + "require_all_substrings": false + } + } + } + ], + "site": "gitlab" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__GITLAB__/byteblaze/web_agent_nodejs", + "locator": "document.querySelector('.visibility-icon').getAttribute('title')", + "required_contents": { + "must_include": [ + "Private" + ] + } + }, + { + "url": "__GITLAB__/byteblaze/web_agent_nodejs/-/commits", + "locator": "", + "required_contents": { + "must_include": [ + "Initialized from 'NodeJS Express' project template" + ] + } + } + ] + }, + "intent_template_id": 332, + "changelogs": [ + { + "key": "intent_template_id", + "category": "reference_alignment", + "note": "Group similar tasks together" + }, + { + "key": "expected_backend_state", + "category": "switch_to_api_eval", + "note": "API-based validation is more accurate" + }, + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "gitlab" + ], + "task_id": 755, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Create a private {{template}} repository called \"{{project_name}}\" using the right template to speed up development.", + "instantiation_dict": { + "project_name": "web_agent_index", + "template": "HTML" + }, + "intent": "Create a private HTML repository called \"web_agent_index\" using the right template to speed up development.", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "project_has_fields", + "eval_params": { + "group": "byteblaze", + "project": "web_agent_index", + "fields": { + "name": "web_agent_index", + "description": "plain HTML", + "visibility": "private" + }, + "description_is_substring": true + } + } + } + ], + "site": "gitlab" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__GITLAB__/byteblaze/web_agent_index", + "locator": "document.querySelector('.visibility-icon').getAttribute('title')", + "required_contents": { + "must_include": [ + "Private" + ] + } + }, + { + "url": "__GITLAB__/byteblaze/web_agent_index", + "locator": "document.querySelector('.home-panel-description-markdown').outerText", + "required_contents": { + "must_include": [ + "Example plain HTML site using GitLab Pages: https://pages.gitlab.io/plain-html |OR| A plain HTML site that uses Netlify for CI/CD instead of GitLab, but still with all the other great GitLab features." + ] + } + } + ] + }, + "intent_template_id": 332, + "changelogs": [ + { + "key": "intent_template_id", + "category": "reference_alignment", + "note": "Group similar tasks together" + }, + { + "key": "expected_backend_state", + "category": "switch_to_api_eval", + "note": "API-based validation is more accurate" + }, + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "gitlab" + ], + "task_id": 756, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Create a private {{template}} repository called \"{{project_name}}\" using the right template to speed up development.", + "instantiation_dict": { + "project_name": "11711_gitlab", + "template": "JEKYLL" + }, + "intent": "Create a private JEKYLL repository called \"11711_gitlab\" using the right template to speed up development.", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "project_has_fields", + "eval_params": { + "group": "byteblaze", + "project": "11711_gitlab", + "fields": { + "name": "11711_gitlab", + "description": "Jekyll site", + "visibility": "private" + }, + "description_is_substring": true + } + } + } + ], + "site": "gitlab" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__GITLAB__/byteblaze/11711_gitlab", + "locator": "document.querySelector('.visibility-icon').getAttribute('title')", + "required_contents": { + "must_include": [ + "Private" + ] + } + }, + { + "url": "__GITLAB__/byteblaze/11711_gitlab", + "locator": "document.querySelector('.home-panel-description-markdown').outerText", + "required_contents": { + "must_include": [ + "Example Jekyll site using GitLab Pages: https://pages.gitlab.io/jekyll |OR| A Jekyll site that uses Netlify for CI/CD instead of GitLab, but still with all the other great GitLab features." + ] + } + } + ] + }, + "intent_template_id": 332, + "changelogs": [ + { + "key": "intent_template_id", + "category": "reference_alignment", + "note": "Group similar tasks together" + }, + { + "key": "expected_backend_state", + "category": "switch_to_api_eval", + "note": "API-based validation is more accurate" + }, + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "map" + ], + "task_id": 757, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "Show me the path and travel time from {{city1}} to {{city2}}.", + "instantiation_dict": { + "city1": "home of the 1980 Super Bowl champions", + "city2": "home of the 1991 Super Bowl champions" + }, + "intent": "Show me the path and travel time from home of the 1980 Super Bowl champions to home of the 1991 Super Bowl champions.", + "require_reset": false, + "eval": { + "site": "map" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "last", + "locator": "document.querySelector(\"div#content select.routing_engines\").selectedIndex", + "required_contents": { + "exact_match": "1" + } + }, + { + "url": "last", + "locator": "document.querySelector('[name=\"route_from\"').value", + "required_contents": { + "must_include": [ + "Pittsburgh" + ] + } + }, + { + "url": "last", + "locator": "document.querySelector('[name=\"route_to\"').value", + "required_contents": { + "must_include": [ + "New York" + ] + } + } + ] + }, + "intent_template_id": 42 + }, + { + "sites": [ + "map" + ], + "task_id": 758, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "Show me the path and travel time from {{city1}} to {{city2}}.", + "instantiation_dict": { + "city1": "the big apple", + "city2": "biggest city in Maine" + }, + "intent": "Show me the path and travel time from the big apple to biggest city in Maine.", + "require_reset": false, + "eval": { + "site": "map" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "last", + "locator": "document.querySelector(\"div#content select.routing_engines\").selectedIndex", + "required_contents": { + "exact_match": "1" + } + }, + { + "url": "last", + "locator": "document.querySelector('[name=\"route_from\"').value", + "required_contents": { + "must_include": [ + "New York" + ] + } + }, + { + "url": "last", + "locator": "document.querySelector('[name=\"route_to\"').value", + "required_contents": { + "must_include": [ + "Portland", + "Maine" + ] + } + } + ] + }, + "intent_template_id": 42 + }, + { + "sites": [ + "map", + "shopping_admin" + ], + "task_id": 759, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "Show me the route and driving time from {{city1}} to {{city2}}", + "instantiation_dict": { + "city1": "the city where my E-commerce customer Sophia Young lives", + "city2": "New York City" + }, + "intent": "Show me the route and driving time from the city where my E-commerce customer Sophia Young lives to New York City", + "require_reset": false, + "eval": { + "site": "shopping_admin" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "last", + "locator": "document.querySelector(\"div#content select.routing_engines\").selectedIndex", + "required_contents": { + "exact_match": "1" + } + }, + { + "url": "last", + "locator": "document.querySelector('[name=\"route_from\"').value", + "required_contents": { + "must_include": [ + "Boston" + ] + } + }, + { + "url": "last", + "locator": "document.querySelector('[name=\"route_to\"').value", + "required_contents": { + "must_include": [ + "New York" + ] + } + } + ] + }, + "intent_template_id": 42 + }, + { + "sites": [ + "map", + "shopping_admin" + ], + "task_id": 760, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "Show me the route and driving time from {{city1}} to {{city2}}", + "instantiation_dict": { + "city1": "Allentown, PA", + "city2": "the city where my E-commerce customer Amanda Kim lives" + }, + "intent": "Show me the route and driving time from Allentown, PA to the city where my E-commerce customer Amanda Kim lives", + "require_reset": false, + "eval": { + "site": "shopping_admin" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "last", + "locator": "document.querySelector(\"div#content select.routing_engines\").selectedIndex", + "required_contents": { + "exact_match": "1" + } + }, + { + "url": "last", + "locator": "document.querySelector('[name=\"route_from\"').value", + "required_contents": { + "must_include": [ + "Allentown" + ] + } + }, + { + "url": "last", + "locator": "document.querySelector('[name=\"route_to\"').value", + "required_contents": { + "must_include": [ + "Hoboken", + "New Jersey" + ] + } + } + ] + }, + "intent_template_id": 42 + }, + { + "sites": [ + "map" + ], + "task_id": 761, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "Get directions from {{location/address_1}} to {{location/address_2}} using {{transportation}} options.", + "instantiation_dict": { + "location/address_1": "Carnegie Science Museum", + "location/address_2": "Hunt library CMU", + "transportation": "walk" + }, + "intent": "Get directions from Carnegie Science Museum to Hunt library CMU using walk options.", + "require_reset": false, + "eval": { + "site": "map" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "last", + "locator": "document.querySelector(\"div#content select.routing_engines\").selectedIndex", + "required_contents": { + "exact_match": "2" + } + }, + { + "url": "last", + "locator": "document.querySelector('[name=\"route_from\"').value", + "required_contents": { + "must_include": [ + "Carnegie Science Center", + "Allegheny County", + "Pittsburgh" + ] + } + }, + { + "url": "last", + "locator": "document.querySelector('[name=\"route_to\"').value", + "required_contents": { + "must_include": [ + "Hunt Library", + "Pittsburgh" + ] + } + } + ] + }, + "intent_template_id": 54 + }, + { + "sites": [ + "map" + ], + "task_id": 762, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "Get directions from {{location/address_1}} to {{location/address_2}} using {{transportation}} options.", + "instantiation_dict": { + "location/address_1": "Carnegie Music Hall in NYC", + "location/address_2": "Carnegie Mellon University", + "transportation": "driving" + }, + "intent": "Get directions from Carnegie Music Hall in NYC to Carnegie Mellon University using driving options.", + "require_reset": false, + "eval": { + "site": "map" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "last", + "locator": "document.querySelector(\"div#content select.routing_engines\").selectedIndex", + "required_contents": { + "exact_match": "1" + } + }, + { + "url": "last", + "locator": "document.querySelector('[name=\"route_from\"').value", + "required_contents": { + "must_include": [ + "Carnegie Hall", + "West 57th Street", + "Manhattan", + "New York" + ] + } + }, + { + "url": "last", + "locator": "document.querySelector('[name=\"route_to\"').value", + "required_contents": { + "must_include": [ + "Carnegie Mellon University", + "Pittsburgh" + ] + } + } + ] + }, + "intent_template_id": 54 + }, + { + "sites": [ + "map" + ], + "task_id": 763, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "Find the walkway to the closest {{store}} from {{location}}.", + "instantiation_dict": { + "store": "Trader Joe's", + "location": "401 Shady Ave, Pittsburgh" + }, + "intent": "Find the walkway to the closest Trader Joe's from 401 Shady Ave, Pittsburgh.", + "require_reset": false, + "eval": { + "site": "map" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "last", + "locator": "document.querySelector(\"div#content select.routing_engines\").selectedIndex", + "required_contents": { + "exact_match": "2" + } + }, + { + "url": "last", + "locator": "document.querySelector('[name=\"route_from\"').value", + "required_contents": { + "must_include": [ + "401, Shady Avenue, Shadyside" + ] + } + }, + { + "url": "last", + "locator": "document.querySelector('[name=\"route_to\"').value", + "required_contents": { + "must_include": [ + "Trader Joe's, 6343, Penn Avenue, East Liberty" + ] + } + } + ] + }, + "intent_template_id": 75 + }, + { + "sites": [ + "map" + ], + "task_id": 764, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "Find the walkway to the closest {{store}} from {{location}}.", + "instantiation_dict": { + "store": "Target", + "location": "401 Shady Ave, Pittsburgh" + }, + "intent": "Find the walkway to the closest Target from 401 Shady Ave, Pittsburgh.", + "require_reset": false, + "eval": { + "site": "map" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "last", + "locator": "document.querySelector(\"div#content select.routing_engines\").selectedIndex", + "required_contents": { + "exact_match": "2" + } + }, + { + "url": "last", + "locator": "document.querySelector('[name=\"route_from\"').value", + "required_contents": { + "must_include": [ + "401, Shady Avenue, Shadyside" + ] + } + }, + { + "url": "last", + "locator": "document.querySelector('[name=\"route_to\"').value", + "required_contents": { + "must_include": [ + "Target, 6231, Penn Avenue, East Liberty" + ] + } + } + ] + }, + "intent_template_id": 75 + }, + { + "sites": [ + "map" + ], + "task_id": 765, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "Find the walkway to the closest {{store}} from {{location}}.", + "instantiation_dict": { + "store": "Japanese food market", + "location": "401 Shady Ave, Pittsburgh" + }, + "intent": "Find the walkway to the closest Japanese food market from 401 Shady Ave, Pittsburgh.", + "require_reset": false, + "eval": { + "site": "map" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "last", + "locator": "document.querySelector(\"div#content select.routing_engines\").selectedIndex", + "required_contents": { + "exact_match": "2" + } + }, + { + "url": "last", + "locator": "document.querySelector('[name=\"route_from\"').value", + "required_contents": { + "must_include": [ + "401, Shady Avenue, Shadyside" + ] + } + }, + { + "url": "last", + "locator": "document.querySelector('[name=\"route_to\"').value", + "required_contents": { + "must_include": [ + "Tokyo Japanese Food Store, 5855, Ellsworth Avenue, Shadyside" + ] + } + } + ] + }, + "intent_template_id": 75 + }, + { + "sites": [ + "map" + ], + "task_id": 766, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "Find the walkway to the closest {{store}} from {{location}}.", + "instantiation_dict": { + "store": "grocessory owned by Amazon", + "location": "401 Shady Ave, Pittsburgh" + }, + "intent": "Find the walkway to the closest grocessory owned by Amazon from 401 Shady Ave, Pittsburgh.", + "require_reset": false, + "eval": { + "site": "map" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "last", + "locator": "document.querySelector(\"div#content select.routing_engines\").selectedIndex", + "required_contents": { + "exact_match": "2" + } + }, + { + "url": "last", + "locator": "document.querySelector('[name=\"route_from\"').value", + "required_contents": { + "must_include": [ + "401, Shady Avenue, Shadyside" + ] + } + }, + { + "url": "last", + "locator": "document.querySelector('[name=\"route_to\"').value", + "required_contents": { + "must_include": [ + "Whole Foods Market, 5700, Penn Avenue, East Liberty" + ] + } + } + ] + }, + "intent_template_id": 75 + }, + { + "sites": [ + "map" + ], + "task_id": 767, + "require_login": true, + "storage_state": null, + "start_url": "__MAP__", + "geolocation": null, + "intent_template": "Find the walkway to the closest {{store}} from {{location}}.", + "instantiation_dict": { + "store": "chain grocessory owned by a local business", + "location": "401 Shady Ave, Pittsburgh" + }, + "intent": "Find the walkway to the closest chain grocessory owned by a local business from 401 Shady Ave, Pittsburgh.", + "require_reset": false, + "eval": { + "site": "map" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "last", + "locator": "document.querySelector(\"div#content select.routing_engines\").selectedIndex", + "required_contents": { + "exact_match": "2" + } + }, + { + "url": "last", + "locator": "document.querySelector('[name=\"route_from\"').value", + "required_contents": { + "must_include": [ + "401, Shady Avenue, Shadyside" + ] + } + }, + { + "url": "last", + "locator": "document.querySelector('[name=\"route_to\"').value", + "required_contents": { + "must_include": [ + "Giant Eagle, 5550, Centre Avenue, Shadyside" + ] + } + } + ] + }, + "intent_template_id": 75 + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 768, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "{{quantity}} {{product}} arrived, update the stock", + "instantiation_dict": { + "quantity": "5", + "product": "blue Cronus yoga pants with size 33" + }, + "intent": "5 blue Cronus yoga pants with size 33 arrived, update the stock", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "verify_product_details", + "eval_params": { + "product_id": "872" + }, + "expected_data": { + "stock_qty": 5, + "in_stock": true + } + } + } + ], + "site": "shopping_admin" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__SHOPPING_ADMIN__/catalog/product/edit/id/872/", + "locator": "document.querySelector('[name=\"product[quantity_and_stock_status][qty]\"').value", + "required_contents": { + "exact_match": "5" + } + }, + { + "url": "__SHOPPING_ADMIN__/catalog/product/edit/id/872/", + "locator": "document.querySelector('[name=\"product[quantity_and_stock_status][is_in_stock]\"').value", + "required_contents": { + "exact_match": "1" + } + } + ] + }, + "intent_template_id": 241, + "changelogs": [ + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 769, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "We've received {{quantity}} {{product}}, please update the inventory.", + "instantiation_dict": { + "quantity": "378", + "product": "brown Aero daily fitness tee in every size" + }, + "intent": "We've received 378 brown Aero daily fitness tee in every size, please update the inventory.", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "verify_product_details", + "eval_params": { + "product_id": "544" + }, + "expected_data": { + "stock_qty": 478 + } + } + }, + { + "eval_func": { + "name": "verify_product_details", + "eval_params": { + "product_id": "547" + }, + "expected_data": { + "stock_qty": 478 + } + } + }, + { + "eval_func": { + "name": "verify_product_details", + "eval_params": { + "product_id": "550" + }, + "expected_data": { + "stock_qty": 478 + } + } + }, + { + "eval_func": { + "name": "verify_product_details", + "eval_params": { + "product_id": "553" + }, + "expected_data": { + "stock_qty": 478 + } + } + }, + { + "eval_func": { + "name": "verify_product_details", + "eval_params": { + "product_id": "556" + }, + "expected_data": { + "stock_qty": 478 + } + } + } + ], + "site": "shopping_admin" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__SHOPPING_ADMIN__/catalog/product/edit/id/544/", + "locator": "document.querySelector('[name=\"product[quantity_and_stock_status][qty]\"').value", + "required_contents": { + "exact_match": "478" + } + }, + { + "url": "__SHOPPING_ADMIN__/catalog/product/edit/id/547/", + "locator": "document.querySelector('[name=\"product[quantity_and_stock_status][qty]\"').value", + "required_contents": { + "exact_match": "478" + } + }, + { + "url": "__SHOPPING_ADMIN__/catalog/product/edit/id/550/", + "locator": "document.querySelector('[name=\"product[quantity_and_stock_status][qty]\"').value", + "required_contents": { + "exact_match": "478" + } + }, + { + "url": "__SHOPPING_ADMIN__/catalog/product/edit/id/553/", + "locator": "document.querySelector('[name=\"product[quantity_and_stock_status][qty]\"').value", + "required_contents": { + "exact_match": "478" + } + }, + { + "url": "__SHOPPING_ADMIN__/catalog/product/edit/id/556/", + "locator": "document.querySelector('[name=\"product[quantity_and_stock_status][qty]\"').value", + "required_contents": { + "exact_match": "478" + } + } + ] + }, + "intent_template_id": 241, + "changelogs": [ + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 770, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "We've received {{quantity}}, update the inventory.", + "instantiation_dict": { + "quantity": "12 white Cora parachute pant of size 28 and 56 blue of size 29" + }, + "intent": "We've received 12 white Cora parachute pant of size 28 and 56 blue of size 29, update the inventory.", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "verify_product_details", + "eval_params": { + "product_id": "1836" + }, + "expected_data": { + "stock_qty": 112 + } + } + }, + { + "eval_func": { + "name": "verify_product_details", + "eval_params": { + "product_id": "1838" + }, + "expected_data": { + "stock_qty": 156 + } + } + } + ], + "site": "shopping_admin" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__SHOPPING_ADMIN__/catalog/product/edit/id/1836/", + "locator": "document.querySelector('[name=\"product[quantity_and_stock_status][qty]\"').value", + "required_contents": { + "exact_match": "112" + } + }, + { + "url": "__SHOPPING_ADMIN__/catalog/product/edit/id/1838/", + "locator": "document.querySelector('[name=\"product[quantity_and_stock_status][qty]\"').value", + "required_contents": { + "exact_match": "156" + } + } + ] + }, + "intent_template_id": 241, + "changelogs": [ + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 771, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Approve the positive reviews to display in our store.", + "instantiation_dict": {}, + "intent": "Approve the positive reviews to display in our store.", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "verify_review_details", + "eval_params": { + "review_id": "352" + }, + "expected_data": { + "approved": true + } + } + }, + { + "eval_func": { + "name": "verify_review_details", + "eval_params": { + "review_id": "349" + }, + "expected_data": { + "approved": true + } + } + }, + { + "eval_func": { + "name": "verify_review_details", + "eval_params": { + "review_id": "347" + }, + "expected_data": { + "approved": true + } + } + } + ], + "site": "shopping_admin" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__SHOPPING_ADMIN__/review/product/edit/id/352", + "locator": "document.querySelector('[name=\"status_id\"').value", + "required_contents": { + "exact_match": "1" + } + }, + { + "url": "__SHOPPING_ADMIN__/review/product/edit/id/349", + "locator": "document.querySelector('[name=\"status_id\"').value", + "required_contents": { + "exact_match": "1" + } + }, + { + "url": "__SHOPPING_ADMIN__/review/product/edit/id/347", + "locator": "document.querySelector('[name=\"status_id\"').value", + "required_contents": { + "exact_match": "1" + } + } + ] + }, + "intent_template_id": 243, + "changelogs": [ + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 772, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Delete all {{review_type}}", + "instantiation_dict": { + "review_type": "pending negative reviews for Circe fleece" + }, + "intent": "Delete all pending negative reviews for Circe fleece", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "verify_review_details", + "eval_params": { + "review_id": "999" + }, + "expected_data": { + "exists": false + } + } + } + ], + "site": "shopping_admin" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__SHOPPING_ADMIN__/review/product/edit/id/999", + "locator": "", + "required_contents": { + "must_include": [ + "Rating isn't Available" + ] + } + } + ] + }, + "intent_template_id": 246, + "changelogs": [ + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 773, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Delete all {{review_type}}", + "instantiation_dict": { + "review_type": "pending negative reviews" + }, + "intent": "Delete all pending negative reviews", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "verify_review_details", + "eval_params": { + "review_id": "351" + }, + "expected_data": { + "exists": false + } + } + }, + { + "eval_func": { + "name": "verify_review_details", + "eval_params": { + "review_id": "353" + }, + "expected_data": { + "exists": false + } + } + } + ], + "site": "shopping_admin" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__SHOPPING_ADMIN__/review/product/edit/id/351", + "locator": "", + "required_contents": { + "must_include": [ + "Rating isn't Available" + ] + } + }, + { + "url": "__SHOPPING_ADMIN__/review/product/edit/id/353", + "locator": "", + "required_contents": { + "must_include": [ + "Rating isn't Available" + ] + } + } + ] + }, + "intent_template_id": 246, + "changelogs": [ + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 774, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Delete all {{review_type}}", + "instantiation_dict": { + "review_type": "pending reviews with less than 4 stars" + }, + "intent": "Delete all pending reviews with less than 4 stars", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "verify_review_details", + "eval_params": { + "review_id": "351" + }, + "expected_data": { + "exists": false + } + } + }, + { + "eval_func": { + "name": "verify_review_details", + "eval_params": { + "review_id": "353" + }, + "expected_data": { + "exists": false + } + } + }, + { + "eval_func": { + "name": "verify_review_details", + "eval_params": { + "review_id": "349" + }, + "expected_data": { + "exists": false + } + } + } + ], + "site": "shopping_admin" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__SHOPPING_ADMIN__/review/product/edit/id/351", + "locator": "", + "required_contents": { + "must_include": [ + "Rating isn't Available" + ] + } + }, + { + "url": "__SHOPPING_ADMIN__/review/product/edit/id/353", + "locator": "", + "required_contents": { + "must_include": [ + "Rating isn't Available" + ] + } + }, + { + "url": "__SHOPPING_ADMIN__/review/product/edit/id/349", + "locator": "", + "required_contents": { + "must_include": [ + "Rating isn't Available" + ] + } + } + ] + }, + "intent_template_id": 246, + "changelogs": [ + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 775, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Delete all {{review_type}}", + "instantiation_dict": { + "review_type": "reviews from the scammer Arden" + }, + "intent": "Delete all reviews from the scammer Arden", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "verify_review_details", + "eval_params": { + "review_id": "51" + }, + "expected_data": { + "exists": false + } + } + } + ], + "site": "shopping_admin" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__SHOPPING_ADMIN__/review/product/edit/id/51", + "locator": "", + "required_contents": { + "must_include": [ + "Rating isn't Available" + ] + } + } + ] + }, + "intent_template_id": 246, + "changelogs": [ + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 776, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Delete all {{review_type}}", + "instantiation_dict": { + "review_type": "reviews from the scammer Carlo" + }, + "intent": "Delete all reviews from the scammer Carlo", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "verify_review_details", + "eval_params": { + "review_id": "93" + }, + "expected_data": { + "exists": false + } + } + }, + { + "eval_func": { + "name": "verify_review_details", + "eval_params": { + "review_id": "109" + }, + "expected_data": { + "exists": false + } + } + } + ], + "site": "shopping_admin" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__SHOPPING_ADMIN__/review/product/edit/id/93", + "locator": "", + "required_contents": { + "must_include": [ + "Rating isn't Available" + ] + } + }, + { + "url": "__SHOPPING_ADMIN__/review/product/edit/id/109", + "locator": "", + "required_contents": { + "must_include": [ + "Rating isn't Available" + ] + } + } + ] + }, + "intent_template_id": 246, + "changelogs": [ + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 777, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "{{action}} the price of {{config}} by {{amount}}", + "instantiation_dict": { + "amount": "$5", + "action": "Reduce", + "config": "green Hollister backyard sweatshirt in all sizes" + }, + "original.instantiation_dict": { + "amount": "$5", + "action": "Reduce", + "config": "green Hollister backyard sweater in all size" + }, + "intent": "Reduce the price of green Hollister backyard sweatshirt in all sizes by $5", + "original.intent": "Reduce the price of green Hollister backyard sweater in all size by $5", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "verify_product_details", + "eval_params": { + "product_id": "120" + }, + "expected_data": { + "price": "47.00" + } + } + }, + { + "eval_func": { + "name": "verify_product_details", + "eval_params": { + "product_id": "117" + }, + "expected_data": { + "price": "47.00" + } + } + }, + { + "eval_func": { + "name": "verify_product_details", + "eval_params": { + "product_id": "114" + }, + "expected_data": { + "price": "47.00" + } + } + }, + { + "eval_func": { + "name": "verify_product_details", + "eval_params": { + "product_id": "111" + }, + "expected_data": { + "price": "47.00" + } + } + }, + { + "eval_func": { + "name": "verify_product_details", + "eval_params": { + "product_id": "123" + }, + "expected_data": { + "price": "47.00" + } + } + } + ], + "site": "shopping_admin" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__SHOPPING_ADMIN__/catalog/product/edit/id/120/", + "locator": "document.querySelector('[name=\"product[price]\"').value", + "required_contents": { + "exact_match": "47.00" + } + }, + { + "url": "__SHOPPING_ADMIN__/catalog/product/edit/id/117/", + "locator": "document.querySelector('[name=\"product[price]\"').value", + "required_contents": { + "exact_match": "47.00" + } + }, + { + "url": "__SHOPPING_ADMIN__/catalog/product/edit/id/114/", + "locator": "document.querySelector('[name=\"product[price]\"').value", + "required_contents": { + "exact_match": "47.00" + } + }, + { + "url": "__SHOPPING_ADMIN__/catalog/product/edit/id/111/", + "locator": "document.querySelector('[name=\"product[price]\"').value", + "required_contents": { + "exact_match": "47.00" + } + }, + { + "url": "__SHOPPING_ADMIN__/catalog/product/edit/id/123/", + "locator": "document.querySelector('[name=\"product[price]\"').value", + "required_contents": { + "exact_match": "47.00" + } + } + ] + }, + "intent_template_id": 742, + "changelogs": [ + { + "key": "instantiation_dict", + "category": "spelling_or_grammar", + "note": "Use proper pluralization" + }, + { + "key": "instantiation_dict", + "category": "task_ambiguity", + "note": "No sweater product exists updating to correct product sweatshirt" + }, + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 778, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "{{action}} the price of {{config}} by {{amount}}", + "instantiation_dict": { + "amount": "13.5%", + "action": "Reduce", + "config": "size 28 Sahara leggings" + }, + "intent": "Reduce the price of size 28 Sahara leggings by 13.5%", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "verify_product_details", + "eval_params": { + "product_id": "1841" + }, + "expected_data": { + "price": "64.88" + } + } + }, + { + "eval_func": { + "name": "verify_product_details", + "eval_params": { + "product_id": "1852" + }, + "expected_data": { + "price": "64.88" + } + } + }, + { + "eval_func": { + "name": "verify_product_details", + "eval_params": { + "product_id": "1843" + }, + "expected_data": { + "price": "64.88" + } + } + } + ], + "site": "shopping_admin" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__SHOPPING_ADMIN__/catalog/product/edit/id/1841/", + "locator": "document.querySelector('[name=\"product[price]\"').value", + "required_contents": { + "exact_match": "64.88" + } + }, + { + "url": "__SHOPPING_ADMIN__/catalog/product/edit/id/1842/", + "locator": "document.querySelector('[name=\"product[price]\"').value", + "required_contents": { + "exact_match": "64.88" + } + }, + { + "url": "__SHOPPING_ADMIN__/catalog/product/edit/id/1843/", + "locator": "document.querySelector('[name=\"product[price]\"').value", + "required_contents": { + "exact_match": "64.88" + } + } + ] + }, + "intent_template_id": 742, + "changelogs": [ + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 779, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "{{action}} the price of {{config}} by {{amount}}", + "instantiation_dict": { + "amount": "15%", + "action": "Reduce", + "config": "yellow shirts from Gwyn Endurance in all sizes below L" + }, + "original.instantiation_dict": { + "amount": "15%", + "action": "Reduce", + "config": "yellow shirts from Gwyn Endurance in all size below L" + }, + "intent": "Reduce the price of yellow shirts from Gwyn Endurance in all sizes below L by 15%", + "original.intent": "Reduce the price of yellow shirts from Gwyn Endurance in all size below L by 15%", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "verify_product_details", + "eval_params": { + "product_id": "1559" + }, + "expected_data": { + "price": "20.40" + } + } + }, + { + "eval_func": { + "name": "verify_product_details", + "eval_params": { + "product_id": "1562" + }, + "expected_data": { + "price": "20.40" + } + } + }, + { + "eval_func": { + "name": "verify_product_details", + "eval_params": { + "product_id": "1565" + }, + "expected_data": { + "price": "20.40" + } + } + } + ], + "site": "shopping_admin" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__SHOPPING_ADMIN__/catalog/product/edit/id/1559/", + "locator": "document.querySelector('[name=\"product[price]\"').value", + "required_contents": { + "exact_match": "20.40" + } + }, + { + "url": "__SHOPPING_ADMIN__/catalog/product/edit/id/1562/", + "locator": "document.querySelector('[name=\"product[price]\"').value", + "required_contents": { + "exact_match": "20.40" + } + }, + { + "url": "__SHOPPING_ADMIN__/catalog/product/edit/id/1565/", + "locator": "document.querySelector('[name=\"product[price]\"').value", + "required_contents": { + "exact_match": "20.40" + } + } + ] + }, + "intent_template_id": 742, + "changelogs": [ + { + "key": "instantiation_dict", + "category": "spelling_or_grammar", + "note": "Use proper pluralization" + }, + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 780, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__/catalog/product/edit/id/1481/", + "geolocation": null, + "intent_template": "{{action}} the price of {{config}} by {{amount}}", + "instantiation_dict": { + "amount": "$17", + "action": "Increase", + "config": "white Ingrid Running with size L and above" + }, + "intent": "Increase the price of white Ingrid Running with size L and above by $17", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "verify_product_details", + "eval_params": { + "product_id": "1264" + }, + "expected_data": { + "price": "101.00" + } + } + }, + { + "eval_func": { + "name": "verify_product_details", + "eval_params": { + "product_id": "1267" + }, + "expected_data": { + "price": "101.00" + } + } + } + ], + "site": "shopping_admin" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__SHOPPING_ADMIN__/catalog/product/edit/id/1264/", + "locator": "document.querySelector('[name=\"product[price]\"').value", + "required_contents": { + "exact_match": "64.00" + } + }, + { + "url": "__SHOPPING_ADMIN__/catalog/product/edit/id/1267/", + "locator": "document.querySelector('[name=\"product[price]\"').value", + "required_contents": { + "exact_match": "64.00" + } + } + ] + }, + "intent_template_id": 742, + "changelogs": [ + { + "key": "expected_backend_state", + "category": "reference_alignment", + "note": "Original value was decrese of $20 when intent states increase of $17" + }, + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 781, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "{{action}} the price of {{config}} by {{amount}}", + "instantiation_dict": { + "amount": "37%", + "action": "Increase", + "config": "black fitness tshirts from Desiree with size XS" + }, + "intent": "Increase the price of black fitness tshirts from Desiree with size XS by 37%", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "verify_product_details", + "eval_params": { + "product_id": "1573" + }, + "expected_data": { + "price": "32.88" + } + } + } + ], + "site": "shopping_admin" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__SHOPPING_ADMIN__/catalog/product/edit/id/1573/", + "locator": "document.querySelector('[name=\"product[price]\"').value", + "required_contents": { + "exact_match": "32.88" + } + } + ] + }, + "intent_template_id": 742, + "changelogs": [ + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 782, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "{{action}} the price of {{config}} by {{amount}}", + "instantiation_dict": { + "amount": "23%", + "action": "Increase", + "config": "all blue running tshirts in extra small and small sizes" + }, + "intent": "Increase the price of all blue running tshirts in extra small and small sizes by 23%", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "verify_product_details", + "eval_params": { + "product_id": "496" + }, + "expected_data": { + "price": "35.67" + } + } + }, + { + "eval_func": { + "name": "verify_product_details", + "eval_params": { + "product_id": "499" + }, + "expected_data": { + "price": "35.67" + } + } + }, + { + "eval_func": { + "name": "verify_product_details", + "eval_params": { + "product_id": "479" + }, + "expected_data": { + "price": "34.44" + } + } + }, + { + "eval_func": { + "name": "verify_product_details", + "eval_params": { + "product_id": "482" + }, + "expected_data": { + "price": "34.44" + } + } + } + ], + "site": "shopping_admin" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__SHOPPING_ADMIN__/catalog/product/edit/id/496/", + "locator": "document.querySelector('[name=\"product[price]\"').value", + "required_contents": { + "exact_match": "22.33" + } + }, + { + "url": "__SHOPPING_ADMIN__/catalog/product/edit/id/499/", + "locator": "document.querySelector('[name=\"product[price]\"').value", + "required_contents": { + "exact_match": "22.33" + } + }, + { + "url": "__SHOPPING_ADMIN__/catalog/product/edit/id/479/", + "locator": "document.querySelector('[name=\"product[price]\"').value", + "required_contents": { + "exact_match": "21.56" + } + }, + { + "url": "__SHOPPING_ADMIN__/catalog/product/edit/id/482/", + "locator": "document.querySelector('[name=\"product[price]\"').value", + "required_contents": { + "exact_match": "21.56" + } + } + ] + }, + "intent_template_id": 742, + "changelogs": [ + { + "key": "expected_backend_state", + "category": "reference_alignment", + "note": "Original values were decresed by 23% not increased as stated in the intent" + }, + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "gitlab" + ], + "task_id": 783, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Add the following users to repo {{repo}} as {{role}}: {{user_list}}", + "instantiation_dict": { + "repo": "kkroening/ffmpeg-python", + "role": "maintainer", + "user_list": [ + "yjlou", + "a11yproject" + ] + }, + "intent": "Add the following users to repo kkroening/ffmpeg-python as maintainer: ['yjlou', 'a11yproject']", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "validate_status", + "expected_data": { + "expected_status": "ACTION_NOT_ALLOWED_ERROR" + } + } + } + ], + "site": "gitlab" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": "N/A" + }, + "reference_url": "", + "program_html": [], + "string_note": "not the owner of the repo so cannot add maintainer", + "reference_answer_raw_annotation": "N/A" + }, + "intent_template_id": 351, + "changelogs": [ + { + "key": "expected_retrieve_value", + "category": "unachievable_tasks", + "note": "byteblaze user does not have permissions to do this" + } + ] + }, + { + "sites": [ + "gitlab" + ], + "task_id": 784, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__/CellularPrivacy/Android-IMSI-Catcher-Detector", + "geolocation": null, + "intent_template": "Tell me the {{attribute}} of the contributor who has the most commits to branch {{branch_name}}", + "instantiation_dict": { + "branch_name": "master", + "attribute": "email address" + }, + "original.instantiation_dict": { + "branch_name": "main", + "attribute": "email address" + }, + "intent": "Tell me the email address of the contributor who has the most commits to branch master", + "original.intent": "Tell me the email address of the contributor who has the most commits to branch main", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "verify_retrieved_value_normalized", + "eval_params": { + "ordered": "false", + "type": "text" + }, + "expected_data": [ + "secupwn@users.noreply.github.com" + ] + } + } + ], + "site": "gitlab" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "secupwn@users.noreply.github.com" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "secupwn@users.noreply.github.com" + }, + "intent_template_id": 316, + "changelogs": [ + { + "key": "expected_retrieve_value", + "category": "avoid_permissive_text", + "note": "Use structured response to judge score" + }, + { + "key": "instantiation_dict", + "category": "reference_alignment", + "note": "No such branch named main" + }, + { + "key": "eval", + "category": "permissive_string_match", + "note": "Switched to exact match" + } + ] + }, + { + "sites": [ + "gitlab" + ], + "task_id": 785, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__/CellularPrivacy/Android-IMSI-Catcher-Detector", + "geolocation": null, + "intent_template": "Tell me the {{attribute}} of the contributor who has the most commits to branch {{branch_name}}", + "instantiation_dict": { + "branch_name": "gh-page", + "attribute": "email address" + }, + "intent": "Tell me the email address of the contributor who has the most commits to branch gh-page", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "verify_retrieved_value_normalized", + "eval_params": { + "ordered": "false", + "type": "text" + }, + "expected_data": [ + "secupwn@users.noreply.github.com" + ] + } + } + ], + "site": "gitlab" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "secupwn@users.noreply.github.com" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "secupwn@users.noreply.github.com" + }, + "intent_template_id": 316, + "changelogs": [ + { + "key": "expected_retrieve_value", + "category": "avoid_permissive_text", + "note": "Use structured response to judge correctness" + }, + { + "key": "eval", + "category": "permissive_string_match", + "note": "Switched to exact match" + } + ] + }, + { + "sites": [ + "gitlab" + ], + "task_id": 786, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__/vinta/awesome-python", + "geolocation": null, + "intent_template": "Tell me the {{attribute}} of the contributor who has the most commits to branch {{branch_name}}", + "instantiation_dict": { + "branch_name": "master", + "attribute": "number of commits" + }, + "original.instantiation_dict": { + "branch_name": "main", + "attribute": "number of commits" + }, + "intent": "Tell me the number of commits of the contributor who has the most commits to branch master", + "original.intent": "Tell me the number of commits of the contributor who has the most commits to branch main", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "verify_retrieved_value_normalized", + "eval_params": { + "ordered": "false", + "type": "numeric" + }, + "expected_data": [ + 412 + ] + } + } + ], + "site": "gitlab" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "412" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "412" + }, + "intent_template_id": 316, + "changelogs": [ + { + "key": "instantiation_dict", + "category": "reference_alignment", + "note": "No such branch named main; original answer incorrect" + }, + { + "key": "expected_retrieve_value", + "category": "avoid_permissive_text", + "note": "Use structured response to judge correctness" + }, + { + "key": "eval", + "category": "permissive_string_match", + "note": "Switched to exact match" + } + ] + }, + { + "sites": [ + "gitlab" + ], + "task_id": 787, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__/dehenne/awesome-visibility/-/graphs/master", + "geolocation": null, + "intent_template": "Tell me the {{attribute}} of the contributor who has the most commits to branch {{branch_name}}", + "instantiation_dict": { + "branch_name": "master", + "attribute": "number of followers" + }, + "original.instantiation_dict": { + "branch_name": "main", + "attribute": "number of followers" + }, + "intent": "Tell me the number of followers of the contributor who has the most commits to branch master", + "original.intent": "Tell me the number of followers of the contributor who has the most commits to branch main", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "verify_retrieved_value_normalized", + "eval_params": { + "ordered": "false", + "type": "numeric" + }, + "expected_data": [ + 0 + ] + } + } + ], + "site": "gitlab" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "0" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "0" + }, + "intent_template_id": 316, + "changelogs": [ + { + "key": "instantiation_dict", + "category": "reference_alignment", + "note": "No such branch named main; original answer incorrect" + }, + { + "key": "expected_retrieve_value", + "category": "avoid_permissive_text", + "note": "Use structured response to judge correctness" + }, + { + "key": "eval", + "category": "permissive_string_match", + "note": "Switched to exact match" + } + ] + }, + { + "sites": [ + "gitlab" + ], + "task_id": 788, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__/amwhalen/archive-my-tweets/-/tree/github/fork/chtitux/addRssFeed", + "geolocation": null, + "intent_template": "Tell me the {{attribute}} of the contributor who has the most commits to branch {{branch_name}}", + "instantiation_dict": { + "branch_name": "php52", + "attribute": "full name, gitlab account name, location and email address" + }, + "intent": "Tell me the full name, gitlab account name, location and email address of the contributor who has the most commits to branch php52", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "verify_retrieved_value_normalized", + "eval_params": { + "ordered": "false", + "type": "text" + }, + "expected_data": [ + "Andrew M. Whalen", + "amwhalen", + "Massachusetts", + "github@amwhalen.com" + ] + } + } + ], + "site": "gitlab" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "must_include": [ + "Andrew M. Whalen", + "amwhalen", + "Massachusetts", + "github@amwhalen.com" + ] + }, + "reference_url": "", + "program_html": [], + "string_note": "", + "reference_answer_raw_annotation": "Andrew M. Whalen, amwhalen, Massachusetts, github@amwhalen.com" + }, + "intent_template_id": 316, + "format_specification": "Return value is a list of format [full name, gitlab account name, location, email address]", + "changelogs": [ + { + "key": "expected_retrieve_value", + "category": "avoid_permissive_text", + "note": "Use structured response to judge correctness" + }, + { + "key": "eval", + "category": "permissive_string_match", + "note": "Switched to exact match" + } + ] + }, + { + "sites": [ + "gitlab" + ], + "task_id": 789, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Create an issue {{issue}} in {{repo}}.", + "instantiation_dict": { + "repo": "huggingface dataset", + "issue": "asking about do they have any plan on supporting Webagent in the next quater" + }, + "intent": "Create an issue asking about do they have any plan on supporting Webagent in the next quater in huggingface dataset.", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "validate_status", + "expected_data": { + "expected_status": "RESOURCE_NOT_FOUND_ERROR" + } + } + } + ], + "site": "gitlab" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": "N/A" + }, + "reference_url": "", + "program_html": [], + "string_note": "huggingface repo does not exist", + "reference_answer_raw_annotation": "N/A" + }, + "intent_template_id": 328, + "changelogs": [ + { + "key": "expected_retrieve_value", + "category": "unachievable_tasks", + "note": "Expect the right error code for unachievable tasks" + } + ] + }, + { + "sites": [ + "shopping_admin" + ], + "task_id": 790, + "require_login": true, + "storage_state": "./.auth/shopping_admin_state.json", + "start_url": "__SHOPPING_ADMIN__", + "geolocation": null, + "intent_template": "Delete all {{review_type}}", + "instantiation_dict": { + "review_type": "negative reviews for Sybil running short" + }, + "intent": "Delete all negative reviews for Sybil running short", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "validate_status", + "expected_data": { + "expected_status": "ACTION_NOT_ALLOWED_ERROR" + } + } + } + ], + "site": "shopping_admin" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": "N/A" + }, + "reference_url": "", + "program_html": [], + "string_note": "No negative reviews for sybil running short", + "reference_answer_raw_annotation": "N/A" + }, + "intent_template_id": 246, + "changelogs": [ + { + "key": "expected_retrieve_value", + "category": "unachievable_tasks", + "note": "Status check used instead of fuzzy match" + } + ] + }, + { + "sites": [ + "gitlab", + "reddit" + ], + "task_id": 791, + "require_login": true, + "storage_state": "./.auth/gitlab.reddit_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Make a folder named {{directory}} on the {{gitlab_repo}} repo and include a file called urls.txt that consists of the links to the 5 most recent posts from {{subreddit}}.", + "instantiation_dict": { + "directory": "car", + "subreddit": "cars", + "gitlab_repo": "gimmiethat.space" + }, + "intent": "Make a folder named car on the gimmiethat.space repo and include a file called urls.txt that consists of the links to the 5 most recent posts from cars.", + "require_reset": false, + "eval": { + "site": "reddit" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": "N/A" + }, + "reference_url": "", + "program_html": [], + "string_note": "the subreddit does not exist", + "reference_answer_raw_annotation": "N/A" + }, + "intent_template_id": 84, + "changelogs": [ + { + "key": "expected_retrieve_value", + "category": "unachievable_tasks", + "note": "Status check used instead of fuzzy match" + } + ] + }, + { + "sites": [ + "shopping" + ], + "task_id": 792, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "Buy the highest rated product from the {{product_category}} category within a budget {{dollar_value}}.", + "instantiation_dict": { + "product_category": "Beauty & Personal Care", + "dollar_value": "under 20" + }, + "intent": "Buy the highest rated product from the Beauty & Personal Care category within a budget under 20.", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "validate_status", + "expected_data": { + "expected_status": "ACTION_NOT_ALLOWED_ERROR" + } + } + } + ], + "site": "shopping" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": "N/A" + }, + "reference_url": "", + "program_html": [], + "string_note": "The website does not support sorting by rating and there are too many products in this category", + "reference_answer_raw_annotation": "N/A" + }, + "intent_template_id": 172, + "changelogs": [ + { + "key": "expected_retrieve_value", + "category": "unachievable_tasks", + "note": "Status check used instead of fuzzy match" + } + ] + }, + { + "sites": [ + "shopping" + ], + "task_id": 793, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "Buy the highest rated product from the {{product_category}} category within a budget {{dollar_value}}.", + "instantiation_dict": { + "product_category": "Men clothing", + "dollar_value": "above 50 but under 129.99" + }, + "intent": "Buy the highest rated product from the Men clothing category within a budget above 50 but under 129.99.", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "validate_status", + "expected_data": { + "expected_status": "ACTION_NOT_ALLOWED_ERROR" + } + } + } + ], + "site": "shopping" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": "N/A" + }, + "reference_url": "", + "program_html": [], + "string_note": "The website does not support sorting by rating and there are too many products in this category", + "reference_answer_raw_annotation": "N/A" + }, + "intent_template_id": 172, + "changelogs": [ + { + "key": "expected_retrieve_value", + "category": "unachievable_tasks", + "note": "Status check used instead of fuzzy match" + } + ] + }, + { + "sites": [ + "shopping" + ], + "task_id": 794, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "Change the delivery address for my most recent order to {{address}}.", + "instantiation_dict": { + "address": "4000 Forbes Ave, Pittsburgh, PA" + }, + "intent": "Change the delivery address for my most recent order to 4000 Forbes Ave, Pittsburgh, PA.", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "validate_status", + "expected_data": { + "expected_status": "ACTION_NOT_ALLOWED_ERROR" + } + } + } + ], + "site": "shopping" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": "N/A" + }, + "reference_url": "", + "program_html": [], + "string_note": "The shipping address cannot be changed after an order is placed in the website.", + "reference_answer_raw_annotation": "N/A" + }, + "intent_template_id": 191, + "changelogs": [ + { + "key": "expected_retrieve_value", + "category": "unachievable_tasks", + "note": "Status check used instead of fuzzy match" + } + ] + }, + { + "sites": [ + "shopping" + ], + "task_id": 795, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "Change the delivery address for my most recent order to {{address}}.", + "instantiation_dict": { + "address": "6726 McPherson Blvd, Pittsburgh, PA" + }, + "intent": "Change the delivery address for my most recent order to 6726 McPherson Blvd, Pittsburgh, PA.", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "validate_status", + "expected_data": { + "expected_status": "ACTION_NOT_ALLOWED_ERROR" + } + } + } + ], + "site": "shopping" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": "N/A" + }, + "reference_url": "", + "program_html": [], + "string_note": "The shipping address cannot be changed after an order is placed in the website.", + "reference_answer_raw_annotation": "N/A" + }, + "intent_template_id": 191, + "changelogs": [ + { + "key": "expected_retrieve_value", + "category": "unachievable_tasks", + "note": "Status check used instead of fuzzy match" + } + ] + }, + { + "sites": [ + "shopping" + ], + "task_id": 796, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "Change the delivery address for my most recent order to {{address}}.", + "instantiation_dict": { + "address": "155 5th Street, San Francisco, CA" + }, + "intent": "Change the delivery address for my most recent order to 155 5th Street, San Francisco, CA.", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "validate_status", + "expected_data": { + "expected_status": "ACTION_NOT_ALLOWED_ERROR" + } + } + } + ], + "site": "shopping" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": "N/A" + }, + "reference_url": "", + "program_html": [], + "string_note": "The shipping address cannot be changed after an order is placed in the website.", + "reference_answer_raw_annotation": "N/A" + }, + "intent_template_id": 191, + "changelogs": [ + { + "key": "expected_retrieve_value", + "category": "unachievable_tasks", + "note": "Status check used instead of fuzzy match" + } + ] + }, + { + "sites": [ + "shopping" + ], + "task_id": 797, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "Change the delivery address for my most recent order to {{address}}.", + "instantiation_dict": { + "address": "3 Oxford St, Cambridge, MA" + }, + "intent": "Change the delivery address for my most recent order to 3 Oxford St, Cambridge, MA.", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "validate_status", + "expected_data": { + "expected_status": "ACTION_NOT_ALLOWED_ERROR" + } + } + } + ], + "site": "shopping" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": "N/A" + }, + "reference_url": "", + "program_html": [], + "string_note": "The shipping address cannot be changed after an order is placed in the website.", + "reference_answer_raw_annotation": "N/A" + }, + "intent_template_id": 191, + "changelogs": [ + { + "key": "expected_retrieve_value", + "category": "unachievable_tasks", + "note": "Status check used instead of fuzzy match" + } + ] + }, + { + "sites": [ + "shopping" + ], + "task_id": 798, + "require_login": true, + "storage_state": "./.auth/shopping_state.json", + "start_url": "__SHOPPING__", + "geolocation": null, + "intent_template": "Change the delivery address for my most recent order to {{address}}.", + "instantiation_dict": { + "address": "77 Massachusetts Ave, Cambridge, MA" + }, + "intent": "Change the delivery address for my most recent order to 77 Massachusetts Ave, Cambridge, MA.", + "require_reset": false, + "eval": { + "expected_retrieve_value": [ + { + "eval_func": { + "name": "validate_status", + "expected_data": { + "expected_status": "ACTION_NOT_ALLOWED_ERROR" + } + } + } + ], + "site": "shopping" + }, + "original.eval": { + "eval_types": [ + "string_match" + ], + "reference_answers": { + "fuzzy_match": "N/A" + }, + "reference_url": "", + "program_html": [], + "string_note": "The shipping address cannot be changed after an order is placed in the website.", + "reference_answer_raw_annotation": "N/A" + }, + "intent_template_id": 191, + "changelogs": [ + { + "key": "expected_retrieve_value", + "category": "unachievable_tasks", + "note": "Status check used instead of fuzzy match" + } + ] + }, + { + "sites": [ + "gitlab" + ], + "task_id": 799, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "create a new group \"{{name}}\" with members {{members}}", + "instantiation_dict": { + "name": "n-lab", + "members": "patou, egpast, westurner, jontutcher" + }, + "intent": "create a new group \"n-lab\" with members patou, egpast, westurner, jontutcher", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "does_group_exist_with_members", + "eval_params": { + "group_name": "n-lab", + "member_usernames": [ + "patou", + "egpast", + "westurner", + "jontutcher" + ] + } + } + } + ], + "site": "gitlab" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__GITLAB__/groups/n-lab/-/group_members", + "locator": "", + "required_contents": { + "must_include": [ + "@patou", + "@egpast", + "@westurner", + "@jontutcher" + ] + } + } + ] + }, + "intent_template_id": 600, + "changelogs": [ + { + "key": "expected_backend_state", + "category": "switch_to_api_eval", + "note": "Use API to verify group and members; more accurate" + }, + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "gitlab" + ], + "task_id": 800, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "create a new group \"{{name}}\" with members {{members}}", + "instantiation_dict": { + "name": "x-lab", + "members": "JonasVautherin, dilipchandima, dawiss1337, bmyun, DCMJY" + }, + "intent": "create a new group \"x-lab\" with members JonasVautherin, dilipchandima, dawiss1337, bmyun, DCMJY", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "does_group_exist_with_members", + "eval_params": { + "group_name": "x-lab", + "member_usernames": [ + "JonasVautherin", + "dilipchandima", + "dawiss1337", + "bmyun", + "DCMJY" + ] + } + } + } + ], + "site": "gitlab" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__GITLAB__/groups/x-lab/-/group_members", + "locator": "", + "required_contents": { + "must_include": [ + "@JonasVautherin", + "@dilipchandima", + "@dawiss1337", + "@bmyun", + "@DCMJY" + ] + } + } + ] + }, + "intent_template_id": 600, + "changelogs": [ + { + "key": "expected_backend_state", + "category": "switch_to_api_eval", + "note": "Use API to verify group and members; more accurate" + }, + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "gitlab" + ], + "task_id": 801, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "create a new group \"{{name}}\" with members {{members}}", + "instantiation_dict": { + "name": "crew", + "members": "ASWATFZLLC, patrickhlauke, westurner, linkmatrix" + }, + "intent": "create a new group \"crew\" with members ASWATFZLLC, patrickhlauke, westurner, linkmatrix", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "does_group_exist_with_members", + "eval_params": { + "group_name": "crew", + "member_usernames": [ + "ASWATFZLLC", + "patrickhlauke", + "westurner", + "linkmatrix" + ] + } + } + } + ], + "site": "gitlab" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__GITLAB__/groups/crew/-/group_members", + "locator": "", + "required_contents": { + "must_include": [ + "@ASWATFZLLC", + "@patrickhlauke", + "@westurner", + "@linkmatrix" + ] + } + } + ] + }, + "intent_template_id": 600, + "changelogs": [ + { + "key": "expected_backend_state", + "category": "switch_to_api_eval", + "note": "Use API to verify group and members; more accurate" + }, + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "gitlab" + ], + "task_id": 802, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "create a new group \"{{name}}\" with members {{members}}", + "instantiation_dict": { + "name": "coding_friends", + "members": "qhduan, Agnes-U" + }, + "intent": "create a new group \"coding_friends\" with members qhduan, Agnes-U", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "does_group_exist_with_members", + "eval_params": { + "group_name": "coding_friends", + "member_usernames": [ + "qhduan", + "Agnes-U" + ] + } + } + } + ], + "site": "gitlab" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__GITLAB__/groups/coding_friends/-/group_members", + "locator": "", + "required_contents": { + "must_include": [ + "@qhduan", + "@Agnes-U" + ] + } + } + ] + }, + "intent_template_id": 600, + "changelogs": [ + { + "key": "expected_backend_state", + "category": "switch_to_api_eval", + "note": "Use API to verify group and members; more accurate" + }, + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "gitlab" + ], + "task_id": 803, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "create a new group \"{{name}}\" with members {{members}}", + "instantiation_dict": { + "name": "webagent", + "members": "pandey2000, sayakpaul, sayakpaul" + }, + "intent": "create a new group \"webagent\" with members pandey2000, sayakpaul, sayakpaul", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "does_group_exist_with_members", + "eval_params": { + "group_name": "webagent", + "member_usernames": [ + "pandey2000", + "sayakpaul" + ] + } + } + } + ], + "site": "gitlab" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__GITLAB__/groups/webagent/-/group_members", + "locator": "", + "required_contents": { + "must_include": [ + "@pandey2000", + "@sayakpaul" + ] + } + } + ] + }, + "intent_template_id": 600, + "changelogs": [ + { + "key": "expected_backend_state", + "category": "switch_to_api_eval", + "note": "Use API to verify group and members; more accurate" + }, + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "gitlab" + ], + "task_id": 804, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Assign the issue regarding {{issue}} to {{account}}.", + "instantiation_dict": { + "issue": "flash alerts", + "account": "myself and primer" + }, + "intent": "Assign the issue regarding flash alerts to myself and primer.", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "issue_assignee_is", + "eval_params": { + "usernames": [ + "byteblaze", + "primer" + ], + "group": "primer", + "project": "design", + "issue_iid": 104 + } + } + } + ], + "site": "gitlab" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__GITLAB__/dashboard/issues?scope=all&state=opened&assignee_username=byteblaze", + "locator": "", + "required_contents": { + "must_include": [ + "Add documentation on using Flash alerts in dialog components", + "Clarify usage of flash alert" + ] + } + }, + { + "url": "__GITLAB__/dashboard/issues?scope=all&state=opened&assignee_username=primer", + "locator": "", + "required_contents": { + "must_include": [ + "Add documentation on using Flash alerts in dialog components", + "Clarify usage of flash alert" + ] + } + } + ] + }, + "intent_template_id": 999, + "changelogs": [ + { + "key": "expected_backend_state", + "category": "switch_to_api_eval", + "note": "Use API to verify; more accurate" + }, + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "gitlab" + ], + "task_id": 805, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__/a11yproject/a11yproject.com", + "geolocation": null, + "intent_template": "Submit a merge request for {{source_branch}} to be merged into {{target_branch}} branch, assign {{reviewer}} as the reviewer", + "instantiation_dict": { + "source_branch": "the branch that implements the support of template strings", + "target_branch": "master", + "reviewer": "myself and Roshan" + }, + "intent": "Submit a merge request for the branch that implements the support of template strings to be merged into master branch, assign myself and Roshan as the reviewer", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "check_merge_request_has_values", + "eval_params": { + "group": "a11yproject", + "project": "a11yproject.com", + "source_branch": "feature/replace-gulp", + "target_branch": "main", + "reviewer": "byteblaze" + } + } + } + ], + "site": "gitlab" + }, + "original.eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__GITLAB__/a11yproject/a11yproject.com/-/merge_requests", + "program_html": [ + { + "url": "last", + "locator": "document.querySelectorAll(\".detail-page-description > a.gl-font-monospace\")[0].outerText", + "required_contents": { + "exact_match": "feature/replace-gulp" + } + }, + { + "url": "last", + "locator": "document.querySelectorAll(\".detail-page-description > a.gl-font-monospace\")[1].outerText", + "required_contents": { + "exact_match": "main" + } + }, + { + "url": "last", + "locator": "document.querySelector('.block.reviewer').outerText", + "required_contents": { + "must_include": [ + "Byte Blaze", + "Roshan Jossy" + ] + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 335, + "changelogs": [ + { + "key": "expected_backend_state", + "category": "switch_to_api_eval", + "note": "Use API to verify; more accurate" + }, + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "gitlab" + ], + "task_id": 806, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Submit a merge request for {{source_branch}} branch to be merged into {{target_branch}} branch, assign {{reviewer}} as the reviewer", + "instantiation_dict": { + "source_branch": "a11yproject.com/redesign", + "target_branch": "markdown-figure-block", + "reviewer": "myself" + }, + "intent": "Submit a merge request for a11yproject.com/redesign branch to be merged into markdown-figure-block branch, assign myself as the reviewer", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "check_merge_request_has_values", + "eval_params": { + "group": "a11yproject", + "project": "a11yproject.com", + "source_branch": "redesign", + "target_branch": "feature/markdown-figure-block", + "reviewer": "byteblaze" + } + } + } + ], + "site": "gitlab" + }, + "original.eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__GITLAB__/a11yproject/a11yproject.com/-/merge_requests", + "program_html": [ + { + "url": "last", + "locator": "document.querySelectorAll(\".detail-page-description > a.gl-font-monospace\")[0].outerText", + "required_contents": { + "exact_match": "redesign" + } + }, + { + "url": "last", + "locator": "document.querySelectorAll(\".detail-page-description > a.gl-font-monospace\")[1].outerText", + "required_contents": { + "exact_match": "feature/markdown-figure-block" + } + }, + { + "url": "last", + "locator": "document.querySelector('.block.reviewer').outerText", + "required_contents": { + "must_include": [ + "Byte Blaze" + ] + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 335, + "changelogs": [ + { + "key": "expected_backend_state", + "category": "switch_to_api_eval", + "note": "Use API to verify; more accurate" + }, + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "gitlab" + ], + "task_id": 807, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__/primer/design", + "geolocation": null, + "intent_template": "Submit a merge request for {{source_branch}} branch to be merged into {{target_branch}} branch, assign {{reviewer}} as the reviewer", + "instantiation_dict": { + "source_branch": "build time debug", + "target_branch": "main", + "reviewer": "myself" + }, + "intent": "Submit a merge request for build time debug branch to be merged into main branch, assign myself as the reviewer", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "check_merge_request_has_values", + "eval_params": { + "group": "primer", + "project": "design", + "source_branch": "debug-build-time", + "target_branch": "main", + "reviewer": "byteblaze" + } + } + } + ], + "site": "gitlab" + }, + "original.eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__GITLAB__/primer/design/-/merge_requests", + "program_html": [ + { + "url": "last", + "locator": "document.querySelectorAll(\".detail-page-description > a.gl-font-monospace\")[1].outerText", + "required_contents": { + "exact_match": "main" + } + }, + { + "url": "last", + "locator": "document.querySelectorAll(\".detail-page-description > a.gl-font-monospace\")[0].outerText", + "required_contents": { + "exact_match": "debug-build-time" + } + }, + { + "url": "last", + "locator": "document.querySelector('.block.reviewer').outerText", + "required_contents": { + "must_include": [ + "Byte Blaze" + ] + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 335, + "changelogs": [ + { + "key": "expected_backend_state", + "category": "switch_to_api_eval", + "note": "Use API to verify; more accurate" + }, + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "gitlab" + ], + "task_id": 808, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Create an issue in {{repo}} repo with title \"{{issue}}\". Assign the issue to {{account}}. Set due date to be {{due}}", + "instantiation_dict": { + "repo": "cloud-to-butt", + "issue": "Let's keep the project alive", + "account": "myself", + "due": "the end of Q1 2033" + }, + "intent": "Create an issue in cloud-to-butt repo with title \"Let's keep the project alive\". Assign the issue to myself. Set due date to be the end of Q1 2033", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "project_has_issue_with_fields", + "eval_params": { + "group": "byteblaze", + "project": "cloud-to-butt", + "fields": { + "title": "Let's keep the project alive", + "assignee.username": "byteblaze", + "due_date": "2033-03-31" + } + } + } + } + ], + "site": "gitlab" + }, + "original.eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__GITLAB__/byteblaze/cloud-to-butt/-/issues", + "program_html": [ + { + "url": "last", + "locator": "document.querySelector('[data-qa-selector=\"title_content\"]').outerText", + "required_contents": { + "exact_match": "Let's keep the project alive" + } + }, + { + "url": "last", + "locator": "document.querySelector('[data-testid=\"sidebar-due-date\"').outerText", + "required_contents": { + "must_include": [ + "Mar 31, 2033" + ] + } + }, + { + "url": "last", + "locator": "document.querySelector('.block.assignee').outerText", + "required_contents": { + "must_include": [ + "Byte Blaze" + ] + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 327, + "changelogs": [ + { + "key": "expected_backend_state", + "category": "switch_to_api_eval", + "note": "Use API to verify; more accurate" + }, + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "gitlab" + ], + "task_id": 809, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Create an issue in {{repo}} repo with title \"{{issue}}\". Assign the issue to {{account}}. Set due date to be {{due}}", + "instantiation_dict": { + "repo": "a11yproject", + "issue": "404 for many URLs", + "account": "myself", + "due": "2030-1-3" + }, + "intent": "Create an issue in a11yproject repo with title \"404 for many URLs\". Assign the issue to myself. Set due date to be 2030-1-3", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "project_has_issue_with_fields", + "eval_params": { + "group": "a11yproject", + "project": "a11yproject.com", + "fields": { + "title": "404 for many URLs", + "assignee.username": "byteblaze", + "due_date": "2030-01-03" + } + } + } + } + ], + "site": "gitlab" + }, + "original.eval": { + "eval_types": [ + "url_match", + "program_html" + ], + "reference_answers": null, + "reference_url": "__GITLAB__/a11yproject/a11yproject.com/-/issues", + "program_html": [ + { + "url": "last", + "locator": "document.querySelector('[data-qa-selector=\"title_content\"]').outerText", + "required_contents": { + "exact_match": "404 for many URLs" + } + }, + { + "url": "last", + "locator": "document.querySelector('[data-testid=\"sidebar-due-date\"').outerText", + "required_contents": { + "must_include": [ + "Jan 3, 2030" + ] + } + }, + { + "url": "last", + "locator": "document.querySelector('.block.assignee').outerText", + "required_contents": { + "must_include": [ + "Byte Blaze" + ] + } + } + ], + "url_note": "GOLD in PRED" + }, + "intent_template_id": 327, + "changelogs": [ + { + "key": "expected_backend_state", + "category": "switch_to_api_eval", + "note": "Use API to verify; more accurate" + }, + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "gitlab" + ], + "task_id": 810, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Assign the issue regarding {{issue}} in {{repo}} repo to {{account}}.", + "instantiation_dict": { + "repo": "primer design guide", + "issue": "flash alert", + "account": "myself" + }, + "intent": "Assign the issue regarding flash alert in primer design guide repo to myself.", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "issue_assignee_is", + "eval_params": { + "usernames": [ + "byteblaze" + ], + "group": "primer", + "project": "design", + "issue_iid": 104 + } + } + } + ], + "site": "gitlab" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__GITLAB__/dashboard/issues?scope=all&state=opened&assignee_username=byteblaze", + "locator": "", + "required_contents": { + "must_include": [ + "Add documentation on using Flash alerts in dialog components", + "Clarify usage of flash alert" + ] + } + } + ] + }, + "intent_template_id": 999, + "changelogs": [ + { + "key": "expected_backend_state", + "category": "switch_to_api_eval", + "note": "Use API to verify; more accurate" + }, + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + }, + { + "sites": [ + "gitlab" + ], + "task_id": 811, + "require_login": true, + "storage_state": "./.auth/gitlab_state.json", + "start_url": "__GITLAB__", + "geolocation": null, + "intent_template": "Assign the issue regarding {{issue}} in {{repo}} to {{account}}.", + "instantiation_dict": { + "repo": "a11yproject", + "issue": 404, + "account": "myself" + }, + "intent": "Assign the issue regarding 404 in a11yproject to myself.", + "require_reset": false, + "eval": { + "expected_backend_state": [ + { + "eval_func": { + "name": "issue_assignee_is", + "eval_params": { + "usernames": [ + "byteblaze" + ], + "group": "a11yproject", + "project": "a11yproject.com", + "issue_iid": 1478 + } + } + } + ], + "site": "gitlab" + }, + "original.eval": { + "eval_types": [ + "program_html" + ], + "reference_answers": null, + "reference_url": "", + "program_html": [ + { + "url": "__GITLAB__/dashboard/issues?scope=all&state=opened&assignee_username=byteblaze", + "locator": "", + "required_contents": { + "must_include": [ + "404s, bad host, timeouts, bad urls for URLs linked from website" + ] + } + } + ] + }, + "intent_template_id": 999, + "changelogs": [ + { + "key": "expected_backend_state", + "category": "switch_to_api_eval", + "note": "Use API to verify; more accurate" + }, + { + "key": "expected_backend_state", + "category": "dom_dependent_eval", + "note": "Switched from dom dependent evaluation to backend state check" + } + ] + } +] \ No newline at end of file From 02dfc5747a74841cbc6618253fee9086b6946d3b Mon Sep 17 00:00:00 2001 From: Nicolas Gontier Date: Fri, 19 Sep 2025 16:03:58 +0000 Subject: [PATCH 04/64] update dependencies --- browsergym/webarena_verified/README.md | 1 + browsergym/webarena_verified/pyproject.toml | 2 +- browsergym/webarena_verified/requirements.txt | 1 + 3 files changed, 3 insertions(+), 1 deletion(-) diff --git a/browsergym/webarena_verified/README.md b/browsergym/webarena_verified/README.md index 11244aac..658dd0ee 100644 --- a/browsergym/webarena_verified/README.md +++ b/browsergym/webarena_verified/README.md @@ -21,6 +21,7 @@ pip install browsergym-webarena-verified This will automatically install the required dependencies from local file paths: - `webarena-verified` from local platform-labs-agent-eval-harness repository - `agent-eval-harness-common` from local platform-labs-agent-eval-harness repository +- `agent-eval-harness-pytest` from local platform-labs-agent-eval-harness repository **Note**: This package requires the [platform-labs-agent-eval-harness](https://github.com/ServiceNow/platform-labs-agent-eval-harness) repository to be cloned locally at `/home/toolkit/platform-labs-agent-eval-harness` before installation. diff --git a/browsergym/webarena_verified/pyproject.toml b/browsergym/webarena_verified/pyproject.toml index 10c9d726..da593c2a 100644 --- a/browsergym/webarena_verified/pyproject.toml +++ b/browsergym/webarena_verified/pyproject.toml @@ -8,7 +8,7 @@ description = "WebArena Verified benchmark for BrowserGym" authors = [ {name = "ServiceNow"}, ] -requires-python = ">=3.13" +requires-python = ">=3.12" license = {text = "Apache-2.0"} classifiers = [ "Development Status :: 3 - Alpha", diff --git a/browsergym/webarena_verified/requirements.txt b/browsergym/webarena_verified/requirements.txt index f95ac098..4db23ded 100644 --- a/browsergym/webarena_verified/requirements.txt +++ b/browsergym/webarena_verified/requirements.txt @@ -2,3 +2,4 @@ browsergym-core==0.14.2 libwebarena==0.0.4 webarena-verified @ file:///home/toolkit/platform-labs-agent-eval-harness/benchmarks/webarena-verified agent-eval-harness-common @ file:///home/toolkit/platform-labs-agent-eval-harness/packages/agent-eval-harness-common +agent-eval-harness-pytest @ file:///home/toolkit/platform-labs-agent-eval-harness/packages/agent-eval-harness-pytest From 48acaebd42f6f945a606948d63e11639f42595f9 Mon Sep 17 00:00:00 2001 From: Nicolas Gontier Date: Mon, 22 Sep 2025 19:50:27 +0000 Subject: [PATCH 05/64] start adding integration with wa_verified --- .../webarena_verified/browsergym_adapter.py | 245 ++++++++++++++++++ .../webarena_verified/evaluators.py | 78 ++++++ .../src/browsergym/webarena_verified/task.py | 133 ++++++++++ 3 files changed, 456 insertions(+) create mode 100644 browsergym/webarena_verified/src/browsergym/webarena_verified/browsergym_adapter.py create mode 100644 browsergym/webarena_verified/src/browsergym/webarena_verified/evaluators.py create mode 100644 browsergym/webarena_verified/src/browsergym/webarena_verified/task.py diff --git a/browsergym/webarena_verified/src/browsergym/webarena_verified/browsergym_adapter.py b/browsergym/webarena_verified/src/browsergym/webarena_verified/browsergym_adapter.py new file mode 100644 index 00000000..da1829a2 --- /dev/null +++ b/browsergym/webarena_verified/src/browsergym/webarena_verified/browsergym_adapter.py @@ -0,0 +1,245 @@ +""" +BrowserGym adapter for WebArena Verified evaluation system. + +This module provides adapter classes that bridge the gap between BrowserGym's +page-based approach and the platform-labs evaluation system's resource-based approach. +""" + +import logging +from typing import Any, Dict, Optional + +from agent_eval_harness_common.models import AllocationResource, WebsiteRequirement +from playwright.async_api import async_playwright + +from webarena_verified.evaluation.evaluator import WebArenaEvaluator +from webarena_verified.types import ( + ActionType, + StatusType, + WebArenaTask, + WebArenaTaskResponse, + WebArenaTaskStatus, + WebArenaVerifiedAgentResponse, +) + +logger = logging.getLogger(__name__) + + + +class BrowserGymResponseAdapter: + """ + Adapter that converts BrowserGym task results to WebArenaTaskResponse format. + """ + + @staticmethod + def create_response_from_result( + page, + agent_response: Optional[str] = None, + status: WebArenaTaskStatus = WebArenaTaskStatus.SUCCESS, + error_details: Optional[list[str]] = None, + ) -> WebArenaTaskResponse: + """ + Create a WebArenaTaskResponse from BrowserGym task execution results. + + Args: + page: Playwright page object + agent_response: Optional agent response string + status: Task execution status + error_details: Optional error details + Returns: + WebArenaTaskResponse object + """ + try: + # Get current and recent URLs + current_url = page.url + last_urls = [current_url] # In a full implementation, this would track navigation history + + # Create agent response if provided + verified_response = None + if agent_response is not None: + # Try to parse the response and determine action type + action_type = BrowserGymResponseAdapter._determine_action_type(agent_response) + response_status = ( + StatusType.SUCCESS + if status == WebArenaTaskStatus.SUCCESS + else StatusType.UNKNOWN_ERROR + ) + + # Extract results if it's a retrieve action + results = None + if action_type == ActionType.RETRIEVE and response_status == StatusType.SUCCESS: + results = BrowserGymResponseAdapter._extract_results(agent_response) + + verified_response = WebArenaVerifiedAgentResponse( + action=action_type, + status=response_status, + results=results, + error_details=error_details[0] if error_details else None + ) + + return WebArenaTaskResponse( + response=verified_response, + last_urls=last_urls, + status=status, + error_details=error_details, + ) + + except Exception as e: + logger.error(f"Error creating WebArenaTaskResponse: {e}") + return WebArenaTaskResponse( + response=None, + last_urls=[page.url if page else ""], + status=WebArenaTaskStatus.AGENT_FAILURE, + error_details=[str(e)], + ) + + @staticmethod + def _determine_action_type(response: str) -> ActionType: + """ + Determine the action type from the response content. + + Args: + response: Agent response string + + Returns: + ActionType enum value + """ + response_lower = response.lower() + + # Simple heuristics to determine action type + if any(word in response_lower for word in ["get", "find", "search", "retrieve", "show", "list"]): + return ActionType.RETRIEVE + elif any(word in response_lower for word in ["create", "add", "update", "delete", "modify", "change"]): + return ActionType.MUTATE + elif any(word in response_lower for word in ["navigate", "go to", "visit", "open"]): + return ActionType.NAVIGATE + else: + # Default to retrieve for most cases + return ActionType.RETRIEVE + + @staticmethod + def _extract_results(response: str) -> list[Any]: + """ + Extract structured results from the response. + + Args: + response: Agent response string + + Returns: + List of extracted results + """ + # This is a simplified extraction - in practice, you'd want more sophisticated parsing + # For now, just return the response as a single result + return [response.strip()] if response.strip() else [] + + +class BrowserGymEvaluationAdapter: + """ + Main adapter that orchestrates the evaluation process for BrowserGym tasks. + """ + + def __init__(self): + self.response_adapter = BrowserGymResponseAdapter() + + async def evaluate_task( + self, + page, + config: Dict[str, Any], + agent_response: Optional[str] = None, + ) -> Dict[str, Any]: + """ + Evaluate a BrowserGym task using the WebArena Verified evaluation system. + Args: + page: Playwright page object + config: Task configuration dictionary + agent_response: Optional agent response string + Returns: + Evaluation result dictionary + """ + # Convert BrowserGym inputs to WebArena Verified format + task = WebArenaTask.model_validate(config) + resource = AllocationResource( + website_type=task.eval.site, + readonly=False, + ) # TODO: create a dummy resource for now + + # Try to parse the agent_response as a WebArenaVerifiedAgentResponse + try: + last_urls = await self._get_last_urls([resource]) + task_result = WebArenaTaskResponse( + response=WebArenaVerifiedAgentResponse.model_validate(agent_response), + last_urls=last_urls, + status=WebArenaTaskStatus.SUCCESS, + ) + except Exception as e: + logger.error(f"Failed to validate task result: {e}, agent_response: {agent_response}") + # task_result = WebArenaTaskResponse( + # response=None, + # last_urls=last_urls, + # status=WebArenaTaskStatus.AGENT_FAILURE, + # error_details=[str(e), f"Result String: {agent_response}"], + # ) + raise + + # task_response = self.response_adapter.create_response_from_result( + # page, agent_response + # ) + + evaluator = WebArenaEvaluator() + eval_results = await evaluator.evaluate_task( + task=task, + task_result=task_result, + resources=[resource], + ) + + # Convert results back to a simple score and message + if eval_results: + # Success only if all evals passed + overall_score = 1.0 if all(r.score == 1.0 for r in eval_results) else 0.0 + # Concatenate messages + messages = [ + msg for result in eval_results for msg in result.assertion_msgs + ] + message = "\n".join(messages) + return { + "score": overall_score, + "message": message, + } + else: + return { + "score": 0.0, + "message": "No evaluation results returned", + } + + + + # Copied from platform-labs-agent-eval-harness/benchmarks/webarena-verified/tests/test_benchmark_task.py + @staticmethod + async def _get_last_urls(resources: list[AllocationResource]) -> list[str]: + async with async_playwright() as playwright: + for resource in resources: + browser = await playwright.chromium.connect_over_cdp(resource.cdp_url) + if browser.contexts: + context = browser.contexts[0] + + return [page.url for page in context.pages] + + # copied from platform-labs-agent-eval-harness/benchmarks/webarena-verified/tests/conftest.py + @staticmethod + def _get_resource_requirements( + task: WebArenaTask, + ) -> list[WebsiteRequirement]: + """Benchmark hook: provide requirements for all sites in the current task. + + Returns a list of WebsiteRequirement (shared model) with website_type and readonly. + """ + + requirements: list[WebsiteRequirement] = [] + for site in task.sites: + website_type = site.value + print(f"Adding requirement for {website_type}") + # Default to write allocations for safety; customize if needed per task + readonly_flag = False + requirements.append( + WebsiteRequirement(website_type=website_type, readonly=readonly_flag) + ) + return requirements \ No newline at end of file diff --git a/browsergym/webarena_verified/src/browsergym/webarena_verified/evaluators.py b/browsergym/webarena_verified/src/browsergym/webarena_verified/evaluators.py new file mode 100644 index 00000000..c7ec43e8 --- /dev/null +++ b/browsergym/webarena_verified/src/browsergym/webarena_verified/evaluators.py @@ -0,0 +1,78 @@ +""" +WebArena Verified evaluators that integrate the full evaluation system +from platform-labs-agent-eval-harness. +""" + +import asyncio +import json +import logging + +import playwright + +from .browsergym_adapter import BrowserGymEvaluationAdapter + +logger = logging.getLogger(__name__) + + +class WebArenaVerifiedEvaluator: + """ + Evaluator that integrates the webarena_verified evaluation system. + + This evaluator handles the new evaluation format with: + - expected_retrieve_value: Validates data retrieval + - expected_backend_state: Validates backend/database changes + - expected_ui_state: Validates UI state changes + """ + + def __init__(self): + """ + Initialize the evaluator. + """ + self.browsergym_adapter = BrowserGymEvaluationAdapter() + + def __call__( + self, + trajectory: list[dict], + config_file: str, + page: playwright.sync_api.Page = None, + client: playwright.async_api.CDPSession | None = None, + ) -> float: + """ + Entry point compatible with GenericWebArenaTask.validate(...). + + Args: + trajectory: Fake trajectory from BrowserGym: [{}, last_action]. last_action["answer"] may contain answer. + config_file: Config file path. + page: Playwright page. + client: Always None, none of webarena's evaluators requires a cdp session. + Returns: + Float score compatible with BrowserGym (1.0 or 0.0) + """ + with open(config_file, "r") as f: + config = json.load(f) + + agent_response = trajectory[-1].get("answer") + + # Run advanced evaluation via adapter and return float score + result = asyncio.run(self._async_evaluation(page, config, agent_response=agent_response)) + return float(result) + + async def _async_evaluation(self, page, config, agent_response=None) -> float: + """ + Use the full webarena_verified evaluation system via the BrowserGym adapter. + """ + task_id = config.get("task_id") + logger.info(f"Running webarena_verified async evaluation for task {task_id}") + + try: + # Use the BrowserGym adapter to evaluate the task + result = await self.browsergym_adapter.evaluate_task( + page=page, + config=config, + agent_response=agent_response, + ) + return result.get("score", 0.0) + + except Exception as e: + logger.error(f"Error in webarena_verified async evaluation: {e}") + return 0.0 diff --git a/browsergym/webarena_verified/src/browsergym/webarena_verified/task.py b/browsergym/webarena_verified/src/browsergym/webarena_verified/task.py new file mode 100644 index 00000000..a46f0cc5 --- /dev/null +++ b/browsergym/webarena_verified/src/browsergym/webarena_verified/task.py @@ -0,0 +1,133 @@ +import importlib.resources +import json +import logging +import tempfile +from typing import Optional + +import playwright.sync_api + +from browsergym.webarena.task import GenericWebArenaTask + +logger = logging.getLogger(__name__) + + +class WebArenaVerifiedTask(GenericWebArenaTask): + """ + WebArena Verified task class that integrates the full evaluation system + from platform-labs-agent-eval-harness. + + This task class handles the new evaluation format with: + - expected_retrieve_value + - expected_backend_state + - expected_ui_state + """ + + def __init__( + self, + seed: int, + task_id: Optional[int] = None, + intent_template_id: Optional[int] = None, + with_na_hint: bool = False, + with_homepage_hint: bool = False, + ): + super().__init__( + seed=seed, + task_id=task_id, + intent_template_id=intent_template_id, + with_na_hint=with_na_hint, + with_homepage_hint=with_homepage_hint, + ) + + # Load the webarena_verified.json file + all_configs_str = ( + importlib.resources.files("browsergym.webarena_verified") + .joinpath("webarena_verified.json") + .read_text() + ) + + # substitute URLs + for pattern, url_key in { + "__GITLAB__": "gitlab", + "__REDDIT__": "reddit", + "__SHOPPING__": "shopping", + "__SHOPPING_ADMIN__": "shopping_admin", + "__WIKIPEDIA__": "wikipedia", + "__MAP__": "map", + }.items(): + all_configs_str = all_configs_str.replace(pattern, self.webarena_instance.urls[url_key]) + + # load all task configs to JSON + all_configs = json.loads(all_configs_str) + + # keep only the desired task configs + if intent_template_id is not None: + task_configs = [ + conf for conf in all_configs if conf["intent_template_id"] == intent_template_id + ] + if not task_configs: + raise ValueError( + f"Could not find any task config with intent_template_id={intent_template_id}." + ) + + elif task_id is not None: + # Filter configs by task_id + task_configs = [conf for conf in all_configs if conf["task_id"] == task_id] + if not task_configs: + raise ValueError(f"Could not find any task config with task_id={task_id}.") + + self.task_configs = task_configs + + def setup(self, page: playwright.sync_api.Page) -> tuple[str, dict]: + # Using the webarena_verified evaluator system + from .evaluators import WebArenaVerifiedEvaluator + + # pick a task at random + self.config = self.random.choice(self.task_configs) + + # hack: dynamically build a config file to read from + with tempfile.NamedTemporaryFile(mode="w+", delete=False) as f: + json.dump(self.config, f) + f.flush() + self.config_file = f.name + + # build the evaluator using the new webarena_verified evaluation system + self.evaluator = WebArenaVerifiedEvaluator(self.config) + + # authenticate + for site in self.config["sites"]: + self.webarena_instance.ui_login(site=site, page=page) + + # set geolocation if specified + if self.config.get("geolocation"): + page.context.set_geolocation(self.config["geolocation"]) + + # navigate to the starting url(s) (might need several pages) + # https://github.com/web-arena-x/webarena/blob/c6475f0e9affe5252a2966e26b8cb4c834a4ae40/browser_env/envs.py#L150 + if self.config["start_url"]: + start_urls = self.config["start_url"].split(" |AND| ") + for i, url in enumerate(start_urls): + page.goto(url) + if i < len(start_urls) - 1: + page = page.context.new_page() + + # recover goal + goal = self.config["intent"] + + # This note is present in all webarena's agent prompts + # https://github.com/web-arena-x/webarena/blob/c6475f0e9affe5252a2966e26b8cb4c834a4ae40/agent/prompts/raw/p_cot_id_actree_2s.py#L34 + # However, webarena_verified does not have a homepage, so skip this hint + self.with_homepage_hint = False + if self.with_homepage_hint: + goal += f""" + +(Note: if you want to visit other websites, check out the homepage at {self.webarena_instance.home_url}. It has a list of websites you can visit. {self.webarena_instance.home_url}/password.html lists all the account name and password for the websites. You can use them to log in to the websites.) +""" + + # This note is present in some of webarena's agent prompts + if self.with_na_hint: + goal += """\ + +If you believe the task is impossible to complete, provide the answer "N/A". +""" + + return goal, {} From 5be792d08b0bd81bbe12e27e4957f1cb9057e4c0 Mon Sep 17 00:00:00 2001 From: Nicolas Gontier Date: Tue, 23 Sep 2025 19:26:11 +0000 Subject: [PATCH 06/64] upd readme --- browsergym/webarena_verified/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/browsergym/webarena_verified/README.md b/browsergym/webarena_verified/README.md index 658dd0ee..43ae475d 100644 --- a/browsergym/webarena_verified/README.md +++ b/browsergym/webarena_verified/README.md @@ -15,7 +15,7 @@ git clone https://github.com/ServiceNow/platform-labs-agent-eval-harness.git /ho ### 1. Install this BrowserGym package ```bash -pip install browsergym-webarena-verified +pip install -e ./browsergym/webarena_verified ``` This will automatically install the required dependencies from local file paths: From aae906c3f9e9ae8e4ffad2ce06b09c52eeacce21 Mon Sep 17 00:00:00 2001 From: Nicolas Gontier Date: Tue, 23 Sep 2025 19:49:25 +0000 Subject: [PATCH 07/64] use custom backend for webarena_verified --- .../experiments/benchmark/configs.py | 3 +- .../browsergym/experiments/benchmark/utils.py | 30 +++ .../browsergym/webarena_verified/instance.py | 222 ++++++++++++++++++ 3 files changed, 254 insertions(+), 1 deletion(-) create mode 100644 browsergym/webarena_verified/src/browsergym/webarena_verified/instance.py diff --git a/browsergym/experiments/src/browsergym/experiments/benchmark/configs.py b/browsergym/experiments/src/browsergym/experiments/benchmark/configs.py index ee239021..9551111d 100644 --- a/browsergym/experiments/src/browsergym/experiments/benchmark/configs.py +++ b/browsergym/experiments/src/browsergym/experiments/benchmark/configs.py @@ -1,4 +1,5 @@ import numpy as np + from browsergym.experiments.benchmark.metadata.utils import ( task_list_from_metadata, task_metadata, @@ -137,7 +138,7 @@ high_level_action_set_args=DEFAULT_HIGHLEVEL_ACTION_SET_ARGS["webarena"], is_multi_tab=True, supports_parallel_seeds=False, - backends=["webarena"], + backends=["webarena_verified"], env_args_list=make_env_args_list_from_repeat_tasks( task_list=task_list_from_metadata(metadata=task_metadata("webarena_verified")), max_steps=30, diff --git a/browsergym/experiments/src/browsergym/experiments/benchmark/utils.py b/browsergym/experiments/src/browsergym/experiments/benchmark/utils.py index c554b15d..b7f256c8 100644 --- a/browsergym/experiments/src/browsergym/experiments/benchmark/utils.py +++ b/browsergym/experiments/src/browsergym/experiments/benchmark/utils.py @@ -6,6 +6,7 @@ from typing import Literal import numpy as np + from browsergym.experiments.loop import SEED_MAX, EnvArgs logger = logging.getLogger(__name__) @@ -141,6 +142,35 @@ def prepare_backend(backend: str): ] ) + case "webarena_verified": + # register environments + import browsergym.webarena_verified + + # full reset the instance (requires environment variables properly set up) + from browsergym.webarena_verified.instance import WebArenaVerifiedInstance + + default_instance = WebArenaVerifiedInstance() + default_instance.full_reset() + + # logging.info( + # f"Initiating WebArena Verified instance warm-up. Some tasks will be pre-loaded (massaged) to trigger some caching mechanisms and make the server more responsive." + # ) + # massage_tasks( + # [ + # f"webarena_verified.{id}" + # for id in [ + # 410, # reddit + # 533, # gitlab + # 561, # gitlab wiki + # 562, # gitlab reddit + # 574, # shopping + # 640, # reddit + # 680, # shopping_admin + # 740, # wiki map + # ] + # ] + # ) + case "visualwebarena": # register environments import browsergym.visualwebarena diff --git a/browsergym/webarena_verified/src/browsergym/webarena_verified/instance.py b/browsergym/webarena_verified/src/browsergym/webarena_verified/instance.py new file mode 100644 index 00000000..1b72fe39 --- /dev/null +++ b/browsergym/webarena_verified/src/browsergym/webarena_verified/instance.py @@ -0,0 +1,222 @@ +import logging +import os +import time + +import playwright.sync_api +import requests + +logger = logging.getLogger(__name__) + +ENV_VARS = ("SHOPPING", "SHOPPING_ADMIN", "REDDIT", "GITLAB", "WIKIPEDIA", "MAP", "HOMEPAGE") + + +class WebArenaVerifiedInstance: + """ + Utility class to access a WebArena instance. + + """ + + RESET_URL_VAR = "WA_FULL_RESET" # used by full_reset() + + def __init__( + self, + ) -> None: + + # setup webarena environment variables (webarena will read those on import) + append_wa = lambda x: f"WA_{x}" + for key in ENV_VARS: + assert append_wa(key) in os.environ, ( + f"Environment variable {append_wa(key)} missing.\n" + + "Please set the following environment variables to use WebArena through BrowserGym:\n" + + "\n".join([append_wa(x) for x in ENV_VARS]) + ) + os.environ[key] = os.environ[append_wa(key)] + + # import webarena on instanciation + from webarena.browser_env.env_config import ( + ACCOUNTS, + GITLAB, + HOMEPAGE, + MAP, + REDDIT, + SHOPPING, + SHOPPING_ADMIN, + WIKIPEDIA, + ) + + self.urls = { + "reddit": REDDIT, + "gitlab": GITLAB, + "shopping": SHOPPING, + "shopping_admin": SHOPPING_ADMIN, + "wikipedia": WIKIPEDIA, + "map": MAP, + } + self.home_url = HOMEPAGE + + self.credentials = ACCOUNTS + + def full_reset(self, skip_if_not_set: bool = True, site: str = "all"): + base_url = os.environ.get(self.RESET_URL_VAR, None) + + if not base_url: + # check for reset URL + logger.error( + f"Environment variable {self.RESET_URL_VAR} is missing or empty, required for a full instance reset." + ) + if skip_if_not_set: + logger.warning( + f"Skipping automated reset. Make sure the instance has been manually reset." + ) + else: + raise RuntimeError(f"Could not reset instance, aborting.") + + else: + # reset the instance + reset_url = f"{base_url}/reset" + status_url = f"{base_url}/status" + + logger.info( + f"Initiating {self.__class__.__name__} instance reset on URL {reset_url}. Should take between 200 - 500 seconds to restart." + ) + + # trigger instance reset + response = requests.get(reset_url) + match response.status_code: + case 200: + logger.info(f"Reset started.") + case 418: + logger.warning("Reset was already running.") + case _: + raise Exception( + f"{self.__class__.__name__} reset request {reset_url} failed ({response.status_code}): {response.text}" + ) + + # wait until reset complete + retry_after = 20 # 20 seconds wait between status checks + timeout = 10 * 60 # 10 minutes timeout + start_time = time.time() + while True: + # request instance status + response = requests.get(status_url) + # check for server error + if response.status_code != 200: + raise Exception( + f"{self.__class__.__name__} status request {status_url} failed ({response.status_code}): {response.text}" + ) + # check for readiness + if response.text == "Ready for duty!": + break + # check for timeout + time_elapsed = time.time() - start_time + logger.info(f"Reset still running after {time_elapsed:.0f} seconds...") + if time_elapsed > timeout: + raise Exception( + f"Reset still running after {time_elapsed} seconds (> {timeout}), aborting." + ) + # wait a bit before next retry + time.sleep(retry_after) + + # warm-start the instance (navigate to every domain) + retries_left = 3 + while retries_left: + retries_left -= 1 + try: + self._check_is_reachable( + timeout=60, + site=site, + ) # 60 seconds, warming up after reset might be slow + break + except Exception as e: + if not retries_left: + raise + logger.info( + f"Instance unresponsive after reset, retrying ({retries_left} retries left)\n{e}" + ) + + def check_status(self, site: str = "all"): + """ + Check the status of the instance. Raises an error if the instance is not ready to be used. + + """ + self._check_is_reachable(timeout=10, site=site) # 10 seconds + + def _check_is_reachable(self, timeout: int, site: str = "all"): + """ + Test that every website is reachable. + + """ + if site == "all": + sites = self.urls.keys() + else: + assert site in self.urls.keys(), f"Site {site} not found in {self.urls.keys()}" + sites = [site] + + for site in sites: + url = self.urls[site] + try: + requests.get(url, timeout=timeout) + except (requests.exceptions.ConnectionError, requests.exceptions.Timeout): + raise RuntimeError( + f'WebArena site "{site}" ({url}) is not reacheable. Please check the URL.' + ) + + def ui_login(self, site: str, page: playwright.sync_api.Page): + """ + Should only be called once per site (expects user to be logged out). + """ + + url = self.urls[site] + + # open a new page (tab) to perform the login + page = page.context.new_page() + + match site: + case "reddit": + username = self.credentials[site]["username"] + password = self.credentials[site]["password"] + + page.goto(f"{url}") + page.get_by_role("link", name="Log in").click() + page.get_by_label("Username").fill(username) + page.get_by_label("Password").fill(password) + page.get_by_role("button", name="Log in").click() + + case "gitlab": + username = self.credentials[site]["username"] + password = self.credentials[site]["password"] + + page.goto(f"{url}/users/sign_in") + page.get_by_label("Username or email").fill(username) + page.get_by_label("Password").fill(password) + page.get_by_role("button", name="Sign in").click() + + case "shopping": + username = self.credentials[site]["username"] + password = self.credentials[site]["password"] + + page.goto(f"{url}/customer/account/login/") + page.get_by_label("Email", exact=True).fill(username) + page.get_by_label("Password", exact=True).fill(password) + page.get_by_role("button", name="Sign In").click() + + case "shopping_admin": + username = self.credentials[site]["username"] + password = self.credentials[site]["password"] + + page.goto(url) + page.get_by_label("Username").fill(username) + page.get_by_label("Password").fill(password) + page.get_by_role("button", name="Sign in").click() + + case "wikipedia": + page.goto(url) + + case "map": + page.goto(url) + + case _: + raise ValueError + + # release login page + page.close() From 2b04c7d9f12d9cd8fee752838228f4150bfc882d Mon Sep 17 00:00:00 2001 From: Nicolas Gontier Date: Tue, 23 Sep 2025 19:51:53 +0000 Subject: [PATCH 08/64] pass the wa instance to the evaluator --- .../src/browsergym/webarena_verified/evaluators.py | 6 ++++-- .../src/browsergym/webarena_verified/instance.py | 3 ++- .../src/browsergym/webarena_verified/task.py | 10 +++++++++- 3 files changed, 15 insertions(+), 4 deletions(-) diff --git a/browsergym/webarena_verified/src/browsergym/webarena_verified/evaluators.py b/browsergym/webarena_verified/src/browsergym/webarena_verified/evaluators.py index c7ec43e8..d031eccd 100644 --- a/browsergym/webarena_verified/src/browsergym/webarena_verified/evaluators.py +++ b/browsergym/webarena_verified/src/browsergym/webarena_verified/evaluators.py @@ -9,6 +9,8 @@ import playwright +from browsergym.webarena_verified.instance import WebArenaVerifiedInstance + from .browsergym_adapter import BrowserGymEvaluationAdapter logger = logging.getLogger(__name__) @@ -24,11 +26,11 @@ class WebArenaVerifiedEvaluator: - expected_ui_state: Validates UI state changes """ - def __init__(self): + def __init__(self, webarena_instance: WebArenaVerifiedInstance): """ Initialize the evaluator. """ - self.browsergym_adapter = BrowserGymEvaluationAdapter() + self.browsergym_adapter = BrowserGymEvaluationAdapter(webarena_instance) def __call__( self, diff --git a/browsergym/webarena_verified/src/browsergym/webarena_verified/instance.py b/browsergym/webarena_verified/src/browsergym/webarena_verified/instance.py index 1b72fe39..8252d729 100644 --- a/browsergym/webarena_verified/src/browsergym/webarena_verified/instance.py +++ b/browsergym/webarena_verified/src/browsergym/webarena_verified/instance.py @@ -12,8 +12,9 @@ class WebArenaVerifiedInstance: """ - Utility class to access a WebArena instance. + Utility class to access a WebArena Verified instances. + TODO: connect to the wa_verified client dispatcher to access the instances and reset them """ RESET_URL_VAR = "WA_FULL_RESET" # used by full_reset() diff --git a/browsergym/webarena_verified/src/browsergym/webarena_verified/task.py b/browsergym/webarena_verified/src/browsergym/webarena_verified/task.py index a46f0cc5..c7547d53 100644 --- a/browsergym/webarena_verified/src/browsergym/webarena_verified/task.py +++ b/browsergym/webarena_verified/src/browsergym/webarena_verified/task.py @@ -7,6 +7,8 @@ import playwright.sync_api from browsergym.webarena.task import GenericWebArenaTask +from browsergym.webarena_verified.evaluators import WebArenaVerifiedEvaluator +from browsergym.webarena_verified.instance import WebArenaVerifiedInstance logger = logging.getLogger(__name__) @@ -38,6 +40,9 @@ def __init__( with_homepage_hint=with_homepage_hint, ) + # override the webarena instance to use the webarena_verified instance + self.webarena_instance = WebArenaVerifiedInstance() + # Load the webarena_verified.json file all_configs_str = ( importlib.resources.files("browsergym.webarena_verified") @@ -74,6 +79,9 @@ def __init__( task_configs = [conf for conf in all_configs if conf["task_id"] == task_id] if not task_configs: raise ValueError(f"Could not find any task config with task_id={task_id}.") + else: + # keep all task configs + task_configs = all_configs self.task_configs = task_configs @@ -91,7 +99,7 @@ def setup(self, page: playwright.sync_api.Page) -> tuple[str, dict]: self.config_file = f.name # build the evaluator using the new webarena_verified evaluation system - self.evaluator = WebArenaVerifiedEvaluator(self.config) + self.evaluator = WebArenaVerifiedEvaluator(self.webarena_instance) # authenticate for site in self.config["sites"]: From 0d7e8dc21d0340b255b6c46263512f7c7160b88f Mon Sep 17 00:00:00 2001 From: Nicolas Gontier Date: Tue, 23 Sep 2025 19:52:17 +0000 Subject: [PATCH 09/64] pass the wa instance to the evaluator --- .../src/browsergym/webarena_verified/browsergym_adapter.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/browsergym/webarena_verified/src/browsergym/webarena_verified/browsergym_adapter.py b/browsergym/webarena_verified/src/browsergym/webarena_verified/browsergym_adapter.py index da1829a2..e37ccc79 100644 --- a/browsergym/webarena_verified/src/browsergym/webarena_verified/browsergym_adapter.py +++ b/browsergym/webarena_verified/src/browsergym/webarena_verified/browsergym_adapter.py @@ -11,6 +11,7 @@ from agent_eval_harness_common.models import AllocationResource, WebsiteRequirement from playwright.async_api import async_playwright +from browsergym.webarena_verified.instance import WebArenaVerifiedInstance from webarena_verified.evaluation.evaluator import WebArenaEvaluator from webarena_verified.types import ( ActionType, @@ -137,7 +138,8 @@ class BrowserGymEvaluationAdapter: Main adapter that orchestrates the evaluation process for BrowserGym tasks. """ - def __init__(self): + def __init__(self, webarena_instance: WebArenaVerifiedInstance): + self.webarena_instance = webarena_instance # TODO use this to access the webarena_verified dispatcher and access the instances self.response_adapter = BrowserGymResponseAdapter() async def evaluate_task( From b57c0f880c06075d7e6110134a8841a4b8323cfb Mon Sep 17 00:00:00 2001 From: Nicolas Gontier Date: Fri, 26 Sep 2025 02:39:13 +0000 Subject: [PATCH 10/64] cleanup evaluator --- .../webarena_verified/browsergym_adapter.py | 247 ------------------ .../webarena_verified/evaluators.py | 144 ++++++++-- .../browsergym/webarena_verified/instance.py | 13 +- 3 files changed, 136 insertions(+), 268 deletions(-) delete mode 100644 browsergym/webarena_verified/src/browsergym/webarena_verified/browsergym_adapter.py diff --git a/browsergym/webarena_verified/src/browsergym/webarena_verified/browsergym_adapter.py b/browsergym/webarena_verified/src/browsergym/webarena_verified/browsergym_adapter.py deleted file mode 100644 index e37ccc79..00000000 --- a/browsergym/webarena_verified/src/browsergym/webarena_verified/browsergym_adapter.py +++ /dev/null @@ -1,247 +0,0 @@ -""" -BrowserGym adapter for WebArena Verified evaluation system. - -This module provides adapter classes that bridge the gap between BrowserGym's -page-based approach and the platform-labs evaluation system's resource-based approach. -""" - -import logging -from typing import Any, Dict, Optional - -from agent_eval_harness_common.models import AllocationResource, WebsiteRequirement -from playwright.async_api import async_playwright - -from browsergym.webarena_verified.instance import WebArenaVerifiedInstance -from webarena_verified.evaluation.evaluator import WebArenaEvaluator -from webarena_verified.types import ( - ActionType, - StatusType, - WebArenaTask, - WebArenaTaskResponse, - WebArenaTaskStatus, - WebArenaVerifiedAgentResponse, -) - -logger = logging.getLogger(__name__) - - - -class BrowserGymResponseAdapter: - """ - Adapter that converts BrowserGym task results to WebArenaTaskResponse format. - """ - - @staticmethod - def create_response_from_result( - page, - agent_response: Optional[str] = None, - status: WebArenaTaskStatus = WebArenaTaskStatus.SUCCESS, - error_details: Optional[list[str]] = None, - ) -> WebArenaTaskResponse: - """ - Create a WebArenaTaskResponse from BrowserGym task execution results. - - Args: - page: Playwright page object - agent_response: Optional agent response string - status: Task execution status - error_details: Optional error details - Returns: - WebArenaTaskResponse object - """ - try: - # Get current and recent URLs - current_url = page.url - last_urls = [current_url] # In a full implementation, this would track navigation history - - # Create agent response if provided - verified_response = None - if agent_response is not None: - # Try to parse the response and determine action type - action_type = BrowserGymResponseAdapter._determine_action_type(agent_response) - response_status = ( - StatusType.SUCCESS - if status == WebArenaTaskStatus.SUCCESS - else StatusType.UNKNOWN_ERROR - ) - - # Extract results if it's a retrieve action - results = None - if action_type == ActionType.RETRIEVE and response_status == StatusType.SUCCESS: - results = BrowserGymResponseAdapter._extract_results(agent_response) - - verified_response = WebArenaVerifiedAgentResponse( - action=action_type, - status=response_status, - results=results, - error_details=error_details[0] if error_details else None - ) - - return WebArenaTaskResponse( - response=verified_response, - last_urls=last_urls, - status=status, - error_details=error_details, - ) - - except Exception as e: - logger.error(f"Error creating WebArenaTaskResponse: {e}") - return WebArenaTaskResponse( - response=None, - last_urls=[page.url if page else ""], - status=WebArenaTaskStatus.AGENT_FAILURE, - error_details=[str(e)], - ) - - @staticmethod - def _determine_action_type(response: str) -> ActionType: - """ - Determine the action type from the response content. - - Args: - response: Agent response string - - Returns: - ActionType enum value - """ - response_lower = response.lower() - - # Simple heuristics to determine action type - if any(word in response_lower for word in ["get", "find", "search", "retrieve", "show", "list"]): - return ActionType.RETRIEVE - elif any(word in response_lower for word in ["create", "add", "update", "delete", "modify", "change"]): - return ActionType.MUTATE - elif any(word in response_lower for word in ["navigate", "go to", "visit", "open"]): - return ActionType.NAVIGATE - else: - # Default to retrieve for most cases - return ActionType.RETRIEVE - - @staticmethod - def _extract_results(response: str) -> list[Any]: - """ - Extract structured results from the response. - - Args: - response: Agent response string - - Returns: - List of extracted results - """ - # This is a simplified extraction - in practice, you'd want more sophisticated parsing - # For now, just return the response as a single result - return [response.strip()] if response.strip() else [] - - -class BrowserGymEvaluationAdapter: - """ - Main adapter that orchestrates the evaluation process for BrowserGym tasks. - """ - - def __init__(self, webarena_instance: WebArenaVerifiedInstance): - self.webarena_instance = webarena_instance # TODO use this to access the webarena_verified dispatcher and access the instances - self.response_adapter = BrowserGymResponseAdapter() - - async def evaluate_task( - self, - page, - config: Dict[str, Any], - agent_response: Optional[str] = None, - ) -> Dict[str, Any]: - """ - Evaluate a BrowserGym task using the WebArena Verified evaluation system. - Args: - page: Playwright page object - config: Task configuration dictionary - agent_response: Optional agent response string - Returns: - Evaluation result dictionary - """ - # Convert BrowserGym inputs to WebArena Verified format - task = WebArenaTask.model_validate(config) - resource = AllocationResource( - website_type=task.eval.site, - readonly=False, - ) # TODO: create a dummy resource for now - - # Try to parse the agent_response as a WebArenaVerifiedAgentResponse - try: - last_urls = await self._get_last_urls([resource]) - task_result = WebArenaTaskResponse( - response=WebArenaVerifiedAgentResponse.model_validate(agent_response), - last_urls=last_urls, - status=WebArenaTaskStatus.SUCCESS, - ) - except Exception as e: - logger.error(f"Failed to validate task result: {e}, agent_response: {agent_response}") - # task_result = WebArenaTaskResponse( - # response=None, - # last_urls=last_urls, - # status=WebArenaTaskStatus.AGENT_FAILURE, - # error_details=[str(e), f"Result String: {agent_response}"], - # ) - raise - - # task_response = self.response_adapter.create_response_from_result( - # page, agent_response - # ) - - evaluator = WebArenaEvaluator() - eval_results = await evaluator.evaluate_task( - task=task, - task_result=task_result, - resources=[resource], - ) - - # Convert results back to a simple score and message - if eval_results: - # Success only if all evals passed - overall_score = 1.0 if all(r.score == 1.0 for r in eval_results) else 0.0 - # Concatenate messages - messages = [ - msg for result in eval_results for msg in result.assertion_msgs - ] - message = "\n".join(messages) - return { - "score": overall_score, - "message": message, - } - else: - return { - "score": 0.0, - "message": "No evaluation results returned", - } - - - - # Copied from platform-labs-agent-eval-harness/benchmarks/webarena-verified/tests/test_benchmark_task.py - @staticmethod - async def _get_last_urls(resources: list[AllocationResource]) -> list[str]: - async with async_playwright() as playwright: - for resource in resources: - browser = await playwright.chromium.connect_over_cdp(resource.cdp_url) - if browser.contexts: - context = browser.contexts[0] - - return [page.url for page in context.pages] - - # copied from platform-labs-agent-eval-harness/benchmarks/webarena-verified/tests/conftest.py - @staticmethod - def _get_resource_requirements( - task: WebArenaTask, - ) -> list[WebsiteRequirement]: - """Benchmark hook: provide requirements for all sites in the current task. - - Returns a list of WebsiteRequirement (shared model) with website_type and readonly. - """ - - requirements: list[WebsiteRequirement] = [] - for site in task.sites: - website_type = site.value - print(f"Adding requirement for {website_type}") - # Default to write allocations for safety; customize if needed per task - readonly_flag = False - requirements.append( - WebsiteRequirement(website_type=website_type, readonly=readonly_flag) - ) - return requirements \ No newline at end of file diff --git a/browsergym/webarena_verified/src/browsergym/webarena_verified/evaluators.py b/browsergym/webarena_verified/src/browsergym/webarena_verified/evaluators.py index d031eccd..3dc17688 100644 --- a/browsergym/webarena_verified/src/browsergym/webarena_verified/evaluators.py +++ b/browsergym/webarena_verified/src/browsergym/webarena_verified/evaluators.py @@ -6,16 +6,33 @@ import asyncio import json import logging +from datetime import datetime +from typing import Any, Dict, Optional import playwright +from agent_eval_harness_common.models import AllocationResource, WebsiteRequirement +from playwright.async_api import async_playwright from browsergym.webarena_verified.instance import WebArenaVerifiedInstance - -from .browsergym_adapter import BrowserGymEvaluationAdapter +from webarena_verified.evaluation.evaluator import WebArenaEvaluator +from webarena_verified.types import ( + WebArenaTask, + WebArenaTaskResponse, + WebArenaTaskStatus, + WebArenaVerifiedAgentResponse, +) logger = logging.getLogger(__name__) +CONTAINER_NAMES = { + "shopping": "shopping-srv-client-0", + "reddit": "reddit-srv-0", + "shopping_admin": "shopping-srv-admin-0", + "gitlab": "gitlab", + "map": "NA", +} + class WebArenaVerifiedEvaluator: """ Evaluator that integrates the webarena_verified evaluation system. @@ -26,11 +43,11 @@ class WebArenaVerifiedEvaluator: - expected_ui_state: Validates UI state changes """ - def __init__(self, webarena_instance: WebArenaVerifiedInstance): + def __init__(self, webarena_verified_instance: WebArenaVerifiedInstance): """ Initialize the evaluator. """ - self.browsergym_adapter = BrowserGymEvaluationAdapter(webarena_instance) + self.wav_instance = webarena_verified_instance def __call__( self, @@ -55,26 +72,113 @@ def __call__( agent_response = trajectory[-1].get("answer") - # Run advanced evaluation via adapter and return float score - result = asyncio.run(self._async_evaluation(page, config, agent_response=agent_response)) - return float(result) - - async def _async_evaluation(self, page, config, agent_response=None) -> float: - """ - Use the full webarena_verified evaluation system via the BrowserGym adapter. - """ + # Run wa_verified evaluation and return float score task_id = config.get("task_id") logger.info(f"Running webarena_verified async evaluation for task {task_id}") - - try: - # Use the BrowserGym adapter to evaluate the task - result = await self.browsergym_adapter.evaluate_task( + result = asyncio.run(self.evaluate_task( page=page, config=config, agent_response=agent_response, + )) + logger.info(f"Webarena_verified evaluation result for task {task_id}: {result}") + return result.get("score", 0.0) + + async def evaluate_task( + self, + page, + config: Dict[str, Any], + agent_response: Optional[str] = None, + ) -> Dict[str, Any]: + """ + Evaluate a BrowserGym task using the WebArena Verified evaluation system. + Args: + page: Playwright page object + config: Task configuration dictionary + agent_response: Optional agent response string + Returns: + Evaluation result dictionary + """ + # Convert BrowserGym inputs to WebArena Verified format + task = WebArenaTask.model_validate(config) + resource = self.create_mock_allocation_resource(task.eval.site) + + # Try to parse the agent_response as a WebArenaVerifiedAgentResponse + try: + # last_urls = await self._get_last_urls([resource]) + last_urls = ["N/A"] # TODO: check if ok + task_result = WebArenaTaskResponse( + response=WebArenaVerifiedAgentResponse.model_validate(agent_response), + last_urls=last_urls, + status=WebArenaTaskStatus.SUCCESS, ) - return result.get("score", 0.0) - except Exception as e: - logger.error(f"Error in webarena_verified async evaluation: {e}") - return 0.0 + logger.error(f"Failed to parse agent response as WebArenaVerifiedAgentResponse: {e}, agent_response: {agent_response}") + # task_result = WebArenaTaskResponse( + # response=None, + # last_urls=last_urls, + # status=WebArenaTaskStatus.AGENT_FAILURE, + # error_details=[str(e), f"Result String: {agent_response}"], + # ) + raise + + evaluator = WebArenaEvaluator() + eval_results = await evaluator.evaluate_task( + task=task, + task_result=task_result, + resources=[resource], + ) + + # Convert results back to a simple score and message + if eval_results: + # Success only if all evals passed + overall_score = 1.0 if all(r.score == 1.0 for r in eval_results) else 0.0 + # Concatenate messages + messages = [ + msg for result in eval_results for msg in result.assertion_msgs + ] + message = "\n".join(messages) + return { + "score": overall_score, + "message": message, + } + else: + return { + "score": 0.0, + "message": "No evaluation results returned", + } + + + # copied over from /platform-labs-agent-eval-harness/benchmarks/webarena-verified/scripts/test_evals.py + def create_mock_allocation_resource(self, site: str) -> AllocationResource: + """ + Create a mock AllocationResource for validation purposes. + """ + username = self.wav_instance.credentials[site]["username"] + password = self.wav_instance.credentials[site]["password"] + timestamp = datetime.now().strftime("%Y%m%d%H%M%S") + + return AllocationResource( + allocation_id=f"allocation-{site}-{timestamp}", + site_id=f"{site}-{timestamp}", + container_name=CONTAINER_NAMES.get(site, "MISSING_CONTAINER_NAME"), + website_type=site, + base_url=self.wav_instance.urls.get(site, "MISSING"), + cdp_url=self.wav_instance.urls.get(site, "MISSING"), # TODO: check if ok + vnc_url=self.wav_instance.urls.get(site, "MISSING"), # TODO: check if ok + readonly=False, + username=username, + password=password, + role="admin", + ) + + # Copied from platform-labs-agent-eval-harness/benchmarks/webarena-verified/tests/test_benchmark_task.py + @staticmethod + async def _get_last_urls(resources: list[AllocationResource]) -> list[str]: + async with async_playwright() as playwright: + for resource in resources: + browser = await playwright.chromium.connect_over_cdp(resource.cdp_url) + if browser.contexts: + context = browser.contexts[0] + + return [page.url for page in context.pages] + diff --git a/browsergym/webarena_verified/src/browsergym/webarena_verified/instance.py b/browsergym/webarena_verified/src/browsergym/webarena_verified/instance.py index 8252d729..54f66d0b 100644 --- a/browsergym/webarena_verified/src/browsergym/webarena_verified/instance.py +++ b/browsergym/webarena_verified/src/browsergym/webarena_verified/instance.py @@ -14,7 +14,18 @@ class WebArenaVerifiedInstance: """ Utility class to access a WebArena Verified instances. - TODO: connect to the wa_verified client dispatcher to access the instances and reset them + == TODO == + Right now this is a copy of the WebArenaInstance class. + - check if we actually need a custom class or if the default WebArenaInstance class is enough. + MAYBE: + - connect to the wa_verified client dispatcher to access the instances and reset them + IF we need a custom class, the following fields must be present: + - urls: Dict[str, str] + - home_url: str + - full_reset: Callable function that resets the instance + - check_status: Callable function that checks the status of the instance + - ui_login: Callable function that logs in to the instance + == END TODO == """ RESET_URL_VAR = "WA_FULL_RESET" # used by full_reset() From 0330f726b74a7bd1add8e42bd5a815fa4caa6dfb Mon Sep 17 00:00:00 2001 From: Nicolas Gontier Date: Fri, 3 Oct 2025 18:37:17 +0000 Subject: [PATCH 11/64] remove custom webarena verified instance --- .../browsergym/webarena_verified/instance.py | 234 ------------------ 1 file changed, 234 deletions(-) delete mode 100644 browsergym/webarena_verified/src/browsergym/webarena_verified/instance.py diff --git a/browsergym/webarena_verified/src/browsergym/webarena_verified/instance.py b/browsergym/webarena_verified/src/browsergym/webarena_verified/instance.py deleted file mode 100644 index 54f66d0b..00000000 --- a/browsergym/webarena_verified/src/browsergym/webarena_verified/instance.py +++ /dev/null @@ -1,234 +0,0 @@ -import logging -import os -import time - -import playwright.sync_api -import requests - -logger = logging.getLogger(__name__) - -ENV_VARS = ("SHOPPING", "SHOPPING_ADMIN", "REDDIT", "GITLAB", "WIKIPEDIA", "MAP", "HOMEPAGE") - - -class WebArenaVerifiedInstance: - """ - Utility class to access a WebArena Verified instances. - - == TODO == - Right now this is a copy of the WebArenaInstance class. - - check if we actually need a custom class or if the default WebArenaInstance class is enough. - MAYBE: - - connect to the wa_verified client dispatcher to access the instances and reset them - IF we need a custom class, the following fields must be present: - - urls: Dict[str, str] - - home_url: str - - full_reset: Callable function that resets the instance - - check_status: Callable function that checks the status of the instance - - ui_login: Callable function that logs in to the instance - == END TODO == - """ - - RESET_URL_VAR = "WA_FULL_RESET" # used by full_reset() - - def __init__( - self, - ) -> None: - - # setup webarena environment variables (webarena will read those on import) - append_wa = lambda x: f"WA_{x}" - for key in ENV_VARS: - assert append_wa(key) in os.environ, ( - f"Environment variable {append_wa(key)} missing.\n" - + "Please set the following environment variables to use WebArena through BrowserGym:\n" - + "\n".join([append_wa(x) for x in ENV_VARS]) - ) - os.environ[key] = os.environ[append_wa(key)] - - # import webarena on instanciation - from webarena.browser_env.env_config import ( - ACCOUNTS, - GITLAB, - HOMEPAGE, - MAP, - REDDIT, - SHOPPING, - SHOPPING_ADMIN, - WIKIPEDIA, - ) - - self.urls = { - "reddit": REDDIT, - "gitlab": GITLAB, - "shopping": SHOPPING, - "shopping_admin": SHOPPING_ADMIN, - "wikipedia": WIKIPEDIA, - "map": MAP, - } - self.home_url = HOMEPAGE - - self.credentials = ACCOUNTS - - def full_reset(self, skip_if_not_set: bool = True, site: str = "all"): - base_url = os.environ.get(self.RESET_URL_VAR, None) - - if not base_url: - # check for reset URL - logger.error( - f"Environment variable {self.RESET_URL_VAR} is missing or empty, required for a full instance reset." - ) - if skip_if_not_set: - logger.warning( - f"Skipping automated reset. Make sure the instance has been manually reset." - ) - else: - raise RuntimeError(f"Could not reset instance, aborting.") - - else: - # reset the instance - reset_url = f"{base_url}/reset" - status_url = f"{base_url}/status" - - logger.info( - f"Initiating {self.__class__.__name__} instance reset on URL {reset_url}. Should take between 200 - 500 seconds to restart." - ) - - # trigger instance reset - response = requests.get(reset_url) - match response.status_code: - case 200: - logger.info(f"Reset started.") - case 418: - logger.warning("Reset was already running.") - case _: - raise Exception( - f"{self.__class__.__name__} reset request {reset_url} failed ({response.status_code}): {response.text}" - ) - - # wait until reset complete - retry_after = 20 # 20 seconds wait between status checks - timeout = 10 * 60 # 10 minutes timeout - start_time = time.time() - while True: - # request instance status - response = requests.get(status_url) - # check for server error - if response.status_code != 200: - raise Exception( - f"{self.__class__.__name__} status request {status_url} failed ({response.status_code}): {response.text}" - ) - # check for readiness - if response.text == "Ready for duty!": - break - # check for timeout - time_elapsed = time.time() - start_time - logger.info(f"Reset still running after {time_elapsed:.0f} seconds...") - if time_elapsed > timeout: - raise Exception( - f"Reset still running after {time_elapsed} seconds (> {timeout}), aborting." - ) - # wait a bit before next retry - time.sleep(retry_after) - - # warm-start the instance (navigate to every domain) - retries_left = 3 - while retries_left: - retries_left -= 1 - try: - self._check_is_reachable( - timeout=60, - site=site, - ) # 60 seconds, warming up after reset might be slow - break - except Exception as e: - if not retries_left: - raise - logger.info( - f"Instance unresponsive after reset, retrying ({retries_left} retries left)\n{e}" - ) - - def check_status(self, site: str = "all"): - """ - Check the status of the instance. Raises an error if the instance is not ready to be used. - - """ - self._check_is_reachable(timeout=10, site=site) # 10 seconds - - def _check_is_reachable(self, timeout: int, site: str = "all"): - """ - Test that every website is reachable. - - """ - if site == "all": - sites = self.urls.keys() - else: - assert site in self.urls.keys(), f"Site {site} not found in {self.urls.keys()}" - sites = [site] - - for site in sites: - url = self.urls[site] - try: - requests.get(url, timeout=timeout) - except (requests.exceptions.ConnectionError, requests.exceptions.Timeout): - raise RuntimeError( - f'WebArena site "{site}" ({url}) is not reacheable. Please check the URL.' - ) - - def ui_login(self, site: str, page: playwright.sync_api.Page): - """ - Should only be called once per site (expects user to be logged out). - """ - - url = self.urls[site] - - # open a new page (tab) to perform the login - page = page.context.new_page() - - match site: - case "reddit": - username = self.credentials[site]["username"] - password = self.credentials[site]["password"] - - page.goto(f"{url}") - page.get_by_role("link", name="Log in").click() - page.get_by_label("Username").fill(username) - page.get_by_label("Password").fill(password) - page.get_by_role("button", name="Log in").click() - - case "gitlab": - username = self.credentials[site]["username"] - password = self.credentials[site]["password"] - - page.goto(f"{url}/users/sign_in") - page.get_by_label("Username or email").fill(username) - page.get_by_label("Password").fill(password) - page.get_by_role("button", name="Sign in").click() - - case "shopping": - username = self.credentials[site]["username"] - password = self.credentials[site]["password"] - - page.goto(f"{url}/customer/account/login/") - page.get_by_label("Email", exact=True).fill(username) - page.get_by_label("Password", exact=True).fill(password) - page.get_by_role("button", name="Sign In").click() - - case "shopping_admin": - username = self.credentials[site]["username"] - password = self.credentials[site]["password"] - - page.goto(url) - page.get_by_label("Username").fill(username) - page.get_by_label("Password").fill(password) - page.get_by_role("button", name="Sign in").click() - - case "wikipedia": - page.goto(url) - - case "map": - page.goto(url) - - case _: - raise ValueError - - # release login page - page.close() From ab0437bda34ff98fcf8cd94bb1590e039ee7b527 Mon Sep 17 00:00:00 2001 From: Nicolas Gontier Date: Fri, 3 Oct 2025 18:37:45 +0000 Subject: [PATCH 12/64] update requirements to latest wav code --- browsergym/webarena_verified/requirements.txt | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/browsergym/webarena_verified/requirements.txt b/browsergym/webarena_verified/requirements.txt index 4db23ded..dc30c254 100644 --- a/browsergym/webarena_verified/requirements.txt +++ b/browsergym/webarena_verified/requirements.txt @@ -1,5 +1,3 @@ browsergym-core==0.14.2 libwebarena==0.0.4 -webarena-verified @ file:///home/toolkit/platform-labs-agent-eval-harness/benchmarks/webarena-verified -agent-eval-harness-common @ file:///home/toolkit/platform-labs-agent-eval-harness/packages/agent-eval-harness-common -agent-eval-harness-pytest @ file:///home/toolkit/platform-labs-agent-eval-harness/packages/agent-eval-harness-pytest +webarena-verified @ file:///home/toolkit/platform-labs-webarena-verified From bd434675de12fba283955bfc3f6213a17be88f22 Mon Sep 17 00:00:00 2001 From: Nicolas Gontier Date: Fri, 3 Oct 2025 18:38:37 +0000 Subject: [PATCH 13/64] use simpler and cleaner wav eval --- .../webarena_verified/evaluators.py | 150 ++++-------------- .../src/browsergym/webarena_verified/task.py | 3 - 2 files changed, 31 insertions(+), 122 deletions(-) diff --git a/browsergym/webarena_verified/src/browsergym/webarena_verified/evaluators.py b/browsergym/webarena_verified/src/browsergym/webarena_verified/evaluators.py index 3dc17688..92bc085b 100644 --- a/browsergym/webarena_verified/src/browsergym/webarena_verified/evaluators.py +++ b/browsergym/webarena_verified/src/browsergym/webarena_verified/evaluators.py @@ -4,23 +4,26 @@ """ import asyncio +import importlib import json import logging from datetime import datetime +from pathlib import Path from typing import Any, Dict, Optional import playwright from agent_eval_harness_common.models import AllocationResource, WebsiteRequirement from playwright.async_api import async_playwright -from browsergym.webarena_verified.instance import WebArenaVerifiedInstance -from webarena_verified.evaluation.evaluator import WebArenaEvaluator -from webarena_verified.types import ( - WebArenaTask, - WebArenaTaskResponse, - WebArenaTaskStatus, - WebArenaVerifiedAgentResponse, +from browsergym.webarena.instance import WebArenaInstance +from webarena_verified.api.evaluator_api import TaskEvaluator +from webarena_verified.types.eval import ( + NetworkTrace, + WebarenaTaskEvalRequest, + WebarenaTaskEvalResult, ) +from webarena_verified.types.settings import URLMap, WebArenaVerifiedSettings +from webarena_verified.types.task import WebArenaVerifiedTask logger = logging.getLogger(__name__) @@ -42,12 +45,17 @@ class WebArenaVerifiedEvaluator: - expected_backend_state: Validates backend/database changes - expected_ui_state: Validates UI state changes """ - - def __init__(self, webarena_verified_instance: WebArenaVerifiedInstance): + + def __init__(self, webarena_instance: WebArenaInstance): """ Initialize the evaluator. """ - self.wav_instance = webarena_verified_instance + self.evaluator = TaskEvaluator( + WebArenaVerifiedSettings( + test_data_file=Path(__file__).parent.joinpath("webarena_verified.json"), + url_map=URLMap(root=webarena_instance.urls), + ) + ) def __call__( self, @@ -70,115 +78,19 @@ def __call__( with open(config_file, "r") as f: config = json.load(f) - agent_response = trajectory[-1].get("answer") - - # Run wa_verified evaluation and return float score - task_id = config.get("task_id") - logger.info(f"Running webarena_verified async evaluation for task {task_id}") - result = asyncio.run(self.evaluate_task( - page=page, - config=config, - agent_response=agent_response, - )) - logger.info(f"Webarena_verified evaluation result for task {task_id}: {result}") - return result.get("score", 0.0) - - async def evaluate_task( - self, - page, - config: Dict[str, Any], - agent_response: Optional[str] = None, - ) -> Dict[str, Any]: - """ - Evaluate a BrowserGym task using the WebArena Verified evaluation system. - Args: - page: Playwright page object - config: Task configuration dictionary - agent_response: Optional agent response string - Returns: - Evaluation result dictionary - """ - # Convert BrowserGym inputs to WebArena Verified format - task = WebArenaTask.model_validate(config) - resource = self.create_mock_allocation_resource(task.eval.site) - - # Try to parse the agent_response as a WebArenaVerifiedAgentResponse - try: - # last_urls = await self._get_last_urls([resource]) - last_urls = ["N/A"] # TODO: check if ok - task_result = WebArenaTaskResponse( - response=WebArenaVerifiedAgentResponse.model_validate(agent_response), - last_urls=last_urls, - status=WebArenaTaskStatus.SUCCESS, - ) - except Exception as e: - logger.error(f"Failed to parse agent response as WebArenaVerifiedAgentResponse: {e}, agent_response: {agent_response}") - # task_result = WebArenaTaskResponse( - # response=None, - # last_urls=last_urls, - # status=WebArenaTaskStatus.AGENT_FAILURE, - # error_details=[str(e), f"Result String: {agent_response}"], - # ) - raise - - evaluator = WebArenaEvaluator() - eval_results = await evaluator.evaluate_task( - task=task, - task_result=task_result, - resources=[resource], + # create eval request + eval_request = WebarenaTaskEvalRequest( + task=WebArenaVerifiedTask.model_validate(config), + agent_response_raw=trajectory[-1].get("answer"), + network_trace=NetworkTrace.from_playwright_trace(...), # TODO: add path to playwright trace should be Path(exp_args.exp_dir / "pw_traces" / f"{exp_args.exp_name}.zip") ) - - # Convert results back to a simple score and message - if eval_results: - # Success only if all evals passed - overall_score = 1.0 if all(r.score == 1.0 for r in eval_results) else 0.0 - # Concatenate messages - messages = [ - msg for result in eval_results for msg in result.assertion_msgs - ] - message = "\n".join(messages) - return { - "score": overall_score, - "message": message, - } - else: - return { - "score": 0.0, - "message": "No evaluation results returned", - } - - # copied over from /platform-labs-agent-eval-harness/benchmarks/webarena-verified/scripts/test_evals.py - def create_mock_allocation_resource(self, site: str) -> AllocationResource: - """ - Create a mock AllocationResource for validation purposes. - """ - username = self.wav_instance.credentials[site]["username"] - password = self.wav_instance.credentials[site]["password"] - timestamp = datetime.now().strftime("%Y%m%d%H%M%S") - - return AllocationResource( - allocation_id=f"allocation-{site}-{timestamp}", - site_id=f"{site}-{timestamp}", - container_name=CONTAINER_NAMES.get(site, "MISSING_CONTAINER_NAME"), - website_type=site, - base_url=self.wav_instance.urls.get(site, "MISSING"), - cdp_url=self.wav_instance.urls.get(site, "MISSING"), # TODO: check if ok - vnc_url=self.wav_instance.urls.get(site, "MISSING"), # TODO: check if ok - readonly=False, - username=username, - password=password, - role="admin", - ) - - # Copied from platform-labs-agent-eval-harness/benchmarks/webarena-verified/tests/test_benchmark_task.py - @staticmethod - async def _get_last_urls(resources: list[AllocationResource]) -> list[str]: - async with async_playwright() as playwright: - for resource in resources: - browser = await playwright.chromium.connect_over_cdp(resource.cdp_url) - if browser.contexts: - context = browser.contexts[0] - - return [page.url for page in context.pages] + # Run wa_verified evaluation and return float score + logger.info(f"Running webarena_verified evaluation for task {eval_request.task.task_id}") + results: list[WebarenaTaskEvalResult] = self.evaluator.eval_task(eval_request) + logger.info(f"Webarena_verified evaluation result for task {eval_request.task.task_id}:") + for result in results: + logger.info(f"status: {result.status}, score: {result.score}, error_msg: {result.error_msg}") + # return average score + return sum(result.score for result in results) / len(results) diff --git a/browsergym/webarena_verified/src/browsergym/webarena_verified/task.py b/browsergym/webarena_verified/src/browsergym/webarena_verified/task.py index c7547d53..2a4c8f80 100644 --- a/browsergym/webarena_verified/src/browsergym/webarena_verified/task.py +++ b/browsergym/webarena_verified/src/browsergym/webarena_verified/task.py @@ -86,9 +86,6 @@ def __init__( self.task_configs = task_configs def setup(self, page: playwright.sync_api.Page) -> tuple[str, dict]: - # Using the webarena_verified evaluator system - from .evaluators import WebArenaVerifiedEvaluator - # pick a task at random self.config = self.random.choice(self.task_configs) From fecedb196c2e7b6ebe9abf7017c56445cf1e2267 Mon Sep 17 00:00:00 2001 From: Nicolas Gontier Date: Thu, 16 Oct 2025 19:31:33 +0000 Subject: [PATCH 14/64] enable tracing --- .../webarena_verified/evaluators.py | 26 ++++++++++++------- .../src/browsergym/webarena_verified/task.py | 3 +++ 2 files changed, 20 insertions(+), 9 deletions(-) diff --git a/browsergym/webarena_verified/src/browsergym/webarena_verified/evaluators.py b/browsergym/webarena_verified/src/browsergym/webarena_verified/evaluators.py index 92bc085b..7e95abc5 100644 --- a/browsergym/webarena_verified/src/browsergym/webarena_verified/evaluators.py +++ b/browsergym/webarena_verified/src/browsergym/webarena_verified/evaluators.py @@ -3,17 +3,12 @@ from platform-labs-agent-eval-harness. """ -import asyncio -import importlib import json import logging -from datetime import datetime +import tempfile from pathlib import Path -from typing import Any, Dict, Optional import playwright -from agent_eval_harness_common.models import AllocationResource, WebsiteRequirement -from playwright.async_api import async_playwright from browsergym.webarena.instance import WebArenaInstance from webarena_verified.api.evaluator_api import TaskEvaluator @@ -75,14 +70,27 @@ def __call__( Returns: Float score compatible with BrowserGym (1.0 or 0.0) """ + # import webarena dynamically + from webarena.browser_env.actions import ActionTypes + # if last action is not a STOP action, return 0.0 as the task is not completed yet + if trajectory[-1].get("action_type") != ActionTypes.STOP: + return 0.0 + + # task is done: load the config file, stop playwright tracing, and evaluate the trace with open(config_file, "r") as f: - config = json.load(f) + config_raw = json.load(f) + config: WebArenaVerifiedTask = WebArenaVerifiedTask.model_validate(config_raw) + + # stop playwright tracing + with tempfile.TemporaryDirectory() as temp_dir: + trace_path = Path(temp_dir) / f"wav_{config.task_id}.zip" + page.context.tracing.stop(path=trace_path) # create eval request eval_request = WebarenaTaskEvalRequest( - task=WebArenaVerifiedTask.model_validate(config), + task=config, agent_response_raw=trajectory[-1].get("answer"), - network_trace=NetworkTrace.from_playwright_trace(...), # TODO: add path to playwright trace should be Path(exp_args.exp_dir / "pw_traces" / f"{exp_args.exp_name}.zip") + network_trace=NetworkTrace.from_playwright_trace(trace_path), # TODO: add path to playwright trace should be Path(exp_args.exp_dir / "pw_traces" / f"{exp_args.exp_name}.zip") ) # Run wa_verified evaluation and return float score diff --git a/browsergym/webarena_verified/src/browsergym/webarena_verified/task.py b/browsergym/webarena_verified/src/browsergym/webarena_verified/task.py index 2a4c8f80..3662ce8c 100644 --- a/browsergym/webarena_verified/src/browsergym/webarena_verified/task.py +++ b/browsergym/webarena_verified/src/browsergym/webarena_verified/task.py @@ -102,6 +102,9 @@ def setup(self, page: playwright.sync_api.Page) -> tuple[str, dict]: for site in self.config["sites"]: self.webarena_instance.ui_login(site=site, page=page) + # enable playwright tracing (required for webarena_verified evaluation) + page.context.tracing.start(snapshots=True) + # set geolocation if specified if self.config.get("geolocation"): page.context.set_geolocation(self.config["geolocation"]) From 8fdebe651542dfa6e97c726791db0439dcd53aed Mon Sep 17 00:00:00 2001 From: Nicolas Gontier Date: Mon, 20 Oct 2025 19:57:21 +0000 Subject: [PATCH 15/64] fix wav --- .../experiments/benchmark/configs.py | 2 +- .../webarena_verified/evaluators.py | 27 ++++++++----------- .../src/browsergym/webarena_verified/task.py | 4 --- 3 files changed, 12 insertions(+), 21 deletions(-) diff --git a/browsergym/experiments/src/browsergym/experiments/benchmark/configs.py b/browsergym/experiments/src/browsergym/experiments/benchmark/configs.py index 9551111d..f5394a36 100644 --- a/browsergym/experiments/src/browsergym/experiments/benchmark/configs.py +++ b/browsergym/experiments/src/browsergym/experiments/benchmark/configs.py @@ -138,7 +138,7 @@ high_level_action_set_args=DEFAULT_HIGHLEVEL_ACTION_SET_ARGS["webarena"], is_multi_tab=True, supports_parallel_seeds=False, - backends=["webarena_verified"], + backends=["webarena"], env_args_list=make_env_args_list_from_repeat_tasks( task_list=task_list_from_metadata(metadata=task_metadata("webarena_verified")), max_steps=30, diff --git a/browsergym/webarena_verified/src/browsergym/webarena_verified/evaluators.py b/browsergym/webarena_verified/src/browsergym/webarena_verified/evaluators.py index 7e95abc5..fd30fea3 100644 --- a/browsergym/webarena_verified/src/browsergym/webarena_verified/evaluators.py +++ b/browsergym/webarena_verified/src/browsergym/webarena_verified/evaluators.py @@ -11,14 +11,16 @@ import playwright from browsergym.webarena.instance import WebArenaInstance -from webarena_verified.api.evaluator_api import TaskEvaluator +from webarena_verified.api.evaluator_api import ( + WebArenaVerifiedEvaluator as WebArenaVerifiedEvaluatorAPI, +) from webarena_verified.types.eval import ( - NetworkTrace, - WebarenaTaskEvalRequest, - WebarenaTaskEvalResult, + TaskEvalRequest, + TaskEvalResult, ) from webarena_verified.types.settings import URLMap, WebArenaVerifiedSettings from webarena_verified.types.task import WebArenaVerifiedTask +from webarena_verified.types.tracing import NetworkTrace logger = logging.getLogger(__name__) @@ -45,7 +47,7 @@ def __init__(self, webarena_instance: WebArenaInstance): """ Initialize the evaluator. """ - self.evaluator = TaskEvaluator( + self.evaluator = WebArenaVerifiedEvaluatorAPI( WebArenaVerifiedSettings( test_data_file=Path(__file__).parent.joinpath("webarena_verified.json"), url_map=URLMap(root=webarena_instance.urls), @@ -57,7 +59,7 @@ def __call__( trajectory: list[dict], config_file: str, page: playwright.sync_api.Page = None, - client: playwright.async_api.CDPSession | None = None, + client: playwright.sync_api.CDPSession | None = None, ) -> float: """ Entry point compatible with GenericWebArenaTask.validate(...). @@ -86,17 +88,10 @@ def __call__( trace_path = Path(temp_dir) / f"wav_{config.task_id}.zip" page.context.tracing.stop(path=trace_path) - # create eval request - eval_request = WebarenaTaskEvalRequest( - task=config, - agent_response_raw=trajectory[-1].get("answer"), - network_trace=NetworkTrace.from_playwright_trace(trace_path), # TODO: add path to playwright trace should be Path(exp_args.exp_dir / "pw_traces" / f"{exp_args.exp_name}.zip") - ) - # Run wa_verified evaluation and return float score - logger.info(f"Running webarena_verified evaluation for task {eval_request.task.task_id}") - results: list[WebarenaTaskEvalResult] = self.evaluator.eval_task(eval_request) - logger.info(f"Webarena_verified evaluation result for task {eval_request.task.task_id}:") + logger.info(f"Running webarena_verified evaluation for task {config.task_id}") + results: list[TaskEvalResult] = self.evaluator.evaluate_task(config.task_id, trajectory[-1].get("answer"), trace_path) + logger.info(f"Webarena_verified evaluation result for task {config.task_id}:") for result in results: logger.info(f"status: {result.status}, score: {result.score}, error_msg: {result.error_msg}") # return average score diff --git a/browsergym/webarena_verified/src/browsergym/webarena_verified/task.py b/browsergym/webarena_verified/src/browsergym/webarena_verified/task.py index 3662ce8c..643355ce 100644 --- a/browsergym/webarena_verified/src/browsergym/webarena_verified/task.py +++ b/browsergym/webarena_verified/src/browsergym/webarena_verified/task.py @@ -8,7 +8,6 @@ from browsergym.webarena.task import GenericWebArenaTask from browsergym.webarena_verified.evaluators import WebArenaVerifiedEvaluator -from browsergym.webarena_verified.instance import WebArenaVerifiedInstance logger = logging.getLogger(__name__) @@ -40,9 +39,6 @@ def __init__( with_homepage_hint=with_homepage_hint, ) - # override the webarena instance to use the webarena_verified instance - self.webarena_instance = WebArenaVerifiedInstance() - # Load the webarena_verified.json file all_configs_str = ( importlib.resources.files("browsergym.webarena_verified") From 4bdfa7e4a8f8675c4e4a010688e96e27477914c9 Mon Sep 17 00:00:00 2001 From: Nicolas Gontier Date: Wed, 22 Oct 2025 15:57:06 +0000 Subject: [PATCH 16/64] update to new webarena verified version --- .../webarena_verified/evaluators.py | 69 +- .../webarena_verified/webarena_verified.json | 61686 +++++----------- 2 files changed, 17683 insertions(+), 44072 deletions(-) diff --git a/browsergym/webarena_verified/src/browsergym/webarena_verified/evaluators.py b/browsergym/webarena_verified/src/browsergym/webarena_verified/evaluators.py index fd30fea3..1bd38ae6 100644 --- a/browsergym/webarena_verified/src/browsergym/webarena_verified/evaluators.py +++ b/browsergym/webarena_verified/src/browsergym/webarena_verified/evaluators.py @@ -11,28 +11,25 @@ import playwright from browsergym.webarena.instance import WebArenaInstance -from webarena_verified.api.evaluator_api import ( +from webarena_verified.api import WebArenaVerifiedDataReader +from webarena_verified.api import ( WebArenaVerifiedEvaluator as WebArenaVerifiedEvaluatorAPI, ) +from webarena_verified.types import WebArenaVerifiedTask +from webarena_verified.types.config import ( + EnvironmentConfig, + WebArenaSite, + WebArenaVerifiedConfig, +) from webarena_verified.types.eval import ( - TaskEvalRequest, + TaskEvalContext, TaskEvalResult, ) -from webarena_verified.types.settings import URLMap, WebArenaVerifiedSettings -from webarena_verified.types.task import WebArenaVerifiedTask from webarena_verified.types.tracing import NetworkTrace logger = logging.getLogger(__name__) -CONTAINER_NAMES = { - "shopping": "shopping-srv-client-0", - "reddit": "reddit-srv-0", - "shopping_admin": "shopping-srv-admin-0", - "gitlab": "gitlab", - "map": "NA", -} - class WebArenaVerifiedEvaluator: """ Evaluator that integrates the webarena_verified evaluation system. @@ -47,12 +44,25 @@ def __init__(self, webarena_instance: WebArenaInstance): """ Initialize the evaluator. """ - self.evaluator = WebArenaVerifiedEvaluatorAPI( - WebArenaVerifiedSettings( - test_data_file=Path(__file__).parent.joinpath("webarena_verified.json"), - url_map=URLMap(root=webarena_instance.urls), - ) + # Create configuration for all sites and homepage from webarena_instance + config = WebArenaVerifiedConfig( + test_data_file=Path(__file__).parent.joinpath("webarena_verified.json"), + environments={ + **{ + site: EnvironmentConfig( + urls=[webarena_instance.urls[site]], + credentials=webarena_instance.credentials.get(site), + ) + for site in webarena_instance.urls + }, + WebArenaSite.HOMEPAGE: EnvironmentConfig( + urls=[webarena_instance.home_url], + ) + } ) + # Instantiate data reader and evaluator + reader = WebArenaVerifiedDataReader(config) + self.evaluator = WebArenaVerifiedEvaluatorAPI(config=config, reader=reader) def __call__( self, @@ -81,19 +91,28 @@ def __call__( # task is done: load the config file, stop playwright tracing, and evaluate the trace with open(config_file, "r") as f: config_raw = json.load(f) - config: WebArenaVerifiedTask = WebArenaVerifiedTask.model_validate(config_raw) + task: WebArenaVerifiedTask = WebArenaVerifiedTask.model_validate(config_raw) # stop playwright tracing with tempfile.TemporaryDirectory() as temp_dir: - trace_path = Path(temp_dir) / f"wav_{config.task_id}.zip" + trace_path = Path(temp_dir) / f"wav_{task.task_id}.zip" page.context.tracing.stop(path=trace_path) + # Create evaluation context + context = TaskEvalContext( + task=task, + agent_response_raw=trajectory[-1].get("answer"), + network_trace=NetworkTrace.from_content(trace_path), + environments=self.evaluator.config.environments, + ) + # Run wa_verified evaluation and return float score - logger.info(f"Running webarena_verified evaluation for task {config.task_id}") - results: list[TaskEvalResult] = self.evaluator.evaluate_task(config.task_id, trajectory[-1].get("answer"), trace_path) - logger.info(f"Webarena_verified evaluation result for task {config.task_id}:") - for result in results: - logger.info(f"status: {result.status}, score: {result.score}, error_msg: {result.error_msg}") + logger.info(f"Running webarena_verified evaluation for task {task.task_id}") + results: TaskEvalResult = self.evaluator.evaluate_task(context=context) + logger.info(f"Webarena_verified evaluation result for task {task.task_id}:") + logger.info(f"status: {results.status}, score: {results.score}, error_msg: {results.error_msg}") + for result in results.evaluators_results: + logger.info(f"- {result.evaluator_name}: status: {result.status}, score: {result.score}, error_msg: {result.error_msg}") # return average score - return sum(result.score for result in results) / len(results) + return sum(result.score for result in results.evaluators_results) / len(results.evaluators_results) diff --git a/browsergym/webarena_verified/src/browsergym/webarena_verified/webarena_verified.json b/browsergym/webarena_verified/src/browsergym/webarena_verified/webarena_verified.json index 1a607487..6e856fd2 100644 --- a/browsergym/webarena_verified/src/browsergym/webarena_verified/webarena_verified.json +++ b/browsergym/webarena_verified/src/browsergym/webarena_verified/webarena_verified.json @@ -1,31304 +1,15491 @@ [ { - "sites": [ - "shopping_admin" - ], + "sites": ["shopping_admin"], "task_id": 0, - "require_login": true, - "storage_state": "./.auth/shopping_admin_state.json", - "start_url": "__SHOPPING_ADMIN__", - "geolocation": null, - "intent_template": "What is the top-{{n}} best-selling product name(s) in {{year}}", - "original.intent_template": "What is the top-{{n}} best-selling product in {{year}}", - "instantiation_dict": { - "n": 1, - "year": 2022 - }, - "intent": "What is the top-1 best-selling product name(s) in 2022", - "original.intent": "What is the top-1 best-selling product in 2022", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "verify_retrieved_value_normalized", - "eval_params": { - "ordered": false, - "type": "text" - }, - "expected_data": [ - "Quest Lumaflex\u2122 Band" - ] - } - } - ], - "site": "shopping_admin" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "exact_match": "Quest Lumaflex\u2122 Band" - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "Quest Lumaflex\u2122 Band" - }, "intent_template_id": 279, - "changelogs": [ + "start_urls": ["__SHOPPING_ADMIN__"], + "intent": "Get the top-1 best-selling product name(s) in 2022", + "intent_template": "Get the top-{{n}} best-selling product name(s) in {{year}}", + "instantiation_dict": {"n": 1, "year": 2022}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "intent_template", - "category": "task_ambiguity", - "note": "Template asks for 'product' but agent can return the id, the color as part of the response leading to false negatives. Should specifically ask for product name(s)" + "evaluator": "AgentResponseEvaluator", + "ordered": false, + "results_schema": { "type": "array", "items": {"type": "string"} }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": ["Quest Lumaflex\u2122 Band"] + } } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping_admin" - ], + "sites": ["shopping_admin"], "task_id": 1, - "require_login": true, - "storage_state": "./.auth/shopping_admin_state.json", - "start_url": "__SHOPPING_ADMIN__", - "geolocation": null, - "intent_template": "What is the top-{{n}} best-selling brand name(s) in {{period}}", - "original.intent_template": "What is the top-{{n}} best-selling brand in {{period}}", - "instantiation_dict": { - "n": 1, - "period": "Quarter 1 2022" - }, - "intent": "What is the top-1 best-selling brand name(s) in Quarter 1 2022", - "original.intent": "What is the top-1 best-selling brand in Quarter 1 2022", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "verify_retrieved_value_normalized", - "eval_params": { - "ordered": false, - "type": "text" - }, - "expected_data": [ - "Sprite" - ] - } - } - ], - "site": "shopping_admin" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "exact_match": "Sprite" - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "Sprite" - }, "intent_template_id": 279, - "changelogs": [ + "start_urls": ["__SHOPPING_ADMIN__"], + "intent": "Get the top-1 best-selling brand name(s) in Quarter 1 2022", + "intent_template": "Get the top-{{n}} best-selling brand name(s) in {{period}}", + "instantiation_dict": {"n": 1, "period": "Quarter 1 2022"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "intent_template", - "category": "task_ambiguity", - "note": "Template asks for 'brand' but agent can return the id, or the name as part of the response leading to false negatives. Should specifically ask for brand name(s)" + "evaluator": "AgentResponseEvaluator", + "ordered": false, + "results_schema": { "type": "array", "items": {"type": "string"} }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": ["Sprite"] + } } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping_admin" - ], + "sites": ["shopping_admin"], "task_id": 2, - "require_login": true, - "storage_state": "./.auth/shopping_admin_state.json", - "start_url": "__SHOPPING_ADMIN__", - "geolocation": null, - "intent_template": "What is the top-{{n}} best-selling product type name(s) in {{period}}", - "original.intent_template": "What is the top-{{n}} best-selling product type in {{period}}", - "instantiation_dict": { - "n": 1, - "period": "Quarter 1 2022" - }, - "intent": "What is the top-1 best-selling product type name(s) in Quarter 1 2022", - "original.intent": "What is the top-1 best-selling product type in Quarter 1 2022", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "verify_retrieved_value_normalized", - "eval_params": { - "ordered": false, - "type": "text" - }, - "expected_data": [ - "Yoga strap" - ] - } - } - ], - "site": "shopping_admin" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "exact_match": "Yoga ball" - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "Yoga ball" - }, "intent_template_id": 279, - "changelogs": [ - { - "key": "intent_template", - "category": "task_ambiguity", - "note": "Template asks for 'product type' but agent can return the id, or the name as part of the response leading to false negatives. Should specifically ask for product type name(s)" - }, + "start_urls": ["__SHOPPING_ADMIN__"], + "intent": "Get the top-1 best-selling product type name(s) in Quarter 1 2022", + "intent_template": "Get the top-{{n}} best-selling product type name(s) in {{period}}", + "instantiation_dict": {"n": 1, "period": "Quarter 1 2022"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "expected_retrieve_value", - "category": "reference_alignment", - "note": "Yoga straps are the best selling product type in q1 2022" + "evaluator": "AgentResponseEvaluator", + "ordered": false, + "results_schema": { "type": "array", "items": {"type": "string"} }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": [ ["Digital Watch", "Band", "Stasis Ball", "Yoga Strap"] ] + } } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping_admin" - ], + "sites": ["shopping_admin"], "task_id": 3, - "require_login": true, - "storage_state": "./.auth/shopping_admin_state.json", - "start_url": "__SHOPPING_ADMIN__", - "geolocation": null, - "intent_template": "What are the top-{{n}} best-selling product name(s) in {{year}}", - "original.intent_template": "What are the top-{{n}} best-selling product in {{year}}", - "instantiation_dict": { - "n": 2, - "year": 2022 - }, - "intent": "What are the top-2 best-selling product name(s) in 2022", - "original.intent": "What are the top-2 best-selling product in 2022", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "verify_retrieved_value_normalized", - "eval_params": { - "ordered": false, - "type": "text" - }, - "expected_data": [ - "Quest Lumaflex\u2122 Band", - "Sprite Stasis Ball 65 cm" - ] - } - } - ], - "site": "shopping_admin" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "must_include": [ - "Quest Lumaflex\u2122 Band", - "Sprite Stasis Ball 65 cm" - ] - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "Quest Lumaflex\u2122 Band, Sprite Stasis Ball 65 cm" - }, "intent_template_id": 279, - "changelogs": [ + "start_urls": ["__SHOPPING_ADMIN__"], + "intent": "Get the top-2 best-selling product name(s) in 2022", + "intent_template": "Get the top-{{n}} best-selling product name(s) in {{year}}", + "instantiation_dict": {"n": 2, "year": 2022}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "intent_template", - "category": "task_ambiguity", - "note": "Template asks for 'product' but agent can return the id, the color as part of the response leading to false negatives. Should specifically ask for product name(s)" - }, - { - "key": "eval", - "category": "permissive_string_match", - "note": "Switched to exact match" + "evaluator": "AgentResponseEvaluator", + "ordered": false, + "results_schema": { "type": "array", "items": {"type": "string"} }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": [ + "Quest Lumaflex\u2122 Band", + [ + "Sprite Stasis Ball 65 cm", "Cruise Stasis Ball 65 cm", + "Sprite Stasis Ball 55 cm" + ] + ] + } } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping_admin" - ], + "sites": ["shopping_admin"], "task_id": 4, - "require_login": true, - "storage_state": "./.auth/shopping_admin_state.json", - "start_url": "__SHOPPING_ADMIN__", - "geolocation": null, - "intent_template": "What are the top-{{n}} best-selling product name(s) in {{period}}", - "original.intent_template": "What are the top-{{n}} best-selling product in {{period}}", - "instantiation_dict": { - "n": 3, - "period": "Jan 2023" - }, - "intent": "What are the top-3 best-selling product name(s) in Jan 2023", - "original.intent": "What are the top-3 best-selling product in Jan 2023", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "verify_retrieved_value_normalized", - "eval_params": { - "ordered": false, - "type": "text" - }, - "expected_data": [ - "Impulse Duffle", - "Overnight Duffle", - "Hawkeye Yoga Short-32-Blue" - ] - } - } - ], - "site": "shopping_admin" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "must_include": [ - "Impulse Duffle", - "Overnight Duffle", - "Hawkeye Yoga Short-32-Blue" - ] - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "Impulse Duffle, Overnight Duffle, Hawkeye Yoga Short-32-Blue" - }, "intent_template_id": 279, - "changelogs": [ - { - "key": "intent_template", - "category": "task_ambiguity", - "note": "Template asks for 'product' but agent can return the id, the color as part of the response leading to false negatives. Should specifically ask for product name(s)" - }, + "start_urls": ["__SHOPPING_ADMIN__"], + "intent": "Get the top-3 best-selling product name(s) in Jan 2023", + "intent_template": "Get the top-{{n}} best-selling product name(s) in {{period}}", + "instantiation_dict": {"n": 3, "period": "Jan 2023"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "eval", - "category": "permissive_string_match", - "note": "Switched to exact match" + "evaluator": "AgentResponseEvaluator", + "ordered": false, + "results_schema": { "type": "array", "items": {"type": "string"} }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": ["Impulse Duffle", "Overnight Duffle", "Hawkeye Yoga Short-32-Blue"] + } } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping_admin" - ], + "sites": ["shopping_admin"], "task_id": 5, - "require_login": true, - "storage_state": "./.auth/shopping_admin_state.json", - "start_url": "__SHOPPING_ADMIN__", - "geolocation": null, - "intent_template": "What is the top-{{n}} best-selling product type name(s) in {{period}}", - "original.intent_template": "What is the top-{{n}} best-selling product type in {{period}}", - "instantiation_dict": { - "n": 1, - "period": "Jan 2023" - }, - "intent": "What is the top-1 best-selling product type name(s) in Jan 2023", - "original.intent": "What is the top-1 best-selling product type in Jan 2023", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "verify_retrieved_value_normalized", - "eval_params": { - "ordered": false, - "type": "text" - }, - "expected_data": [ - "Duffle" - ] - } - } - ], - "site": "shopping_admin" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "exact_match": "Duffle" - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "Duffle" - }, "intent_template_id": 279, - "changelogs": [ + "start_urls": ["__SHOPPING_ADMIN__"], + "intent": "Get the top-1 best-selling product type name(s) in Jan 2023", + "intent_template": "Get the top-{{n}} best-selling product type name(s) in {{period}}", + "instantiation_dict": {"n": 1, "period": "Jan 2023"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "intent_template", - "category": "task_ambiguity", - "note": "Template asks for 'product type' but agent can return the id, or name as part of the response leading to false negatives. Should specifically ask for product type name(s)" + "evaluator": "AgentResponseEvaluator", + "ordered": false, + "results_schema": { "type": "array", "items": {"type": "string"} }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": ["Duffle"] + } } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping_admin" - ], + "sites": ["shopping_admin"], "task_id": 6, - "require_login": true, - "storage_state": "./.auth/shopping_admin_state.json", - "start_url": "__SHOPPING_ADMIN__", - "geolocation": null, - "intent_template": "What are the top-{{n}} best-selling product name(s) in {{year}}", - "original.intent_template": "What are the top-{{n}} best-selling product in {{year}}", - "instantiation_dict": { - "n": 2, - "year": 2023 - }, - "original.instantiation_dict": { - "n": 5, - "year": 2023 - }, - "intent": "What are the top-2 best-selling product name(s) in 2023", - "original.intent": "What are the top-5 best-selling product in 2023", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "verify_retrieved_value_normalized", - "eval_params": { - "ordered": false, - "type": "text" - }, - "expected_data": [ - "Sprite Yoga Strap 6 foot", - "Overnight Duffle" - ] - } - } - ], - "site": "shopping_admin" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "must_include": [ - "Sprite Yoga Strap 6 foot", - "Overnight Duffle", - "Ida Workout Parachute Pant-29-Purple", - "Hawkeye Yoga Short-32-Blue", - "Sprite Stasis Ball 65 cm" - ] - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "Sprite Yoga Strap 6 foot, Overnight Duffle, Ida Workout Parachute Pant-29-Purple, Hawkeye Yoga Short-32-Blue, Sprite Stasis Ball 65 cm" - }, "intent_template_id": 279, - "changelogs": [ - { - "key": "instantiation_dict", - "category": "task_ambiguity", - "note": "Last two items of the top 5 do not provide consistent results due to numerious contenders for spots 4 and 5 changed to top 2" - }, + "start_urls": ["__SHOPPING_ADMIN__"], + "intent": "Get the top-2 best-selling product name(s) in 2023", + "intent_template": "Get the top-{{n}} best-selling product name(s) in {{year}}", + "instantiation_dict": {"n": 2, "year": 2023}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "eval", - "category": "permissive_string_match", - "note": "Switched to exact match" + "evaluator": "AgentResponseEvaluator", + "ordered": false, + "results_schema": { "type": "array", "items": {"type": "string"} }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": [ + "Sprite Yoga Strap 6 foot", + ["Overnight Duffle", "Ida Workout Parachute Pant-29-Purple"] + ] + } } - ] + ], + "revision": 2 }, { - "sites": [ - "map" - ], + "sites": ["map"], "task_id": 7, - "require_login": true, - "storage_state": null, - "start_url": "__MAP__", - "geolocation": null, - "intent_template": "Tell me the full address of all {{airport_type}} that are within a driving distance of {{radius}} to {{start}}", + "intent_template_id": 79, + "start_urls": ["__MAP__"], + "intent": "Get the name, state, and zip code of all international airports that are within a driving distance of 50 km to Carnegie Mellon University", + "intent_template": "Get the name, state, and zip code of all {{airport_type}} that are within a driving distance of {{radius}} to {{start}}", "instantiation_dict": { "airport_type": "international airports", "start": "Carnegie Mellon University", "radius": "50 km" }, - "intent": "Tell me the full address of all international airports that are within a driving distance of 50 km to Carnegie Mellon University", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "verify_retrieved_value_normalized", - "eval_params": { - "ordered": false, - "type": "text" - }, - "expected_data": [ - "Pittsburgh International Airport, Southern Beltway, Findlay Township, Allegheny County, 15231, United States" - ] - } + "format_specification": "Use \"name\" for the name, \"state\" for the state, and \"postcode\" for the postcode.", + "start_url_context": null, + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "ordered": false, + "results_schema": { + "type": "array", + "items": { + "type": "object", + "title": "full_address", + "properties": { + "name" : { "type": "string" }, + "state" : { "type": "string" }, + "postcode": { "type": "string" } + }, + "required": ["name", "state", "postcode"] + } + }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": [ + { + "name": "Pittsburgh International Airport", + "state": "Pennsylvania", + "postcode": "15231" + } + ] } - ], - "site": "map" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "exact_match": "Pittsburgh International Airport, Southern Beltway, Findlay Township, Allegheny County, 15231, United States" - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "Pittsburgh International Airport People Movers, Airport Boulevard, Findlay Township, Allegheny County, Pennsylvania, 15231, United States" - }, - "intent_template_id": 79 + } + ], + "revision": 2 }, { - "sites": [ - "map" - ], + "sites": ["map"], "task_id": 8, - "require_login": true, - "storage_state": null, - "start_url": "__MAP__", - "geolocation": null, - "intent_template": "Tell me the full address of all {{airport_type}} that are within a driving distance of {{radius}} to {{start}}", + "intent_template_id": 79, + "start_urls": ["__MAP__"], + "intent": "Get the name, state, and zip code of all international airports that are within a driving distance of 5 km to Carnegie Mellon University", + "intent_template": "Get the name, state, and zip code of all {{airport_type}} that are within a driving distance of {{radius}} to {{start}}", "instantiation_dict": { "airport_type": "international airports", "start": "Carnegie Mellon University", "radius": "5 km" }, - "intent": "Tell me the full address of all international airports that are within a driving distance of 5 km to Carnegie Mellon University", - "require_reset": false, - "eval": { - "site": "map" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "fuzzy_match": [ - "There is no airport within 5 km of Carnegie Mellon University" - ] - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "There is no airport within 5 km of Carnegie Mellon University" - }, - "intent_template_id": 79, - "changelogs": [ + "format_specification": "Use \"name\" for the name, \"state\" for the state, and \"postcode\" for the postcode.", + "start_url_context": null, + "eval": [ { - "key": "eval", - "category": "permissive_string_match", - "note": "Switched from fuzzy eval" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": { + "performed_operation": "retrieve", + "status": "NOT_FOUND_ERROR", + "retrieved_data": null + } } - ] + ], + "revision": 2 }, { - "sites": [ - "map" - ], + "sites": ["map"], "task_id": 9, - "require_login": true, - "storage_state": null, - "start_url": "__MAP__", - "geolocation": null, - "intent_template": "Tell me the full address of all {{airport_type}} that are within a driving distance of {{radius}} to {{start}}", + "intent_template_id": 79, + "start_urls": ["__MAP__"], + "intent": "Get the name, state, and zip code of all international airports that are within a driving distance of 30 km to Carnegie Art Museum", + "intent_template": "Get the name, state, and zip code of all {{airport_type}} that are within a driving distance of {{radius}} to {{start}}", "instantiation_dict": { "airport_type": "international airports", "start": "Carnegie Art Museum", "radius": "30 km" }, - "intent": "Tell me the full address of all international airports that are within a driving distance of 30 km to Carnegie Art Museum", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "verify_retrieved_value_normalized", - "eval_params": { - "ordered": false, - "type": "text" - }, - "expected_data": [ - "Pittsburgh International Airport, Southern Beltway, Findlay Township, Allegheny County, 15231, United States" - ] - } + "format_specification": "Use \"name\" for the name, \"state\" for the state, and \"postcode\" for the postcode.", + "start_url_context": null, + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "ordered": false, + "results_schema": { + "type": "array", + "items": { + "type": "object", + "title": "full_address", + "properties": { + "name" : { "type": "string" }, + "state" : { "type": "string" }, + "postcode": { "type": "string" } + }, + "required": ["name", "state", "postcode"] + } + }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": [ + { + "name": "Pittsburgh International Airport", + "state": "Pennsylvania", + "postcode": "15231" + } + ] } - ], - "site": "map" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "must_include": [ - "Pittsburgh International Airport, Southern Beltway, Findlay Township, Allegheny County, 15231, United States" - ] - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "Pittsburgh International Airport People Movers, Airport Boulevard, Findlay Township, Allegheny County, Pennsylvania, 15231, United States" - }, - "intent_template_id": 79, - "changelogs": [ - { - "key": "eval", - "category": "permissive_string_match", - "note": "Switched to exact match" } - ] + ], + "revision": 2 }, { - "sites": [ - "map" - ], + "sites": ["map"], "task_id": 10, - "require_login": true, - "storage_state": null, - "start_url": "__MAP__", - "geolocation": null, - "intent_template": "Tell me the full address of all {{airport_type}} that are within a driving distance of {{radius}} to {{start}}", + "intent_template_id": 79, + "start_urls": ["__MAP__"], + "intent": "Get the name, state, and zip code of all US international airports that are within a driving distance of 60 km to Niagara Falls", + "intent_template": "Get the name, state, and zip code of all {{airport_type}} that are within a driving distance of {{radius}} to {{start}}", "instantiation_dict": { "airport_type": "US international airports", "start": "Niagara Falls", "radius": "60 km" }, - "intent": "Tell me the full address of all US international airports that are within a driving distance of 60 km to Niagara Falls", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "verify_retrieved_value_normalized", - "eval_params": { - "ordered": false, - "type": "text" + "format_specification": "Use \"name\" for the name, \"state\" for the state, and \"postcode\" for the postcode.", + "start_url_context": null, + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "ordered": false, + "results_schema": { + "type": "array", + "items": { + "type": "object", + "title": "full_address", + "properties": { + "name" : { "type": "string" }, + "state" : { "type": "string" }, + "postcode": { "type": "string" } + }, + "required": ["name", "state", "postcode"] + } + }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": [ + { + "name": "Niagara Falls International Airport", + "state": "New York", + "postcode": "14304" }, - "expected_data": [ - "Niagara Falls International Airport, 2035, Niagara Falls Boulevard, City of Niagara Falls, Town of Wheatfield, Niagara County, New York, 14304, United States", - "Buffalo-Niagara International Airport, Holtz Drive, Town of Cheektowaga, Erie County, New York, 14225, United States" - ] - } + { + "name": "Buffalo-Niagara International Airport", + "state": "New York", + "postcode": "14225" + } + ] } - ], - "site": "map" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "must_include": [ - "Niagara Falls International Airport, 2035, Niagara Falls Boulevard, City of Niagara Falls, Town of Wheatfield, Niagara County, New York, 14304, United States", - "Buffalo-Niagara International Airport, Holtz Drive, Town of Cheektowaga, Erie County, New York, 14225, United States" - ] - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "Niagara Falls International Airport, 2035, Niagara Falls Boulevard, City of Niagara Falls, Town of Wheatfield, Niagara County, New York, 14304, United States Buffalo-Niagara International Airport, South Youngs Road, Town of Cheektowaga, Erie County, New York, 14221, United States" - }, - "intent_template_id": 79, - "changelogs": [ - { - "key": "eval", - "category": "permissive_string_match", - "note": "Switched to exact match" } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping_admin" - ], + "sites": ["shopping_admin"], "task_id": 11, - "require_login": true, - "storage_state": "./.auth/shopping_admin_state.json", - "start_url": "__SHOPPING_ADMIN__", - "geolocation": null, - "intent_template": "Tell me the number of reviews that our store received so far that mention term \"{{term}}\"", - "original.intent_template": "Tell me the the number of reviews that our store received by far that mention term \"{{term}}\"", - "instantiation_dict": { - "term": "disappointed" - }, - "intent": "Tell me the number of reviews that our store received so far that mention term \"disappointed\"", - "original.intent": "Tell me the the number of reviews that our store received by far that mention term \"disappointed\"", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "verify_retrieved_value_normalized", - "eval_params": { - "ordered": false, - "type": "numeric" - }, - "expected_data": [ - 6 - ] - } - } - ], - "site": "shopping_admin" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "must_include": [ - "6" - ] - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "6" - }, "intent_template_id": 288, - "changelogs": [ - { - "key": "intent_template", - "category": "spelling_or_grammar", - "note": "Fix task wording." - }, + "start_urls": ["__SHOPPING_ADMIN__"], + "intent": "Tell me the number of reviews that our store received so far that mention term \"disappointed\"", + "intent_template": "Tell me the number of reviews that our store received so far that mention term \"{{term}}\"", + "instantiation_dict": {"term": "disappointed"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "eval", - "category": "permissive_string_match", - "note": "Switched to exact match" + "evaluator": "AgentResponseEvaluator", + "ordered": false, + "results_schema": { "type": "array", "items": {"type": "number"} }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": [6] + } } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping_admin" - ], + "sites": ["shopping_admin"], "task_id": 12, - "require_login": true, - "storage_state": "./.auth/shopping_admin_state.json", - "start_url": "__SHOPPING_ADMIN__", - "geolocation": null, - "intent_template": "Tell me the number of reviews that our store received so far that mention term \"{{term}}\"", - "original.intent_template": "Tell me the the number of reviews that our store received by far that mention term \"{{term}}\"", - "instantiation_dict": { - "term": "satisfied" - }, - "intent": "Tell me the number of reviews that our store received so far that mention term \"satisfied\"", - "original.intent": "Tell me the the number of reviews that our store received by far that mention term \"satisfied\"", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "verify_retrieved_value_normalized", - "eval_params": { - "ordered": false, - "type": "numeric" - }, - "expected_data": [ - 2 - ] - } - } - ], - "site": "shopping_admin" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "must_include": [ - "2" - ] - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "2" - }, "intent_template_id": 288, - "changelogs": [ - { - "key": "intent_template", - "category": "spelling_or_grammar", - "note": "Fix task wording." - }, + "start_urls": ["__SHOPPING_ADMIN__"], + "intent": "Tell me the number of reviews that our store received so far that mention term \"satisfied\"", + "intent_template": "Tell me the number of reviews that our store received so far that mention term \"{{term}}\"", + "instantiation_dict": {"term": "satisfied"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "eval", - "category": "permissive_string_match", - "note": "Switched to exact match" + "evaluator": "AgentResponseEvaluator", + "ordered": false, + "results_schema": { "type": "array", "items": {"type": "number"} }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": [2] + } } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping_admin" - ], + "sites": ["shopping_admin"], "task_id": 13, - "require_login": true, - "storage_state": "./.auth/shopping_admin_state.json", - "start_url": "__SHOPPING_ADMIN__", - "geolocation": null, - "intent_template": "Tell me the number of reviews that our store received so far that mention term \"{{term}}\"", - "original.intent_template": "Tell me the the number of reviews that our store received by far that mention term \"{{term}}\"", - "instantiation_dict": { - "term": "decent" - }, - "intent": "Tell me the number of reviews that our store received so far that mention term \"decent\"", - "original.intent": "Tell me the the number of reviews that our store received by far that mention term \"decent\"", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "verify_retrieved_value_normalized", - "eval_params": { - "ordered": false, - "type": "numeric" - }, - "expected_data": [ - 2 - ] - } - } - ], - "site": "shopping_admin" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "must_include": [ - "2" - ] - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "2" - }, "intent_template_id": 288, - "changelogs": [ - { - "key": "intent_template", - "category": "spelling_or_grammar", - "note": "Fix task wording." - }, + "start_urls": ["__SHOPPING_ADMIN__"], + "intent": "Tell me the number of reviews that our store received so far that mention term \"decent\"", + "intent_template": "Tell me the number of reviews that our store received so far that mention term \"{{term}}\"", + "instantiation_dict": {"term": "decent"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "eval", - "category": "permissive_string_match", - "note": "Switched to exact match" + "evaluator": "AgentResponseEvaluator", + "ordered": false, + "results_schema": { "type": "array", "items": {"type": "number"} }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": [2] + } } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping_admin" - ], + "sites": ["shopping_admin"], "task_id": 14, - "require_login": true, - "storage_state": "./.auth/shopping_admin_state.json", - "start_url": "__SHOPPING_ADMIN__", - "geolocation": null, - "intent_template": "Tell me the number of reviews that our store received so far that mention term \"{{term}}\"", - "original.intent_template": "Tell me the the number of reviews that our store received by far that mention term \"{{term}}\"", - "instantiation_dict": { - "term": "not useful" - }, - "intent": "Tell me the number of reviews that our store received so far that mention term \"not useful\"", - "original.intent": "Tell me the the number of reviews that our store received by far that mention term \"not useful\"", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "verify_retrieved_value_normalized", - "eval_params": { - "ordered": false, - "type": "numeric" - }, - "expected_data": [ - 0 - ] - } - } - ], - "site": "shopping_admin" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "must_include": [ - "0" - ] - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "0" - }, "intent_template_id": 288, - "changelogs": [ - { - "key": "intent_template", - "category": "spelling_or_grammar", - "note": "Fix task wording." - }, + "start_urls": ["__SHOPPING_ADMIN__"], + "intent": "Tell me the number of reviews that our store received so far that mention term \"not useful\"", + "intent_template": "Tell me the number of reviews that our store received so far that mention term \"{{term}}\"", + "instantiation_dict": {"term": "not useful"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "eval", - "category": "permissive_string_match", - "note": "Switched to exact match" + "evaluator": "AgentResponseEvaluator", + "ordered": false, + "results_schema": { "type": "array", "items": {"type": "number"} }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": [0] + } } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping_admin" - ], + "sites": ["shopping_admin"], "task_id": 15, - "require_login": true, - "storage_state": "./.auth/shopping_admin_state.json", - "start_url": "__SHOPPING_ADMIN__", - "geolocation": null, - "intent_template": "Tell me the number of reviews that our store received so far that mention term \"{{term}}\"", - "original.intent_template": "Tell me the the number of reviews that our store received by far that mention term \"{{term}}\"", - "instantiation_dict": { - "term": "best" - }, - "intent": "Tell me the number of reviews that our store received so far that mention term \"best\"", - "original.intent": "Tell me the the number of reviews that our store received by far that mention term \"best\"", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "verify_retrieved_value_normalized", - "eval_params": { - "ordered": false, - "type": "numeric" - }, - "expected_data": [ - 2 - ] - } - } - ], - "site": "shopping_admin" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "must_include": [ - "2" - ] - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "2" - }, "intent_template_id": 288, - "changelogs": [ - { - "key": "intent_template", - "category": "spelling_or_grammar", - "note": "Fix task wording." - }, + "start_urls": ["__SHOPPING_ADMIN__"], + "intent": "Tell me the number of reviews that our store received so far that mention term \"best\"", + "intent_template": "Tell me the number of reviews that our store received so far that mention term \"{{term}}\"", + "instantiation_dict": {"term": "best"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "eval", - "category": "permissive_string_match", - "note": "Switched to exact match" + "evaluator": "AgentResponseEvaluator", + "ordered": false, + "results_schema": { "type": "array", "items": {"type": "number"} }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": [2] + } } - ] + ], + "revision": 2 }, { - "sites": [ - "map" - ], + "sites": ["map"], "task_id": 16, - "require_login": true, - "storage_state": null, - "start_url": "__MAP__", - "geolocation": null, - "intent_template": "Compare the time for walking and driving route from {{start}} to {{end}}", - "instantiation_dict": { - "start": "5000 Fifth Avenue, Pittsburgh", - "end": "UPMC family health center" - }, - "intent": "Compare the time for walking and driving route from 5000 Fifth Avenue, Pittsburgh to UPMC family health center", - "require_reset": false, - "eval": { - "site": "map" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "fuzzy_match": [ - "driving: 2min", - "walking: 16min" - ] - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "Driving: 2min. Walking: 16min." - }, "intent_template_id": 73, - "changelogs": [ + "start_urls": ["__MAP__"], + "intent": "Tell me the time for walking and driving route from 5000 Fifth Avenue, Pittsburgh to UPMC family health center", + "intent_template": "Tell me the time for walking and driving route from {{start}} to {{end}}", + "instantiation_dict": {"start": "5000 Fifth Avenue, Pittsburgh", "end": "UPMC family health center"}, + "format_specification": "Use \"mode\" for the travel mode and \"duration\" for the duration.", + "start_url_context": null, + "eval": [ { - "key": "eval", - "category": "permissive_string_match", - "note": "Switched from fuzzy eval" + "evaluator": "AgentResponseEvaluator", + "ordered": false, + "results_schema": { + "type": "array", + "items": { + "type": "object", + "properties": { + "mode" : { "type": "string" }, + "duration": { "type": "string", "format": "duration" } + }, + "required": ["duration", "mode"] + } + }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": [ + { "mode": "driving", "duration": "2min" }, + { "mode": "walking", "duration": "16min" } + ] + } } - ] + ], + "revision": 2 }, { - "sites": [ - "map" - ], + "sites": ["map"], "task_id": 17, - "require_login": true, - "storage_state": null, - "start_url": "__MAP__", - "geolocation": null, - "intent_template": "Compare the time for walking and driving route from {{start}} to {{end}}", - "instantiation_dict": { - "start": "AMC Waterfront", - "end": "Carnegie Mellon University" - }, - "intent": "Compare the time for walking and driving route from AMC Waterfront to Carnegie Mellon University", - "require_reset": false, - "eval": { - "site": "map" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "fuzzy_match": [ - "driving: 13min", - "walking: 1h 35min" - ] - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "driving: 13min, walking: 1h 35min." - }, "intent_template_id": 73, - "changelogs": [ + "start_urls": ["__MAP__"], + "intent": "Tell me the time for walking and driving route from AMC Waterfront to Carnegie Mellon University", + "intent_template": "Tell me the time for walking and driving route from {{start}} to {{end}}", + "instantiation_dict": {"start": "AMC Waterfront", "end": "Carnegie Mellon University"}, + "format_specification": "Use \"mode\" for the travel mode and \"duration\" for the duration.", + "start_url_context": null, + "eval": [ { - "key": "eval", - "category": "permissive_string_match", - "note": "Switched from fuzzy eval" + "evaluator": "AgentResponseEvaluator", + "ordered": false, + "results_schema": { + "type": "array", + "items": { + "type": "object", + "properties": { + "mode" : { "type": "string" }, + "duration": { "type": "string", "format": "duration" } + }, + "required": ["duration", "mode"] + } + }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": [ + { "mode": "driving", "duration": "13min" }, + { "mode": "walking", "duration": "1hr 35min" } + ] + } } - ] + ], + "revision": 2 }, { - "sites": [ - "map" - ], + "sites": ["map"], "task_id": 18, - "require_login": true, - "storage_state": null, - "start_url": "__MAP__", - "geolocation": null, - "intent_template": "Compare the time for walking and driving route from {{start}} to {{end}}", - "instantiation_dict": { - "start": "AMC Waterfront", - "end": "Univ of Pittsburgh" - }, - "intent": "Compare the time for walking and driving route from AMC Waterfront to Univ of Pittsburgh", - "require_reset": false, - "eval": { - "site": "map" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "fuzzy_match": [ - "driving: 15min", - "walking: 1h 47min" - ] - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "driving: 15min, walking: 1h 47min." - }, "intent_template_id": 73, - "changelogs": [ + "start_urls": ["__MAP__"], + "intent": "Tell me the time for walking and driving route from AMC Waterfront to Univ of Pittsburgh", + "intent_template": "Tell me the time for walking and driving route from {{start}} to {{end}}", + "instantiation_dict": {"start": "AMC Waterfront", "end": "Univ of Pittsburgh"}, + "format_specification": "Use \"mode\" for the travel mode and \"duration\" for the duration.", + "start_url_context": null, + "eval": [ { - "key": "eval", - "category": "permissive_string_match", - "note": "Switched from fuzzy eval" + "evaluator": "AgentResponseEvaluator", + "ordered": false, + "results_schema": { + "type": "array", + "items": { + "type": "object", + "properties": { + "mode" : { "type": "string" }, + "duration": { "type": "string", "format": "duration" } + }, + "required": ["duration", "mode"] + } + }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": [ + { "mode": "driving", "duration": "2min" }, + { "mode": "walking", "duration": "16min" } + ] + } } - ] + ], + "revision": 2 }, { - "sites": [ - "map" - ], + "sites": ["map"], "task_id": 19, - "require_login": true, - "storage_state": null, - "start_url": "__MAP__", - "geolocation": null, - "intent_template": "Compare the time for walking and driving route from {{start}} to {{end}}", - "instantiation_dict": { - "start": "Carnegie Science Center", - "end": "Carnegie Mellon University" - }, - "intent": "Compare the time for walking and driving route from Carnegie Science Center to Carnegie Mellon University", - "require_reset": false, - "eval": { - "site": "map" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "fuzzy_match": [ - "driving: 12min", - "walking: 1h 44min." - ] - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "driving: 12min, walking: 1h 44min." - }, "intent_template_id": 73, - "changelogs": [ + "start_urls": ["__MAP__"], + "intent": "Tell me the time for walking and driving route from Carnegie Science Center to Carnegie Mellon University", + "intent_template": "Tell me the time for walking and driving route from {{start}} to {{end}}", + "instantiation_dict": {"start": "Carnegie Science Center", "end": "Carnegie Mellon University"}, + "format_specification": "Use \"mode\" for the travel mode and \"duration\" for the duration.", + "start_url_context": null, + "eval": [ { - "key": "eval", - "category": "permissive_string_match", - "note": "Switched from fuzzy eval" + "evaluator": "AgentResponseEvaluator", + "ordered": false, + "results_schema": { + "type": "array", + "items": { + "type": "object", + "properties": { + "mode" : { "type": "string" }, + "duration": { "type": "string", "format": "duration" } + }, + "required": ["duration", "mode"] + } + }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": [ + { "mode": "driving", "duration": "12min" }, + { "mode": "walking", "duration": "1hr 44min" } + ] + } } - ] + ], + "revision": 2 }, { - "sites": [ - "map" - ], + "sites": ["map"], "task_id": 20, - "require_login": true, - "storage_state": null, - "start_url": "__MAP__", - "geolocation": null, - "intent_template": "Compare the difference in time for walking and driving route from {{start}} to {{end}}", - "instantiation_dict": { - "start": "Randyland", - "end": "Carnegie Mellon University" - }, - "intent": "Compare the difference in time for walking and driving route from Randyland to Carnegie Mellon University", - "require_reset": false, - "eval": { - "site": "map" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "fuzzy_match": [ - "driving: 13min", - "walking: 1h 45min" - ] - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "driving: 13min, walking: 1h 45min." - }, "intent_template_id": 73, - "changelogs": [ + "start_urls": ["__MAP__"], + "intent": "Tell me the time for walking and driving route from Randyland to Carnegie Mellon University", + "intent_template": "Tell me the time for walking and driving route from {{start}} to {{end}}", + "instantiation_dict": {"start": "Randyland", "end": "Carnegie Mellon University"}, + "format_specification": "Use \"mode\" for the travel mode and \"duration\" for the duration.", + "start_url_context": null, + "eval": [ { - "key": "eval", - "category": "permissive_string_match", - "note": "Switched from fuzzy eval" + "evaluator": "AgentResponseEvaluator", + "ordered": false, + "results_schema": { + "type": "array", + "items": { + "type": "object", + "properties": { + "mode" : { "type": "string" }, + "duration": { "type": "string", "format": "duration" } + }, + "required": ["duration", "mode"] + } + }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": [ + { "mode": "driving", "duration": "13min" }, + { "mode": "walking", "duration": "1hr 45min" } + ] + } } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping" - ], + "sites": ["shopping"], "task_id": 21, - "require_login": true, - "storage_state": "./.auth/shopping_state.json", - "start_url": "__SHOPPING__/6s-wireless-headphones-over-ear-noise-canceling-hi-fi-bass-foldable-stereo-wireless-kid-headsets-earbuds-with-built-in-mic-micro-sd-tf-fm-for-iphone-samsung-ipad-pc-black-gold.html", - "geolocation": null, - "intent_template": "List out reviewer names, if exist, who mention about {{description}}", - "original.intent_template": "List out reviewers, if exist, who mention about {{description}}", - "instantiation_dict": { - "description": "ear cups being small" - }, - "intent": "List out reviewer names, if exist, who mention about ear cups being small", - "original.intent": "List out reviewers, if exist, who mention about ear cups being small", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "verify_retrieved_value_normalized", - "eval_params": { - "ordered": false, - "type": "text" - }, - "expected_data": [ - "Joseph Brzezinski", - "Catso", - "Dibbins", - "Anglebert Dinkherhump", - "Michelle Davis" - ] - } - } - ], - "site": "shopping" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "must_include": [ - "Joseph Brzezinski", - "Catso", - "Dibbins", - "Anglebert Dinkherhump", - "Michelle Davis" - ] - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "Joseph Brzezinski, Catso, Dibbins, Anglebert Dinkherhump, Michelle Davis" - }, "intent_template_id": 222, - "changelogs": [ - { - "key": "intent_template", - "category": "task_ambiguity", - "note": "Clarify the intent to specify the reviewers name instead of email or other attributes" - }, + "start_urls": [ + "__SHOPPING__/6s-wireless-headphones-over-ear-noise-canceling-hi-fi-bass-foldable-stereo-wireless-kid-headsets-earbuds-with-built-in-mic-micro-sd-tf-fm-for-iphone-samsung-ipad-pc-black-gold.html" + ], + "intent": "Get name(s) of reviewer(s) who mention ear cups being small", + "intent_template": "Get name(s) of reviewer(s) who mention {{description}}", + "instantiation_dict": {"description": "ear cups being small"}, + "format_specification": null, + "start_url_context": "Complete the task given the context of the current page.", + "eval": [ { - "key": "eval", - "category": "permissive_string_match", - "note": "Switched to exact match" + "evaluator": "AgentResponseEvaluator", + "ordered": false, + "results_schema": { "type": "array", "items": {"type": "string"} }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": [ + "Joseph Brzezinski", "Catso", "Dibbins", "Anglebert Dinkherhump", + "Michelle Davis" + ] + } } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping" - ], + "sites": ["shopping"], "task_id": 22, - "require_login": true, - "storage_state": "./.auth/shopping_state.json", - "start_url": "__SHOPPING__/fujifilm-finepix-z200fd-10mp-digital-camera-with-5x-optical-dual-image-stabilized-zoom-black.html", - "geolocation": null, - "intent_template": "List out reviewer names, if exist, who mention about {{description}}", - "original.intent_template": "List out reviewers, if exist, who mention about {{description}}", - "instantiation_dict": { - "description": "under water photo" - }, - "intent": "List out reviewer names, if exist, who mention about under water photo", - "original.intent": "List out reviewers, if exist, who mention about under water photo", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "validate_status", - "expected_data": { - "expected_status": "NOT_FOUND_ERROR" - } - } - } - ], - "site": "shopping" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "fuzzy_match": "N/A" - }, - "reference_url": "", - "program_html": [], - "string_note": "There is no review about under water photo", - "reference_answer_raw_annotation": "N/A" - }, "intent_template_id": 222, - "changelogs": [ - { - "key": "intent_template", - "category": "task_ambiguity", - "note": "Clarify the intent to specify the reviewers name instead of email or other attributes" - }, + "start_urls": [ + "__SHOPPING__/fujifilm-finepix-z200fd-10mp-digital-camera-with-5x-optical-dual-image-stabilized-zoom-black.html" + ], + "intent": "Get name(s) of reviewer(s) who mention under water photo", + "intent_template": "Get name(s) of reviewer(s) who mention {{description}}", + "instantiation_dict": {"description": "under water photo"}, + "format_specification": null, + "start_url_context": "Complete the task given the context of the current page.", + "eval": [ { - "key": "expected_retrieve_value", - "category": "unachievable_tasks", - "note": "Status check used instead of fuzzy match" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": { + "performed_operation": "retrieve", + "status": "NOT_FOUND_ERROR", + "retrieved_data": null + } } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping" - ], + "sites": ["shopping"], "task_id": 23, - "require_login": true, - "storage_state": "./.auth/shopping_state.json", - "start_url": "__SHOPPING__/3-pack-samsung-galaxy-s6-screen-protector-nearpow-tempered-glass-screen-protector-with-9h-hardness-crystal-clear-easy-bubble-free-installation-scratch-resist.html", - "geolocation": null, - "intent_template": "List out reviewer names, if exist, who mention about {{description}}", - "original.intent_template": "List out reviewers, if exist, who mention about {{description}}", - "instantiation_dict": { - "description": "good fingerprint resistant" - }, - "intent": "List out reviewer names, if exist, who mention about good fingerprint resistant", - "original.intent": "List out reviewers, if exist, who mention about good fingerprint resistant", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "verify_retrieved_value_normalized", - "eval_params": { - "ordered": false, - "type": "text" - }, - "expected_data": [ - "Rachel", - "T. Gannon" - ] - } - } - ], - "site": "shopping" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "must_include": [ - "Rachel", - "T. Gannon" - ] - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "Rachel, T. Gannon, " - }, "intent_template_id": 222, - "changelogs": [ - { - "key": "intent_template", - "category": "task_ambiguity", - "note": "Clarify the intent to specify the reviewers name instead of email or other attributes" - }, + "start_urls": [ + "__SHOPPING__/3-pack-samsung-galaxy-s6-screen-protector-nearpow-tempered-glass-screen-protector-with-9h-hardness-crystal-clear-easy-bubble-free-installation-scratch-resist.html" + ], + "intent": "Get name(s) of reviewer(s) who mention good fingerprint resistant", + "intent_template": "Get name(s) of reviewer(s) who mention {{description}}", + "instantiation_dict": {"description": "good fingerprint resistant"}, + "format_specification": null, + "start_url_context": "Complete the task given the context of the current page.", + "eval": [ { - "key": "eval", - "category": "permissive_string_match", - "note": "Switched to exact match" + "evaluator": "AgentResponseEvaluator", + "ordered": false, + "results_schema": { "type": "array", "items": {"type": "string"} }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": ["Rachel", "T. Gannon"] + } } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping" - ], + "sites": ["shopping"], "task_id": 24, - "require_login": true, - "storage_state": "./.auth/shopping_state.json", - "start_url": "__SHOPPING__/haflinger-men-s-wool-felt-open-back-slippers-beige-550-peat-us-7.html", - "geolocation": null, - "intent_template": "List out reviewer names, if exist, who mention about {{description}}", - "original.intent_template": "List out reviewers, if exist, who mention about {{description}}", - "instantiation_dict": { - "description": "price being unfair" - }, - "intent": "List out reviewer names, if exist, who mention about price being unfair", - "original.intent": "List out reviewers, if exist, who mention about price being unfair", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "validate_status", - "expected_data": { - "expected_status": "NOT_FOUND_ERROR" - } - } - } - ], - "site": "shopping" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "fuzzy_match": "N/A" - }, - "reference_url": "", - "program_html": [], - "string_note": "There is no reivew about price being unfair", - "reference_answer_raw_annotation": "N/A" - }, "intent_template_id": 222, - "changelogs": [ - { - "key": "intent_template", - "category": "task_ambiguity", - "note": "Clarify the intent to specify the reviewers name instead of email or other attributes" - }, + "start_urls": [ + "__SHOPPING__/haflinger-men-s-wool-felt-open-back-slippers-beige-550-peat-us-7.html" + ], + "intent": "Get name(s) of reviewer(s) who mention price being unfair", + "intent_template": "Get name(s) of reviewer(s) who mention {{description}}", + "instantiation_dict": {"description": "price being unfair"}, + "format_specification": null, + "start_url_context": "Complete the task given the context of the current page.", + "eval": [ { - "key": "expected_retrieve_value", - "category": "unachievable_tasks", - "note": "Status check used instead of fuzzy match" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": { + "performed_operation": "retrieve", + "status": "NOT_FOUND_ERROR", + "retrieved_data": null + } } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping" - ], + "sites": ["shopping"], "task_id": 25, - "require_login": true, - "storage_state": "./.auth/shopping_state.json", - "start_url": "__SHOPPING__/epson-workforce-wf-3620-wifi-direct-all-in-one-color-inkjet-printer-copier-scanner-amazon-dash-replenishment-ready.html", - "geolocation": null, - "intent_template": "List out reviewer names, if exist, who mention about {{description}}", - "original.intent_template": "List out reviewers, if exist, who mention about {{description}}", - "instantiation_dict": { - "description": "average print quality" - }, - "intent": "List out reviewer names, if exist, who mention about average print quality", - "original.intent": "List out reviewers, if exist, who mention about average print quality", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "verify_retrieved_value_normalized", - "eval_params": { - "ordered": false, - "type": "text" - }, - "expected_data": [ - "GoldfishGoldfish", - "Roxanne Brandon Coffey" - ] - } - } - ], - "site": "shopping" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "must_include": [ - "Goldfish", - "Roxanne Brandon Coffey" - ] - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "GoldfishGoldfish, Roxanne Brandon Coffey" - }, "intent_template_id": 222, - "changelogs": [ - { - "key": "intent_template", - "category": "task_ambiguity", - "note": "Clarify the intent to specify the reviewers name instead of email or other attributes" - }, + "start_urls": [ + "__SHOPPING__/epson-workforce-wf-3620-wifi-direct-all-in-one-color-inkjet-printer-copier-scanner-amazon-dash-replenishment-ready.html" + ], + "intent": "Get name(s) of reviewer(s) who mention print quality and give rating of 3 or less stars", + "intent_template": "Get name(s) of reviewer(s) who mention {{description}} and give rating {{rating}}", + "instantiation_dict": {"description": "print quality", "rating": "3 or less stars"}, + "format_specification": null, + "start_url_context": "Complete the task given the context of the current page.", + "eval": [ { - "key": "eval", - "category": "permissive_string_match", - "note": "Switched to exact match" + "evaluator": "AgentResponseEvaluator", + "ordered": false, + "results_schema": { "type": "array", "items": {"type": "string"} }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": ["Roxanne Brandon Coffey", "Nelson"] + } } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping" - ], + "sites": ["shopping"], "task_id": 26, - "require_login": true, - "storage_state": "./.auth/shopping_state.json", - "start_url": "__SHOPPING__/epson-workforce-wf-3620-wifi-direct-all-in-one-color-inkjet-printer-copier-scanner-amazon-dash-replenishment-ready.html", - "geolocation": null, - "intent_template": "List out reviewer names, if exist, who mention about {{description}}", - "original.intent_template": "List out reviewers, if exist, who mention about {{description}}", - "instantiation_dict": { - "description": "complain of the customer service" - }, - "intent": "List out reviewer names, if exist, who mention about complain of the customer service", - "original.intent": "List out reviewers, if exist, who mention about complain of the customer service", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "verify_retrieved_value_normalized", - "eval_params": { - "ordered": false, - "type": "text" - }, - "expected_data": [ - "RemyRRemyR", - "Bob in Vegas" - ] - } - } - ], - "site": "shopping" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "must_include": [ - "Bob in Vegas", - "RemyR" - ] - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "Bob in Vegas, RemyRRemyR" - }, "intent_template_id": 222, - "changelogs": [ - { - "key": "intent_template", - "category": "task_ambiguity", - "note": "Clarify the intent to specify the reviewers name instead of email or other attributes" - }, + "start_urls": [ + "__SHOPPING__/epson-workforce-wf-3620-wifi-direct-all-in-one-color-inkjet-printer-copier-scanner-amazon-dash-replenishment-ready.html" + ], + "intent": "Get name(s) of reviewer(s) who mention complain of the customer service", + "intent_template": "Get name(s) of reviewer(s) who mention {{description}}", + "instantiation_dict": {"description": "complain of the customer service"}, + "format_specification": null, + "start_url_context": "Complete the task given the context of the current page.", + "eval": [ { - "key": "eval", - "category": "permissive_string_match", - "note": "Switched to exact match" + "evaluator": "AgentResponseEvaluator", + "ordered": false, + "results_schema": { "type": "array", "items": {"type": "string"} }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": ["RemyRRemyR", "Bob in Vegas"] + } } - ] + ], + "revision": 2 }, { - "sites": [ - "reddit" - ], + "sites": ["reddit"], "task_id": 27, - "require_login": true, - "storage_state": "./.auth/reddit_state.json", - "start_url": "__REDDIT__", - "geolocation": null, - "intent_template": "Tell me the count of comments that have received more downvotes than upvotes for the user who made the latest post on the {{forum}} forum.", - "instantiation_dict": { - "forum": "Showerthoughts" - }, - "intent": "Tell me the count of comments that have received more downvotes than upvotes for the user who made the latest post on the Showerthoughts forum.", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "verify_retrieved_value_normalized", - "eval_params": { - "ordered": false, - "type": "numeric" - }, - "expected_data": [ - 0 - ] - } - } - ], - "site": "reddit" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "must_include": [ - "0" - ] - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "0" - }, "intent_template_id": 33, - "changelogs": [ + "start_urls": ["__REDDIT__"], + "intent": "Tell me the count of comments that have received more downvotes than upvotes for the user who made the latest post on the Showerthoughts forum.", + "intent_template": "Tell me the count of comments that have received more downvotes than upvotes for the user who made the latest post on the {{forum}} forum.", + "instantiation_dict": {"forum": "Showerthoughts"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "eval", - "category": "permissive_string_match", - "note": "Switched to exact match" + "evaluator": "AgentResponseEvaluator", + "ordered": false, + "results_schema": { "type": "array", "items": {"type": "number"} }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": [0] + } } - ] + ], + "revision": 2 }, { - "sites": [ - "reddit" - ], + "sites": ["reddit"], "task_id": 28, - "require_login": true, - "storage_state": "./.auth/reddit_state.json", - "start_url": "__REDDIT__", - "geolocation": null, - "intent_template": "Tell me the count of comments that have received more downvotes than upvotes for the user who made the latest post on the {{forum}} forum.", - "instantiation_dict": { - "forum": "Worcester" - }, - "intent": "Tell me the count of comments that have received more downvotes than upvotes for the user who made the latest post on the Worcester forum.", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "verify_retrieved_value_normalized", - "eval_params": { - "ordered": false, - "type": "numeric" - }, - "expected_data": [ - 1 - ] - } - } - ], - "site": "reddit" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "must_include": [ - "0" - ] - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "0" - }, "intent_template_id": 33, - "changelogs": [ - { - "key": "expected_retrieve_value", - "category": "reference_alignment", - "note": "Original expected was missing a comment" - }, + "start_urls": ["__REDDIT__"], + "intent": "Tell me the count of comments that have received more downvotes than upvotes for the user who made the latest post on the Worcester forum.", + "intent_template": "Tell me the count of comments that have received more downvotes than upvotes for the user who made the latest post on the {{forum}} forum.", + "instantiation_dict": {"forum": "Worcester"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "eval", - "category": "permissive_string_match", - "note": "Switched to exact match" + "evaluator": "AgentResponseEvaluator", + "ordered": false, + "results_schema": { "type": "array", "items": {"type": "number"} }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": [1] + } } - ] + ], + "revision": 2 }, { - "sites": [ - "reddit" - ], + "sites": ["reddit"], "task_id": 29, - "require_login": true, - "storage_state": "./.auth/reddit_state.json", - "start_url": "__REDDIT__", - "geolocation": null, - "intent_template": "Tell me the count of comments that have received more downvotes than upvotes for the user who made the latest post on the {{forum}} forum.", - "instantiation_dict": { - "forum": "DIY" - }, - "intent": "Tell me the count of comments that have received more downvotes than upvotes for the user who made the latest post on the DIY forum.", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "verify_retrieved_value_normalized", - "eval_params": { - "ordered": false, - "type": "numeric" - }, - "expected_data": [ - 1 - ] - } - } - ], - "site": "reddit" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "must_include": [ - "1" - ] - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "1" - }, "intent_template_id": 33, - "changelogs": [ + "start_urls": ["__REDDIT__"], + "intent": "Tell me the count of comments that have received more downvotes than upvotes for the user who made the latest post on the DIY forum.", + "intent_template": "Tell me the count of comments that have received more downvotes than upvotes for the user who made the latest post on the {{forum}} forum.", + "instantiation_dict": {"forum": "DIY"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "eval", - "category": "permissive_string_match", - "note": "Switched to exact match" + "evaluator": "AgentResponseEvaluator", + "ordered": false, + "results_schema": { "type": "array", "items": {"type": "number"} }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": [1] + } } - ] + ], + "revision": 2 }, { - "sites": [ - "reddit" - ], + "sites": ["reddit"], "task_id": 30, - "require_login": true, - "storage_state": "./.auth/reddit_state.json", - "start_url": "__REDDIT__", - "geolocation": null, - "intent_template": "Tell me the count of comments that have received more downvotes than upvotes for the user who made the latest post on the {{forum}} forum.", - "instantiation_dict": { - "forum": "space" - }, - "intent": "Tell me the count of comments that have received more downvotes than upvotes for the user who made the latest post on the space forum.", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "verify_retrieved_value_normalized", - "eval_params": { - "ordered": false, - "type": "numeric" - }, - "expected_data": [ - 0 - ] - } - } - ], - "site": "reddit" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "must_include": [ - "0" - ] - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "0" - }, "intent_template_id": 33, - "changelogs": [ + "start_urls": ["__REDDIT__"], + "intent": "Tell me the count of comments that have received more downvotes than upvotes for the user who made the latest post on the space forum.", + "intent_template": "Tell me the count of comments that have received more downvotes than upvotes for the user who made the latest post on the {{forum}} forum.", + "instantiation_dict": {"forum": "space"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "eval", - "category": "permissive_string_match", - "note": "Switched to exact match" + "evaluator": "AgentResponseEvaluator", + "ordered": false, + "results_schema": { "type": "array", "items": {"type": "number"} }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": [0] + } } - ] + ], + "revision": 2 }, { - "sites": [ - "reddit" - ], + "sites": ["reddit"], "task_id": 31, - "require_login": true, - "storage_state": "./.auth/reddit_state.json", - "start_url": "__REDDIT__", - "geolocation": null, - "intent_template": "Tell me the count of comments that have received more downvotes than upvotes for the user who made the latest post on the {{forum}} forum.", - "instantiation_dict": { - "forum": "photoshopbattles" - }, - "intent": "Tell me the count of comments that have received more downvotes than upvotes for the user who made the latest post on the photoshopbattles forum.", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "verify_retrieved_value_normalized", - "eval_params": { - "ordered": false, - "type": "numeric" - }, - "expected_data": [ - 0 - ] - } - } - ], - "site": "reddit" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "must_include": [ - "0" - ] - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "0" - }, "intent_template_id": 33, - "changelogs": [ + "start_urls": ["__REDDIT__"], + "intent": "Tell me the count of comments that have received more downvotes than upvotes for the user who made the latest post on the photoshopbattles forum.", + "intent_template": "Tell me the count of comments that have received more downvotes than upvotes for the user who made the latest post on the {{forum}} forum.", + "instantiation_dict": {"forum": "photoshopbattles"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "eval", - "category": "permissive_string_match", - "note": "Switched to exact match" + "evaluator": "AgentResponseEvaluator", + "ordered": false, + "results_schema": { "type": "array", "items": {"type": "number"} }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": [0] + } } - ] + ], + "revision": 2 }, { - "sites": [ - "map" - ], + "sites": ["map"], "task_id": 32, - "require_login": true, - "storage_state": null, - "start_url": "__MAP__", - "geolocation": null, - "intent_template": "I will arrive {{place}} soon. Provide the name of a {{target1}} in the vicinity, if available. Then, tell me the {{information}} to {{target2}} from the hotel.", + "intent_template_id": 78, + "start_urls": ["__MAP__"], + "intent": "I will arrive at Pittsburgh Airport soon. Provide the name of a Hilton hotel in the vicinity, if available. Then, tell me the the walking distance to the nearest supermarket own by a local company from the hotel.", + "intent_template": "I will arrive at {{place}} soon. Provide the name of a {{target1}} in the vicinity, if available. Then, tell me the {{information}} to {{target2}} from the hotel.", "instantiation_dict": { "place": "Pittsburgh Airport", "information": "the walking distance", "target1": "Hilton hotel", "target2": "the nearest supermarket own by a local company" }, - "intent": "I will arrive Pittsburgh Airport soon. Provide the name of a Hilton hotel in the vicinity, if available. Then, tell me the the walking distance to the nearest supermarket own by a local company from the hotel.", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "verify_retrieved_value_normalized", - "eval_params": { - "ordered": false, - "type": "text" + "format_specification": "Use \"hotel\" for the hotel name and \"distance\" for the distance.", + "start_url_context": null, + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "ordered": false, + "results_schema": { + "type": "array", + "items": { + "type": "object", + "properties": { + "hotel" : { "type": "string" }, + "distance": { "type": "string", "format": "distance" } }, - "expected_data": [ - "DoubleTree by Hilton Hotel Pittsburgh Airport", - "2.0km" - ] + "required": ["hotel", "distance"] } + }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": [ + {"hotel": "DoubleTree by Hilton Hotel Pittsburgh Airport", "distance": "2km"} + ] } - ], - "site": "map" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "must_include": [ - "DoubleTree by Hilton Hotel Pittsburgh Airport", - "2.0km" - ] - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "DoubleTree by Hilton Hotel Pittsburgh Airport Distance: 2.0km" - }, - "intent_template_id": 78, - "changelogs": [ - { - "key": "eval", - "category": "permissive_string_match", - "note": "Switched to exact match" } - ] + ], + "revision": 2 }, { - "sites": [ - "map" - ], + "sites": ["map"], "task_id": 33, - "require_login": true, - "storage_state": null, - "start_url": "__MAP__", - "geolocation": null, - "intent_template": "I will arrive {{place}} soon. Provide the name of a {{target1}} in the vicinity, if available. Then, tell me the {{information}} to {{target2}} from the hotel.", + "intent_template_id": 78, + "start_urls": ["__MAP__"], + "intent": "I will arrive at Pittsburgh Airport soon. Provide the name of a Hilton hotel in the vicinity, if available. Then, tell me the the shortest walking distance to a supermarket from the hotel.", + "intent_template": "I will arrive at {{place}} soon. Provide the name of a {{target1}} in the vicinity, if available. Then, tell me the {{information}} to {{target2}} from the hotel.", "instantiation_dict": { "place": "Pittsburgh Airport", "target1": "Hilton hotel", "information": "the shortest walking distance", "target2": "a supermarket" }, - "intent": "I will arrive Pittsburgh Airport soon. Provide the name of a Hilton hotel in the vicinity, if available. Then, tell me the the shortest walking distance to a supermarket from the hotel.", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "verify_retrieved_value_normalized", - "eval_params": { - "ordered": false, - "type": "text" + "format_specification": "Use \"hotel\" for the hotel name and \"distance\" for the distance.", + "start_url_context": null, + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "ordered": false, + "results_schema": { + "type": "array", + "items": { + "type": "object", + "properties": { + "hotel" : { "type": "string" }, + "distance": { "type": "string", "format": "distance" } }, - "expected_data": [ - "DoubleTree by Hilton Hotel Pittsburgh Airport", - "1.4km" - ] + "required": ["hotel", "distance"] } + }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": [ + {"hotel": "DoubleTree by Hilton Hotel Pittsburgh Airport", "distance": "1.4km"} + ] } - ], - "site": "map" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "must_include": [ - "DoubleTree by Hilton Hotel Pittsburgh Airport", - "1.4km" - ] - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "DoubleTree by Hilton Hotel Pittsburgh Airport Distance: 1.4km" - }, - "intent_template_id": 78, - "changelogs": [ - { - "key": "eval", - "category": "permissive_string_match", - "note": "Switched to exact match" } - ] + ], + "revision": 2 }, { - "sites": [ - "map" - ], + "sites": ["map"], "task_id": 34, - "require_login": true, - "storage_state": null, - "start_url": "__MAP__", - "geolocation": null, - "intent_template": "I will arrive {{place}} soon. Provide the name of a {{target1}} in the vicinity, if available. Then, tell me the {{information}} to {{target2}} from the hotel.", + "intent_template_id": 78, + "start_urls": ["__MAP__"], + "intent": "I will arrive at Pittsburgh Airport soon. Provide the name of a Hyatt hotel in the vicinity, if available. Then, tell me the the shortest walking time to a supermarket from the hotel.", + "intent_template": "I will arrive at {{place}} soon. Provide the name of a {{target1}} in the vicinity, if available. Then, tell me the {{information}} to {{target2}} from the hotel.", "instantiation_dict": { "place": "Pittsburgh Airport", "target1": "Hyatt hotel", "information": "the shortest walking time", "target2": "a supermarket" }, - "intent": "I will arrive Pittsburgh Airport soon. Provide the name of a Hyatt hotel in the vicinity, if available. Then, tell me the the shortest walking time to a supermarket from the hotel.", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "verify_retrieved_value_normalized", - "eval_params": { - "ordered": false, - "type": "text" - }, - "expected_data": [ - "Hyatt Regency Pittsburgh International Airport" - ] + "format_specification": "Use \"hotel\" for the hotel name and \"information\" for the distance or time information requested.", + "start_url_context": null, + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "ordered": false, + "results_schema": { + "type": "array", + "items": { + "type": "object", + "properties": { "hotel": {"type": "string"}, "information": {"type": "string"} }, + "required": ["hotel", "information"] } + }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": [ + { + "hotel": "Hyatt Regency Pittsburgh International Airport", + "information": "3h 30min" + } + ] } - ], - "site": "map" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "must_include": [ - "Hyatt Regency Pittsburgh International Airport" - ], - "fuzzy_match": [ - "Time: 3h 30min" - ] - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "Hyatt Regency Pittsburgh International Airport\n3:30" - }, - "intent_template_id": 78, - "changelogs": [ - { - "key": "eval", - "category": "permissive_string_match", - "note": "Switched from fuzzy eval" } - ] + ], + "revision": 2 }, { - "sites": [ - "map" - ], + "sites": ["map"], "task_id": 35, - "require_login": true, - "storage_state": null, - "start_url": "__MAP__", - "geolocation": null, - "intent_template": "I will arrive {{place}} soon. Provide the name of a {{target1}} in the vicinity, if available. Then, tell me the {{information}} to {{target2}} from the hotel.", + "intent_template_id": 78, + "start_urls": ["__MAP__"], + "intent": "I will arrive at Pittsburgh Airport soon. Provide the name of a Hyatt hotel in the vicinity, if available. Then, tell me the the minimal driving time to a supermarket from the hotel.", + "intent_template": "I will arrive at {{place}} soon. Provide the name of a {{target1}} in the vicinity, if available. Then, tell me the {{information}} to {{target2}} from the hotel.", "instantiation_dict": { "place": "Pittsburgh Airport", "target1": "Hyatt hotel", "information": "the minimal driving time", "target2": "a supermarket" }, - "intent": "I will arrive Pittsburgh Airport soon. Provide the name of a Hyatt hotel in the vicinity, if available. Then, tell me the the minimal driving time to a supermarket from the hotel.", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "verify_retrieved_value_normalized", - "eval_params": { - "ordered": false, - "type": "text" - }, - "expected_data": [ - "Hyatt Regency Pittsburgh International Airport" - ] + "format_specification": "Use \"hotel\" for the hotel name and \"information\" for the distance or time information requested.", + "start_url_context": null, + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "ordered": false, + "results_schema": { + "type": "array", + "items": { + "type": "object", + "properties": { "hotel": {"type": "string"}, "information": {"type": "string"} }, + "required": ["hotel", "information"] } + }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": [ + { + "hotel": "Hyatt Regency Pittsburgh International Airport", + "information": "15min" + } + ] } - ], - "site": "map" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "must_include": [ - "Hyatt Regency Pittsburgh International Airport" - ], - "fuzzy_match": [ - "Time: 15min" - ] - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "Hyatt Regency Pittsburgh International Airport Time: 15min" - }, - "intent_template_id": 78, - "changelogs": [ - { - "key": "eval", - "category": "permissive_string_match", - "note": "Switched from fuzzy eval" } - ] + ], + "revision": 2 }, { - "sites": [ - "map" - ], + "sites": ["map"], "task_id": 36, - "require_login": true, - "storage_state": null, - "start_url": "__MAP__", - "geolocation": null, + "intent_template_id": 77, + "start_urls": ["__MAP__"], + "intent": "Check if the social security administration in pittsburgh can be reached in one hour by car from Carnegie Mellon University", "intent_template": "Check if the {{place}} in pittsburgh can be reached in one hour by car from {{location}}", "instantiation_dict": { "place": "social security administration", "location": "Carnegie Mellon University" }, - "intent": "Check if the social security administration in pittsburgh can be reached in one hour by car from Carnegie Mellon University", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "verify_retrieved_value_normalized", - "eval_params": { - "ordered": false, - "type": "text" - }, - "expected_data": [ - "Yes" - ] - } - } - ], - "site": "map" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "must_include": [ - "Yes" - ] - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "Yes" - }, - "intent_template_id": 77, - "changelogs": [ + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "eval", - "category": "permissive_string_match", - "note": "Switched to exact match" + "evaluator": "AgentResponseEvaluator", + "ordered": false, + "results_schema": { "type": "array", "items": {"type": "boolean"} }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": [true] + } } - ] + ], + "revision": 2 }, { - "sites": [ - "map" - ], + "sites": ["map"], "task_id": 37, - "require_login": true, - "storage_state": null, - "start_url": "__MAP__", - "geolocation": null, - "intent_template": "Check if the {{place}} in pittsburgh can be reached in one hour by car from {{location}}", - "instantiation_dict": { - "place": "police station", - "location": "gates building at CMU" - }, - "intent": "Check if the police station in pittsburgh can be reached in one hour by car from gates building at CMU", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "verify_retrieved_value_normalized", - "eval_params": { - "ordered": false, - "type": "text" - }, - "expected_data": [ - "Yes" - ] - } - } - ], - "site": "map" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "must_include": [ - "Yes" - ] - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "Yes" - }, "intent_template_id": 77, - "changelogs": [ + "start_urls": ["__MAP__"], + "intent": "Check if the police station in pittsburgh can be reached in one hour by car from gates building at CMU", + "intent_template": "Check if the {{place}} in pittsburgh can be reached in one hour by car from {{location}}", + "instantiation_dict": {"place": "police station", "location": "gates building at CMU"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "eval", - "category": "permissive_string_match", - "note": "Switched to exact match" + "evaluator": "AgentResponseEvaluator", + "ordered": false, + "results_schema": { "type": "array", "items": {"type": "boolean"} }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": [true] + } } - ] + ], + "revision": 2 }, { - "sites": [ - "map" - ], + "sites": ["map"], "task_id": 38, - "require_login": true, - "storage_state": null, - "start_url": "__MAP__", - "geolocation": null, - "intent_template": "Check if the {{place}} in pittsburgh can be reached in one hour by car from {{location}}", - "instantiation_dict": { - "place": "duquesne university", - "location": "pittsburgh airport" - }, - "intent": "Check if the duquesne university in pittsburgh can be reached in one hour by car from pittsburgh airport", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "verify_retrieved_value_normalized", - "eval_params": { - "ordered": false, - "type": "text" - }, - "expected_data": [ - "Yes" - ] - } - } - ], - "site": "map" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "must_include": [ - "Yes" - ] - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "Yes" - }, "intent_template_id": 77, - "changelogs": [ + "start_urls": ["__MAP__"], + "intent": "Check if the duquesne university in pittsburgh can be reached in one hour by car from pittsburgh airport", + "intent_template": "Check if the {{place}} in pittsburgh can be reached in one hour by car from {{location}}", + "instantiation_dict": {"place": "duquesne university", "location": "pittsburgh airport"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "eval", - "category": "permissive_string_match", - "note": "Switched to exact match" + "evaluator": "AgentResponseEvaluator", + "ordered": false, + "results_schema": { "type": "array", "items": {"type": "boolean"} }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": [true] + } } - ] + ], + "revision": 2 }, { - "sites": [ - "map" - ], + "sites": ["map"], "task_id": 39, - "require_login": true, - "storage_state": null, - "start_url": "__MAP__", - "geolocation": null, - "intent_template": "Check if the {{place}} in pittsburgh can be reached in one hour by car from {{location}}", - "instantiation_dict": { - "place": "walmart", - "location": "5600 fifth avenue" - }, - "intent": "Check if the walmart in pittsburgh can be reached in one hour by car from 5600 fifth avenue", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "verify_retrieved_value_normalized", - "eval_params": { - "ordered": false, - "type": "text" - }, - "expected_data": [ - "Yes" - ] - } - } - ], - "site": "map" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "must_include": [ - "Yes" - ] - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "Yes" - }, "intent_template_id": 77, - "changelogs": [ + "start_urls": ["__MAP__"], + "intent": "Check if the walmart in pittsburgh can be reached in one hour by car from 5600 fifth avenue", + "intent_template": "Check if the {{place}} in pittsburgh can be reached in one hour by car from {{location}}", + "instantiation_dict": {"place": "walmart", "location": "5600 fifth avenue"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "eval", - "category": "permissive_string_match", - "note": "Switched to exact match" + "evaluator": "AgentResponseEvaluator", + "ordered": false, + "results_schema": { "type": "array", "items": {"type": "boolean"} }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": [true] + } } - ] + ], + "revision": 2 }, { - "sites": [ - "map" - ], + "sites": ["map"], "task_id": 40, - "require_login": true, - "storage_state": null, - "start_url": "__MAP__", - "geolocation": null, - "intent_template": "Check if the {{place}} in pittsburgh can be reached in one hour by car from {{location}}", - "instantiation_dict": { - "place": "amc theatre", - "location": "hobart street" - }, - "intent": "Check if the amc theatre in pittsburgh can be reached in one hour by car from hobart street", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "verify_retrieved_value_normalized", - "eval_params": { - "ordered": false, - "type": "text" - }, - "expected_data": [ - "Yes" - ] - } - } - ], - "site": "map" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "must_include": [ - "Yes" - ] - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "Yes" - }, "intent_template_id": 77, - "changelogs": [ + "start_urls": ["__MAP__"], + "intent": "Check if the amc theatre in pittsburgh can be reached in one hour by car from hobart street", + "intent_template": "Check if the {{place}} in pittsburgh can be reached in one hour by car from {{location}}", + "instantiation_dict": {"place": "amc theatre", "location": "hobart street"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "eval", - "category": "permissive_string_match", - "note": "Switched to exact match" + "evaluator": "AgentResponseEvaluator", + "ordered": false, + "results_schema": { "type": "array", "items": {"type": "boolean"} }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": [true] + } } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping_admin" - ], + "sites": ["shopping_admin"], "task_id": 41, - "require_login": true, - "storage_state": "./.auth/shopping_admin_state.json", - "start_url": "__SHOPPING_ADMIN__", - "geolocation": null, - "intent_template": "List the top {{n}} search term(s) in my store", - "original.intent_template": "List the top {{n}} search terms in my store", - "instantiation_dict": { - "n": "1" - }, - "intent": "List the top 1 search term(s) in my store", - "original.intent": "List the top 1 search terms in my store", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "verify_retrieved_value_normalized", - "eval_params": { - "ordered": false, - "type": "text" - }, - "expected_data": [ - "hollister" - ] - } - } - ], - "site": "shopping_admin" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "exact_match": "hollister" - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "hollister" - }, "intent_template_id": 285, - "changelogs": [ + "start_urls": ["__SHOPPING_ADMIN__"], + "intent": "Get the top 1 search term(s) in my store", + "intent_template": "Get the top {{n}} search term(s) in my store", + "instantiation_dict": {"n": "1"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "intent_template", - "category": "spelling_or_grammar", - "note": "Use proper optional pluralization" + "evaluator": "AgentResponseEvaluator", + "ordered": false, + "results_schema": { "type": "array", "items": {"type": "string"} }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": ["hollister"] + } } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping_admin" - ], + "sites": ["shopping_admin"], "task_id": 42, - "require_login": true, - "storage_state": "./.auth/shopping_admin_state.json", - "start_url": "__SHOPPING_ADMIN__", - "geolocation": null, - "intent_template": "List the top {{n}} search term(s) in my store", - "original.intent_template": "List the top {{n}} search terms in my store", - "instantiation_dict": { - "n": "2" - }, - "intent": "List the top 2 search term(s) in my store", - "original.intent": "List the top 2 search terms in my store", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "verify_retrieved_value_normalized", - "eval_params": { - "ordered": false, - "type": "text" - }, - "expected_data": [ - "hollister", - "Joust Bag" - ] - } - } - ], - "site": "shopping_admin" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "must_include": [ - "hollister", - "Joust Bag" - ] - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "hollister, Joust Bag" - }, "intent_template_id": 285, - "changelogs": [ - { - "key": "intent_template", - "category": "spelling_or_grammar", - "note": "Use proper optional pluralization" - }, + "start_urls": ["__SHOPPING_ADMIN__"], + "intent": "Get the top 2 search term(s) in my store", + "intent_template": "Get the top {{n}} search term(s) in my store", + "instantiation_dict": {"n": "2"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "eval", - "category": "permissive_string_match", - "note": "Switched to exact match" + "evaluator": "AgentResponseEvaluator", + "ordered": false, + "results_schema": { "type": "array", "items": {"type": "string"} }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": ["hollister", "Joust Bag"] + } } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping_admin" - ], + "sites": ["shopping_admin"], "task_id": 43, - "require_login": true, - "storage_state": "./.auth/shopping_admin_state.json", - "start_url": "__SHOPPING_ADMIN__", - "geolocation": null, - "intent_template": "List the top {{n}} search term(s) in my store", - "original.intent_template": "List the top {{n}} search terms in my store", - "instantiation_dict": { - "n": "3" - }, - "intent": "List the top 3 search term(s) in my store", - "original.intent": "List the top 3 search terms in my store", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "verify_retrieved_value_normalized", - "eval_params": { - "ordered": false, - "type": "text" - }, - "expected_data": [ - "hollister", - "Joust Bag", - "Antonia Racer Tank" - ] - } - } - ], - "site": "shopping_admin" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "must_include": [ - "hollister", - "Joust Bag", - "Antonia Racer Tank" - ] - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "hollister, Joust Bag, Antonia Race Tank" - }, "intent_template_id": 285, - "changelogs": [ - { - "key": "intent_template", - "category": "spelling_or_grammar", - "note": "Use proper optional pluralization" - }, + "start_urls": ["__SHOPPING_ADMIN__"], + "intent": "Get the top 3 search term(s) in my store", + "intent_template": "Get the top {{n}} search term(s) in my store", + "instantiation_dict": {"n": "3"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "eval", - "category": "permissive_string_match", - "note": "Switched to exact match" + "evaluator": "AgentResponseEvaluator", + "ordered": false, + "results_schema": { "type": "array", "items": {"type": "string"} }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": ["hollister", "Joust Bag", "Antonia Racer Tank"] + } } - ] + ], + "revision": 2 }, { - "sites": [ - "gitlab" - ], + "sites": ["gitlab"], "task_id": 44, - "require_login": true, - "storage_state": "./.auth/gitlab_state.json", - "start_url": "__GITLAB__", - "geolocation": null, + "intent_template_id": 303, + "start_urls": ["__GITLAB__"], + "intent": "Navigate to my todos", "intent_template": "Navigate to my todos", - "original.intent_template": "Check out my todos", "instantiation_dict": {}, - "intent": "Navigate to my todos", - "original.intent": "Check out my todos", - "require_reset": false, - "eval": { - "expected_ui_state": [ - { - "url": "__GITLAB__/dashboard/todos" - } - ], - "site": "gitlab" - }, - "original.eval": { - "eval_types": [ - "url_match" - ], - "reference_answers": null, - "reference_url": "__GITLAB__/dashboard/todos", - "program_html": [], - "url_note": "GOLD in PRED" - }, - "intent_template_id": 303, - "changelogs": [ + "format_specification": null, + "start_url_context": null, + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} + }, { - "key": "intent_template", - "category": "clarify_instructions", - "note": "Clarify the intent as a navigation task" + "evaluator": "NetworkEventEvaluator", + "site": "gitlab", + "url_match_mode": "exact", + "last_event_only": true, + "expected": { + "url": "__GITLAB__/dashboard/todos", + "response_status": 200, + "event_type": "navigation" + } } - ] + ], + "revision": 2 }, { - "sites": [ - "gitlab" - ], + "sites": ["gitlab"], "task_id": 45, - "require_login": true, - "storage_state": "./.auth/gitlab_state.json", - "start_url": "__GITLAB__/a11yproject/a11yproject.com", - "geolocation": null, + "intent_template_id": 300, + "start_urls": ["__GITLAB__/a11yproject/a11yproject.com"], + "intent": "Navigate to the most recent open issues", "intent_template": "Navigate to the most recent open issues", - "original.intent_template": "Check out the most recent open issues", "instantiation_dict": {}, - "intent": "Navigate to the most recent open issues", - "original.intent": "Check out the most recent open issues", - "require_reset": false, - "eval": { - "expected_ui_state": [ - { - "url": "__GITLAB__/a11yproject/a11yproject.com/-/issues/?sort=created_asc&state=opened" - } - ], - "site": "gitlab" - }, - "original.eval": { - "eval_types": [ - "url_match" - ], - "reference_answers": null, - "reference_url": "__GITLAB__/a11yproject/a11yproject.com/-/issues/?sort=created_asc&state=opened", - "program_html": [], - "url_note": "GOLD in PRED" - }, - "intent_template_id": 300, - "changelogs": [ - { - "key": "intent_template", - "category": "clarify_instructions", - "note": "Clarify the intent as a navigation task" + "format_specification": null, + "start_url_context": "Complete the task given the context of the current page.", + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} + }, + { + "evaluator": "NetworkEventEvaluator", + "site": "gitlab", + "url_match_mode": "exact", + "last_event_only": true, + "expected": { + "url": [ + "__GITLAB__/a11yproject/a11yproject.com/-/issues", + "__GITLAB__/a11yproject/a11yproject.com/-/issues/?sort=created_date&state=opened" + ], + "response_status": 200, + "event_type": "navigation" + } } - ] + ], + "revision": 2 }, { - "sites": [ - "gitlab" - ], + "sites": ["gitlab"], "task_id": 46, - "require_login": true, - "storage_state": "./.auth/gitlab_state.json", - "start_url": "__GITLAB__/primer/design", - "geolocation": null, + "intent_template_id": 300, + "start_urls": ["__GITLAB__/primer/design"], + "intent": "Navigate to the most recent open issues", "intent_template": "Navigate to the most recent open issues", - "original.intent_template": "Check out the most recent open issues", "instantiation_dict": {}, - "intent": "Navigate to the most recent open issues", - "original.intent": "Check out the most recent open issues", - "require_reset": false, - "eval": { - "expected_ui_state": [ - { - "url": "__GITLAB__/primer/design/-/issues/?sort=created_date&state=opened" - } - ], - "site": "gitlab" - }, - "original.eval": { - "eval_types": [ - "url_match" - ], - "reference_answers": null, - "reference_url": "__GITLAB__/primer/design/-/issues/?sort=created_date&state=opened", - "program_html": [], - "url_note": "GOLD in PRED" - }, - "intent_template_id": 300, - "changelogs": [ - { - "key": "intent_template", - "category": "clarify_instructions", - "note": "Clarify the intent as a navigation task" + "format_specification": null, + "start_url_context": "Complete the task given the context of the current page.", + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} + }, + { + "evaluator": "NetworkEventEvaluator", + "site": "gitlab", + "url_match_mode": "exact", + "last_event_only": true, + "expected": { + "url": [ + "__GITLAB__/primer/design/-/issues", + "__GITLAB__/primer/design/-/issues/?sort=created_date&state=opened" + ], + "response_status": 200, + "event_type": "navigation" + } } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping" - ], + "sites": ["shopping"], "task_id": 47, - "require_login": true, - "storage_state": "./.auth/shopping_state.json", - "start_url": "__SHOPPING__", - "geolocation": null, - "intent_template": "Today is 6/12/2023. Tell me how many fulfilled orders I have {{period}}, and the total amount of money I spent.", - "instantiation_dict": { - "period": "over the past month" - }, - "intent": "Today is 6/12/2023. Tell me how many fulfilled orders I have over the past month, and the total amount of money I spent.", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "verify_retrieved_value_normalized", - "eval_params": { - "ordered": false, - "type": "object" - }, - "expected_data": [ - { - "order_count": { - "value": 0, - "type": "numeric" - }, - "amount": { - "value": 0, - "type": "currency" - } - } - ] - } - } - ], - "site": "shopping" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "fuzzy_match": [ - "0 order", - "$0 total spend" - ] - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "0 order, $0 total spend" - }, "intent_template_id": 197, + "start_urls": ["__SHOPPING__"], + "intent": "Today is 6/12/2023. Tell me how many complete orders I have over the past month, and the total amount of money I spent.", + "intent_template": "Today is 6/12/2023. Tell me how many complete orders I have {{period}}, and the total amount of money I spent.", + "instantiation_dict": {"period": "over the past month"}, "format_specification": "Use \"order_count\" for the number of orders and \"amount\" for the total amount spent.", - "changelogs": [ + "start_url_context": null, + "eval": [ { - "key": "eval", - "category": "permissive_string_match", - "note": "Switched from fuzzy eval" + "evaluator": "AgentResponseEvaluator", + "ordered": false, + "results_schema": { + "type": "array", + "items": { + "type": "object", + "properties": { + "order_count": { "type": "number" }, + "amount" : { "type": "number", "format": "currency" } + }, + "required": ["amount", "order_count"] + } + }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": [ {"order_count": 0, "amount": 0} ] + } } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping" - ], + "sites": ["shopping"], "task_id": 48, - "require_login": true, - "storage_state": "./.auth/shopping_state.json", - "start_url": "__SHOPPING__", - "geolocation": null, - "intent_template": "Today is 6/12/2023. Tell me how many fulfilled orders I have {{period}}, and the total amount of money I spent.", - "instantiation_dict": { - "period": "over the past three days" - }, - "intent": "Today is 6/12/2023. Tell me how many fulfilled orders I have over the past three days, and the total amount of money I spent.", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "verify_retrieved_value_normalized", - "eval_params": { - "ordered": false, - "type": "object" - }, - "expected_data": [ - { - "order_count": { - "value": 0, - "type": "numeric" - }, - "amount": { - "value": 0, - "type": "currency" - } - } - ] - } - } - ], - "site": "shopping" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "fuzzy_match": [ - "0 order", - "$0 total spend" - ] - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "0 order, $0 total spend" - }, "intent_template_id": 197, + "start_urls": ["__SHOPPING__"], + "intent": "Today is 6/12/2023. Tell me how many complete orders I have over the past three days, and the total amount of money I spent.", + "intent_template": "Today is 6/12/2023. Tell me how many complete orders I have {{period}}, and the total amount of money I spent.", + "instantiation_dict": {"period": "over the past three days"}, "format_specification": "Use \"order_count\" for the number of orders and \"amount\" for the total amount spent.", - "changelogs": [ + "start_url_context": null, + "eval": [ { - "key": "eval", - "category": "permissive_string_match", - "note": "Switched from fuzzy eval" + "evaluator": "AgentResponseEvaluator", + "ordered": false, + "results_schema": { + "type": "array", + "items": { + "type": "object", + "properties": { + "order_count": { "type": "number" }, + "amount" : { "type": "number", "format": "currency" } + }, + "required": ["amount", "order_count"] + } + }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": [ {"order_count": 0, "amount": 0} ] + } } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping" - ], + "sites": ["shopping"], "task_id": 49, - "require_login": true, - "storage_state": "./.auth/shopping_state.json", - "start_url": "__SHOPPING__", - "geolocation": null, - "intent_template": "Today is 6/12/2023. Tell me how many fulfilled orders I have {{period}}, and the total amount of money I spent.", - "instantiation_dict": { - "period": "over the past four month" - }, - "intent": "Today is 6/12/2023. Tell me how many fulfilled orders I have over the past four month, and the total amount of money I spent.", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "verify_retrieved_value_normalized", - "eval_params": { - "ordered": false, - "type": "object" - }, - "expected_data": [ - { - "order_count": { - "value": 3, - "type": "numeric" - }, - "amount": { - "value": 845.49, - "type": "currency" - } - } - ] - } - } - ], - "site": "shopping" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "fuzzy_match": [ - "3 orders", - "$845.49 total spend" - ] - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "3 orders, $845.49 total spend" - }, "intent_template_id": 197, + "start_urls": ["__SHOPPING__"], + "intent": "Today is 6/12/2023. Tell me how many complete orders I have over the past four month, and the total amount of money I spent.", + "intent_template": "Today is 6/12/2023. Tell me how many complete orders I have {{period}}, and the total amount of money I spent.", + "instantiation_dict": {"period": "over the past four month"}, "format_specification": "Use \"order_count\" for the number of orders and \"amount\" for the total amount spent.", - "changelogs": [ + "start_url_context": null, + "eval": [ { - "key": "eval", - "category": "permissive_string_match", - "note": "Switched from fuzzy eval" + "evaluator": "AgentResponseEvaluator", + "ordered": false, + "results_schema": { + "type": "array", + "items": { + "type": "object", + "properties": { + "order_count": { "type": "number" }, + "amount" : { "type": "number", "format": "currency" } + }, + "required": ["amount", "order_count"] + } + }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": [ {"order_count": 3, "amount": 845.49} ] + } } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping" - ], + "sites": ["shopping"], "task_id": 50, - "require_login": true, - "storage_state": "./.auth/shopping_state.json", - "start_url": "__SHOPPING__", - "geolocation": null, - "intent_template": "Today is 6/12/2023. Tell me how many fulfilled orders I have {{period}}, and the total amount of money I spent.", - "instantiation_dict": { - "period": "over the past year" - }, - "intent": "Today is 6/12/2023. Tell me how many fulfilled orders I have over the past year, and the total amount of money I spent.", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "verify_retrieved_value_normalized", - "eval_params": { - "ordered": false, - "type": "object" - }, - "expected_data": [ - { - "order_count": { - "value": 24, - "type": "numeric" - }, - "amount": { - "value": 6560.69, - "type": "currency" - } - } - ] - } - } - ], - "site": "shopping" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "fuzzy_match": [ - "24 orders", - "$6560.69 total spend" - ] - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "24 orders, $6560.69 total spend" - }, "intent_template_id": 197, + "start_urls": ["__SHOPPING__"], + "intent": "Today is 6/12/2023. Tell me how many complete orders I have over the past year, and the total amount of money I spent.", + "intent_template": "Today is 6/12/2023. Tell me how many complete orders I have {{period}}, and the total amount of money I spent.", + "instantiation_dict": {"period": "over the past year"}, "format_specification": "Use \"order_count\" for the number of orders and \"amount\" for the total amount spent.", - "changelogs": [ + "start_url_context": null, + "eval": [ { - "key": "eval", - "category": "permissive_string_match", - "note": "Switched from fuzzy eval" + "evaluator": "AgentResponseEvaluator", + "ordered": false, + "results_schema": { + "type": "array", + "items": { + "type": "object", + "properties": { + "order_count": { "type": "number" }, + "amount" : { "type": "number", "format": "currency" } + }, + "required": ["amount", "order_count"] + } + }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": [ {"order_count": 21, "amount": 6560.69} ] + } } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping" - ], + "sites": ["shopping"], "task_id": 51, - "require_login": true, - "storage_state": "./.auth/shopping_state.json", - "start_url": "__SHOPPING__", - "geolocation": null, - "intent_template": "Today is 6/12/2023. Tell me how many fulfilled orders I have {{period}}, and the total amount of money I spent.", - "instantiation_dict": { - "period": "over the past six month" - }, - "intent": "Today is 6/12/2023. Tell me how many fulfilled orders I have over the past six month, and the total amount of money I spent.", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "verify_retrieved_value_normalized", - "eval_params": { - "ordered": false, - "type": "object" - }, - "expected_data": [ - { - "order_count": { - "value": 12, - "type": "numeric" - }, - "amount": { - "value": 1603.69, - "type": "currency" - } - } - ] - } - } - ], - "site": "shopping" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "fuzzy_match": [ - "12 orders", - "$1603.69 total spend" - ] - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "12 orders, $1603.69 total spend" - }, "intent_template_id": 197, + "start_urls": ["__SHOPPING__"], + "intent": "Today is 6/12/2023. Tell me how many complete orders I have over the past six month, and the total amount of money I spent.", + "intent_template": "Today is 6/12/2023. Tell me how many complete orders I have {{period}}, and the total amount of money I spent.", + "instantiation_dict": {"period": "over the past six month"}, "format_specification": "Use \"order_count\" for the number of orders and \"amount\" for the total amount spent.", - "changelogs": [ + "start_url_context": null, + "eval": [ { - "key": "eval", - "category": "permissive_string_match", - "note": "Switched from fuzzy eval" + "evaluator": "AgentResponseEvaluator", + "ordered": false, + "results_schema": { + "type": "array", + "items": { + "type": "object", + "properties": { + "order_count": { "type": "number" }, + "amount" : { "type": "number", "format": "currency" } + }, + "required": ["amount", "order_count"] + } + }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": [ {"order_count": 7, "amount": 1700.84} ] + } } - ] + ], + "revision": 2 }, { - "sites": [ - "map" - ], + "sites": ["map"], "task_id": 52, - "require_login": true, - "storage_state": null, - "start_url": "__MAP__", - "geolocation": null, - "intent_template": "How long does it take to walk from {{start}} to {{end}}?", - "instantiation_dict": { - "start": "Carnegie Mellon University", - "end": "starbucks on Craig Street" - }, - "intent": "How long does it take to walk from Carnegie Mellon University to starbucks on Craig Street?", - "require_reset": false, - "eval": { - "site": "map" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "fuzzy_match": [ - "7 min" - ] - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "7 min" - }, "intent_template_id": 68, - "changelogs": [ + "start_urls": ["__MAP__"], + "intent": "How long does it take to walk from Carnegie Mellon University to starbucks on Craig Street?", + "intent_template": "How long does it take to walk from {{start}} to {{end}}?", + "instantiation_dict": {"start": "Carnegie Mellon University", "end": "starbucks on Craig Street"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "eval", - "category": "permissive_string_match", - "note": "Switched from fuzzy eval" + "evaluator": "AgentResponseEvaluator", + "ordered": false, + "results_schema": { "type": "array", "items": {"type": "string"} }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": ["7min"] + } } - ] + ], + "revision": 2 }, { - "sites": [ - "map" - ], + "sites": ["map"], "task_id": 53, - "require_login": true, - "storage_state": null, - "start_url": "__MAP__", - "geolocation": null, - "intent_template": "How long does it take to walk from {{start}} to {{end}}?", - "instantiation_dict": { - "start": "Univ of Pittsburgh", - "end": "starbucks on Craig Street" - }, - "intent": "How long does it take to walk from Univ of Pittsburgh to starbucks on Craig Street?", - "require_reset": false, - "eval": { - "site": "map" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "fuzzy_match": [ - "18 min" - ] - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "18 min" - }, "intent_template_id": 68, - "changelogs": [ + "start_urls": ["__MAP__"], + "intent": "How long does it take to walk from Univ of Pittsburgh to starbucks on Craig Street?", + "intent_template": "How long does it take to walk from {{start}} to {{end}}?", + "instantiation_dict": {"start": "Univ of Pittsburgh", "end": "starbucks on Craig Street"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "eval", - "category": "permissive_string_match", - "note": "Switched from fuzzy eval" + "evaluator": "AgentResponseEvaluator", + "ordered": false, + "results_schema": { "type": "array", "items": {"type": "string"} }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": ["18min"] + } } - ] + ], + "revision": 2 }, { - "sites": [ - "map" - ], + "sites": ["map"], "task_id": 54, - "require_login": true, - "storage_state": null, - "start_url": "__MAP__", - "geolocation": null, - "intent_template": "How long does it take to walk from {{start}} to {{end}}?", - "instantiation_dict": { - "start": "Carnegie Mellon University", - "end": "Univ of Pittsburgh" - }, - "intent": "How long does it take to walk from Carnegie Mellon University to Univ of Pittsburgh?", - "require_reset": false, - "eval": { - "site": "map" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "fuzzy_match": [ - "25 min" - ] - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "25 min" - }, "intent_template_id": 68, - "changelogs": [ + "start_urls": ["__MAP__"], + "intent": "How long does it take to walk from Carnegie Mellon University to Univ of Pittsburgh?", + "intent_template": "How long does it take to walk from {{start}} to {{end}}?", + "instantiation_dict": {"start": "Carnegie Mellon University", "end": "Univ of Pittsburgh"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "eval", - "category": "permissive_string_match", - "note": "Switched from fuzzy eval" + "evaluator": "AgentResponseEvaluator", + "ordered": false, + "results_schema": { "type": "array", "items": {"type": "string"} }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": ["25min"] + } } - ] + ], + "revision": 2 }, { - "sites": [ - "map" - ], + "sites": ["map"], "task_id": 55, - "require_login": true, - "storage_state": null, - "start_url": "__MAP__", - "geolocation": null, - "intent_template": "How long does it take to walk from {{start}} to {{end}}?", - "instantiation_dict": { - "start": "the starbuck near CMU", - "end": "Chatham university" - }, - "intent": "How long does it take to walk from the starbuck near CMU to Chatham university?", - "require_reset": false, - "eval": { - "site": "map" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "fuzzy_match": [ - "30 min" - ] - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "30 min" - }, "intent_template_id": 68, - "changelogs": [ + "start_urls": ["__MAP__"], + "intent": "How long does it take to walk from the starbuck near CMU to Chatham university?", + "intent_template": "How long does it take to walk from {{start}} to {{end}}?", + "instantiation_dict": {"start": "the starbuck near CMU", "end": "Chatham university"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "eval", - "category": "permissive_string_match", - "note": "Switched from fuzzy eval" + "evaluator": "AgentResponseEvaluator", + "ordered": false, + "results_schema": { "type": "array", "items": {"type": "string"} }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": ["30min"] + } } - ] + ], + "revision": 2 }, { - "sites": [ - "map" - ], + "sites": ["map"], "task_id": 56, - "require_login": true, - "storage_state": null, - "start_url": "__MAP__", - "geolocation": null, - "intent_template": "How long does it take to walk from {{start}} to {{end}}?", - "instantiation_dict": { - "start": "Carnegie Museum of Art", - "end": "a library at CMU" - }, - "intent": "How long does it take to walk from Carnegie Museum of Art to a library at CMU?", - "require_reset": false, - "eval": { - "site": "map" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "fuzzy_match": [ - "11 min" - ] - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "11 min" - }, "intent_template_id": 68, - "changelogs": [ + "start_urls": ["__MAP__"], + "intent": "How long does it take to walk from Carnegie Museum of Art to a library at CMU?", + "intent_template": "How long does it take to walk from {{start}} to {{end}}?", + "instantiation_dict": {"start": "Carnegie Museum of Art", "end": "a library at CMU"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "eval", - "category": "permissive_string_match", - "note": "Switched from fuzzy eval" + "evaluator": "AgentResponseEvaluator", + "ordered": false, + "results_schema": { "type": "array", "items": {"type": "string"} }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": ["11min"] + } } - ] + ], + "revision": 2 }, { - "sites": [ - "map" - ], + "sites": ["map"], "task_id": 57, - "require_login": true, - "storage_state": null, - "start_url": "__MAP__", - "geolocation": null, + "intent_template_id": 69, + "start_urls": ["__MAP__"], + "intent": "Tell me the closest restaurant(s) to university center at Carnegie Mellon University", "intent_template": "Tell me the closest {{place1}}(s) to {{place2}}", "instantiation_dict": { "place1": "restaurant", "place2": "university center at Carnegie Mellon University" }, - "intent": "Tell me the closest restaurant(s) to university center at Carnegie Mellon University", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "verify_retrieved_value_normalized", - "eval_params": { - "ordered": false, - "type": "text" - }, - "expected_data": [ - "El Gallo de Oro", - "Back Bar Grill", - "Grano", - "Beefsteak", - "Nourish", - "Schatz Dining Room", - "Au Bon Pain" - ] - } - } - ], - "site": "map" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "must_include": [ - "El Gallo de Oro", - "Back Bar Grill", - "Grano", - "Beefsteak", - "Nourish", - "Schatz Dining Room", - "Au Bon Pain" - ] - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "El Gallo de Oro, Back Bar Grill, Grano, Beefsteak, Nourish, Schatz Dining Room, Au Bon Pain" - }, - "intent_template_id": 69, - "changelogs": [ + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "eval", - "category": "permissive_string_match", - "note": "Switched to exact match" + "evaluator": "AgentResponseEvaluator", + "ordered": false, + "results_schema": { "type": "array", "items": {"type": "string"} }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": [ + "El Gallo de Oro", "Back Bar Grill", "Grano", "Beefsteak", "Nourish", + "Schatz Dining Room", "Au Bon Pain" + ] + } } - ] + ], + "revision": 2 }, { - "sites": [ - "map" - ], + "sites": ["map"], "task_id": 58, - "require_login": true, - "storage_state": null, - "start_url": "__MAP__", - "geolocation": null, - "intent_template": "Tell me the closest {{place1}}(s) to {{place2}}", - "instantiation_dict": { - "place1": "cafe", - "place2": "CMU Hunt library" - }, - "intent": "Tell me the closest cafe(s) to CMU Hunt library", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "verify_retrieved_value_normalized", - "eval_params": { - "ordered": false, - "type": "text" - }, - "expected_data": [ - "De Fer Coffee & Tea" - ] - } - } - ], - "site": "map" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "must_include": [ - "De Fer Coffee & Tea" - ] - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "De Fer Coffee & Tea" - }, "intent_template_id": 69, - "changelogs": [ + "start_urls": ["__MAP__"], + "intent": "Tell me the closest cafe(s) to CMU Hunt library", + "intent_template": "Tell me the closest {{place1}}(s) to {{place2}}", + "instantiation_dict": {"place1": "cafe", "place2": "CMU Hunt library"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "eval", - "category": "permissive_string_match", - "note": "Switched to exact match" + "evaluator": "AgentResponseEvaluator", + "ordered": false, + "results_schema": { "type": "array", "items": {"type": "string"} }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": ["De Fer Coffee & Tea"] + } } - ] + ], + "revision": 2 }, { - "sites": [ - "map" - ], + "sites": ["map"], "task_id": 59, - "require_login": true, - "storage_state": null, - "start_url": "__MAP__", - "geolocation": null, - "intent_template": "Tell me the closest {{place1}}(s) to {{place2}}", - "instantiation_dict": { - "place1": "restaurant", - "place2": "CMU Hunt library" - }, - "intent": "Tell me the closest restaurant(s) to CMU Hunt library", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "verify_retrieved_value_normalized", - "eval_params": { - "ordered": false, - "type": "text" - }, - "expected_data": [ - "The exchange" - ] - } - } - ], - "site": "map" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "must_include": [ - "The exchange" - ] - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "The exchange" - }, "intent_template_id": 69, - "changelogs": [ + "start_urls": ["__MAP__"], + "intent": "Tell me the closest restaurant(s) to CMU Hunt library", + "intent_template": "Tell me the closest {{place1}}(s) to {{place2}}", + "instantiation_dict": {"place1": "restaurant", "place2": "CMU Hunt library"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "eval", - "category": "permissive_string_match", - "note": "Switched to exact match" + "evaluator": "AgentResponseEvaluator", + "ordered": false, + "results_schema": { "type": "array", "items": {"type": "string"} }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": ["The exchange"] + } } - ] + ], + "revision": 2 }, { - "sites": [ - "map" - ], + "sites": ["map"], "task_id": 60, - "require_login": true, - "storage_state": null, - "start_url": "__MAP__", - "geolocation": null, - "intent_template": "Tell me the closest {{place1}}(s) to {{place2}}", - "instantiation_dict": { - "place1": "restaurant", - "place2": "CMU Posner Hall" - }, - "intent": "Tell me the closest restaurant(s) to CMU Posner Hall", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "verify_retrieved_value_normalized", - "eval_params": { - "ordered": false, - "type": "text" - }, - "expected_data": [ - "The exchange" - ] - } - } - ], - "site": "map" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "must_include": [ - "The exchange" - ] - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "The exchange" - }, "intent_template_id": 69, - "changelogs": [ + "start_urls": ["__MAP__"], + "intent": "Tell me the closest restaurant(s) to CMU Posner Hall", + "intent_template": "Tell me the closest {{place1}}(s) to {{place2}}", + "instantiation_dict": {"place1": "restaurant", "place2": "CMU Posner Hall"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "eval", - "category": "permissive_string_match", - "note": "Switched to exact match" + "evaluator": "AgentResponseEvaluator", + "ordered": false, + "results_schema": { "type": "array", "items": {"type": "string"} }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": ["The exchange"] + } } - ] + ], + "revision": 2 }, { - "sites": [ - "map" - ], + "sites": ["map"], "task_id": 61, - "require_login": true, - "storage_state": null, - "start_url": "__MAP__", - "geolocation": null, - "intent_template": "Tell me the closest {{place1}}(s) to {{place2}}", - "instantiation_dict": { - "place1": "restaurant", - "place2": "CMU Sorrells Library" - }, - "intent": "Tell me the closest restaurant(s) to CMU Sorrells Library", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "verify_retrieved_value_normalized", - "eval_params": { - "ordered": false, - "type": "text" - }, - "expected_data": [ - "La Prima Espresso" - ] - } - } - ], - "site": "map" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "must_include": [ - "La Prima Espresso" - ] - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "La Prima Espresso" - }, "intent_template_id": 69, - "changelogs": [ + "start_urls": ["__MAP__"], + "intent": "Tell me the closest restaurant(s) to CMU Sorrells Library", + "intent_template": "Tell me the closest {{place1}}(s) to {{place2}}", + "instantiation_dict": {"place1": "restaurant", "place2": "CMU Sorrells Library"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "eval", - "category": "permissive_string_match", - "note": "Switched to exact match" + "evaluator": "AgentResponseEvaluator", + "ordered": false, + "results_schema": { "type": "array", "items": {"type": "string"} }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": ["La Prima Espresso"] + } } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping_admin" - ], + "sites": ["shopping_admin"], "task_id": 62, - "require_login": true, - "storage_state": "./.auth/shopping_admin_state.json", - "start_url": "__SHOPPING_ADMIN__", - "geolocation": null, - "intent_template": "Get customer names that have completed the {{quantifier}} number of orders in the entire history?", - "original.intent_template": "Which customer has completed the {{quantifier}} number of orders in the entire history?", - "instantiation_dict": { - "quantifier": "most" - }, - "intent": "Get customer names that have completed the most number of orders in the entire history?", - "original.intent": "Which customer has completed the most number of orders in the entire history?", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "verify_retrieved_value_normalized", - "eval_params": { - "ordered": false, - "type": "text" - }, - "expected_data": [ - "Jane Smith" - ] - } - } - ], - "site": "shopping_admin" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "must_include": [ - "Jane Smith" - ] - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "Jane Smith" - }, "intent_template_id": 276, - "changelogs": [ + "start_urls": ["__SHOPPING_ADMIN__"], + "intent": "Get customer name(s) who completed the most number of orders in the entire history", + "intent_template": "Get customer name(s) who completed the {{quantifier}} number of orders in the entire history", + "instantiation_dict": {"quantifier": "most"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "intent_template", - "category": "task_ambiguity", - "note": "Clearly specify to return names to avoid false negatives when agents return emails" - }, - { - "key": "eval", - "category": "permissive_string_match", - "note": "Switched to exact match" + "evaluator": "AgentResponseEvaluator", + "ordered": false, + "results_schema": { "type": "array", "items": {"type": "string"} }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": ["Jane Smith"] + } } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping_admin" - ], + "sites": ["shopping_admin"], "task_id": 63, - "require_login": true, - "storage_state": "./.auth/shopping_admin_state.json", - "start_url": "__SHOPPING_ADMIN__", - "geolocation": null, - "intent_template": "Get customer names that have completed the {{quantifier}} number of orders in the entire history?", - "original.intent_template": "Which customer(s) has completed the {{quantifier}} number of orders in the entire history?", - "instantiation_dict": { - "quantifier": "second most" - }, - "intent": "Get customer names that have completed the second most number of orders in the entire history?", - "original.intent": "Which customer(s) has completed the second most number of orders in the entire history?", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "verify_retrieved_value_normalized", - "eval_params": { - "ordered": false, - "type": "text" - }, - "expected_data": [ - "Adam Garcia", - "Michael Nguyen", - "Sarah Miller" - ] - } - } - ], - "site": "shopping_admin" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "must_include": [ - "Adam Garcia", - "Michael Nguyen", - "Sarah Miller" - ] - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "Adam Garcia, Michael Nguyen, Sarah Miller" - }, "intent_template_id": 276, - "changelogs": [ - { - "key": "intent_template", - "category": "task_ambiguity", - "note": "Clearly specify to return names to avoid false negatives when agents return emails" - }, + "start_urls": ["__SHOPPING_ADMIN__"], + "intent": "Get customer name(s) who completed the second most number of orders in the entire history", + "intent_template": "Get customer name(s) who completed the {{quantifier}} number of orders in the entire history", + "instantiation_dict": {"quantifier": "second most"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "eval", - "category": "permissive_string_match", - "note": "Switched to exact match" + "evaluator": "AgentResponseEvaluator", + "ordered": false, + "results_schema": { "type": "array", "items": {"type": "string"} }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": ["Adam Garcia", "Michael Nguyen", "Sarah Miller"] + } } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping_admin" - ], + "sites": ["shopping_admin"], "task_id": 64, - "require_login": true, - "storage_state": "./.auth/shopping_admin_state.json", - "start_url": "__SHOPPING_ADMIN__", - "geolocation": null, - "intent_template": "Get customer names that have placed {{number}} orders in the entire history?", - "original.intent_template": "Which customer has placed {{number}} orders in the entire history?", - "instantiation_dict": { - "number": "2" - }, - "intent": "Get customer names that have placed 2 orders in the entire history?", - "original.intent": "Which customer has placed 2 orders in the entire history?", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "verify_retrieved_value_normalized", - "eval_params": { - "ordered": false, - "type": "text" - }, - "expected_data": [ - "Lisa Kim", - "Lisa Green", - "Julia Williams", - "Brian Smith", - "Alexander Thomas" - ] - } - } - ], - "site": "shopping_admin" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "must_include": [ - "Lisa Kim", - "Lisa Green", - "Julia Williams", - "Brian Smith", - "Alexander Thomas" - ] - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "Lisa Kim, Lisa Green, Julia Williams, Brian Smith, Alexander Thomas" - }, "intent_template_id": 276, - "changelogs": [ - { - "key": "intent_template", - "category": "task_ambiguity", - "note": "Clearly specify to return names to avoid false negatives when agents return emails" - }, + "start_urls": ["__SHOPPING_ADMIN__"], + "intent": "Get customer name(s) who placed 2 orders in the entire history", + "intent_template": "Get customer name(s) who placed {{number}} orders in the entire history", + "instantiation_dict": {"number": "2"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "intent_template", - "category": "task_ambiguity", - "note": "Clearly specify to return names to avoid false negatives when agents return emails" - }, - { - "key": "eval", - "category": "permissive_string_match", - "note": "Switched to exact match" + "evaluator": "AgentResponseEvaluator", + "ordered": false, + "results_schema": { "type": "array", "items": {"type": "string"} }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": ["Lisa Kim", "Lisa Green", "Julia Williams", "Brian Smith", "Alexander Thomas"] + } } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping_admin" - ], + "sites": ["shopping_admin"], "task_id": 65, - "require_login": true, - "storage_state": "./.auth/shopping_admin_state.json", - "start_url": "__SHOPPING_ADMIN__", - "geolocation": null, - "intent_template": "Get customer names that have completed the {{quantifier}} number of orders in the entire history?", - "original.intent_template": "Which customer has completed the {{quantifier}} number of orders in the entire history?", - "instantiation_dict": { - "quantifier": "fifth most" - }, - "intent": "Get customer names that have completed the fifth most number of orders in the entire history?", - "original.intent": "Which customer has completed the fifth most number of orders in the entire history?", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "verify_retrieved_value_normalized", - "eval_params": { - "ordered": false, - "type": "text" - }, - "expected_data": [ - "Jason Miller", - "Jennifer White" - ] - } - } - ], - "site": "shopping_admin" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "must_include": [ - "Jane Doe" - ] - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "Jane Doe" - }, "intent_template_id": 276, - "changelogs": [ - { - "key": "intent_template", - "category": "task_ambiguity", - "note": "Clearly specify to return names to avoid false negatives when agents return emails" - }, - { - "key": "expected_retrieve_value", - "category": "reference_alignment", - "note": "Both have 5 completed orders which brings them in fifth place (1st 11 orders, 2nd 8, 3rd 7, 4th 6, 5th 5)" - }, + "start_urls": ["__SHOPPING_ADMIN__"], + "intent": "Get customer name(s) who completed the fifth most number of orders in the entire history", + "intent_template": "Get customer name(s) who completed the {{quantifier}} number of orders in the entire history", + "instantiation_dict": {"quantifier": "fifth most"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "eval", - "category": "permissive_string_match", - "note": "Switched to exact match" + "evaluator": "AgentResponseEvaluator", + "ordered": false, + "results_schema": { "type": "array", "items": {"type": "string"} }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": ["Jason Miller", "Jennifer White"] + } } - ] + ], + "revision": 2 }, { - "sites": [ - "reddit" - ], + "sites": ["reddit"], "task_id": 66, - "require_login": true, - "storage_state": "./.auth/reddit_state.json", - "start_url": "__REDDIT__", - "geolocation": null, + "intent_template_id": 17, + "start_urls": ["__REDDIT__"], + "intent": "Among the top 10 post in \"books\" forum, Provide the URLs of posts that recommend a single book", "intent_template": "Among the top {{number}} post in \"{{subreddit}}\" forum, {{description}}", "instantiation_dict": { "number": 10, "subreddit": "books", "description": "Provide the URLs of posts that recommend a single book" }, - "original.instantiation_dict": { - "number": 10, - "subreddit": "books", - "description": "show me the post URLs that recommand a single book" - }, - "intent": "Among the top 10 post in \"books\" forum, Provide the URLs of posts that recommend a single book", - "original.intent": "Among the top 10 post in \"books\" forum, show me the post URLs that recommand a single book", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "verify_retrieved_value_normalized", - "eval_params": { - "ordered": false, - "type": "text" - }, - "expected_data": [ - "http://www.reddit.com/f/books/59396/apple-books-has-a-free-audiobook-of-a-christmas-carol", - "http://www.reddit.com/f/books/17445/i-just-finished-reading-the-hobbit-to-my-6-year-old-daughter" - ] - } - } - ], - "site": "reddit" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "must_include": [ - "http://www.reddit.com/f/books/59396/apple-books-has-a-free-audiobook-of-a-christmas-carol", - "http://www.reddit.com/f/books/17445/i-just-finished-reading-the-hobbit-to-my-6-year-old-daughter" - ] - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "__REDDIT__/f/books/59396/apple-books-has-a-free-audiobook-of-a-christmas-carol, __REDDIT__/f/books/17445/i-just-finished-reading-the-hobbit-to-my-6-year-old-daughter" - }, - "intent_template_id": 17, - "changelogs": [ - { - "key": "instantiation_dict", - "category": "spelling_or_grammar", - "note": "Typo in the description" - }, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "eval", - "category": "permissive_string_match", - "note": "Switched to exact match" + "evaluator": "AgentResponseEvaluator", + "ordered": false, + "results_schema": { "type": "array", "items": {"type": "string"} }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": [ + "http://www.reddit.com/f/books/59396/apple-books-has-a-free-audiobook-of-a-christmas-carol", + "http://www.reddit.com/f/books/17445/i-just-finished-reading-the-hobbit-to-my-6-year-old-daughter" + ] + } } - ] + ], + "revision": 2 }, { - "sites": [ - "reddit" - ], + "sites": ["reddit"], "task_id": 67, - "require_login": true, - "storage_state": "./.auth/reddit_state.json", - "start_url": "__REDDIT__", - "geolocation": null, + "intent_template_id": 17, + "start_urls": ["__REDDIT__"], + "intent": "Among the top 10 post in \"books\" forum, Provide the book names from posts that recommend a single book", "intent_template": "Among the top {{number}} post in \"{{subreddit}}\" forum, {{description}}", "instantiation_dict": { "number": 10, "subreddit": "books", "description": "Provide the book names from posts that recommend a single book" }, - "original.instantiation_dict": { - "number": 10, - "subreddit": "books", - "description": "show me the book names from posts that recommand a single book" - }, - "intent": "Among the top 10 post in \"books\" forum, Provide the book names from posts that recommend a single book", - "original.intent": "Among the top 10 post in \"books\" forum, show me the book names from posts that recommand a single book", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "verify_retrieved_value_normalized", - "eval_params": { - "ordered": false, - "type": "text" - }, - "expected_data": [ - "A Christmas Carol", - "The Hobbit" - ] - } - } - ], - "site": "reddit" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "must_include": [ - "A Christmas Carol", - "The Hobbit" - ] - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "A Christmas Carol, The Hobbit" - }, - "intent_template_id": 17, - "changelogs": [ - { - "key": "instantiation_dict", - "category": "spelling_or_grammar", - "note": "Typo in the description" - }, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "eval", - "category": "permissive_string_match", - "note": "Switched to exact match" + "evaluator": "AgentResponseEvaluator", + "ordered": false, + "results_schema": { "type": "array", "items": {"type": "string"} }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": ["A Christmas Carol", "The Hobbit"] + } } - ] + ], + "revision": 2 }, { - "sites": [ - "reddit" - ], + "sites": ["reddit"], "task_id": 68, - "require_login": true, - "storage_state": "./.auth/reddit_state.json", - "start_url": "__REDDIT__", - "geolocation": null, + "intent_template_id": 17, + "start_urls": ["__REDDIT__"], + "intent": "Among the top 10 post in \"books\" forum, Provide the author names and the book names from posts that recommend a single book", "intent_template": "Among the top {{number}} post in \"{{subreddit}}\" forum, {{description}}", "instantiation_dict": { "number": 10, "subreddit": "books", "description": "Provide the author names and the book names from posts that recommend a single book" }, - "original.instantiation_dict": { - "number": 10, - "subreddit": "books", - "description": "show me the author name and the book name from posts that recommand a single book" - }, - "intent": "Among the top 10 post in \"books\" forum, Provide the author names and the book names from posts that recommend a single book", - "original.intent": "Among the top 10 post in \"books\" forum, show me the author name and the book name from posts that recommand a single book", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "verify_retrieved_value_normalized", - "eval_params": { - "ordered": false, - "type": "object" - }, - "expected_data": [ - { - "book": { - "value": "The Little Prince", - "type": "text" - }, - "author": { - "value": "Levar Burton", - "type": "text" - } - }, - { - "book": { - "value": "The Hobbit", - "type": "text" - }, - "author": { - "value": "Tolkien", - "type": "text" - } - } - ] - } - } - ], - "site": "reddit" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "must_include": [ - "A Christmas Carol", - "Levar Burton", - "The Hobbit", - "J. R. R. Tolkien" - ] - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "A Christmas Carol by Levar Burton: , The Hobbit by J. R. R. Tolkien" - }, - "intent_template_id": 17, "format_specification": "For each pair, return a dictionary with the key 'book' for the book name and 'author' for the author name.", - "changelogs": [ - { - "key": "instantiation_dict", - "category": "spelling_or_grammar", - "note": "Typo in the description" - }, - { - "key": "expected_retrieve_value", - "category": "avoid_permissive_text", - "note": "Use structured response to judge correctness" - }, + "start_url_context": null, + "eval": [ { - "key": "eval", - "category": "permissive_string_match", - "note": "Switched to exact match" + "evaluator": "AgentResponseEvaluator", + "ordered": false, + "results_schema": { + "type": "array", + "items": { + "type": "object", + "properties": { "book": {"type": "string"}, "author": {"type": "string"} }, + "required": ["author", "book"] + } + }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": [ + { "book": "The Little Prince", "author": "Levar Burton" }, + { "book": "The Hobbit" , "author": "Tolkien" } + ] + } } - ] + ], + "revision": 2 }, { - "sites": [ - "reddit" - ], + "sites": ["reddit"], "task_id": 69, - "require_login": true, - "storage_state": "./.auth/reddit_state.json", - "start_url": "__REDDIT__", - "geolocation": null, + "intent_template_id": 17, + "start_urls": ["__REDDIT__"], + "intent": "Among the top 10 post in \"books\" forum, is there any post talks about supporting local book stores? If so, tell me the names of the organizations involved", "intent_template": "Among the top {{number}} post in \"{{subreddit}}\" forum, {{description}}", "instantiation_dict": { "number": 10, "subreddit": "books", "description": "is there any post talks about supporting local book stores? If so, tell me the names of the organizations involved" }, - "original.instantiation_dict": { - "number": 10, - "subreddit": "books", - "description": "is there any post talks about supporting local book stores? If so, tell me the organizations involved" - }, - "intent": "Among the top 10 post in \"books\" forum, is there any post talks about supporting local book stores? If so, tell me the names of the organizations involved", - "original.intent": "Among the top 10 post in \"books\" forum, is there any post talks about supporting local book stores? If so, tell me the organizations involved", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "verify_retrieved_value_normalized", - "eval_params": { - "ordered": false, - "type": "text" - }, - "expected_data": [ - "bookshop.org" - ] - } - } - ], - "site": "reddit" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "must_include": [ - "bookshop.org" - ] - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "bookshop.org" - }, - "intent_template_id": 17, - "changelogs": [ - { - "key": "instantiation_dict", - "category": "clarify_instructions", - "note": "Task did not indicate to return the names of the organizations" - }, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "eval", - "category": "permissive_string_match", - "note": "Switched to exact match" + "evaluator": "AgentResponseEvaluator", + "ordered": false, + "results_schema": { "type": "array", "items": {"type": "string"} }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": ["bookshop.org"] + } } - ] + ], + "revision": 2 }, { - "sites": [ - "map" - ], + "sites": ["map"], "task_id": 70, - "require_login": true, - "storage_state": null, - "start_url": "__MAP__", - "geolocation": null, - "intent_template": "What is the zip code of {{place}}?", - "instantiation_dict": { - "place": "Carnegie Mellon University" - }, + "intent_template_id": 70, + "start_urls": ["__MAP__"], "intent": "What is the zip code of Carnegie Mellon University?", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "verify_retrieved_value_normalized", - "eval_params": { - "ordered": false, - "type": "text" - }, - "expected_data": [ - "15213" - ] - } + "intent_template": "What is the zip code of {{place}}?", + "instantiation_dict": {"place": "Carnegie Mellon University"}, + "format_specification": null, + "start_url_context": null, + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "ordered": false, + "results_schema": { "type": "array", "items": {"type": "string"} }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": ["15213"] } - ], - "site": "map" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "exact_match": "15213" - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "15213" - }, - "intent_template_id": 70 + } + ], + "revision": 2 }, { - "sites": [ - "map" - ], + "sites": ["map"], "task_id": 71, - "require_login": true, - "storage_state": null, - "start_url": "__MAP__", - "geolocation": null, - "intent_template": "What is the zip code of {{place}}?", - "instantiation_dict": { - "place": "Chatham University" - }, + "intent_template_id": 70, + "start_urls": ["__MAP__"], "intent": "What is the zip code of Chatham University?", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "verify_retrieved_value_normalized", - "eval_params": { - "ordered": false, - "type": "text" - }, - "expected_data": [ - "15232" - ] - } + "intent_template": "What is the zip code of {{place}}?", + "instantiation_dict": {"place": "Chatham University"}, + "format_specification": null, + "start_url_context": null, + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "ordered": false, + "results_schema": { "type": "array", "items": {"type": "string"} }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": ["15232"] } - ], - "site": "map" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "exact_match": "15232" - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "15232" - }, - "intent_template_id": 70 + } + ], + "revision": 2 }, { - "sites": [ - "map" - ], + "sites": ["map"], "task_id": 72, - "require_login": true, - "storage_state": null, - "start_url": "__MAP__", - "geolocation": null, - "intent_template": "What is the zip code of {{place}}?", - "instantiation_dict": { - "place": "Yale University" - }, + "intent_template_id": 70, + "start_urls": ["__MAP__"], "intent": "What is the zip code of Yale University?", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "verify_retrieved_value_normalized", - "eval_params": { - "ordered": false, - "type": "text" - }, - "expected_data": [ - "06516" - ] - } + "intent_template": "What is the zip code of {{place}}?", + "instantiation_dict": {"place": "Yale University"}, + "format_specification": null, + "start_url_context": null, + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "ordered": false, + "results_schema": { "type": "array", "items": {"type": "string"} }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": ["06516"] } - ], - "site": "map" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "exact_match": "06516" - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "06516" - }, - "intent_template_id": 70 + } + ], + "revision": 2 }, { - "sites": [ - "map" - ], + "sites": ["map"], "task_id": 73, - "require_login": true, - "storage_state": null, - "start_url": "__MAP__", - "geolocation": null, - "intent_template": "What is the zip code of {{place}}?", - "instantiation_dict": { - "place": "Columbia University" - }, + "intent_template_id": 70, + "start_urls": ["__MAP__"], "intent": "What is the zip code of Columbia University?", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "verify_retrieved_value_normalized", - "eval_params": { - "ordered": false, - "type": "text" - }, - "expected_data": [ - "10027" - ] - } + "intent_template": "What is the zip code of {{place}}?", + "instantiation_dict": {"place": "Columbia University"}, + "format_specification": null, + "start_url_context": null, + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "ordered": false, + "results_schema": { "type": "array", "items": {"type": "string"} }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": ["10027"] } - ], - "site": "map" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "exact_match": "10027" - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "10027" - }, - "intent_template_id": 70 + } + ], + "revision": 2 }, { - "sites": [ - "map" - ], + "sites": ["map"], "task_id": 74, - "require_login": true, - "storage_state": null, - "start_url": "__MAP__", - "geolocation": null, - "intent_template": "Given the following locations, {{place_list}}, what would be the optimal route to travel through them all in order to minimize total travel time? Please note the journey begins at the first place listed.", + "intent_template_id": 65, + "start_urls": ["__MAP__"], + "intent": "Given the following locations,\"Carnegie Mellon University\", \"apple store shadyside\", \"starbucks on craig street\", what would be the optimal route to travel through them all in order to minimize total travel time? Please note the journey begins at the first place listed.", + "intent_template": "Given the following locations,{% for place in place_list %}\"{{ place }}\"{% if not loop.last %}, {% endif %}{% endfor %}, what would be the optimal route to travel through them all in order to minimize total travel time? Please note the journey begins at the first place listed.", "instantiation_dict": { "place_list": [ - "Carnegie Mellon University", - "apple store shadyside", + "Carnegie Mellon University", "apple store shadyside", "starbucks on craig street" ] }, - "intent": "Given the following locations, ['Carnegie Mellon University', 'apple store shadyside', 'starbucks on craig street'], what would be the optimal route to travel through them all in order to minimize total travel time? Please note the journey begins at the first place listed.", - "require_reset": false, - "eval": { - "site": "map" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "fuzzy_match": [ - "The order is Carnegie Mellon University, starbucks on forbes ave, apple store shadyside" - ] - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "Carnegie Mellon University, starbucks on forbes ave, apple store shadyside" - }, - "intent_template_id": 65, - "changelogs": [ + "format_specification": "Return the list of place in order using their names.", + "start_url_context": null, + "eval": [ { - "key": "eval", - "category": "permissive_string_match", - "note": "Switched from fuzzy eval" + "evaluator": "AgentResponseEvaluator", + "ordered": true, + "results_schema": { "type": "array", "items": {"type": "string"} }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": [ + "Carnegie Mellon University", "starbucks on craig street", + "apple store shadyside" + ] + } } - ] + ], + "revision": 2 }, { - "sites": [ - "map" - ], + "sites": ["map"], "task_id": 75, - "require_login": true, - "storage_state": null, - "start_url": "__MAP__", - "geolocation": null, - "intent_template": "Given the following locations, {{place_list}}, what would be the optimal route to travel through them all in order to minimize total travel time? Please note the journey begins at the first place listed.", + "intent_template_id": 65, + "start_urls": ["__MAP__"], + "intent": "Given the following locations,\"Massachusetts Institute of Technology\", \"Harvard University\", \"Boston Logan International Airport\", what would be the optimal route to travel through them all in order to minimize total travel time? Please note the journey begins at the first place listed.", + "intent_template": "Given the following locations,{% for place in place_list %}\"{{ place }}\"{% if not loop.last %}, {% endif %}{% endfor %}, what would be the optimal route to travel through them all in order to minimize total travel time? Please note the journey begins at the first place listed.", "instantiation_dict": { "place_list": [ - "Massachusetts Institute of Technology", - "Harvard University", + "Massachusetts Institute of Technology", "Harvard University", "Boston Logan International Airport" ] }, - "intent": "Given the following locations, ['Massachusetts Institute of Technology', 'Harvard University', 'Boston Logan International Airport'], what would be the optimal route to travel through them all in order to minimize total travel time? Please note the journey begins at the first place listed.", - "require_reset": false, - "eval": { - "site": "map" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "fuzzy_match": [ - "The order is Massachusetts Institute of Technology, Harvard University, Boston Logan International Airport" - ] - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "Massachusetts Institute of Technology, Harvard University, Boston Logan International Airport" - }, - "intent_template_id": 65, - "changelogs": [ + "format_specification": "Return the list of place in order using their names.", + "start_url_context": null, + "eval": [ { - "key": "eval", - "category": "permissive_string_match", - "note": "Switched from fuzzy eval" + "evaluator": "AgentResponseEvaluator", + "ordered": true, + "results_schema": { "type": "array", "items": {"type": "string"} }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": [ + "Massachusetts Institute of Technology", "Harvard University", + "Boston Logan International Airport" + ] + } } - ] + ], + "revision": 2 }, { - "sites": [ - "map" - ], + "sites": ["map"], "task_id": 76, - "require_login": true, - "storage_state": null, - "start_url": "__MAP__", - "geolocation": null, - "intent_template": "Given the following locations, {{place_list}}, what would be the optimal route to travel through them all in order to minimize total travel time? Please note the journey begins at the first place listed.", + "intent_template_id": 65, + "start_urls": ["__MAP__"], + "intent": "Given the following locations,\"Princeton University\", \"Yale University\", \"Harvard University\", what would be the optimal route to travel through them all in order to minimize total travel time? Please note the journey begins at the first place listed.", + "intent_template": "Given the following locations,{% for place in place_list %}\"{{ place }}\"{% if not loop.last %}, {% endif %}{% endfor %}, what would be the optimal route to travel through them all in order to minimize total travel time? Please note the journey begins at the first place listed.", "instantiation_dict": { - "place_list": [ - "Princeton University", - "Yale University", - "Harvard University" - ] + "place_list": ["Princeton University", "Yale University", "Harvard University"] }, - "intent": "Given the following locations, ['Princeton University', 'Yale University', 'Harvard University'], what would be the optimal route to travel through them all in order to minimize total travel time? Please note the journey begins at the first place listed.", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "verify_retrieved_value_normalized", - "eval_params": { - "ordered": false, - "type": "text" - }, - "expected_data": [ - "The order is Princeton University, Yale University, Harvard University" - ] - } + "format_specification": "Return the list of place in order using their names.", + "start_url_context": null, + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "ordered": true, + "results_schema": { "type": "array", "items": {"type": "string"} }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": ["Princeton University", "Yale University", "Harvard University"] } - ], - "site": "map" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "must_include": [ - "The order is Princeton University, Yale University, Harvard University" - ] - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "Princeton University, Yale University, Harvard University" - }, - "intent_template_id": 65, - "changelogs": [ - { - "key": "eval", - "category": "permissive_string_match", - "note": "Switched to exact match" } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping_admin" - ], + "sites": ["shopping_admin"], "task_id": 77, - "require_login": true, - "storage_state": "./.auth/shopping_admin_state.json", - "start_url": "__SHOPPING_ADMIN__", - "geolocation": null, - "intent_template": "What is the total count of {{status}} reviews amongst all the reviews?", - "instantiation_dict": { - "status": "Pending" - }, - "intent": "What is the total count of Pending reviews amongst all the reviews?", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "verify_retrieved_value_normalized", - "eval_params": { - "ordered": false, - "type": "numeric" - }, - "expected_data": [ - 5 - ] - } - } - ], - "site": "shopping_admin" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "must_include": [ - "5" - ] - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "5" - }, "intent_template_id": 277, - "changelogs": [ + "start_urls": ["__SHOPPING_ADMIN__"], + "intent": "What is the total count of Pending reviews amongst all the reviews?", + "intent_template": "What is the total count of {{status}} reviews amongst all the reviews?", + "instantiation_dict": {"status": "Pending"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "eval", - "category": "permissive_string_match", - "note": "Switched to exact match" + "evaluator": "AgentResponseEvaluator", + "ordered": false, + "results_schema": { "type": "array", "items": {"type": "number"} }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": [5] + } } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping_admin" - ], + "sites": ["shopping_admin"], "task_id": 78, - "require_login": true, - "storage_state": "./.auth/shopping_admin_state.json", - "start_url": "__SHOPPING_ADMIN__", - "geolocation": null, - "intent_template": "What is the total count of {{status}} reviews amongst all the reviews?", - "instantiation_dict": { - "status": "Approved" - }, - "intent": "What is the total count of Approved reviews amongst all the reviews?", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "verify_retrieved_value_normalized", - "eval_params": { - "ordered": false, - "type": "numeric" - }, - "expected_data": [ - 346 - ] - } - } - ], - "site": "shopping_admin" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "must_include": [ - "346" - ] - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "346" - }, "intent_template_id": 277, - "changelogs": [ + "start_urls": ["__SHOPPING_ADMIN__"], + "intent": "What is the total count of Approved reviews amongst all the reviews?", + "intent_template": "What is the total count of {{status}} reviews amongst all the reviews?", + "instantiation_dict": {"status": "Approved"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "eval", - "category": "permissive_string_match", - "note": "Switched to exact match" + "evaluator": "AgentResponseEvaluator", + "ordered": false, + "results_schema": { "type": "array", "items": {"type": "number"} }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": [346] + } } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping_admin" - ], + "sites": ["shopping_admin"], "task_id": 79, - "require_login": true, - "storage_state": "./.auth/shopping_admin_state.json", - "start_url": "__SHOPPING_ADMIN__", - "geolocation": null, - "intent_template": "What is the total count of {{status}} reviews amongst all the reviews?", - "instantiation_dict": { - "status": "Not Approved" - }, - "intent": "What is the total count of Not Approved reviews amongst all the reviews?", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "verify_retrieved_value_normalized", - "eval_params": { - "ordered": false, - "type": "numeric" - }, - "expected_data": [ - 0 - ] - } - } - ], - "site": "shopping_admin" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "must_include": [ - "0" - ] - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "0" - }, "intent_template_id": 277, - "changelogs": [ + "start_urls": ["__SHOPPING_ADMIN__"], + "intent": "What is the total count of Not Approved reviews amongst all the reviews?", + "intent_template": "What is the total count of {{status}} reviews amongst all the reviews?", + "instantiation_dict": {"status": "Not Approved"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "eval", - "category": "permissive_string_match", - "note": "Switched to exact match" + "evaluator": "AgentResponseEvaluator", + "ordered": false, + "results_schema": { "type": "array", "items": {"type": "number"} }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": [0] + } } - ] + ], + "revision": 2 }, { - "sites": [ - "map" - ], + "sites": ["map"], "task_id": 80, - "require_login": true, - "storage_state": null, - "start_url": "__MAP__", - "geolocation": null, + "intent_template_id": 72, + "start_urls": ["__MAP__"], + "intent": "What is the duration required to first walk from Carnegie Mellon University to Starbucks on Craig Street, and then drive to Pittsburgh International Airport?", "intent_template": "What is the duration required to first walk from {{place_A}} to {{place_B}}, and then drive to {{place_C}}?", "instantiation_dict": { "place_A": "Carnegie Mellon University", "place_B": "Starbucks on Craig Street", "place_C": "Pittsburgh International Airport" }, - "intent": "What is the duration required to first walk from Carnegie Mellon University to Starbucks on Craig Street, and then drive to Pittsburgh International Airport?", - "require_reset": false, - "eval": { - "site": "map" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "fuzzy_match": [ - "38 min" - ] - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "38 min" - }, - "intent_template_id": 72, - "changelogs": [ + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "eval", - "category": "permissive_string_match", - "note": "Switched from fuzzy eval" + "evaluator": "AgentResponseEvaluator", + "ordered": false, + "results_schema": { "type": "array", "items": {"type": "string"} }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": ["38min"] + } } - ] + ], + "revision": 2 }, { - "sites": [ - "map" - ], + "sites": ["map"], "task_id": 81, - "require_login": true, - "storage_state": null, - "start_url": "__MAP__", - "geolocation": null, + "intent_template_id": 72, + "start_urls": ["__MAP__"], + "intent": "What is the duration required to first walk from Univ of Pittsburgh to starbucks on Craig Street, and then drive to Pittsburgh International Airport?", "intent_template": "What is the duration required to first walk from {{place_A}} to {{place_B}}, and then drive to {{place_C}}?", "instantiation_dict": { "place_A": "Univ of Pittsburgh", "place_B": "starbucks on Craig Street", "place_C": "Pittsburgh International Airport" }, - "intent": "What is the duration required to first walk from Univ of Pittsburgh to starbucks on Craig Street, and then drive to Pittsburgh International Airport?", - "require_reset": false, - "eval": { - "site": "map" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "fuzzy_match": [ - "49 min" - ] - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "49 min" - }, - "intent_template_id": 72, - "changelogs": [ + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "eval", - "category": "permissive_string_match", - "note": "Switched from fuzzy eval" + "evaluator": "AgentResponseEvaluator", + "ordered": false, + "results_schema": { "type": "array", "items": {"type": "string"} }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": ["49min"] + } } - ] + ], + "revision": 2 }, { - "sites": [ - "map" - ], + "sites": ["map"], "task_id": 82, - "require_login": true, - "storage_state": null, - "start_url": "__MAP__", - "geolocation": null, + "intent_template_id": 72, + "start_urls": ["__MAP__"], + "intent": "What is the duration required to first walk from Massachusetts Institute of Technology to Harvard University, and then drive to Boston Logan International Airport?", "intent_template": "What is the duration required to first walk from {{place_A}} to {{place_B}}, and then drive to {{place_C}}?", "instantiation_dict": { "place_A": "Massachusetts Institute of Technology", "place_B": "Harvard University", "place_C": "Boston Logan International Airport" }, - "intent": "What is the duration required to first walk from Massachusetts Institute of Technology to Harvard University, and then drive to Boston Logan International Airport?", - "require_reset": false, - "eval": { - "site": "map" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "fuzzy_match": [ - "63 min" - ] - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "63 min" - }, - "intent_template_id": 72, - "changelogs": [ + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "eval", - "category": "permissive_string_match", - "note": "Switched from fuzzy eval" + "evaluator": "AgentResponseEvaluator", + "ordered": false, + "results_schema": { "type": "array", "items": {"type": "string"} }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": ["63min"] + } } - ] + ], + "revision": 2 }, { - "sites": [ - "map" - ], + "sites": ["map"], "task_id": 83, - "require_login": true, - "storage_state": null, - "start_url": "__MAP__", - "geolocation": null, + "intent_template_id": 72, + "start_urls": ["__MAP__"], + "intent": "What is the duration required to first walk from Carnegie Mellon University to apple store shadyside, and then drive to starbucks on craig street?", "intent_template": "What is the duration required to first walk from {{place_A}} to {{place_B}}, and then drive to {{place_C}}?", "instantiation_dict": { "place_A": "Carnegie Mellon University", "place_B": "apple store shadyside", "place_C": "starbucks on craig street" }, - "intent": "What is the duration required to first walk from Carnegie Mellon University to apple store shadyside, and then drive to starbucks on craig street?", - "require_reset": false, - "eval": { - "site": "map" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "fuzzy_match": [ - "22 min" - ] - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "22 min" - }, - "intent_template_id": 72, - "changelogs": [ + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "eval", - "category": "permissive_string_match", - "note": "Switched from fuzzy eval" + "evaluator": "AgentResponseEvaluator", + "ordered": false, + "results_schema": { "type": "array", "items": {"type": "string"} }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": ["22min"] + } } - ] + ], + "revision": 2 }, { - "sites": [ - "map" - ], + "sites": ["map"], "task_id": 84, - "require_login": true, - "storage_state": null, - "start_url": "__MAP__", - "geolocation": null, - "intent_template": "From my stay at {{hotel}}, what's the estimated driving time to reach {{place}}?", - "instantiation_dict": { - "hotel": "DoubleTree by Hilton New York Downtown", - "place": "Keens Steakhouse" - }, - "intent": "From my stay at DoubleTree by Hilton New York Downtown, what's the estimated driving time to reach Keens Steakhouse?", - "require_reset": false, - "eval": { - "site": "map" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "fuzzy_match": [ - "14 minutes" - ] - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "14 minutes" - }, "intent_template_id": 64, - "changelogs": [ + "start_urls": ["__MAP__"], + "intent": "From my stay at DoubleTree by Hilton New York Downtown, what's the estimated driving time to reach Keens Steakhouse?", + "intent_template": "From my stay at {{hotel}}, what's the estimated driving time to reach {{place}}?", + "instantiation_dict": {"hotel": "DoubleTree by Hilton New York Downtown", "place": "Keens Steakhouse"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "eval", - "category": "permissive_string_match", - "note": "Switched from fuzzy eval" + "evaluator": "AgentResponseEvaluator", + "ordered": false, + "results_schema": { "type": "array", "items": {"type": "string"} }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": ["14min"] + } } - ] + ], + "revision": 2 }, { - "sites": [ - "map" - ], + "sites": ["map"], "task_id": 85, - "require_login": true, - "storage_state": null, - "start_url": "__MAP__", - "geolocation": null, + "intent_template_id": 64, + "start_urls": ["__MAP__"], + "intent": "From my stay at La Quinta Inn near the airport, what's the estimated driving time to reach Carnegie Mellon University?", "intent_template": "From my stay at {{hotel}}, what's the estimated driving time to reach {{place}}?", "instantiation_dict": { "hotel": "La Quinta Inn near the airport", "place": "Carnegie Mellon University" }, - "intent": "From my stay at La Quinta Inn near the airport, what's the estimated driving time to reach Carnegie Mellon University?", - "require_reset": false, - "eval": { - "site": "map" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "fuzzy_match": [ - "30 minutes" - ] - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "30 minutes" - }, - "intent_template_id": 64, - "changelogs": [ + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "eval", - "category": "permissive_string_match", - "note": "Switched from fuzzy eval" + "evaluator": "AgentResponseEvaluator", + "ordered": false, + "results_schema": { "type": "array", "items": {"type": "string"} }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": ["30min"] + } } - ] + ], + "revision": 2 }, { - "sites": [ - "map" - ], + "sites": ["map"], "task_id": 86, - "require_login": true, - "storage_state": null, - "start_url": "__MAP__", - "geolocation": null, - "intent_template": "From my stay at {{hotel}}, what's the estimated driving time to reach {{place}}?", - "instantiation_dict": { - "hotel": "La Quinta Inn near the airport", - "place": "Upitt" - }, - "intent": "From my stay at La Quinta Inn near the airport, what's the estimated driving time to reach Upitt?", - "require_reset": false, - "eval": { - "site": "map" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "fuzzy_match": [ - "29 minutes" - ] - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "29 minutes" - }, "intent_template_id": 64, - "changelogs": [ + "start_urls": ["__MAP__"], + "intent": "From my stay at La Quinta Inn near the airport, what's the estimated driving time to reach Upitt?", + "intent_template": "From my stay at {{hotel}}, what's the estimated driving time to reach {{place}}?", + "instantiation_dict": {"hotel": "La Quinta Inn near the airport", "place": "Upitt"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "eval", - "category": "permissive_string_match", - "note": "Switched from fuzzy eval" + "evaluator": "AgentResponseEvaluator", + "ordered": false, + "results_schema": { "type": "array", "items": {"type": "string"} }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": ["29min"] + } } - ] + ], + "revision": 2 }, { - "sites": [ - "map" - ], + "sites": ["map"], "task_id": 87, - "require_login": true, - "storage_state": null, - "start_url": "__MAP__", - "geolocation": null, - "intent_template": "From my stay at {{hotel}}, what's the estimated driving time to reach {{place}}?", - "instantiation_dict": { - "hotel": "red roof inn", - "place": "Pittsburgh science museum" - }, - "intent": "From my stay at red roof inn, what's the estimated driving time to reach Pittsburgh science museum?", - "require_reset": false, - "eval": { - "site": "map" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "fuzzy_match": [ - "20 minutes" - ] - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "20 minutes" - }, "intent_template_id": 64, - "changelogs": [ + "start_urls": ["__MAP__"], + "intent": "From my stay at red roof inn, what's the estimated driving time to reach Pittsburgh science museum?", + "intent_template": "From my stay at {{hotel}}, what's the estimated driving time to reach {{place}}?", + "instantiation_dict": {"hotel": "red roof inn", "place": "Pittsburgh science museum"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "eval", - "category": "permissive_string_match", - "note": "Switched from fuzzy eval" + "evaluator": "AgentResponseEvaluator", + "ordered": false, + "results_schema": { "type": "array", "items": {"type": "string"} }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": ["20min"] + } } - ] + ], + "revision": 2 }, { - "sites": [ - "map" - ], + "sites": ["map"], "task_id": 88, - "require_login": true, - "storage_state": null, - "start_url": "__MAP__", - "geolocation": null, - "intent_template": "From my stay at {{hotel}}, what's the estimated driving time to reach {{place}}?", - "instantiation_dict": { - "hotel": "Homewood Suites Southpointe", - "place": "PPG Paints Arena" - }, - "intent": "From my stay at Homewood Suites Southpointe, what's the estimated driving time to reach PPG Paints Arena?", - "require_reset": false, - "eval": { - "site": "map" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "fuzzy_match": [ - "34 minutes" - ] - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "34 minutes" - }, "intent_template_id": 64, - "changelogs": [ + "start_urls": ["__MAP__"], + "intent": "From my stay at Homewood Suites Southpointe, what's the estimated driving time to reach PPG Paints Arena?", + "intent_template": "From my stay at {{hotel}}, what's the estimated driving time to reach {{place}}?", + "instantiation_dict": {"hotel": "Homewood Suites Southpointe", "place": "PPG Paints Arena"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "eval", - "category": "permissive_string_match", - "note": "Switched from fuzzy eval" + "evaluator": "AgentResponseEvaluator", + "ordered": false, + "results_schema": { "type": "array", "items": {"type": "string"} }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": ["34min"] + } } - ] + ], + "revision": 2 }, { - "sites": [ - "map" - ], + "sites": ["map"], "task_id": 89, - "require_login": true, - "storage_state": null, - "start_url": "__MAP__", - "geolocation": null, - "intent_template": "Which US states border {{state}}?", - "instantiation_dict": { - "state": "Connecticut" - }, - "intent": "Which US states border Connecticut?", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "verify_retrieved_value_normalized", - "eval_params": { - "ordered": false, - "type": "text" - }, - "expected_data": [ - "Rhode Island", - "Massachusetts", - "New York" - ] - } - } - ], - "site": "map" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "must_include": [ - "Rhode Island", - "Massachusetts", - "New York" - ] - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "Rhode Island, Massachusetts, New York" - }, "intent_template_id": 67, - "changelogs": [ + "start_urls": ["__MAP__"], + "intent": "Which US states border Connecticut?", + "intent_template": "Which US states border {{state}}?", + "instantiation_dict": {"state": "Connecticut"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "eval", - "category": "permissive_string_match", - "note": "Switched to exact match" + "evaluator": "AgentResponseEvaluator", + "ordered": false, + "results_schema": { "type": "array", "items": {"type": "string"} }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": ["Rhode Island", "Massachusetts", "New York"] + } } - ] + ], + "revision": 2 }, { - "sites": [ - "map" - ], + "sites": ["map"], "task_id": 90, - "require_login": true, - "storage_state": null, - "start_url": "__MAP__", - "geolocation": null, - "intent_template": "Which US states border {{state}}?", - "instantiation_dict": { - "state": "Pennsylvania" - }, - "intent": "Which US states border Pennsylvania?", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "verify_retrieved_value_normalized", - "eval_params": { - "ordered": false, - "type": "text" - }, - "expected_data": [ - "Ohio", - "Maryland", - "New York", - "New Jersey", - "Delaware", - "West Virginia" - ] - } - } - ], - "site": "map" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "must_include": [ - "Ohio", - "Maryland", - "New York", - "New Jersey", - "Delaware", - "West Virginia" - ] - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "Ohio, Maryland, New York, New Jersey, Delaware, West Virginia" - }, "intent_template_id": 67, - "changelogs": [ + "start_urls": ["__MAP__"], + "intent": "Which US states border Pennsylvania?", + "intent_template": "Which US states border {{state}}?", + "instantiation_dict": {"state": "Pennsylvania"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "eval", - "category": "permissive_string_match", - "note": "Switched to exact match" + "evaluator": "AgentResponseEvaluator", + "ordered": false, + "results_schema": { "type": "array", "items": {"type": "string"} }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": ["Ohio", "Maryland", "New York", "New Jersey", "Delaware", "West Virginia"] + } } - ] + ], + "revision": 2 }, { - "sites": [ - "map" - ], + "sites": ["map"], "task_id": 91, - "require_login": true, - "storage_state": null, - "start_url": "__MAP__", - "geolocation": null, - "intent_template": "Which US states border {{state}}?", - "instantiation_dict": { - "state": "Massachusetts" - }, - "intent": "Which US states border Massachusetts?", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "verify_retrieved_value_normalized", - "eval_params": { - "ordered": false, - "type": "text" - }, - "expected_data": [ - "Rhode Island", - "Connecticut", - "New York", - "New Hampshire", - "Vermont" - ] - } - } - ], - "site": "map" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "must_include": [ - "Rhode Island", - "Connecticut", - "New York", - "New Hampshire", - "Vermont" - ] - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "Rhode Island, Connecticut, New York, New Hampshire, Vermont" - }, "intent_template_id": 67, - "changelogs": [ + "start_urls": ["__MAP__"], + "intent": "Which US states border Massachusetts?", + "intent_template": "Which US states border {{state}}?", + "instantiation_dict": {"state": "Massachusetts"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "eval", - "category": "permissive_string_match", - "note": "Switched to exact match" + "evaluator": "AgentResponseEvaluator", + "ordered": false, + "results_schema": { "type": "array", "items": {"type": "string"} }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": ["Rhode Island", "Connecticut", "New York", "New Hampshire", "Vermont"] + } } - ] + ], + "revision": 2 }, { - "sites": [ - "map" - ], + "sites": ["map"], "task_id": 92, - "require_login": true, - "storage_state": null, - "start_url": "__MAP__", - "geolocation": null, - "intent_template": "Which US states border {{state}}?", - "instantiation_dict": { - "state": "Vermont" - }, - "intent": "Which US states border Vermont?", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "verify_retrieved_value_normalized", - "eval_params": { - "ordered": false, - "type": "text" - }, - "expected_data": [ - "New York", - "New Hampshire", - "Massachusetts" - ] - } - } - ], - "site": "map" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "must_include": [ - "New York", - "New Hampshire", - "Massachusetts" - ] - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "New York, New Hampshire, Massachusetts" - }, "intent_template_id": 67, - "changelogs": [ + "start_urls": ["__MAP__"], + "intent": "Which US states border Vermont?", + "intent_template": "Which US states border {{state}}?", + "instantiation_dict": {"state": "Vermont"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "eval", - "category": "permissive_string_match", - "note": "Switched to exact match" + "evaluator": "AgentResponseEvaluator", + "ordered": false, + "results_schema": { "type": "array", "items": {"type": "string"} }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": ["New York", "New Hampshire", "Massachusetts"] + } } - ] + ], + "revision": 2 }, { - "sites": [ - "map" - ], + "sites": ["map"], "task_id": 93, - "require_login": true, - "storage_state": null, - "start_url": "__MAP__", - "geolocation": null, - "intent_template": "Which US states border {{state}}?", - "instantiation_dict": { - "state": "New Hampshire" - }, - "intent": "Which US states border New Hampshire?", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "verify_retrieved_value_normalized", - "eval_params": { - "ordered": false, - "type": "text" - }, - "expected_data": [ - "Massachusetts", - "Vermont", - "Maine" - ] - } - } - ], - "site": "map" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "must_include": [ - "Massachusetts", - "Vermont", - "Maine" - ] - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "Massachusetts, Vermont, Maine" - }, "intent_template_id": 67, - "changelogs": [ + "start_urls": ["__MAP__"], + "intent": "Which US states border New Hampshire?", + "intent_template": "Which US states border {{state}}?", + "instantiation_dict": {"state": "New Hampshire"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "eval", - "category": "permissive_string_match", - "note": "Switched to exact match" + "evaluator": "AgentResponseEvaluator", + "ordered": false, + "results_schema": { "type": "array", "items": {"type": "string"} }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": ["Massachusetts", "Vermont", "Maine"] + } } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping_admin" - ], + "sites": ["shopping_admin"], "task_id": 94, - "require_login": true, - "storage_state": "./.auth/shopping_admin_state.json", - "start_url": "__SHOPPING_ADMIN__", - "geolocation": null, - "intent_template": "Tell me the grand total of invoice {{id}}.", - "original.intent_template": "Telll me the grand total of invoice {{id}}.", - "instantiation_dict": { - "id": "000000001" - }, - "intent": "Tell me the grand total of invoice 000000001.", - "original.intent": "Telll me the grand total of invoice 000000001.", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "verify_retrieved_value_normalized", - "eval_params": { - "ordered": false, - "type": "currency" - }, - "expected_data": [ - "36.39" - ] - } - } - ], - "site": "shopping_admin" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "must_include": [ - "36.39" - ] - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "$36.39" - }, "intent_template_id": 274, - "changelogs": [ - { - "key": "intent_template", - "category": "spelling_or_grammar", - "note": "Update Tell spelling" - }, + "start_urls": ["__SHOPPING_ADMIN__"], + "intent": "Tell me the grand total of invoice 000000001.", + "intent_template": "Tell me the grand total of invoice {{id}}.", + "instantiation_dict": {"id": "000000001"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "eval", - "category": "permissive_string_match", - "note": "Switched to exact match" + "evaluator": "AgentResponseEvaluator", + "ordered": false, + "results_schema": { "type": "array", "items": {"type": "string"} }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": ["36.39"] + } } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping_admin" - ], + "sites": ["shopping_admin"], "task_id": 95, - "require_login": true, - "storage_state": "./.auth/shopping_admin_state.json", - "start_url": "__SHOPPING_ADMIN__", - "geolocation": null, - "intent_template": "Tell me the grand total of invoice {{id}}.", - "original.intent_template": "Telll me the grand total of invoice {{id}}.", - "instantiation_dict": { - "id": "000000002" - }, - "intent": "Tell me the grand total of invoice 000000002.", - "original.intent": "Telll me the grand total of invoice 000000002.", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "verify_retrieved_value_normalized", - "eval_params": { - "ordered": false, - "type": "currency" - }, - "expected_data": [ - "39.64" - ] - } - } - ], - "site": "shopping_admin" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "must_include": [ - "39.64" - ] - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "$39.64" - }, "intent_template_id": 274, - "changelogs": [ - { - "key": "intent_template", - "category": "spelling_or_grammar", - "note": "Update Tell spelling" - }, + "start_urls": ["__SHOPPING_ADMIN__"], + "intent": "Tell me the grand total of invoice 000000002.", + "intent_template": "Tell me the grand total of invoice {{id}}.", + "instantiation_dict": {"id": "000000002"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "eval", - "category": "permissive_string_match", - "note": "Switched to exact match" + "evaluator": "AgentResponseEvaluator", + "ordered": false, + "results_schema": { "type": "array", "items": {"type": "string"} }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": ["39.64"] + } } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping" - ], + "sites": ["shopping"], "task_id": 96, - "require_login": true, - "storage_state": "./.auth/shopping_state.json", - "start_url": "__SHOPPING__", - "geolocation": null, + "intent_template_id": 193, + "start_urls": ["__SHOPPING__"], + "intent": "Tell me the status of my latest order and when will it arrive", "intent_template": "Tell me the status of my latest order and when will it arrive", "instantiation_dict": {}, - "intent": "Tell me the status of my latest order and when will it arrive", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "verify_retrieved_value_normalized", - "eval_params": { - "ordered": false, - "type": "object" + "format_specification": "Use \"status\" for status and \"arrival_date\" for arrival date (null if not applicable).", + "start_url_context": null, + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "ordered": false, + "results_schema": { + "type": "array", + "items": { + "type": "object", + "properties": { + "status" : { "type": "string" }, + "arrival_date": { "type": "string", "format": "date" } }, - "expected_data": [ - { - "status": { - "value": "canceled", - "type": "text" - }, - "arrival_date": { - "value": null, - "type": "text" - } - } - ] + "required": ["arrival_date", "status"] } + }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": [ {"status": "canceled", "arrival_date": null} ] } - ], - "site": "shopping" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "fuzzy_match": [ - "The last order was canceled. It will never arrive." - ] - }, - "reference_url": "", - "program_html": [], - "reference_answer_raw_annotation": "The last order was canceled. It will never arrive.", - "string_note": "" - }, - "intent_template_id": 193, - "format_specification": "Use \"status\" for status and \"arrival_date\" for arrival date (null if not applicable).", - "changelogs": [ - { - "key": "eval", - "category": "permissive_string_match", - "note": "Switched from fuzzy eval" } - ] + ], + "revision": 2 }, { - "sites": [ - "map", - "wikipedia" - ], + "sites": ["map", "wikipedia"], "task_id": 97, - "require_login": true, - "storage_state": null, - "start_url": "__MAP__", - "geolocation": null, + "intent_template_id": 120, + "start_urls": ["__MAP__", "__WIKIPEDIA__"], + "intent": "Tell me the distance to drive from Carnegie Mellon University to the top computer science school in massachusetts", "intent_template": "Tell me the distance to drive from Carnegie Mellon University to the top computer science school in massachusetts", "instantiation_dict": {}, - "intent": "Tell me the distance to drive from Carnegie Mellon University to the top computer science school in massachusetts", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "verify_retrieved_value_normalized", - "eval_params": { - "ordered": false, - "type": "text" - }, - "expected_data": [ - "914km" - ] - } - } - ], - "site": "map" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "must_include": [ - "914km" - ] - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "914 km" - }, - "intent_template_id": 120, - "changelogs": [ + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "eval", - "category": "permissive_string_match", - "note": "Switched to exact match" + "evaluator": "AgentResponseEvaluator", + "ordered": true, + "results_schema": { "type": "array", "items": {"type": "string", "format": "distance"} }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": ["914km"] + } } - ] + ], + "revision": 2 }, { - "sites": [ - "map" - ], + "sites": ["map"], "task_id": 98, - "require_login": true, - "storage_state": null, - "start_url": "__MAP__", - "geolocation": null, - "intent_template": "Where is the nearest {{places}} to {{start}}, and what is the walking distance to it?", - "instantiation_dict": { - "places": "tea cafe", - "start": "University of Pittsburgh" - }, + "intent_template_id": 66, + "start_urls": ["__MAP__"], "intent": "Where is the nearest tea cafe to University of Pittsburgh, and what is the walking distance to it?", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "verify_retrieved_value_normalized", - "eval_params": { - "ordered": false, - "type": "text" + "intent_template": "Where is the nearest {{places}} to {{start}}, and what is the walking distance to it?", + "instantiation_dict": {"places": "tea cafe", "start": "University of Pittsburgh"}, + "format_specification": "Use \"location\" for the name and location of the place and \"distance\" for the walking distance.", + "start_url_context": null, + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "ordered": false, + "results_schema": { + "type": "array", + "items": { + "type": "object", + "properties": { + "location": { "type": "string" }, + "distance": { "type": "string", "format": "distance" } }, - "expected_data": [ - "Fuku Tea", - "3716", - "Forbes Avenue", - "Central Oakland", - "Pittsburgh", - "653m" - ] + "required": ["distance", "location"] } + }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": [ + { + "location": "Fuku Tea, 3716, Forbes Avenue, Oakland, Central Oakland, Pittsburgh, Allegheny County, Pennsylvania, 15213, United States", + "distance": "653m" + } + ] } - ], - "site": "map" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "must_include": [ - "Fuku Tea", - "3716", - "Forbes Avenue", - "Central Oakland", - "Pittsburgh", - "653m" - ] - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "Fuku Tea, 3716, Forbes Avenue, Oakland, Central Oakland, Pittsburgh, Allegheny County, Pennsylvania, 15213, United States\n653m" - }, - "intent_template_id": 66, - "changelogs": [ - { - "key": "eval", - "category": "permissive_string_match", - "note": "Switched to exact match" } - ] + ], + "revision": 2 }, { - "sites": [ - "map" - ], + "sites": ["map"], "task_id": 99, - "require_login": true, - "storage_state": null, - "start_url": "__MAP__", - "geolocation": null, - "intent_template": "Where is the nearest {{places}} to {{start}}, and what is the walking distance to it?", - "instantiation_dict": { - "places": "Five Guys", - "start": "5700 Penn Ave" - }, + "intent_template_id": 66, + "start_urls": ["__MAP__"], "intent": "Where is the nearest Five Guys to 5700 Penn Ave, and what is the walking distance to it?", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "verify_retrieved_value_normalized", - "eval_params": { - "ordered": false, - "type": "text" + "intent_template": "Where is the nearest {{places}} to {{start}}, and what is the walking distance to it?", + "instantiation_dict": {"places": "Five Guys", "start": "5700 Penn Ave"}, + "format_specification": "Use \"location\" for the name and location of the place and \"distance\" for the walking distance.", + "start_url_context": null, + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "ordered": false, + "results_schema": { + "type": "array", + "items": { + "type": "object", + "properties": { + "location": { "type": "string" }, + "distance": { "type": "string", "format": "distance" } }, - "expected_data": [ - "Five Guys", - "117", - "South Bouquet Street", - "North Oakland", - "Pittsburgh", - "4.0km" - ] + "required": ["distance", "location"] } + }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": [ + { + "location": "Five Guys, 117, South Bouquet Street, Oakland, North Oakland, Pittsburgh, Allegheny County, Pennsylvania, 15213, United States", + "distance": "4km" + } + ] } - ], - "site": "map" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "must_include": [ - "Five Guys", - "117", - "South Bouquet Street", - "North Oakland", - "Pittsburgh", - "4.0km" - ] - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "Five Guys, 117, South Bouquet Street, Oakland, North Oakland, Pittsburgh, Allegheny County, Pennsylvania, 15213, United States\n4.0km" - }, - "intent_template_id": 66, - "changelogs": [ - { - "key": "eval", - "category": "permissive_string_match", - "note": "Switched to exact match" } - ] + ], + "revision": 2 }, { - "sites": [ - "map" - ], + "sites": ["map"], "task_id": 100, - "require_login": true, - "storage_state": null, - "start_url": "__MAP__", - "geolocation": null, - "intent_template": "Where is the nearest {{places}} to {{start}}, and what is the walking distance to it?", - "instantiation_dict": { - "places": "Starbucks", - "start": "Carnegie Mellon" - }, + "intent_template_id": 66, + "start_urls": ["__MAP__"], "intent": "Where is the nearest Starbucks to Carnegie Mellon, and what is the walking distance to it?", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "verify_retrieved_value_normalized", - "eval_params": { - "ordered": false, - "type": "text" + "intent_template": "Where is the nearest {{places}} to {{start}}, and what is the walking distance to it?", + "instantiation_dict": {"places": "Starbucks", "start": "Carnegie Mellon"}, + "format_specification": "Use \"location\" for the name and location of the place and \"distance\" for the walking distance.", + "start_url_context": null, + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "ordered": false, + "results_schema": { + "type": "array", + "items": { + "type": "object", + "properties": { + "location": { "type": "string" }, + "distance": { "type": "string", "format": "distance" } }, - "expected_data": [ - "Starbucks", - "417", - "South Craig Street", - "Bellefield", - "Pittsburgh", - "557m" - ] + "required": ["distance", "location"] } + }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": [ + { + "location": "Starbucks, 417, South Craig Street, Bellefield, Pittsburgh, Allegheny County, Pennsylvania, 15213, United States", + "distance": "557m" + } + ] } - ], - "site": "map" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "must_include": [ - "Starbucks", - "417", - "South Craig Street", - "Bellefield", - "Pittsburgh", - "557m" - ] - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "Starbucks, 417, South Craig Street, Bellefield, Pittsburgh, Allegheny County, Pennsylvania, 15213, United States\n557m" - }, - "intent_template_id": 66, - "changelogs": [ - { - "key": "eval", - "category": "permissive_string_match", - "note": "Switched to exact match" } - ] + ], + "revision": 2 }, { - "sites": [ - "map" - ], + "sites": ["map"], "task_id": 101, - "require_login": true, - "storage_state": null, - "start_url": "__MAP__", - "geolocation": null, - "intent_template": "Where is the nearest {{places}} to {{start}}, and what is the walking distance to it?", - "instantiation_dict": { - "places": "In-N-Out", - "start": "Upitts" - }, - "intent": "Where is the nearest In-N-Out to Upitts, and what is the walking distance to it?", - "require_reset": false, - "eval": { - "site": "map" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "fuzzy_match": "N/A" - }, - "reference_url": "", - "program_html": [], - "string_note": "There is no In-N-Out near University of Pittsburgh", - "reference_answer_raw_annotation": "N/A" - }, "intent_template_id": 66, - "changelogs": [ + "start_urls": ["__MAP__"], + "intent": "Where is the nearest In-N-Out to Upitts, and what is the walking distance to it?", + "intent_template": "Where is the nearest {{places}} to {{start}}, and what is the walking distance to it?", + "instantiation_dict": {"places": "In-N-Out", "start": "Upitts"}, + "format_specification": "Use \"location\" for the name and location of the place and \"distance\" for the walking distance.", + "start_url_context": null, + "eval": [ { - "key": "expected_retrieve_value", - "category": "unachievable_tasks", - "note": "Status check used instead of fuzzy match" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": { + "performed_operation": "retrieve", + "status": "NOT_FOUND_ERROR", + "retrieved_data": null + } } - ] + ], + "revision": 2 }, { - "sites": [ - "gitlab" - ], + "sites": ["gitlab"], "task_id": 102, - "require_login": true, - "storage_state": "./.auth/gitlab_state.json", - "start_url": "__GITLAB__", - "geolocation": null, - "intent_template": "Navigate to and display the list of issues in the {{repo}} repository that have labels related to {{label}}", - "original.intent_template": "Display the list of issues in the {{repo}} repository that have labels related to {{label}}", - "instantiation_dict": { - "label": "help wanted", - "repo": "a11yproject/a11yproject.com" - }, - "original.instantiation_dict": { - "label": "help needed", - "repo": "a11yproject/a11yproject.com" - }, - "intent": "Navigate to and display the list of issues in the a11yproject/a11yproject.com repository that have labels related to help wanted", - "original.intent": "Display the list of issues in the a11yproject/a11yproject.com repository that have labels related to help needed", - "require_reset": false, - "eval": { - "expected_ui_state": [ - { - "url": "__GITLAB__/byteblaze/a11y-syntax-highlighting/-/issues/?label_name%5B%5D=help%20wanted" - } - ], - "site": "gitlab" - }, - "original.eval": { - "eval_types": [ - "url_match" - ], - "reference_answers": null, - "reference_url": "__GITLAB__/byteblaze/a11y-syntax-highlighting/-/issues/?label_name%5B%5D=help%20wanted", - "program_html": [], - "url_note": "GOLD in PRED" - }, "intent_template_id": 349, - "changelogs": [ + "start_urls": ["__GITLAB__"], + "intent": "Display the list of open issues in the a11yproject/a11yproject.com repository that have labels related to help wanted", + "intent_template": "Display the list of issues in the {{repo}} repository that have labels related to {{label}}", + "instantiation_dict": {"label": "help wanted", "repo": "a11yproject/a11yproject.com"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "intent_template", - "category": "clarify_instructions", - "note": "Clarify the intent as a navigation task" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} }, { - "key": "instantiation_dict", - "category": "reference_alignment", - "note": "Use available label 'help wanted'; correct org/repo" + "evaluator": "NetworkEventEvaluator", + "site": "gitlab", + "url_match_mode": "exact", + "last_event_only": true, + "expected": { + "url": "__GITLAB__/byteblaze/a11y-syntax-highlighting/-/issues/?label_name%5B%5D=help%20wanted", + "response_status": 200, + "event_type": "navigation" + } } - ] + ], + "revision": 2 }, { - "sites": [ - "gitlab" - ], + "sites": ["gitlab"], "task_id": 103, - "require_login": true, - "storage_state": "./.auth/gitlab_state.json", - "start_url": "__GITLAB__", - "geolocation": null, - "intent_template": "Navigate to and display the list of issues in the {{repo}} repository that have labels related to {{label}}", - "original.intent_template": "Display the list of issues in the {{repo}} repository that have labels related to {{label}}", - "instantiation_dict": { - "label": "question", - "repo": "kkroening/ffmpeg-python" - }, - "original.instantiation_dict": { - "label": "questions", - "repo": "kkroening/ffmpeg-python" - }, - "intent": "Navigate to and display the list of issues in the kkroening/ffmpeg-python repository that have labels related to question", - "original.intent": "Display the list of issues in the kkroening/ffmpeg-python repository that have labels related to questions", - "require_reset": false, - "eval": { - "expected_ui_state": [ - { - "url": "__GITLAB__/kkroening/ffmpeg-python/-/issues/?label_name%5B%5D=question" - } - ], - "site": "gitlab" - }, - "original.eval": { - "eval_types": [ - "url_match" - ], - "reference_answers": null, - "reference_url": "__GITLAB__/kkroening/ffmpeg-python/-/issues/?label_name%5B%5D=question", - "program_html": [], - "url_note": "GOLD in PRED" - }, "intent_template_id": 349, - "changelogs": [ + "start_urls": ["__GITLAB__"], + "intent": "Display the list of issues in the kkroening/ffmpeg-python repository that have labels related to questions", + "intent_template": "Display the list of issues in the {{repo}} repository that have labels related to {{label}}", + "instantiation_dict": {"label": "question", "repo": "kkroening/ffmpeg-python"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "intent_template", - "category": "clarify_instructions", - "note": "Clarify the intent as a navigation task" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} }, { - "key": "instantiation_dict", - "category": "spelling_or_grammar", - "note": "Use singular 'question' label" + "evaluator": "NetworkEventEvaluator", + "site": "gitlab", + "url_match_mode": "exact", + "last_event_only": true, + "expected": { + "url": "__GITLAB__/kkroening/ffmpeg-python/-/issues/?label_name%5B%5D=question", + "response_status": 200, + "event_type": "navigation" + } } - ] + ], + "revision": 2 }, { - "sites": [ - "gitlab" - ], + "sites": ["gitlab"], "task_id": 104, - "require_login": true, - "storage_state": "./.auth/gitlab_state.json", - "start_url": "__GITLAB__", - "geolocation": null, - "intent_template": "Navigate to and display the list of issues in the {{repo}} repository that have labels related to {{label}}", - "original.intent_template": "Display the list of issues in the {{repo}} repository that have labels related to {{label}}", - "instantiation_dict": { - "label": "flaky-test", - "repo": "keycloak/keycloak" - }, - "intent": "Navigate to and display the list of issues in the keycloak/keycloak repository that have labels related to flaky-test", - "original.intent": "Display the list of issues in the keycloak/keycloak repository that have labels related to flaky-test", - "require_reset": false, - "eval": { - "expected_ui_state": [ - { - "url": "__GITLAB__/keycloak/keycloak/-/issues/?label_name%5B%5D=flaky-test" - } - ], - "site": "gitlab" - }, - "original.eval": { - "eval_types": [ - "url_match" - ], - "reference_answers": null, - "reference_url": "__GITLAB__/keycloak/keycloak/-/issues/?label_name%5B%5D=flaky-test", - "program_html": [], - "url_note": "GOLD in PRED" - }, "intent_template_id": 349, - "changelogs": [ + "start_urls": ["__GITLAB__"], + "intent": "Display the list of issues in the keycloak/keycloak repository that have labels related to flaky-test", + "intent_template": "Display the list of issues in the {{repo}} repository that have labels related to {{label}}", + "instantiation_dict": {"label": "flaky-test", "repo": "keycloak/keycloak"}, + "format_specification": null, + "start_url_context": null, + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} + }, { - "key": "intent_template", - "category": "clarify_instructions", - "note": "Clarify the intent as a navigation task" + "evaluator": "NetworkEventEvaluator", + "site": "gitlab", + "url_match_mode": "exact", + "last_event_only": true, + "expected": { + "url": "__GITLAB__/keycloak/keycloak/-/issues/?label_name%5B%5D=flaky-test&state=opened", + "response_status": 200, + "event_type": "navigation" + } } - ] + ], + "revision": 2 }, { - "sites": [ - "gitlab" - ], + "sites": ["gitlab"], "task_id": 105, - "require_login": true, - "storage_state": "./.auth/gitlab_state.json", - "start_url": "__GITLAB__", - "geolocation": null, - "intent_template": "Navigate to and display the list of issues in the {{repo}} repository that have labels related to {{label}}", - "original.intent_template": "Display the list of issues in the {{repo}} repository that have labels related to {{label}}", - "instantiation_dict": { - "label": "OpenAPI Generator CLI", - "repo": "OpenAPITools/openapi-generator" - }, - "intent": "Navigate to and display the list of issues in the OpenAPITools/openapi-generator repository that have labels related to OpenAPI Generator CLI", - "original.intent": "Display the list of issues in the OpenAPITools/openapi-generator repository that have labels related to OpenAPI Generator CLI", - "require_reset": false, - "eval": { - "expected_ui_state": [ - { - "url": "__GITLAB__/OpenAPITools/openapi-generator/-/issues/?label_name%5B%5D=OpenAPI%20Generator%20CLI" - } - ], - "site": "gitlab" - }, - "original.eval": { - "eval_types": [ - "url_match" - ], - "reference_answers": null, - "reference_url": "__GITLAB__/OpenAPITools/openapi-generator/-/issues/?label_name%5B%5D=OpenAPI%20Generator%20CLI", - "program_html": [], - "url_note": "GOLD in PRED" - }, "intent_template_id": 349, - "changelogs": [ + "start_urls": ["__GITLAB__"], + "intent": "Display the list of issues in the OpenAPITools/openapi-generator repository that have labels related to OpenAPI Generator CLI", + "intent_template": "Display the list of issues in the {{repo}} repository that have labels related to {{label}}", + "instantiation_dict": {"label": "OpenAPI Generator CLI", "repo": "OpenAPITools/openapi-generator"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "intent_template", - "category": "clarify_instructions", - "note": "Clarify the intent as a navigation task" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} + }, + { + "evaluator": "NetworkEventEvaluator", + "site": "gitlab", + "url_match_mode": "exact", + "last_event_only": true, + "expected": { + "url": "__GITLAB__/OpenAPITools/openapi-generator/-/issues/?label_name%5B%5D=OpenAPI%20Generator%20CLI&state=opened", + "response_status": 200, + "event_type": "navigation" + } } - ] + ], + "revision": 2 }, { - "sites": [ - "gitlab" - ], + "sites": ["gitlab"], "task_id": 106, - "require_login": true, - "storage_state": "./.auth/gitlab_state.json", - "start_url": "__GITLAB__", - "geolocation": null, - "intent_template": "Navigate to and display the list of issues in the {{repo}} repository that have labels related to {{label}}", - "original.intent_template": "Display the list of issues in the {{repo}} repository that have labels related to {{label}}", - "instantiation_dict": { - "label": "BUG", - "repo": "umano/AndroidSlidingUpPanel" - }, - "intent": "Navigate to and display the list of issues in the umano/AndroidSlidingUpPanel repository that have labels related to BUG", - "original.intent": "Display the list of issues in the umano/AndroidSlidingUpPanel repository that have labels related to BUG", - "require_reset": false, - "eval": { - "expected_ui_state": [ - { - "url": "__GITLAB__/umano/AndroidSlidingUpPanel/-/issues/?label_name%5B%5D=BUG" - } - ], - "site": "gitlab" - }, - "original.eval": { - "eval_types": [ - "url_match" - ], - "reference_answers": null, - "reference_url": "__GITLAB__/umano/AndroidSlidingUpPanel/-/issues/?label_name%5B%5D=BUG", - "program_html": [], - "url_note": "GOLD in PRED" - }, "intent_template_id": 349, - "changelogs": [ + "start_urls": ["__GITLAB__"], + "intent": "Display the list of issues in the umano/AndroidSlidingUpPanel repository that have labels related to BUG", + "intent_template": "Display the list of issues in the {{repo}} repository that have labels related to {{label}}", + "instantiation_dict": {"label": "BUG", "repo": "umano/AndroidSlidingUpPanel"}, + "format_specification": null, + "start_url_context": null, + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} + }, { - "key": "intent_template", - "category": "clarify_instructions", - "note": "Clarify the intent as a navigation task" + "evaluator": "NetworkEventEvaluator", + "site": "gitlab", + "url_match_mode": "exact", + "last_event_only": true, + "expected": { + "url": "__GITLAB__/umano/AndroidSlidingUpPanel/-/issues/?label_name%5B%5D=BUG&state=opened", + "response_status": 200, + "event_type": "navigation" + } } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping_admin" - ], + "sites": ["shopping_admin"], "task_id": 107, - "require_login": true, - "storage_state": "./.auth/shopping_admin_state.json", - "start_url": "__SHOPPING_ADMIN__", - "geolocation": null, - "intent_template": "Get the monthly count of successful orders {{period}}", - "original.intent_template": "Presents the monthly count of successful orders {{period}} in MM:COUNT format", - "instantiation_dict": { - "period": "from May to December 2022" - }, + "intent_template_id": 270, + "start_urls": ["__SHOPPING_ADMIN__"], "intent": "Get the monthly count of successful orders from May to December 2022", - "original.intent": "Presents the monthly count of successful orders from May to December 2022 in MM:COUNT format", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "verify_retrieved_value_normalized", - "eval_params": { - "ordered": false, - "type": "object" - }, - "expected_data": [ - { - "month": { - "value": "May", - "type": "month" - }, - "count": { - "value": 8, - "type": "numeric" - } - }, - { - "month": { - "value": "June", - "type": "month" - }, - "count": { - "value": 13, - "type": "numeric" - } - }, - { - "month": { - "value": "July", - "type": "month" - }, - "count": { - "value": 9, - "type": "numeric" - } - }, - { - "month": { - "value": "August", - "type": "month" - }, - "count": { - "value": 8, - "type": "numeric" - } - }, - { - "month": { - "value": "Sepetember", - "type": "month" - }, - "count": { - "value": 10, - "type": "numeric" - } - }, - { - "month": { - "value": "October", - "type": "month" - }, - "count": { - "value": 4, - "type": "numeric" - } - }, - { - "month": { - "value": "November", - "type": "month" - }, - "count": { - "value": 5, - "type": "numeric" - } - }, - { - "month": { - "value": "December", - "type": "month" - }, - "count": { - "value": 10, - "type": "numeric" - } - } - ] + "intent_template": "Get the monthly count of successful orders {{period}}", + "instantiation_dict": {"period": "from May to December 2022"}, + "format_specification": "Return a list of objects, where each object includes a \"month\" field for the month and a \"count\" field for the count.", + "start_url_context": null, + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "ordered": false, + "results_schema": { + "type": "array", + "items": { + "type": "object", + "properties": { "month": {"type": "string", "format": "month"}, "count": {"type": "number"} }, + "required": ["count", "month"] } + }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": [ + { "month": "May" , "count": 8 }, + { "month": "June" , "count": 13 }, + { "month": "July" , "count": 9 }, + { "month": "August" , "count": 8 }, + { "month": "Sepetember", "count": 10 }, + { "month": "October" , "count": 4 }, + { "month": "November" , "count": 5 }, + { "month": "December" , "count": 10 } + ] } - ], - "site": "shopping_admin" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "fuzzy_match": [ - "May: 8 orders", - "June: 13 orders", - "July: 9 orders", - "August: 8 orders", - "Sepetember: 10 orders", - "October: 4 orders", - "November: 5 orders", - "December: 10 orders" - ] - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "May: 8 orders June: 13 orders July: 9 orders August: 8 orders Sepetember: 10 orders Octorbor: 4 orders November: 5 orders December: 10 orders " - }, - "intent_template_id": 270, - "format_specification": "Use \"month\" for the month and \"count\" for the count.", - "changelogs": [ - { - "key": "intent_template", - "category": "task_ambiguity", - "note": "'Presents' does not match the evaluation that checks for a return value" - }, - { - "key": "eval", - "category": "permissive_string_match", - "note": "Switched from fuzzy eval" } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping_admin" - ], + "sites": ["shopping_admin"], "task_id": 108, - "require_login": true, - "storage_state": "./.auth/shopping_admin_state.json", - "start_url": "__SHOPPING_ADMIN__", - "geolocation": null, - "intent_template": "Get the monthly count of successful orders {{period}}", - "original.intent_template": "Presents the monthly count of successful orders {{period}} in MM:COUNT format", - "instantiation_dict": { - "period": "01/2023-05/2023" - }, - "intent": "Get the monthly count of successful orders 01/2023-05/2023", - "original.intent": "Presents the monthly count of successful orders 01/2023-05/2023 in MM:COUNT format", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "verify_retrieved_value_normalized", - "eval_params": { - "ordered": false, - "type": "object" - }, - "expected_data": [ - { - "month": { - "value": "January", - "type": "month" - }, - "count": { - "value": 12, - "type": "numeric" - } - }, - { - "month": { - "value": "February", - "type": "month" - }, - "count": { - "value": 7, - "type": "numeric" - } - }, - { - "month": { - "value": "March", - "type": "month" - }, - "count": { - "value": 5, - "type": "numeric" - } - }, - { - "month": { - "value": "April", - "type": "month" - }, - "count": { - "value": 9, - "type": "numeric" - } - }, - { - "month": { - "value": "May", - "type": "month" - }, - "count": { - "value": 5, - "type": "numeric" - } - } - ] - } - } - ], - "site": "shopping_admin" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "fuzzy_match": [ - "January: 12 orders", - "Feburary: 7 orders", - "March: 5 orders", - "April: 9 orders", - "May: 5 orders" - ] - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "January: 12 orders Febulary: 7 orders March: 5 orders Apirl: 9 orders May: 5 orders" - }, "intent_template_id": 270, + "start_urls": ["__SHOPPING_ADMIN__"], + "intent": "Get the monthly count of successful orders 01/2023-05/2023", + "intent_template": "Get the monthly count of successful orders {{period}}", + "instantiation_dict": {"period": "01/2023-05/2023"}, "format_specification": "Use \"month\" for the month and \"count\" for the count.", - "changelogs": [ + "start_url_context": null, + "eval": [ { - "key": "intent_template", - "category": "task_ambiguity", - "note": "'Presents' does not match the evaluation that checks for a return value" - }, - { - "key": "eval", - "category": "permissive_string_match", - "note": "Switched from fuzzy eval" + "evaluator": "AgentResponseEvaluator", + "ordered": false, + "results_schema": { + "type": "array", + "items": { + "type": "object", + "properties": { "month": {"type": "string", "format": "month"}, "count": {"type": "number"} }, + "required": ["count", "month"] + } + }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": [ + { "month": "January" , "count": 12 }, + { "month": "February", "count": 7 }, + { "month": "March" , "count": 5 }, + { "month": "April" , "count": 9 }, + { "month": "May" , "count": 5 } + ] + } } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping_admin" - ], + "sites": ["shopping_admin"], "task_id": 109, - "require_login": true, - "storage_state": "./.auth/shopping_admin_state.json", - "start_url": "__SHOPPING_ADMIN__", - "geolocation": null, - "intent_template": "Get the monthly count of successful orders {{period}}", - "original.intent_template": "Presents the monthly count of successful orders {{period}} in MM:COUNT format", - "instantiation_dict": { - "period": "from Jan to December 2022" - }, - "intent": "Get the monthly count of successful orders from Jan to December 2022", - "original.intent": "Presents the monthly count of successful orders from Jan to December 2022 in MM:COUNT format", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "verify_retrieved_value_normalized", - "eval_params": { - "ordered": false, - "type": "object" - }, - "expected_data": [ - { - "month": { - "value": "January", - "type": "month" - }, - "count": { - "value": 11, - "type": "numeric" - } - }, - { - "month": { - "value": "February", - "type": "month" - }, - "count": { - "value": 16, - "type": "numeric" - } - }, - { - "month": { - "value": "March", - "type": "month" - }, - "count": { - "value": 14, - "type": "numeric" - } - }, - { - "month": { - "value": "April", - "type": "month" - }, - "count": { - "value": 7, - "type": "numeric" - } - }, - { - "month": { - "value": "May", - "type": "month" - }, - "count": { - "value": 8, - "type": "numeric" - } - }, - { - "month": { - "value": "June", - "type": "month" - }, - "count": { - "value": 13, - "type": "numeric" - } - }, - { - "month": { - "value": "July", - "type": "month" - }, - "count": { - "value": 9, - "type": "numeric" - } - }, - { - "month": { - "value": "August", - "type": "month" - }, - "count": { - "value": 8, - "type": "numeric" - } - }, - { - "month": { - "value": "September", - "type": "month" - }, - "count": { - "value": 10, - "type": "numeric" - } - }, - { - "month": { - "value": "October", - "type": "month" - }, - "count": { - "value": 4, - "type": "numeric" - } - }, - { - "month": { - "value": "November", - "type": "month" - }, - "count": { - "value": 5, - "type": "numeric" - } - }, - { - "month": { - "value": "December", - "type": "month" - }, - "count": { - "value": 10, - "type": "numeric" - } - } - ] - } - } - ], - "site": "shopping_admin" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "fuzzy_match": [ - "January: 11 orders", - "Feburary: 16 orders", - "March: 14 orders", - "April: 7 orders", - "May: 8 orders", - "June: 13 orders", - "July: 9 orders", - "August: 8 orders", - "Sepetember: 10 orders", - "Octorbor: 4 orders", - "November: 5 orders", - "December: 10 orders" - ] - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "January: 11 orders Feburary: 16 orders March: 14 orders April: 7 orders May: 8 orders June: 13 orders July: 9 orders August: 8 orders Sepetember: 10 orders Octorbor: 4 orders November: 5 orders December: 10 orders " - }, "intent_template_id": 270, + "start_urls": ["__SHOPPING_ADMIN__"], + "intent": "Get the monthly count of successful orders from Jan to December 2022", + "intent_template": "Get the monthly count of successful orders {{period}}", + "instantiation_dict": {"period": "from Jan to December 2022"}, "format_specification": "Use \"month\" for the month and \"count\" for the count.", - "changelogs": [ - { - "key": "intent_template", - "category": "task_ambiguity", - "note": "'Presents' does not match the evaluation that checks for a return value" - }, - { - "key": "eval", - "category": "permissive_string_match", - "note": "Switched from fuzzy eval" - } - ] - }, - { - "sites": [ - "shopping_admin" - ], + "start_url_context": null, + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "ordered": false, + "results_schema": { + "type": "array", + "items": { + "type": "object", + "properties": { "month": {"type": "string", "format": "month"}, "count": {"type": "number"} }, + "required": ["count", "month"] + } + }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": [ + { "month": "January" , "count": 11 }, + { "month": "February" , "count": 16 }, + { "month": "March" , "count": 14 }, + { "month": "April" , "count": 7 }, + { "month": "May" , "count": 8 }, + { "month": "June" , "count": 13 }, + { "month": "July" , "count": 9 }, + { "month": "August" , "count": 8 }, + { "month": "September", "count": 10 }, + { "month": "October" , "count": 4 }, + { "month": "November" , "count": 5 }, + { "month": "December" , "count": 10 } + ] + } + } + ], + "revision": 2 + }, + { + "sites": ["shopping_admin"], "task_id": 110, - "require_login": true, - "storage_state": "./.auth/shopping_admin_state.json", - "start_url": "__SHOPPING_ADMIN__", - "geolocation": null, - "intent_template": "Get the monthly count of successful orders {{period}}", - "original.intent_template": "Presents the monthly count of successful orders {{period}} in MM:COUNT format", - "instantiation_dict": { - "period": "from Jan to Nov 2022" - }, - "intent": "Get the monthly count of successful orders from Jan to Nov 2022", - "original.intent": "Presents the monthly count of successful orders from Jan to Nov 2022 in MM:COUNT format", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "verify_retrieved_value_normalized", - "eval_params": { - "ordered": false, - "type": "object" - }, - "expected_data": [ - { - "month": { - "value": "January", - "type": "month" - }, - "count": { - "value": 11, - "type": "numeric" - } - }, - { - "month": { - "value": "February", - "type": "month" - }, - "count": { - "value": 16, - "type": "numeric" - } - }, - { - "month": { - "value": "March", - "type": "month" - }, - "count": { - "value": 14, - "type": "numeric" - } - }, - { - "month": { - "value": "April", - "type": "month" - }, - "count": { - "value": 7, - "type": "numeric" - } - }, - { - "month": { - "value": "May", - "type": "month" - }, - "count": { - "value": 8, - "type": "numeric" - } - }, - { - "month": { - "value": "June", - "type": "month" - }, - "count": { - "value": 13, - "type": "numeric" - } - }, - { - "month": { - "value": "July", - "type": "month" - }, - "count": { - "value": 9, - "type": "numeric" - } - }, - { - "month": { - "value": "August", - "type": "month" - }, - "count": { - "value": 8, - "type": "numeric" - } - }, - { - "month": { - "value": "September", - "type": "month" - }, - "count": { - "value": 10, - "type": "numeric" - } - }, - { - "month": { - "value": "October", - "type": "month" - }, - "count": { - "value": 4, - "type": "numeric" - } - }, - { - "month": { - "value": "November", - "type": "month" - }, - "count": { - "value": 5, - "type": "numeric" - } - } - ] - } - } - ], - "site": "shopping_admin" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "fuzzy_match": [ - "January: 11 orders", - "Feburary: 16 orders", - "March: 14 orders", - "April: 7 orders", - "May: 8 orders", - "June: 13 orders", - "July: 9 orders", - "August: 8 orders", - "Sepetember: 10 orders", - "Octorbor: 4 orders", - "November: 5 orders" - ] - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "January: 11 orders Feburary: 16 orders March: 14 orders April: 7 orders May: 8 orders June: 13 orders July: 9 orders August: 8 orders Sepetember: 10 orders Octorbor: 4 orders November: 5 orders " - }, "intent_template_id": 270, + "start_urls": ["__SHOPPING_ADMIN__"], + "intent": "Get the monthly count of successful orders from Jan to Nov 2022", + "intent_template": "Get the monthly count of successful orders {{period}}", + "instantiation_dict": {"period": "from Jan to Nov 2022"}, "format_specification": "Use \"month\" for the month and \"count\" for the count.", - "changelogs": [ - { - "key": "intent_template", - "category": "task_ambiguity", - "note": "'Presents' does not match the evaluation that checks for a return value" - }, - { - "key": "eval", - "category": "permissive_string_match", - "note": "Switched from fuzzy eval" - } - ] - }, - { - "sites": [ - "shopping_admin" - ], + "start_url_context": null, + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "ordered": false, + "results_schema": { + "type": "array", + "items": { + "type": "object", + "properties": { "month": {"type": "string", "format": "month"}, "count": {"type": "number"} }, + "required": ["count", "month"] + } + }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": [ + { "month": "January" , "count": 11 }, + { "month": "February" , "count": 16 }, + { "month": "March" , "count": 14 }, + { "month": "April" , "count": 7 }, + { "month": "May" , "count": 8 }, + { "month": "June" , "count": 13 }, + { "month": "July" , "count": 9 }, + { "month": "August" , "count": 8 }, + { "month": "September", "count": 10 }, + { "month": "October" , "count": 4 }, + { "month": "November" , "count": 5 } + ] + } + } + ], + "revision": 2 + }, + { + "sites": ["shopping_admin"], "task_id": 111, - "require_login": true, - "storage_state": "./.auth/shopping_admin_state.json", - "start_url": "__SHOPPING_ADMIN__", - "geolocation": null, - "intent_template": "Get the monthly count of successful orders {{period}}", - "original.intent_template": "Presents the monthly count of successful orders {{period}} in MM:COUNT format", - "instantiation_dict": { - "period": "from Feb to Nov 2022" - }, - "intent": "Get the monthly count of successful orders from Feb to Nov 2022", - "original.intent": "Presents the monthly count of successful orders from Feb to Nov 2022 in MM:COUNT format", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "verify_retrieved_value_normalized", - "eval_params": { - "ordered": false, - "type": "object" - }, - "expected_data": [ - { - "month": { - "value": "February", - "type": "month" - }, - "count": { - "value": 16, - "type": "numeric" - } - }, - { - "month": { - "value": "March", - "type": "month" - }, - "count": { - "value": 14, - "type": "numeric" - } - }, - { - "month": { - "value": "April", - "type": "month" - }, - "count": { - "value": 7, - "type": "numeric" - } - }, - { - "month": { - "value": "May", - "type": "month" - }, - "count": { - "value": 8, - "type": "numeric" - } - }, - { - "month": { - "value": "June", - "type": "month" - }, - "count": { - "value": 13, - "type": "numeric" - } - }, - { - "month": { - "value": "July", - "type": "month" - }, - "count": { - "value": 9, - "type": "numeric" - } - }, - { - "month": { - "value": "August", - "type": "month" - }, - "count": { - "value": 8, - "type": "numeric" - } - }, - { - "month": { - "value": "September", - "type": "month" - }, - "count": { - "value": 10, - "type": "numeric" - } - }, - { - "month": { - "value": "October", - "type": "month" - }, - "count": { - "value": 4, - "type": "numeric" - } - }, - { - "month": { - "value": "November", - "type": "month" - }, - "count": { - "value": 5, - "type": "numeric" - } - } - ] - } - } - ], - "site": "shopping_admin" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "fuzzy_match": [ - "Feburary: 16 orders", - "March: 14 orders", - "April: 7 orders", - "May: 8 orders", - "June: 13 orders", - "July: 9 orders", - "August: 8 orders", - "Sepetember: 10 orders", - "Octorbor: 4 orders", - "November: 5 orders" - ] - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "Feburary: 16 orders March: 14 orders April: 7 orders May: 8 orders June: 13 orders July: 9 orders August: 8 orders Sepetember: 10 orders Octorbor: 4 orders November: 5 orders " - }, "intent_template_id": 270, + "start_urls": ["__SHOPPING_ADMIN__"], + "intent": "Get the monthly count of successful orders from Feb to Nov 2022", + "intent_template": "Get the monthly count of successful orders {{period}}", + "instantiation_dict": {"period": "from Feb to Nov 2022"}, "format_specification": "Use \"month\" for the month and \"count\" for the count.", - "changelogs": [ - { - "key": "intent_template", - "category": "task_ambiguity", - "note": "'Presents' does not match the evaluation that checks for a return value" - }, + "start_url_context": null, + "eval": [ { - "key": "eval", - "category": "permissive_string_match", - "note": "Switched from fuzzy eval" + "evaluator": "AgentResponseEvaluator", + "ordered": false, + "results_schema": { + "type": "array", + "items": { + "type": "object", + "properties": { "month": {"type": "string", "format": "month"}, "count": {"type": "number"} }, + "required": ["count", "month"] + } + }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": [ + { "month": "February" , "count": 16 }, + { "month": "March" , "count": 14 }, + { "month": "April" , "count": 7 }, + { "month": "May" , "count": 8 }, + { "month": "June" , "count": 13 }, + { "month": "July" , "count": 9 }, + { "month": "August" , "count": 8 }, + { "month": "September", "count": 10 }, + { "month": "October" , "count": 4 }, + { "month": "November" , "count": 5 } + ] + } } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping_admin" - ], + "sites": ["shopping_admin"], "task_id": 112, - "require_login": true, - "storage_state": "./.auth/shopping_admin_state.json", - "start_url": "__SHOPPING_ADMIN__", - "geolocation": null, - "intent_template": "Return the customer names for those who gave a rating of 3 stars or below for {{product}}?", - "original.intent_template": "Show me the customers who have expressed dissatisfaction with {{product}}?", - "instantiation_dict": { - "product": "Circe fleece" - }, - "intent": "Return the customer names for those who gave a rating of 3 stars or below for Circe fleece?", - "original.intent": "Show me the customers who have expressed dissatisfaction with Circe fleece?", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "verify_retrieved_value_normalized", - "eval_params": { - "ordered": false, - "type": "text" - }, - "expected_data": [ - "Hannah Lim" - ] - } - } - ], - "site": "shopping_admin" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "exact_match": "Hannah Lim" - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "Hannah Lim" - }, "intent_template_id": 245, - "changelogs": [ + "start_urls": ["__SHOPPING_ADMIN__"], + "intent": "Return the customer name(s) who gave a rating of 3 stars or below for Circe fleece", + "intent_template": "Return the customer name(s) who gave a rating of 3 stars or below for {{product}}", + "instantiation_dict": {"product": "Circe fleece"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "intent_template", - "category": "task_ambiguity", - "note": "Clarify the intent to specify return value instead of navigation" + "evaluator": "AgentResponseEvaluator", + "ordered": false, + "results_schema": { "type": "array", "items": {"type": "string"} }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": ["Hannah Lim"] + } } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping_admin" - ], + "sites": ["shopping_admin"], "task_id": 113, - "require_login": true, - "storage_state": "./.auth/shopping_admin_state.json", - "start_url": "__SHOPPING_ADMIN__", - "geolocation": null, - "intent_template": "Return the customer names for those who gave a rating of 3 stars or below for {{product}}?", - "original.intent_template": "Show me the customers who have expressed dissatisfaction with {{product}}?", - "instantiation_dict": { - "product": "Olivia zip jacket" - }, - "intent": "Return the customer names for those who gave a rating of 3 stars or below for Olivia zip jacket?", - "original.intent": "Show me the customers who have expressed dissatisfaction with Olivia zip jacket?", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "verify_retrieved_value_normalized", - "eval_params": { - "ordered": false, - "type": "text" - }, - "expected_data": [ - "Emma Lopez", - "Seam Miller" - ] - } - } - ], - "site": "shopping_admin" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "must_include": [ - "Emma Lopez", - "Seam Miller" - ] - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "Emma Lopez, Seam Miller" - }, "intent_template_id": 245, - "changelogs": [ - { - "key": "intent_template", - "category": "task_ambiguity", - "note": "Clarify the intent to specify return value instead of navigation" - }, + "start_urls": ["__SHOPPING_ADMIN__"], + "intent": "Return the customer name(s) who gave a rating of 3 stars or below for Olivia zip jacket", + "intent_template": "Return the customer name(s) who gave a rating of 3 stars or below for {{product}}", + "instantiation_dict": {"product": "Olivia zip jacket"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "eval", - "category": "permissive_string_match", - "note": "Switched to exact match" + "evaluator": "AgentResponseEvaluator", + "ordered": false, + "results_schema": { "type": "array", "items": {"type": "string"} }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": ["Emma Lopez", "Seam Miller"] + } } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping_admin" - ], + "sites": ["shopping_admin"], "task_id": 114, - "require_login": true, - "storage_state": "./.auth/shopping_admin_state.json", - "start_url": "__SHOPPING_ADMIN__", - "geolocation": null, - "intent_template": "Return the customer names for those who gave a rating of 3 stars or below for {{product}}?", - "original.intent_template": "Show me the customers who have expressed dissatisfaction with {{product}}?", - "instantiation_dict": { - "product": "Antonia racer tank" - }, - "intent": "Return the customer names for those who gave a rating of 3 stars or below for Antonia racer tank?", - "original.intent": "Show me the customers who have expressed dissatisfaction with Antonia racer tank?", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "verify_retrieved_value_normalized", - "eval_params": { - "ordered": false, - "type": "text" - }, - "expected_data": [ - "Shaunte", - "Merrie" - ] - } - } - ], - "site": "shopping_admin" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "must_include": [ - "Shaunte", - "Merrie" - ] - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "Shaunte, Merrie" - }, "intent_template_id": 245, - "changelogs": [ + "start_urls": ["__SHOPPING_ADMIN__"], + "intent": "Return the customer name(s) who gave a rating of 3 stars or below for Antonia racer tank", + "intent_template": "Return the customer name(s) who gave a rating of 3 stars or below for {{product}}", + "instantiation_dict": {"product": "Antonia racer tank"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "intent_template", - "category": "task_ambiguity", - "note": "Clarify the intent to specify return value instead of navigation" - }, - { - "key": "eval", - "category": "permissive_string_match", - "note": "Switched to exact match" + "evaluator": "AgentResponseEvaluator", + "ordered": false, + "results_schema": { "type": "array", "items": {"type": "string"} }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": ["Shaunte", "Merrie"] + } } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping_admin" - ], + "sites": ["shopping_admin"], "task_id": 115, - "require_login": true, - "storage_state": "./.auth/shopping_admin_state.json", - "start_url": "__SHOPPING_ADMIN__", - "geolocation": null, - "intent_template": "Return the customer names for those who gave a rating of 3 stars or below for {{product}}?", - "original.intent_template": "Show me the name of the customers who have expressed dissatisfaction with {{product}}", - "instantiation_dict": { - "product": "Chloe tank" - }, - "intent": "Return the customer names for those who gave a rating of 3 stars or below for Chloe tank?", - "original.intent": "Show me the name of the customers who have expressed dissatisfaction with Chloe tank", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "validate_status", - "expected_data": { - "expected_status": "NOT_FOUND_ERROR" - } - } - } - ], - "site": "shopping_admin" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "fuzzy_match": "N/A" - }, - "reference_url": "", - "program_html": [], - "string_note": "There is no negative review for Chloe tank", - "reference_answer_raw_annotation": "" - }, "intent_template_id": 245, - "changelogs": [ + "start_urls": ["__SHOPPING_ADMIN__"], + "intent": "Return the customer name(s) who gave a rating of 3 stars or below for {{product}} 3 stars or below for Chloe tank", + "intent_template": "Return the customer name(s) who gave a rating of 3 stars or below for {{product}} 3 stars or below for {{product}}", + "instantiation_dict": {"product": "Chloe tank"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "intent_template", - "category": "task_ambiguity", - "note": "Clarify the intent to specify return value instead of navigation" - }, - { - "key": "expected_retrieve_value", - "category": "unachievable_tasks", - "note": "Status check used instead of fuzzy match" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": { + "performed_operation": "retrieve", + "status": "NOT_FOUND_ERROR", + "retrieved_data": null + } } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping_admin" - ], + "sites": ["shopping_admin"], "task_id": 116, - "require_login": true, - "storage_state": "./.auth/shopping_admin_state.json", - "start_url": "__SHOPPING_ADMIN__", - "geolocation": null, - "intent_template": "Return the customer names for those who gave a rating of 3 stars or below for {{product}}?", - "original.intent_template": "Show me the name of the customers who have expressed dissatisfaction with {{product}}?", - "instantiation_dict": { - "product": "tanks products" - }, - "intent": "Return the customer names for those who gave a rating of 3 stars or below for tanks products?", - "original.intent": "Show me the name of the customers who have expressed dissatisfaction with tanks products?", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "verify_retrieved_value_normalized", - "eval_params": { - "ordered": false, - "type": "text" - }, - "expected_data": [ - "Alexander", - "Carma", - "Dominic", - "Merrie", - "Monroe", - "Scotty", - "Shaunte", - "Teofila", - "Valorie", - "Yan" - ] - } - } - ], - "site": "shopping_admin" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "must_include": [ - "Alexander", - "Carma", - "Dominic", - "Merrie", - "Monroe", - "Scotty", - "Shaunte", - "Teofila", - "Valorie" - ] - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "Alexander, Carma, Dominic, Merrie, Monroe, Scotty, Shaunte, Teofila, Valorie" - }, "intent_template_id": 245, - "changelogs": [ - { - "key": "intent_template", - "category": "task_ambiguity", - "note": "Clarify the intent to specify return value instead of navigation" - }, - { - "key": "expected_retrieve_value", - "category": "reference_alignment", - "note": "Missing expected reviewer name" - }, + "start_urls": ["__SHOPPING_ADMIN__"], + "intent": "Return the customer name(s) who gave a rating of 3 stars or below for tanks products", + "intent_template": "Return the customer name(s) who gave a rating of 3 stars or below for {{product}} 3 stars or below for {{product}}", + "instantiation_dict": {"product": "tanks products"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "eval", - "category": "permissive_string_match", - "note": "Switched to exact match" + "evaluator": "AgentResponseEvaluator", + "ordered": false, + "results_schema": { "type": "array", "items": {"type": "string"} }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": [ + "Alexander", "Carma", "Dominic", "Merrie", "Monroe", "Scotty", "Shaunte", + "Teofila", "Valorie", "Yan", "Trey", "Edmund", "Tracey", "Mikkel", "Matthew", + "Joey" + ] + } } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping" - ], + "sites": ["shopping"], "task_id": 117, - "require_login": true, - "storage_state": "./.auth/shopping_state.json", - "start_url": "__SHOPPING__", - "geolocation": null, - "intent_template": "Return the date when I made my first purchase on this site?", - "original.intent_template": "What is the date when I made my first purchase on this site?", - "instantiation_dict": {}, - "intent": "Return the date when I made my first purchase on this site?", - "original.intent": "What is the date when I made my first purchase on this site?", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "verify_retrieved_value_normalized", - "eval_params": { - "ordered": false, - "type": "date" - }, - "expected_data": [ - "3/2/22" - ] - } - } - ], - "site": "shopping" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "fuzzy_match": [ - "3/2/22" - ] - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "3/2/22" - }, "intent_template_id": 161, - "changelogs": [ - { - "key": "intent_template", - "category": "task_ambiguity", - "note": "Clarify the intent to specify return value" - }, + "start_urls": ["__SHOPPING__"], + "intent": "Get the date when I made my first purchase on this site", + "intent_template": "Get the date when I made my first purchase on this site", + "instantiation_dict": {}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "eval", - "category": "permissive_string_match", - "note": "Switched from fuzzy eval" + "evaluator": "AgentResponseEvaluator", + "ordered": false, + "results_schema": { "type": "array", "items": {"type": "string"} }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": ["3/2/22"] + } } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping" - ], + "sites": ["shopping"], "task_id": 118, - "require_login": true, - "storage_state": "./.auth/shopping_state.json", - "start_url": "__SHOPPING__", - "geolocation": null, + "intent_template_id": 151, + "start_urls": ["__SHOPPING__"], + "intent": "I have jaw bruxism problem, show me something that could alleviate the problem.", "intent_template": "I have jaw bruxism problem, show me something that could alleviate the problem.", "instantiation_dict": {}, - "intent": "I have jaw bruxism problem, show me something that could alleviate the problem.", - "require_reset": false, - "eval": { - "site": "shopping" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": null, - "program_html": [ - { - "url": "last", - "locator": "", - "required_contents": { - "must_include": [ - "jaw bruxism", - "mouth guard" - ] - } - } - ] - }, - "intent_template_id": 151 + "format_specification": null, + "start_url_context": null, + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} + } + ], + "revision": 2 }, { - "sites": [ - "shopping_admin" - ], + "sites": ["shopping_admin"], "task_id": 119, - "require_login": true, - "storage_state": "./.auth/shopping_admin_state.json", - "start_url": "__SHOPPING_ADMIN__", - "geolocation": null, - "intent_template": "List all reviews with 4 stars or above for {{product}}.", - "original.intent_template": "Tell me the reasons why customers like {{product}}", - "instantiation_dict": { - "product": "Antonia Racer Tank" - }, - "intent": "List all reviews with 4 stars or above for Antonia Racer Tank.", - "original.intent": "Tell me the reasons why customers like Antonia Racer Tank", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "verify_retrieved_value_normalized", - "eval_params": { - "ordered": false, - "type": "object" - }, - "expected_data": [ - { - "title": { - "value": "A regular or me", - "type": "text" - }, - "rating": { - "value": 4, - "type": "numeric" - } - } - ] - } - } - ], - "site": "shopping_admin" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "fuzzy_match": [ - "Its color and style is good" - ] - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "Its color and style is good" - }, "intent_template_id": 250, + "start_urls": ["__SHOPPING_ADMIN__"], + "intent": "List all reviews with 4 stars or above for Antonia Racer Tank.", + "intent_template": "List all reviews with 4 stars or above for {{product}}.", + "instantiation_dict": {"product": "Antonia Racer Tank"}, "format_specification": "Use \"title\" for the review title and \"rating\" for rating (number of stars).", - "changelogs": [ + "start_url_context": null, + "eval": [ { - "key": "eval", - "category": "permissive_string_match", - "note": "Switched from fuzzy eval" + "evaluator": "AgentResponseEvaluator", + "ordered": false, + "results_schema": { + "type": "array", + "items": { + "type": "object", + "properties": { "title": {"type": "string"}, "rating": {"type": "number"} }, + "required": ["rating", "title"] + } + }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": [ {"title": "A regular or me", "rating": 4} ] + } } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping_admin" - ], + "sites": ["shopping_admin"], "task_id": 120, - "require_login": true, - "storage_state": "./.auth/shopping_admin_state.json", - "start_url": "__SHOPPING_ADMIN__", - "geolocation": null, - "intent_template": "List all reviews with 4 stars or above for {{product}}.", - "original.intent_template": "Tell me the reasons why customers like {{product}}", - "instantiation_dict": { - "product": "Ana Running Short" - }, - "intent": "List all reviews with 4 stars or above for Ana Running Short.", - "original.intent": "Tell me the reasons why customers like Ana Running Short", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "verify_retrieved_value_normalized", - "eval_params": { - "ordered": false, - "type": "object" - }, - "expected_data": [ - { - "title": { - "value": "It was really hard to find the right siz", - "type": "text" - }, - "rating": { - "value": 4, - "type": "numeric" - } - }, - { - "title": { - "value": "VERY LIGHTWEIGHT COMFY-GOOD SHOES", - "type": "text" - }, - "rating": { - "value": 5, - "type": "numeric" - } - } - ] - } - } - ], - "site": "shopping_admin" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "fuzzy_match": [ - "It is comfortable" - ] - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "It is comfortable" - }, "intent_template_id": 250, + "start_urls": ["__SHOPPING_ADMIN__"], + "intent": "List all reviews with 4 stars or above for Ana Running Short.", + "intent_template": "List all reviews with 4 stars or above for {{product}}.", + "instantiation_dict": {"product": "Ana Running Short"}, "format_specification": "Use \"title\" for the review title and \"rating\" for rating (number of stars).", - "changelogs": [ + "start_url_context": null, + "eval": [ { - "key": "eval", - "category": "permissive_string_match", - "note": "Switched from fuzzy eval" + "evaluator": "AgentResponseEvaluator", + "ordered": false, + "results_schema": { + "type": "array", + "items": { + "type": "object", + "properties": { "title": {"type": "string"}, "rating": {"type": "number"} }, + "required": ["rating", "title"] + } + }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": [ + { "title": "It was really hard to find the right siz", "rating": 4 }, + { "title": "VERY LIGHTWEIGHT COMFY-GOOD SHOES" , "rating": 5 } + ] + } } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping_admin" - ], + "sites": ["shopping_admin"], "task_id": 121, - "require_login": true, - "storage_state": "./.auth/shopping_admin_state.json", - "start_url": "__SHOPPING_ADMIN__", - "geolocation": null, - "intent_template": "List all reviews with 4 stars or above for {{product}}.", - "original.intent_template": "Tell me the reasons why customers like {{product}}", - "instantiation_dict": { - "product": "Circe hooded fleece" - }, - "intent": "List all reviews with 4 stars or above for Circe hooded fleece.", - "original.intent": "Tell me the reasons why customers like Circe hooded fleece", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "verify_retrieved_value_normalized", - "eval_params": { - "ordered": false, - "type": "object" - }, - "expected_data": [ - { - "title": { - "value": "Good but not perfect", - "type": "text" - }, - "rating": { - "value": 4, - "type": "numeric" - } - } - ] - } - } - ], - "site": "shopping_admin" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "fuzzy_match": [ - "Warm and comfortable. True to size." - ] - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "Warm and comfortable. True to size." - }, "intent_template_id": 250, + "start_urls": ["__SHOPPING_ADMIN__"], + "intent": "List all reviews with 4 stars or above for Circe hooded fleece.", + "intent_template": "List all reviews with 4 stars or above for {{product}}.", + "instantiation_dict": {"product": "Circe hooded fleece"}, "format_specification": "Use \"title\" for the review title and \"rating\" for rating (number of stars).", - "changelogs": [ + "start_url_context": null, + "eval": [ { - "key": "eval", - "category": "permissive_string_match", - "note": "Switched from fuzzy eval" + "evaluator": "AgentResponseEvaluator", + "ordered": false, + "results_schema": { + "type": "array", + "items": { + "type": "object", + "properties": { "title": {"type": "string"}, "rating": {"type": "number"} }, + "required": ["rating", "title"] + } + }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": [ {"title": "Good but not perfect", "rating": 4} ] + } } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping_admin" - ], + "sites": ["shopping_admin"], "task_id": 122, - "require_login": true, - "storage_state": "./.auth/shopping_admin_state.json", - "start_url": "__SHOPPING_ADMIN__", - "geolocation": null, - "intent_template": "List all reviews with 4 stars or above for {{product}}.", - "original.intent_template": "Tell me the reasons why customers like {{product}}", - "instantiation_dict": { - "product": "Olivia zip jacket" - }, - "intent": "List all reviews with 4 stars or above for Olivia zip jacket.", - "original.intent": "Tell me the reasons why customers like Olivia zip jacket", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "verify_retrieved_value_normalized", - "eval_params": { - "ordered": false, - "type": "object" - }, - "expected_data": [ - { - "title": { - "value": "Quite good", - "type": "text" - }, - "rating": { - "value": 5, - "type": "numeric" - } - } - ] - } - } - ], - "site": "shopping_admin" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "fuzzy_match": [ - "Lightweight, comfortable and stylish. Good design and details." - ] - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "Lightweight, comfortable, and stylish. Good design and details." - }, "intent_template_id": 250, + "start_urls": ["__SHOPPING_ADMIN__"], + "intent": "List all reviews with 4 stars or above for Olivia zip jacket.", + "intent_template": "List all reviews with 4 stars or above for {{product}}.", + "instantiation_dict": {"product": "Olivia zip jacket"}, "format_specification": "Use \"title\" for the review title and \"rating\" for rating (number of stars).", - "changelogs": [ + "start_url_context": null, + "eval": [ { - "key": "eval", - "category": "permissive_string_match", - "note": "Switched from fuzzy eval" + "evaluator": "AgentResponseEvaluator", + "ordered": false, + "results_schema": { + "type": "array", + "items": { + "type": "object", + "properties": { "title": {"type": "string"}, "rating": {"type": "number"} }, + "required": ["rating", "title"] + } + }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": [ {"title": "Quite good", "rating": 5} ] + } } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping_admin" - ], + "sites": ["shopping_admin"], "task_id": 123, - "require_login": true, - "storage_state": "./.auth/shopping_admin_state.json", - "start_url": "__SHOPPING_ADMIN__", - "geolocation": null, - "intent_template": "List all reviews with 4 stars or above for {{product}}.", - "original.intent_template": "Tell me the reasons why customers like {{product}}", - "instantiation_dict": { - "product": "Circe's products" - }, - "intent": "List all reviews with 4 stars or above for Circe's products.", - "original.intent": "Tell me the reasons why customers like Circe's products", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "verify_retrieved_value_normalized", - "eval_params": { - "ordered": false, - "type": "object" - }, - "expected_data": [ - { - "title": { - "value": "Good but not perfect", - "type": "text" - }, - "rating": { - "value": 4, - "type": "numeric" - } - } - ] - } - } - ], - "site": "shopping_admin" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "fuzzy_match": [ - "Warm and comfortable. True to size." - ] - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "Warm and comfortable. True to size." - }, "intent_template_id": 250, + "start_urls": ["__SHOPPING_ADMIN__"], + "intent": "List all reviews with 4 stars or above for Circe's products.", + "intent_template": "List all reviews with 4 stars or above for {{product}}.", + "instantiation_dict": {"product": "Circe's products"}, "format_specification": "Use \"title\" for the review title and \"rating\" for rating (number of stars).", - "changelogs": [ + "start_url_context": null, + "eval": [ { - "key": "eval", - "category": "permissive_string_match", - "note": "Switched from fuzzy eval" + "evaluator": "AgentResponseEvaluator", + "ordered": false, + "results_schema": { + "type": "array", + "items": { + "type": "object", + "properties": { "title": {"type": "string"}, "rating": {"type": "number"} }, + "required": ["rating", "title"] + } + }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": [ {"title": "Good but not perfect", "rating": 4} ] + } } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping" - ], + "sites": ["shopping"], "task_id": 124, - "require_login": true, - "storage_state": "./.auth/shopping_state.json", - "start_url": "__SHOPPING__", - "geolocation": null, - "intent_template": "What is the price range of {{product}} in the One Stop Market?", - "instantiation_dict": { - "product": "wireless earphone" - }, - "intent": "What is the price range of wireless earphone in the One Stop Market?", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "verify_retrieved_value_normalized", - "eval_params": { - "ordered": false, - "type": "object" - }, - "expected_data": [ - { - "min": { - "value": 0.01, - "type": "currency" - }, - "max": { - "value": 745.0, - "type": "currency" - } - } - ] - } - } - ], - "site": "shopping" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "must_include": [ - "0.14", - "745.00" - ] - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "$0.14 - $745.00" - }, "intent_template_id": 159, + "start_urls": ["__SHOPPING__"], + "intent": "What is the price range of wireless earphone in the One Stop Market?", + "intent_template": "What is the price range of {{product}} in the One Stop Market?", + "instantiation_dict": {"product": "wireless earphone"}, "format_specification": "Use \"min\" for lowest price and \"max\" for highest price.", - "changelogs": [ - { - "key": "expected_retrieve_value", - "category": "reference_alignment", - "note": "Min cost for earphone is a earbud for 0.01" - }, + "start_url_context": null, + "eval": [ { - "key": "eval", - "category": "permissive_string_match", - "note": "Switched to exact match" + "evaluator": "AgentResponseEvaluator", + "ordered": false, + "results_schema": { + "type": "array", + "items": { + "type": "object", + "properties": { "min": {"type": "number"}, "max": {"type": "number"} }, + "required": ["max", "min"] + } + }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": [ {"min": 0.01, "max": 745.0} ] + } } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping" - ], + "sites": ["shopping"], "task_id": 125, - "require_login": true, - "storage_state": "./.auth/shopping_state.json", - "start_url": "__SHOPPING__", - "geolocation": null, - "intent_template": "What is the price range of {{product}} in the One Stop Market?", - "instantiation_dict": { - "product": "teeth grinding mouth guard" - }, - "intent": "What is the price range of teeth grinding mouth guard in the One Stop Market?", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "verify_retrieved_value_normalized", - "eval_params": { - "ordered": false, - "type": "object" - }, - "expected_data": [ - { - "min": { - "value": 1.46, - "type": "currency" - }, - "max": { - "value": 85.0, - "type": "currency" - } - } - ] - } - } - ], - "site": "shopping" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "must_include": [ - "1.46", - "85" - ] - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "$1.46 - $85" - }, "intent_template_id": 159, + "start_urls": ["__SHOPPING__"], + "intent": "What is the price range of teeth grinding mouth guard in the One Stop Market?", + "intent_template": "What is the price range of {{product}} in the One Stop Market?", + "instantiation_dict": {"product": "teeth grinding mouth guard"}, "format_specification": "Use \"min\" for lowest price and \"max\" for highest price.", - "changelogs": [ + "start_url_context": null, + "eval": [ { - "key": "eval", - "category": "permissive_string_match", - "note": "Switched to exact match" + "evaluator": "AgentResponseEvaluator", + "ordered": false, + "results_schema": { + "type": "array", + "items": { + "type": "object", + "properties": { "min": {"type": "number"}, "max": {"type": "number"} }, + "required": ["max", "min"] + } + }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": [ {"min": 1.46, "max": 179.99} ] + } } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping" - ], + "sites": ["shopping"], "task_id": 126, - "require_login": true, - "storage_state": "./.auth/shopping_state.json", - "start_url": "__SHOPPING__", - "geolocation": null, - "intent_template": "What is the price range of {{product}} in the One Stop Market?", - "instantiation_dict": { - "product": "Canon photo printer" - }, - "intent": "What is the price range of Canon photo printer in the One Stop Market?", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "verify_retrieved_value_normalized", - "eval_params": { - "ordered": false, - "type": "object" - }, - "expected_data": [ - { - "min": { - "value": 2.56, - "type": "currency" - }, - "max": { - "value": 649.99, - "type": "currency" - } - } - ] - } - } - ], - "site": "shopping" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "must_include": [ - "2.56", - "649.99" - ] - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "$2.56 - $649.99" - }, "intent_template_id": 159, + "start_urls": ["__SHOPPING__"], + "intent": "What is the price range of Canon photo printer in the One Stop Market?", + "intent_template": "What is the price range of {{product}} in the One Stop Market?", + "instantiation_dict": {"product": "Canon photo printer"}, "format_specification": "Use \"min\" for lowest price and \"max\" for highest price.", - "changelogs": [ + "start_url_context": null, + "eval": [ { - "key": "eval", - "category": "permissive_string_match", - "note": "Switched to exact match" + "evaluator": "AgentResponseEvaluator", + "ordered": false, + "results_schema": { + "type": "array", + "items": { + "type": "object", + "properties": { "min": {"type": "number"}, "max": {"type": "number"} }, + "required": ["max", "min"] + } + }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": [ {"min": 2.56, "max": 649.99} ] + } } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping_admin" - ], + "sites": ["shopping_admin"], "task_id": 127, - "require_login": true, - "storage_state": "./.auth/shopping_admin_state.json", - "start_url": "__SHOPPING_ADMIN__", - "geolocation": null, + "intent_template_id": 1001, + "start_urls": ["__SHOPPING_ADMIN__"], + "intent": "What brands appear most frequently among the top search terms?", "intent_template": "What brands appear most frequently among the top search terms?", "instantiation_dict": {}, - "intent": "What brands appear most frequently among the top search terms?", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "verify_retrieved_value_normalized", - "eval_params": { - "ordered": false, - "type": "text" - }, - "expected_data": [ - "Hollister", - "Joust", - "Antonia" - ] - } - } - ], - "site": "shopping_admin" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "must_include": [ - "Hollister", - "Joust", - "Antonia" - ] - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "Hollister, Joust, Antonia" - }, - "intent_template_id": 1001, - "changelogs": [ + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "eval", - "category": "permissive_string_match", - "note": "Switched to exact match" + "evaluator": "AgentResponseEvaluator", + "ordered": false, + "results_schema": { "type": "array", "items": {"type": "string"} }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": ["Hollister", "Joust", "Antonia"] + } } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping_admin" - ], + "sites": ["shopping_admin"], "task_id": 128, - "require_login": true, - "storage_state": "./.auth/shopping_admin_state.json", - "start_url": "__SHOPPING_ADMIN__", - "geolocation": null, - "intent_template": "What's the total number of items sold in the most recent {{k}} orders?", - "instantiation_dict": { - "k": "2" - }, - "intent": "What's the total number of items sold in the most recent 2 orders?", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "verify_retrieved_value_normalized", - "eval_params": { - "ordered": false, - "type": "numeric" - }, - "expected_data": [ - 9 - ] - } - } - ], - "site": "shopping_admin" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "must_include": [ - "9" - ] - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "9" - }, "intent_template_id": 1002, - "changelogs": [ + "start_urls": ["__SHOPPING_ADMIN__"], + "intent": "What's the total number of items sold in the most recent 2 complete orders?", + "intent_template": "What's the total number of items sold in the most recent {{k}} complete orders?", + "instantiation_dict": {"k": "2"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "eval", - "category": "permissive_string_match", - "note": "Switched to exact match" + "evaluator": "AgentResponseEvaluator", + "ordered": false, + "results_schema": { "type": "array", "items": {"type": "number"} }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": [3] + } } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping_admin" - ], + "sites": ["shopping_admin"], "task_id": 129, - "require_login": true, - "storage_state": "./.auth/shopping_admin_state.json", - "start_url": "__SHOPPING_ADMIN__", - "geolocation": null, - "intent_template": "What's the total number of items sold in the most recent {{k}} orders?", - "instantiation_dict": { - "k": "4" - }, - "intent": "What's the total number of items sold in the most recent 4 orders?", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "verify_retrieved_value_normalized", - "eval_params": { - "ordered": false, - "type": "numeric" - }, - "expected_data": [ - 16 - ] - } - } - ], - "site": "shopping_admin" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "must_include": [ - "16" - ] - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "16" - }, "intent_template_id": 1002, - "changelogs": [ + "start_urls": ["__SHOPPING_ADMIN__"], + "intent": "What's the total number of items sold in the most recent 4 complete orders?", + "intent_template": "What's the total number of items sold in the most recent {{k}} complete orders?", + "instantiation_dict": {"k": "4"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "eval", - "category": "permissive_string_match", - "note": "Switched to exact match" + "evaluator": "AgentResponseEvaluator", + "ordered": false, + "results_schema": { "type": "array", "items": {"type": "number"} }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": [9] + } } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping_admin" - ], + "sites": ["shopping_admin"], "task_id": 130, - "require_login": true, - "storage_state": "./.auth/shopping_admin_state.json", - "start_url": "__SHOPPING_ADMIN__", - "geolocation": null, - "intent_template": "What's the total number of items sold in the most recent {{k}} orders?", - "instantiation_dict": { - "k": "5" - }, - "intent": "What's the total number of items sold in the most recent 5 orders?", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "verify_retrieved_value_normalized", - "eval_params": { - "ordered": false, - "type": "numeric" - }, - "expected_data": [ - 18 - ] - } - } - ], - "site": "shopping_admin" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "must_include": [ - "18" - ] - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "18" - }, "intent_template_id": 1002, - "changelogs": [ + "start_urls": ["__SHOPPING_ADMIN__"], + "intent": "What's the total number of items sold in the most recent 5 complete orders?", + "intent_template": "What's the total number of items sold in the most recent {{k}} complete orders?", + "instantiation_dict": {"k": "5"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "eval", - "category": "permissive_string_match", - "note": "Switched to exact match" + "evaluator": "AgentResponseEvaluator", + "ordered": false, + "results_schema": { "type": "array", "items": {"type": "number"} }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": [12] + } } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping_admin" - ], + "sites": ["shopping_admin"], "task_id": 131, - "require_login": true, - "storage_state": "./.auth/shopping_admin_state.json", - "start_url": "__SHOPPING_ADMIN__", - "geolocation": null, - "intent_template": "What's the total number of items sold in the most recent {{k}} orders?", - "instantiation_dict": { - "k": "7" - }, - "intent": "What's the total number of items sold in the most recent 7 orders?", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "verify_retrieved_value_normalized", - "eval_params": { - "ordered": false, - "type": "numeric" - }, - "expected_data": [ - 25 - ] - } - } - ], - "site": "shopping_admin" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "must_include": [ - "25" - ] - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "25" - }, "intent_template_id": 1002, - "changelogs": [ + "start_urls": ["__SHOPPING_ADMIN__"], + "intent": "What's the total number of items sold in the most recent 7 complete orders?", + "intent_template": "What's the total number of items sold in the most recent {{k}} complete orders?", + "instantiation_dict": {"k": "7"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "eval", - "category": "permissive_string_match", - "note": "Switched to exact match" + "evaluator": "AgentResponseEvaluator", + "ordered": false, + "results_schema": { "type": "array", "items": {"type": "number"} }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": [17] + } } - ] + ], + "revision": 2 }, { - "sites": [ - "gitlab" - ], + "sites": ["gitlab"], "task_id": 132, - "require_login": true, - "storage_state": "./.auth/gitlab_state.json", - "start_url": "__GITLAB__", - "geolocation": null, - "intent_template": "How many commits did {{user}} make to {{repo}} on {{date}}?", - "instantiation_dict": { - "user": "kilian", - "repo": "a11yproject.com", - "date": "3/5/2023" - }, - "original.instantiation_dict": { - "user": "kilian", - "repo": "a11yproject", - "date": "3/5/2023" - }, - "intent": "How many commits did kilian make to a11yproject.com on 3/5/2023?", - "original.intent": "How many commits did kilian make to a11yproject on 3/5/2023?", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "verify_retrieved_value_normalized", - "eval_params": { - "ordered": "false", - "type": "numeric" - }, - "expected_data": [ - 1 - ] - } - } - ], - "site": "gitlab" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "must_include": [ - "1" - ] - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "1" - }, "intent_template_id": 322, - "changelogs": [ - { - "key": "instantiation_dict", - "category": "clarify_instructions", - "note": "Repo corrected; date format ambiguous; require numeric answer" - }, + "start_urls": ["__GITLAB__"], + "intent": "How many commits did kilian make to a11yproject.com on 3/5/2023?", + "intent_template": "How many commits did {{user}} make to {{repo}} on {{date}}?", + "instantiation_dict": {"user": "kilian", "repo": "a11yproject.com", "date": "3/5/2023"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "expected_retrieve_value", - "category": "avoid_permissive_text", - "note": "Use structured response to judge correctness" - }, - { - "key": "eval", - "category": "permissive_string_match", - "note": "Switched to exact match" + "evaluator": "AgentResponseEvaluator", + "ordered": "false", + "results_schema": { "type": "array", "items": {"type": "number"} }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": [1] + } } - ] + ], + "revision": 2 }, { - "sites": [ - "gitlab" - ], + "sites": ["gitlab"], "task_id": 133, - "require_login": true, - "storage_state": "./.auth/gitlab_state.json", - "start_url": "__GITLAB__", - "geolocation": null, - "intent_template": "How many commits did {{user}} make to {{repo}} on {{date}}?", - "instantiation_dict": { - "user": "Eric", - "repo": "a11yproject.com", - "date": "3/2" - }, - "original.instantiation_dict": { - "user": "Eric", - "repo": "a11yproject", - "date": "3/2" - }, - "intent": "How many commits did Eric make to a11yproject.com on 3/2?", - "original.intent": "How many commits did Eric make to a11yproject on 3/2?", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "verify_retrieved_value_normalized", - "eval_params": { - "ordered": "false", - "type": "numeric" - }, - "expected_data": [ - 2 - ] - } - } - ], - "site": "gitlab" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "must_include": [ - "2" - ] - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "2" - }, "intent_template_id": 322, - "changelogs": [ - { - "key": "instantiation_dict", - "category": "clarify_instructions", - "note": "Same as 132: repo corrected; ambiguous date; numeric answer" - }, - { - "key": "expected_retrieve_value", - "category": "avoid_permissive_text", - "note": "Use structured response to judge correctness" - }, + "start_urls": ["__GITLAB__"], + "intent": "How many commits did Eric make to a11yproject.com on 3/2?", + "intent_template": "How many commits did {{user}} make to {{repo}} on {{date}}?", + "instantiation_dict": {"user": "Eric", "repo": "a11yproject.com", "date": "3/2/2023"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "eval", - "category": "permissive_string_match", - "note": "Switched to exact match" + "evaluator": "AgentResponseEvaluator", + "ordered": "false", + "results_schema": { "type": "array", "items": {"type": "number"} }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": [2] + } } - ] + ], + "revision": 2 }, { - "sites": [ - "gitlab" - ], + "sites": ["gitlab"], "task_id": 134, - "require_login": true, - "storage_state": "./.auth/gitlab_state.json", - "start_url": "__GITLAB__", - "geolocation": null, - "intent_template": "How many commits did {{user}} make to {{repo}} on {{date}}?", - "instantiation_dict": { - "user": "kilian", - "repo": "a11yproject.com", - "date": "3/1/2023" - }, - "original.instantiation_dict": { - "user": "kilian", - "repo": "a11yproject", - "date": "3/1/2023" - }, - "intent": "How many commits did kilian make to a11yproject.com on 3/1/2023?", - "original.intent": "How many commits did kilian make to a11yproject on 3/1/2023?", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "verify_retrieved_value_normalized", - "eval_params": { - "ordered": "false", - "type": "numeric" - }, - "expected_data": [ - 0 - ] - } - } - ], - "site": "gitlab" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "must_include": [ - "0" - ] - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "0" - }, "intent_template_id": 322, - "changelogs": [ - { - "key": "instantiation_dict", - "category": "clarify_instructions", - "note": "Same as 132: repo corrected; ambiguous date; numeric answer" - }, - { - "key": "expected_retrieve_value", - "category": "avoid_permissive_text", - "note": "Use structured response to judge correctness" - }, + "start_urls": ["__GITLAB__"], + "intent": "How many commits did kilian make to a11yproject.com on 3/1/2023?", + "intent_template": "How many commits did {{user}} make to {{repo}} on {{date}}?", + "instantiation_dict": {"user": "kilian", "repo": "a11yproject.com", "date": "3/1/2023"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "eval", - "category": "permissive_string_match", - "note": "Switched to exact match" + "evaluator": "AgentResponseEvaluator", + "ordered": "false", + "results_schema": { "type": "array", "items": {"type": "number"} }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": [0] + } } - ] + ], + "revision": 2 }, { - "sites": [ - "gitlab" - ], + "sites": ["gitlab"], "task_id": 135, - "require_login": true, - "storage_state": "./.auth/gitlab_state.json", - "start_url": "__GITLAB__", - "geolocation": null, - "intent_template": "How many commits did {{user}} make to {{repo}} on {{date}}?", - "instantiation_dict": { - "user": "Eric and Kilian", - "repo": "a11yproject.com", - "date": "1/3/2023" - }, - "original.instantiation_dict": { - "user": "Eric and Kilian", - "repo": "a11yproject", - "date": "1/3/2023" - }, - "intent": "How many commits did Eric and Kilian make to a11yproject.com on 1/3/2023?", - "original.intent": "How many commits did Eric and Kilian make to a11yproject on 1/3/2023?", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "verify_retrieved_value_normalized", - "eval_params": { - "ordered": "false", - "type": "numeric" - }, - "expected_data": [ - 1 - ] - } - } - ], - "site": "gitlab" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "must_include": [ - "1" - ] - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "1" - }, "intent_template_id": 322, - "changelogs": [ - { - "key": "instantiation_dict", - "category": "clarify_instructions", - "note": "Repo corrected; date format ambiguous; require numeric answer" - }, - { - "key": "expected_retrieve_value", - "category": "avoid_permissive_text", - "note": "Use structured response to judge correctness" - }, + "start_urls": ["__GITLAB__"], + "intent": "How many commits did Eric and Kilian make to a11yproject.com on 1/3/2023?", + "intent_template": "How many commits did {{user}} make to {{repo}} on {{date}}?", + "instantiation_dict": {"user": "Eric and Kilian", "repo": "a11yproject.com", "date": "1/3/2023"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "eval", - "category": "permissive_string_match", - "note": "Switched to exact match" + "evaluator": "AgentResponseEvaluator", + "ordered": "false", + "results_schema": { "type": "array", "items": {"type": "number"} }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": [1] + } } - ] + ], + "revision": 2 }, { - "sites": [ - "gitlab" - ], + "sites": ["gitlab"], "task_id": 136, - "require_login": true, - "storage_state": "./.auth/gitlab_state.json", - "start_url": "__GITLAB__", - "geolocation": null, - "intent_template": "How many commits did {{user}} make to {{repo}} on {{date}}?", - "instantiation_dict": { - "user": "Steven Woodson", - "repo": "a11y-webring.club", - "date": "2/6/2023" - }, - "intent": "How many commits did Steven Woodson make to a11y-webring.club on 2/6/2023?", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "verify_retrieved_value_normalized", - "eval_params": { - "ordered": "false", - "type": "numeric" - }, - "expected_data": [ - 5 - ] - } - } - ], - "site": "gitlab" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "must_include": [ - "5" - ] - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "5" - }, "intent_template_id": 322, - "changelogs": [ - { - "key": "expected_retrieve_value", - "category": "avoid_permissive_text", - "note": "Use structured response to judge correctness" - }, + "start_urls": ["__GITLAB__"], + "intent": "How many commits did Steven Woodson make to a11y-webring.club on 2/6/2023?", + "intent_template": "How many commits did {{user}} make to {{repo}} on {{date}}?", + "instantiation_dict": {"user": "Steven Woodson", "repo": "a11y-webring.club", "date": "2/6/2023"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "eval", - "category": "permissive_string_match", - "note": "Switched to exact match" + "evaluator": "AgentResponseEvaluator", + "ordered": "false", + "results_schema": { "type": "array", "items": {"type": "number"} }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": [5] + } } - ] + ], + "revision": 2 }, { - "sites": [ - "map" - ], + "sites": ["map"], "task_id": 137, - "require_login": true, - "storage_state": null, - "start_url": "__MAP__", - "geolocation": null, + "intent_template_id": 51, + "start_urls": ["__MAP__"], + "intent": "What is the estimated driving time between the city where the Liberty Bell is located and the home city of Pirates?", "intent_template": "What is the estimated driving time between {{city1}} and {{city2}}?", "instantiation_dict": { "city1": "the city where the Liberty Bell is located", "city2": "the home city of Pirates" }, - "intent": "What is the estimated driving time between the city where the Liberty Bell is located and the home city of Pirates?", - "require_reset": false, - "eval": { - "site": "map" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "fuzzy_match": [ - "5h 47min" - ] - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "5h 47min" - }, - "intent_template_id": 51, - "changelogs": [ + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "eval", - "category": "permissive_string_match", - "note": "Switched from fuzzy eval" + "evaluator": "AgentResponseEvaluator", + "ordered": false, + "results_schema": { "type": "array", "items": {"type": "string"} }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": ["5h 47min"] + } } - ] + ], + "revision": 2 }, { - "sites": [ - "map" - ], + "sites": ["map"], "task_id": 138, - "require_login": true, - "storage_state": null, - "start_url": "__MAP__", - "geolocation": null, + "intent_template_id": 51, + "start_urls": ["__MAP__"], + "intent": "What is the estimated driving time between the big apple and the city with the most authentic Philly cheesesteaks?", "intent_template": "What is the estimated driving time between {{city1}} and {{city2}}?", "instantiation_dict": { "city1": "the big apple", "city2": "the city with the most authentic Philly cheesesteaks" }, - "intent": "What is the estimated driving time between the big apple and the city with the most authentic Philly cheesesteaks?", - "require_reset": false, - "eval": { - "site": "map" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "fuzzy_match": [ - "1h 58min" - ] - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "1h 58min" - }, - "intent_template_id": 51, - "changelogs": [ + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "eval", - "category": "permissive_string_match", - "note": "Switched from fuzzy eval" + "evaluator": "AgentResponseEvaluator", + "ordered": false, + "results_schema": { "type": "array", "items": {"type": "string"} }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": ["1h 58min"] + } } - ] + ], + "revision": 2 }, { - "sites": [ - "map" - ], + "sites": ["map"], "task_id": 139, - "require_login": true, - "storage_state": null, - "start_url": "__MAP__", - "geolocation": null, - "intent_template": "What is the estimated driving time between {{city1}} and {{city2}}?", - "instantiation_dict": { - "city1": "the hometown of Joe Biden", - "city2": "Bridgeport" - }, - "intent": "What is the estimated driving time between the hometown of Joe Biden and Bridgeport?", - "require_reset": false, - "eval": { - "site": "map" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "fuzzy_match": [ - "3h 20min" - ] - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "3h 20min" - }, "intent_template_id": 51, - "changelogs": [ + "start_urls": ["__MAP__"], + "intent": "What is the estimated driving time between the hometown of Joe Biden and Bridgeport?", + "intent_template": "What is the estimated driving time between {{city1}} and {{city2}}?", + "instantiation_dict": {"city1": "the hometown of Joe Biden", "city2": "Bridgeport"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "eval", - "category": "permissive_string_match", - "note": "Switched from fuzzy eval" + "evaluator": "AgentResponseEvaluator", + "ordered": false, + "results_schema": { "type": "array", "items": {"type": "string"} }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": ["3h 20min"] + } } - ] + ], + "revision": 2 }, { - "sites": [ - "map" - ], + "sites": ["map"], "task_id": 140, - "require_login": true, - "storage_state": null, - "start_url": "__MAP__", - "geolocation": null, - "intent_template": "What is the estimated driving time between {{city1}} and {{city2}}?", - "instantiation_dict": { - "city1": "the city of Niagara Falls", - "city2": "the city of Yale University" - }, - "intent": "What is the estimated driving time between the city of Niagara Falls and the city of Yale University?", - "require_reset": false, - "eval": { - "site": "map" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "fuzzy_match": [ - "8h 33min" - ] - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "8h 33min" - }, "intent_template_id": 51, - "changelogs": [ + "start_urls": ["__MAP__"], + "intent": "What is the estimated driving time between the city of Niagara Falls and the city of Yale University?", + "intent_template": "What is the estimated driving time between {{city1}} and {{city2}}?", + "instantiation_dict": {"city1": "the city of Niagara Falls", "city2": "the city of Yale University"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "eval", - "category": "permissive_string_match", - "note": "Switched from fuzzy eval" + "evaluator": "AgentResponseEvaluator", + "ordered": false, + "results_schema": { "type": "array", "items": {"type": "string"} }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": ["8h 33min"] + } } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping" - ], + "sites": ["shopping"], "task_id": 141, - "require_login": true, - "storage_state": "./.auth/shopping_state.json", - "start_url": "__SHOPPING__", - "geolocation": null, - "intent_template": "Return how much I spent on {{category}} shopping during {{time}}", - "original.intent_template": "How much I spent on {{category}} shopping during {{time}}", - "instantiation_dict": { - "category": "food-related", - "time": "March 2023" - }, - "intent": "Return how much I spent on food-related shopping during March 2023", - "original.intent": "How much I spent on food-related shopping during March 2023", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "verify_retrieved_value_normalized", - "eval_params": { - "ordered": false, - "type": "currency" - }, - "expected_data": [ - "47.41" - ] - } - } - ], - "site": "shopping" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "must_include": [ - "47.41" - ] - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "$47.41" - }, "intent_template_id": 162, - "changelogs": [ - { - "key": "intent_template", - "category": "task_ambiguity", - "note": "Clarify the intent to specify return value instead of navigation" - }, + "start_urls": ["__SHOPPING__"], + "intent": "Return how much I spent on food-related shopping during March 2023 without considering shipping and handling fee", + "intent_template": "Return how much I spent on {{category}} shopping during {{time}} without considering shipping and handling fee", + "instantiation_dict": {"category": "food-related", "time": "March 2023"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "eval", - "category": "permissive_string_match", - "note": "Switched to exact match" + "evaluator": "AgentResponseEvaluator", + "ordered": false, + "results_schema": { "type": "array", "items": {"type": "string"} }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": ["32.41"] + } } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping" - ], + "sites": ["shopping"], "task_id": 142, - "require_login": true, - "storage_state": "./.auth/shopping_state.json", - "start_url": "__SHOPPING__", - "geolocation": null, - "intent_template": "Return how much I spent on {{category}} shopping during {{time}}", - "original.intent_template": "How much I spent on {{category}} shopping during {{time}}", - "instantiation_dict": { - "category": "hair care and hair style", - "time": "Jan 2023" - }, - "intent": "Return how much I spent on hair care and hair style shopping during Jan 2023", - "original.intent": "How much I spent on hair care and hair style shopping during Jan 2023", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "verify_retrieved_value_normalized", - "eval_params": { - "ordered": false, - "type": "currency" - }, - "expected_data": [ - "68.51" - ] - } - } - ], - "site": "shopping" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "must_include": [ - "95.23" - ] - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "$95.23" - }, "intent_template_id": 162, - "changelogs": [ - { - "key": "intent_template", - "category": "task_ambiguity", - "note": "Clarify the intent to specify return value instead of navigation" - }, - { - "key": "expected_retrieve_value", - "category": "reference_alignment", - "note": "Update expected value to match two products in jan orders (50.52 + 17.99)" - }, + "start_urls": ["__SHOPPING__"], + "intent": "Return how much I spent on hair care and hair style shopping during Jan 2023 without considering shipping and handling fee", + "intent_template": "Return how much I spent on {{category}} shopping during {{time}} without considering shipping and handling fee", + "instantiation_dict": {"category": "hair care and hair style", "time": "Jan 2023"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "eval", - "category": "permissive_string_match", - "note": "Switched to exact match" + "evaluator": "AgentResponseEvaluator", + "ordered": false, + "results_schema": { "type": "array", "items": {"type": "string"} }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": ["68.51"] + } } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping" - ], + "sites": ["shopping"], "task_id": 143, - "require_login": true, - "storage_state": "./.auth/shopping_state.json", - "start_url": "__SHOPPING__", - "geolocation": null, - "intent_template": "Return how much I spent on {{category}} shopping during {{time}}", - "original.intent_template": "How much I spent on {{category}} shopping during {{time}}", - "instantiation_dict": { - "category": "home decoration", - "time": "1/29/2023" - }, - "intent": "Return how much I spent on home decoration shopping during 1/29/2023", - "original.intent": "How much I spent on home decoration shopping during 1/29/2023", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "verify_retrieved_value_normalized", - "eval_params": { - "ordered": false, - "type": "currency" - }, - "expected_data": [ - "260.69" - ] - } - } - ], - "site": "shopping" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "must_include": [ - "265.69" - ] - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "$265.69" - }, "intent_template_id": 162, - "changelogs": [ - { - "key": "intent_template", - "category": "task_ambiguity", - "note": "Clarify the intent to specify return value instead of navigation" - }, + "start_urls": ["__SHOPPING__"], + "intent": "Return how much I spent on home decoration shopping during 1/29/2023 without considering shipping and handling fee", + "intent_template": "Return how much I spent on {{category}} shopping during {{time}} without considering shipping and handling fee", + "instantiation_dict": {"category": "home decoration", "time": "1/29/2023"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "expected_retrieve_value", - "category": "reference_alignment", - "note": "Update expected value to tree purchase 260.69" - }, - { - "key": "eval", - "category": "permissive_string_match", - "note": "Switched to exact match" + "evaluator": "AgentResponseEvaluator", + "ordered": false, + "results_schema": { "type": "array", "items": {"type": "string"} }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": ["260.69"] + } } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping" - ], + "sites": ["shopping"], "task_id": 144, - "require_login": true, - "storage_state": "./.auth/shopping_state.json", - "start_url": "__SHOPPING__", - "geolocation": null, - "intent_template": "Return how much I spent on {{category}} shopping during {{time}}", - "original.intent_template": "How much I spent on {{category}} shopping during {{time}}", - "instantiation_dict": { - "category": "food", - "time": "from mid Jan to the end Jan 2023" - }, - "intent": "Return how much I spent on food shopping during from mid Jan to the end Jan 2023", - "original.intent": "How much I spent on food shopping during from mid Jan to the end Jan 2023", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "verify_retrieved_value_normalized", - "eval_params": { - "ordered": false, - "type": "numeric" - }, - "expected_data": [ - 0 - ] - } - } - ], - "site": "shopping" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "must_include": [ - "0" - ] - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "0" - }, "intent_template_id": 162, - "changelogs": [ - { - "key": "intent_template", - "category": "task_ambiguity", - "note": "Clarify the intent to specify return value instead of navigation" - }, + "start_urls": ["__SHOPPING__"], + "intent": "Return how much I spent on food shopping from mid Jan to the end of Jan 2023 without considering shipping and handling fee", + "intent_template": "Return how much I spent on {{category}} shopping during {{time}} without considering shipping and handling fee", + "instantiation_dict": {"category": "food", "time": "from mid Jan to the end Jan 2023"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "eval", - "category": "permissive_string_match", - "note": "Switched to exact match" + "evaluator": "AgentResponseEvaluator", + "ordered": false, + "results_schema": { "type": "array", "items": {"type": "number"} }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": [0] + } } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping" - ], + "sites": ["shopping"], "task_id": 145, - "require_login": true, - "storage_state": "./.auth/shopping_state.json", - "start_url": "__SHOPPING__", - "geolocation": null, - "intent_template": "Return how much I spent on {{category}} shopping during {{time}}", - "original.intent_template": "How much I spent on {{category}} shopping during {{time}}", - "instantiation_dict": { - "category": "cooking and food", - "time": "March 2022" - }, - "intent": "Return how much I spent on cooking and food shopping during March 2022", - "original.intent": "How much I spent on cooking and food shopping during March 2022", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "verify_retrieved_value_normalized", - "eval_params": { - "ordered": false, - "type": "currency" - }, - "expected_data": [ - "52.35" - ] - } - } - ], - "site": "shopping" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "must_include": [ - "52.35" - ] - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "$52.35" - }, "intent_template_id": 162, - "changelogs": [ - { - "key": "intent_template", - "category": "task_ambiguity", - "note": "Clarify the intent to specify return value instead of navigation" - }, + "start_urls": ["__SHOPPING__"], + "intent": "Return how much I spent on cooking and food shopping during March 2022 without considering shipping and handling fee", + "intent_template": "Return how much I spent on {{category}} shopping during {{time}} without considering shipping and handling fee", + "instantiation_dict": {"category": "cooking and food", "time": "March 2022"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "eval", - "category": "permissive_string_match", - "note": "Switched to exact match" + "evaluator": "AgentResponseEvaluator", + "ordered": false, + "results_schema": { "type": "array", "items": {"type": "string"} }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": ["42.35"] + } } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping" - ], + "sites": ["shopping"], "task_id": 146, - "require_login": true, - "storage_state": "./.auth/shopping_state.json", - "start_url": "__SHOPPING__", - "geolocation": null, - "intent_template": "What is the {{option}} of the {{product}} I bought {{time}}", - "original.intent_template": "What is the {{option}} configuration of the {{product}} I bought {{time}}", - "instantiation_dict": { - "option": "size", - "product": "picture frame", - "time": "Sep 2022" - }, - "intent": "What is the size of the picture frame I bought Sep 2022", - "original.intent": "What is the size configuration of the picture frame I bought Sep 2022", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "verify_retrieved_value_normalized", - "eval_params": { - "ordered": false, - "type": "text" - }, - "expected_data": [ - "16\"x24\"" - ] - } - } - ], - "site": "shopping" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "must_include": [ - "16x24" - ] - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "16x24" - }, "intent_template_id": 155, - "changelogs": [ - { - "key": "intent_template", - "category": "task_ambiguity", - "note": "Clarify the intent by removing \"configuration\"" - }, - { - "key": "expected_retrieve_value", - "category": "reference_alignment", - "note": "Update to correct size" - }, + "start_urls": ["__SHOPPING__"], + "intent": "Get the size of the picture frame I bought Sep 2022", + "intent_template": "Get the {{option}} of the {{product}} I bought {{time}}", + "instantiation_dict": {"option": "size", "product": "picture frame", "time": "Sep 2022"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "eval", - "category": "permissive_string_match", - "note": "Switched to exact match" + "evaluator": "AgentResponseEvaluator", + "ordered": false, + "results_schema": { "type": "array", "items": {"type": "string"} }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": ["16\"x24\""] + } } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping" - ], + "sites": ["shopping"], "task_id": 147, - "require_login": true, - "storage_state": "./.auth/shopping_state.json", - "start_url": "__SHOPPING__", - "geolocation": null, - "intent_template": "What is the {{option}} of the {{product}} I bought {{time}}", - "original.intent_template": "What is the {{option}} configuration of the {{product}} I bought {{time}}", + "intent_template_id": 155, + "start_urls": ["__SHOPPING__"], + "intent": "Get the size of the picture frame I bought between June and December 2022", + "intent_template": "Get the {{option}} of the {{product}} I bought {{time}}", "instantiation_dict": { "option": "size", "product": "picture frame", - "time": "2022" - }, - "intent": "What is the size of the picture frame I bought between June and December 2022", - "original.intent": "What is the size configuration of the picture frame I bought 2022", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "verify_retrieved_value_normalized", - "eval_params": { - "ordered": false, - "type": "text" - }, - "expected_data": [ - "16\"x24\"" - ] - } - } - ], - "site": "shopping" + "time": "between June and December 2022" }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "must_include": [ - "16x24" - ] - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "16x24" - }, - "intent_template_id": 155, - "changelogs": [ - { - "key": "intent_template", - "category": "task_ambiguity", - "note": "Clarify the intent by removing \"configuration\"" - }, - { - "key": "expected_retrieve_value", - "category": "reference_alignment", - "note": "Update to correct size" - }, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "eval", - "category": "permissive_string_match", - "note": "Switched to exact match" + "evaluator": "AgentResponseEvaluator", + "ordered": false, + "results_schema": { "type": "array", "items": {"type": "string"} }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": ["16\"x24\""] + } } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping" - ], + "sites": ["shopping"], "task_id": 148, - "require_login": true, - "storage_state": "./.auth/shopping_state.json", - "start_url": "__SHOPPING__", - "geolocation": null, - "intent_template": "What is the {{option}} of the {{product}} I bought {{time}}", - "original.intent_template": "What is the {{option}} configuration of the {{product}} I bought {{time}}", - "instantiation_dict": { - "option": "color", - "product": "picture frame", - "time": "Sep 2022" - }, - "intent": "What is the color of the picture frame I bought Sep 2022", - "original.intent": "What is the color configuration of the picture frame I bought Sep 2022", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "verify_retrieved_value_normalized", - "eval_params": { - "ordered": false, - "type": "text" - }, - "expected_data": [ - "Mist" - ] - } - } - ], - "site": "shopping" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "must_include": [ - "Mist" - ] - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "Mist" - }, "intent_template_id": 155, - "changelogs": [ - { - "key": "intent_template", - "category": "task_ambiguity", - "note": "Clarify the intent by removing \"configuration\"" - }, + "start_urls": ["__SHOPPING__"], + "intent": "Get the color of the picture frame I bought Sep 2022", + "intent_template": "Get the {{option}} of the {{product}} I bought {{time}}", + "instantiation_dict": {"option": "color", "product": "picture frame", "time": "Sep 2022"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "eval", - "category": "permissive_string_match", - "note": "Switched to exact match" + "evaluator": "AgentResponseEvaluator", + "ordered": false, + "results_schema": { "type": "array", "items": {"type": "string"} }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": ["Mist"] + } } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping" - ], + "sites": ["shopping"], "task_id": 149, - "require_login": true, - "storage_state": "./.auth/shopping_state.json", - "start_url": "__SHOPPING__", - "geolocation": null, - "intent_template": "What is the {{option}} of the {{product}} I bought {{time}}", - "original.intent_template": "What is the {{option}} configuration of the {{product}} I bought {{time}}", - "instantiation_dict": { - "option": "color", - "product": "artifical plants", - "time": "Feb 2023" - }, - "intent": "What is the color of the artifical plants I bought Feb 2023", - "original.intent": "What is the color configuration of the artifical plants I bought Feb 2023", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "verify_retrieved_value_normalized", - "eval_params": { - "ordered": false, - "type": "text" - }, - "expected_data": [ - "Green" - ] - } - } - ], - "site": "shopping" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "must_include": [ - "Green-vines" - ] - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "Green-vines" - }, "intent_template_id": 155, - "changelogs": [ - { - "key": "intent_template", - "category": "task_ambiguity", - "note": "Clarify the intent by removing \"configuration\"" - }, - { - "key": "expected_retrieve_value", - "category": "reference_alignment", - "note": "Update to the exact color" - }, + "start_urls": ["__SHOPPING__"], + "intent": "Get the color of the artifical plants I bought Feb 2023", + "intent_template": "Get the {{option}} of the {{product}} I bought {{time}}", + "instantiation_dict": {"option": "color", "product": "artifical plants", "time": "Feb 2023"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "eval", - "category": "permissive_string_match", - "note": "Switched to exact match" + "evaluator": "AgentResponseEvaluator", + "ordered": false, + "results_schema": { "type": "array", "items": {"type": "string"} }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": ["Green-vines"] + } } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping" - ], + "sites": ["shopping"], "task_id": 150, - "require_login": true, - "storage_state": "./.auth/shopping_state.json", - "start_url": "__SHOPPING__", - "geolocation": null, - "intent_template": "What is the {{option}} of the {{product}} I bought {{time}}", - "original.intent_template": "What is the {{option}} configuration of the {{product}} I bought {{time}}", - "instantiation_dict": { - "option": "price", - "product": "fake tree", - "time": "Jan 2023" - }, - "intent": "What is the price of the fake tree I bought Jan 2023", - "original.intent": "What is the price configuration of the fake tree I bought Jan 2023", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "verify_retrieved_value_normalized", - "eval_params": { - "ordered": false, - "type": "currency" - }, - "expected_data": [ - "260.69" - ] - } - } - ], - "site": "shopping" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "must_include": [ - "260.69" - ] - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "260.69" - }, "intent_template_id": 155, - "changelogs": [ - { - "key": "intent_template", - "category": "task_ambiguity", - "note": "Clarify the intent by removing \"configuration\"" - }, + "start_urls": ["__SHOPPING__"], + "intent": "Get the price of the fake tree I bought Jan 2023", + "intent_template": "Get the {{option}} of the {{product}} I bought {{time}}", + "instantiation_dict": {"option": "price", "product": "fake tree", "time": "Jan 2023"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "eval", - "category": "permissive_string_match", - "note": "Switched to exact match" + "evaluator": "AgentResponseEvaluator", + "ordered": false, + "results_schema": { "type": "array", "items": {"type": "string"} }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": ["260.69"] + } } - ] + ], + "revision": 2 }, { - "sites": [ - "map" - ], + "sites": ["map"], "task_id": 151, - "require_login": true, - "storage_state": null, - "start_url": "__MAP__", - "geolocation": null, - "intent_template": "What is the minimum travel time by car from {{location1}} to {{location2}}?", - "instantiation_dict": { - "location1": "CMU", - "location2": "University of Pittsburgh" - }, - "intent": "What is the minimum travel time by car from CMU to University of Pittsburgh?", - "require_reset": false, - "eval": { - "site": "map" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "fuzzy_match": [ - "4min" - ] - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "4min" - }, "intent_template_id": 36, - "changelogs": [ + "start_urls": ["__MAP__"], + "intent": "What is the minimum travel time by car from CMU to University of Pittsburgh?", + "intent_template": "What is the minimum travel time by car from {{location1}} to {{location2}}?", + "instantiation_dict": {"location1": "CMU", "location2": "University of Pittsburgh"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "eval", - "category": "permissive_string_match", - "note": "Switched from fuzzy eval" + "evaluator": "AgentResponseEvaluator", + "ordered": false, + "results_schema": { "type": "array", "items": {"type": "string"} }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": ["4min"] + } } - ] + ], + "revision": 2 }, { - "sites": [ - "map" - ], + "sites": ["map"], "task_id": 152, - "require_login": true, - "storage_state": null, - "start_url": "__MAP__", - "geolocation": null, - "intent_template": "What is the minimum travel time by car from {{location1}} to {{location2}}?", - "instantiation_dict": { - "location1": "Schenley park", - "location2": "Upitt" - }, - "intent": "What is the minimum travel time by car from Schenley park to Upitt?", - "require_reset": false, - "eval": { - "site": "map" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "fuzzy_match": [ - "4min" - ] - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "4min" - }, "intent_template_id": 36, - "changelogs": [ + "start_urls": ["__MAP__"], + "intent": "What is the minimum travel time by car from Schenley park to Upitt?", + "intent_template": "What is the minimum travel time by car from {{location1}} to {{location2}}?", + "instantiation_dict": {"location1": "Schenley park", "location2": "Upitt"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "eval", - "category": "permissive_string_match", - "note": "Switched from fuzzy eval" + "evaluator": "AgentResponseEvaluator", + "ordered": false, + "results_schema": { "type": "array", "items": {"type": "string"} }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": ["4min"] + } } - ] + ], + "revision": 2 }, { - "sites": [ - "map" - ], + "sites": ["map"], "task_id": 153, - "require_login": true, - "storage_state": null, - "start_url": "__MAP__", - "geolocation": null, - "intent_template": "What is the minimum travel time by car from {{location1}} to {{location2}}?", - "instantiation_dict": { - "location1": "REI", - "location2": "CMU" - }, - "intent": "What is the minimum travel time by car from REI to CMU?", - "require_reset": false, - "eval": { - "site": "map" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "fuzzy_match": [ - "7min" - ] - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "7min" - }, "intent_template_id": 36, - "changelogs": [ + "start_urls": ["__MAP__"], + "intent": "What is the minimum travel time by car from REI to CMU?", + "intent_template": "What is the minimum travel time by car from {{location1}} to {{location2}}?", + "instantiation_dict": {"location1": "REI", "location2": "CMU"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "eval", - "category": "permissive_string_match", - "note": "Switched from fuzzy eval" + "evaluator": "AgentResponseEvaluator", + "ordered": false, + "results_schema": { "type": "array", "items": {"type": "string"} }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": ["7min"] + } } - ] + ], + "revision": 2 }, { - "sites": [ - "map" - ], + "sites": ["map"], "task_id": 154, - "require_login": true, - "storage_state": null, - "start_url": "__MAP__", - "geolocation": null, - "intent_template": "What is the minimum travel time by car from {{location1}} to {{location2}}?", - "instantiation_dict": { - "location1": "CMU gates building", - "location2": "Schenley park" - }, - "intent": "What is the minimum travel time by car from CMU gates building to Schenley park?", - "require_reset": false, - "eval": { - "site": "map" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "fuzzy_match": [ - "4min" - ] - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "4min" - }, "intent_template_id": 36, - "changelogs": [ + "start_urls": ["__MAP__"], + "intent": "What is the minimum travel time by car from CMU gates building to Schenley park?", + "intent_template": "What is the minimum travel time by car from {{location1}} to {{location2}}?", + "instantiation_dict": {"location1": "CMU gates building", "location2": "Schenley park"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "eval", - "category": "permissive_string_match", - "note": "Switched from fuzzy eval" + "evaluator": "AgentResponseEvaluator", + "ordered": false, + "results_schema": { "type": "array", "items": {"type": "string"} }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": ["4min"] + } } - ] + ], + "revision": 2 }, { - "sites": [ - "map" - ], + "sites": ["map"], "task_id": 155, - "require_login": true, - "storage_state": null, - "start_url": "__MAP__", - "geolocation": null, + "intent_template_id": 36, + "start_urls": ["__MAP__"], + "intent": "What is the minimum travel time by car from Animal Rescue League of Pittsburgh to Schenley park?", "intent_template": "What is the minimum travel time by car from {{location1}} to {{location2}}?", "instantiation_dict": { "location1": "Animal Rescue League of Pittsburgh", "location2": "Schenley park" }, - "intent": "What is the minimum travel time by car from Animal Rescue League of Pittsburgh to Schenley park?", - "require_reset": false, - "eval": { - "site": "map" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "fuzzy_match": [ - "9min" - ] - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "9min" - }, - "intent_template_id": 36, - "changelogs": [ + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "eval", - "category": "permissive_string_match", - "note": "Switched from fuzzy eval" + "evaluator": "AgentResponseEvaluator", + "ordered": false, + "results_schema": { "type": "array", "items": {"type": "string"} }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": ["9min"] + } } - ] + ], + "revision": 2 }, { - "sites": [ - "gitlab" - ], + "sites": ["gitlab"], "task_id": 156, - "require_login": true, - "storage_state": "./.auth/gitlab_state.json", - "start_url": "__GITLAB__", - "geolocation": null, + "intent_template_id": 290, + "start_urls": ["__GITLAB__"], + "intent": "Navigate to the merge requests assigned to me", "intent_template": "Navigate to the merge requests assigned to me", - "original.intent_template": "Checkout merge requests assigned to me", "instantiation_dict": {}, - "intent": "Navigate to the merge requests assigned to me", - "original.intent": "Checkout merge requests assigned to me", - "require_reset": false, - "eval": { - "expected_ui_state": [ - { - "url": "__GITLAB__/dashboard/merge_requests?assignee_username=byteblaze" - } - ], - "site": "gitlab" - }, - "original.eval": { - "eval_types": [ - "url_match" - ], - "reference_answers": null, - "reference_url": "__GITLAB__/dashboard/merge_requests?assignee_username=byteblaze", - "program_html": [], - "url_note": "GOLD in PRED" - }, - "intent_template_id": 290, - "changelogs": [ + "format_specification": null, + "start_url_context": null, + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} + }, { - "key": "intent_template", - "category": "clarify_instructions", - "note": "Clarify the intent as a navigation task" + "evaluator": "NetworkEventEvaluator", + "site": "gitlab", + "url_match_mode": "exact", + "last_event_only": true, + "expected": { + "url": "__GITLAB__/dashboard/merge_requests?assignee_username=byteblaze", + "response_status": 200, + "event_type": "navigation" + } } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping_admin" - ], + "sites": ["shopping_admin"], "task_id": 157, - "require_login": true, - "storage_state": "./.auth/shopping_admin_state.json", - "start_url": "__SHOPPING_ADMIN__", - "geolocation": null, - "intent_template": "Navigate to all customers", - "original.intent_template": "Show all customers", - "instantiation_dict": {}, - "intent": "Navigate to all customers", - "original.intent": "Show all customers", - "require_reset": false, - "eval": { - "expected_ui_state": [ - { - "url": "__SHOPPING_ADMIN__/customer/index/" - } - ], - "site": "shopping_admin" - }, - "original.eval": { - "eval_types": [ - "url_match" - ], - "reference_answers": null, - "reference_url": "__SHOPPING_ADMIN__/customer/index/", - "program_html": [], - "url_note": "GOLD in PRED" - }, "intent_template_id": 255, - "changelogs": [ + "start_urls": ["__SHOPPING_ADMIN__"], + "intent": "Show all customers", + "intent_template": "Show all customers", + "instantiation_dict": {}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "intent_template", - "category": "task_ambiguity", - "note": "Clearly specify to navigate to the customers page instead of returning a list of customers" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} + }, + { + "evaluator": "NetworkEventEvaluator", + "site": "shopping_admin", + "url_match_mode": "exact", + "last_event_only": true, + "expected": { + "url": "__SHOPPING_ADMIN__/customer/index/", + "response_status": 200, + "event_type": "navigation" + } } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping" - ], + "sites": ["shopping"], "task_id": 158, - "require_login": true, - "storage_state": "./.auth/shopping_state.json", - "start_url": "__SHOPPING__", - "geolocation": null, - "intent_template": "I have a lot of Nintendo Switch game cards now, help me find the best storage option to fit all {{num}} cards", - "instantiation_dict": { - "num": 11 - }, + "intent_template_id": 171, + "start_urls": ["__SHOPPING__"], "intent": "I have a lot of Nintendo Switch game cards now, help me find the best storage option to fit all 11 cards", - "require_reset": false, - "eval": { - "expected_ui_state": [ - { - "url": "__SHOPPING__/heiying-game-card-case-for-nintendo-switch-switch-oled-game-card-or-micro-sd-memory-cards-portable-switch-game-memory-card-storage-with-24-game-card-slots-and-24-micro-sd-card-slots-black.html" - } - ], - "site": "shopping" - }, - "original.eval": { - "eval_types": [ - "url_match" - ], - "reference_answers": null, - "reference_url": "__SHOPPING__/heiying-game-card-case-for-nintendo-switch-switch-oled-game-card-or-micro-sd-memory-cards-portable-switch-game-memory-card-storage-with-24-game-card-slots-and-24-micro-sd-card-slots-black.html", - "program_html": [], - "url_note": "GOLD in PRED" - }, - "intent_template_id": 171 + "intent_template": "I have a lot of Nintendo Switch game cards now, help me find the best storage option to fit all {{num}} cards", + "instantiation_dict": {"num": 11}, + "format_specification": null, + "start_url_context": null, + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} + }, + { + "evaluator": "NetworkEventEvaluator", + "site": "shopping", + "url_match_mode": "exact", + "last_event_only": true, + "expected": { + "url": "__SHOPPING__/heiying-game-card-case-for-nintendo-switch-switch-oled-game-card-or-micro-sd-memory-cards-portable-switch-game-memory-card-storage-with-24-game-card-slots-and-24-micro-sd-card-slots-black.html", + "response_status": 200, + "event_type": "navigation" + } + } + ], + "revision": 2 }, { - "sites": [ - "shopping" - ], + "sites": ["shopping"], "task_id": 159, - "require_login": true, - "storage_state": "./.auth/shopping_state.json", - "start_url": "__SHOPPING__", - "geolocation": null, - "intent_template": "I have a lot of Nintendo Switch game cards now, help me find the best storage option to fit all {{num}} cards", - "instantiation_dict": { - "num": 31 - }, + "intent_template_id": 171, + "start_urls": ["__SHOPPING__"], "intent": "I have a lot of Nintendo Switch game cards now, help me find the best storage option to fit all 31 cards", - "require_reset": false, - "eval": { - "expected_ui_state": [ - { - "url": "__SHOPPING__/game-card-holder-storage-case-for-nintendo-switch-games-or-ps-vita-game-case-or-sd-memory-cards-black.html" - } - ], - "site": "shopping" - }, - "original.eval": { - "eval_types": [ - "url_match" - ], - "reference_answers": null, - "reference_url": "__SHOPPING__/game-card-holder-storage-case-for-nintendo-switch-games-or-ps-vita-game-case-or-sd-memory-cards-black.html", - "program_html": [], - "url_note": "GOLD in PRED" - }, - "intent_template_id": 171 + "intent_template": "I have a lot of Nintendo Switch game cards now, help me find the best storage option to fit all {{num}} cards", + "instantiation_dict": {"num": 31}, + "format_specification": null, + "start_url_context": null, + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} + }, + { + "evaluator": "NetworkEventEvaluator", + "site": "shopping", + "url_match_mode": "exact", + "last_event_only": true, + "expected": { + "url": "__SHOPPING__/game-card-holder-storage-case-for-nintendo-switch-games-or-ps-vita-game-case-or-sd-memory-cards-black.html", + "response_status": 200, + "event_type": "navigation" + } + } + ], + "revision": 2 }, { - "sites": [ - "shopping" - ], + "sites": ["shopping"], "task_id": 160, - "require_login": true, - "storage_state": "./.auth/shopping_state.json", - "start_url": "__SHOPPING__", - "geolocation": null, - "intent_template": "I have a lot of Nintendo Switch game cards now, help me find the best storage option to fit all {{num}} cards", - "instantiation_dict": { - "num": 6 - }, + "intent_template_id": 171, + "start_urls": ["__SHOPPING__"], "intent": "I have a lot of Nintendo Switch game cards now, help me find the best storage option to fit all 6 cards", - "require_reset": false, - "eval": { - "expected_ui_state": [ - { - "url": "__SHOPPING__/heiying-game-card-case-for-nintendo-switch-switch-oled-game-card-or-micro-sd-memory-cards-portable-switch-game-memory-card-storage-with-24-game-card-slots-and-24-micro-sd-card-slots-black.html" - } - ], - "site": "shopping" - }, - "original.eval": { - "eval_types": [ - "url_match" - ], - "reference_answers": null, - "reference_url": "__SHOPPING__/heiying-game-card-case-for-nintendo-switch-switch-oled-game-card-or-micro-sd-memory-cards-portable-switch-game-memory-card-storage-with-24-game-card-slots-and-24-micro-sd-card-slots-black.html", - "program_html": [], - "url_note": "GOLD in PRED" - }, - "intent_template_id": 171 + "intent_template": "I have a lot of Nintendo Switch game cards now, help me find the best storage option to fit all {{num}} cards", + "instantiation_dict": {"num": 6}, + "format_specification": null, + "start_url_context": null, + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} + }, + { + "evaluator": "NetworkEventEvaluator", + "site": "shopping", + "url_match_mode": "exact", + "last_event_only": true, + "expected": { + "url": "__SHOPPING__/heiying-game-card-case-for-nintendo-switch-switch-oled-game-card-or-micro-sd-memory-cards-portable-switch-game-memory-card-storage-with-24-game-card-slots-and-24-micro-sd-card-slots-black.html", + "response_status": 200, + "event_type": "navigation" + } + } + ], + "revision": 2 }, { - "sites": [ - "shopping" - ], + "sites": ["shopping"], "task_id": 161, - "require_login": true, - "storage_state": "./.auth/shopping_state.json", - "start_url": "__SHOPPING__", - "geolocation": null, - "intent_template": "I have a lot of Nintendo Switch game cards now, help me find the best storage option to fit all {{num}} cards", - "instantiation_dict": { - "num": 23 - }, + "intent_template_id": 171, + "start_urls": ["__SHOPPING__"], "intent": "I have a lot of Nintendo Switch game cards now, help me find the best storage option to fit all 23 cards", - "require_reset": false, - "eval": { - "expected_ui_state": [ - { - "url": "__SHOPPING__/heiying-game-card-case-for-nintendo-switch-switch-oled-game-card-or-micro-sd-memory-cards-portable-switch-game-memory-card-storage-with-24-game-card-slots-and-24-micro-sd-card-slots-black.html" - } - ], - "site": "shopping" - }, - "original.eval": { - "eval_types": [ - "url_match" - ], - "reference_answers": null, - "reference_url": "__SHOPPING__/heiying-game-card-case-for-nintendo-switch-switch-oled-game-card-or-micro-sd-memory-cards-portable-switch-game-memory-card-storage-with-24-game-card-slots-and-24-micro-sd-card-slots-black.html", - "program_html": [], - "url_note": "GOLD in PRED" - }, - "intent_template_id": 171 + "intent_template": "I have a lot of Nintendo Switch game cards now, help me find the best storage option to fit all {{num}} cards", + "instantiation_dict": {"num": 23}, + "format_specification": null, + "start_url_context": null, + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} + }, + { + "evaluator": "NetworkEventEvaluator", + "site": "shopping", + "url_match_mode": "exact", + "last_event_only": true, + "expected": { + "url": "__SHOPPING__/heiying-game-card-case-for-nintendo-switch-switch-oled-game-card-or-micro-sd-memory-cards-portable-switch-game-memory-card-storage-with-24-game-card-slots-and-24-micro-sd-card-slots-black.html", + "response_status": 200, + "event_type": "navigation" + } + } + ], + "revision": 2 }, { - "sites": [ - "shopping" - ], + "sites": ["shopping"], "task_id": 162, - "require_login": true, - "storage_state": "./.auth/shopping_state.json", - "start_url": "__SHOPPING__", - "geolocation": null, - "intent_template": "I have a lot of Nintendo Switch game cards now, help me find the best storage option to fit all {{num}} cards", - "instantiation_dict": { - "num": 40 - }, + "intent_template_id": 171, + "start_urls": ["__SHOPPING__"], "intent": "I have a lot of Nintendo Switch game cards now, help me find the best storage option to fit all 40 cards", - "require_reset": false, - "eval": { - "expected_ui_state": [ - { - "url": "__SHOPPING__/game-card-holder-storage-case-for-nintendo-switch-games-or-ps-vita-game-case-or-sd-memory-cards-black.html" - } - ], - "site": "shopping" - }, - "original.eval": { - "eval_types": [ - "url_match" - ], - "reference_answers": null, - "reference_url": "__SHOPPING__/game-card-holder-storage-case-for-nintendo-switch-games-or-ps-vita-game-case-or-sd-memory-cards-black.html", - "program_html": [], - "url_note": "GOLD in PRED" - }, - "intent_template_id": 171 + "intent_template": "I have a lot of Nintendo Switch game cards now, help me find the best storage option to fit all {{num}} cards", + "instantiation_dict": {"num": 40}, + "format_specification": null, + "start_url_context": null, + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} + }, + { + "evaluator": "NetworkEventEvaluator", + "site": "shopping", + "url_match_mode": "exact", + "last_event_only": true, + "expected": { + "url": "__SHOPPING__/game-card-holder-storage-case-for-nintendo-switch-games-or-ps-vita-game-case-or-sd-memory-cards-black.html", + "response_status": 200, + "event_type": "navigation" + } + } + ], + "revision": 2 }, { - "sites": [ - "shopping" - ], + "sites": ["shopping"], "task_id": 163, - "require_login": true, - "storage_state": "./.auth/shopping_state.json", - "start_url": "__SHOPPING__/ostent-16gb-memory-card-stick-storage-for-sony-ps-vita-psv1000-2000-pch-z081-z161-z321-z641.html", - "geolocation": null, + "intent_template_id": 136, + "start_urls": [ + "__SHOPPING__/ostent-16gb-memory-card-stick-storage-for-sony-ps-vita-psv1000-2000-pch-z081-z161-z321-z641.html" + ], + "intent": "List all review titles with 2 stars or below for this product.", "intent_template": "List all review titles with 2 stars or below for this product.", - "original.intent_template": "What are the main criticisms of this product? Please extract the relevant sentences.", "instantiation_dict": {}, - "intent": "List all review titles with 2 stars or below for this product.", - "original.intent": "What are the main criticisms of this product? Please extract the relevant sentences.", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "verify_retrieved_value_normalized", - "eval_params": { - "ordered": false, - "type": "text" - }, - "expected_data": [ - "Memory Card Came Defective" - ] - } - } - ], - "site": "shopping" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "must_include": [ - "I ordered the 16gb but I only got 14 gigs even though I formatted the card", - "The memory card is kind of slow on games and downloads", - "No original packaging It's used and the previous owners data has not been erased", - "The product is a legit sony hardware that have been owned by someone else before", - "The media could not be loaded", - "I could not format the card so I wasn\u2019t able to use it for my VITA" - ] - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "I ordered the 16gb but I only got 14 gigs even though I formatted the card. The memory card is kind of slow on games and downloads. No original packaging It's used and the previous owners data has not been erased. The product is a legit sony hardware that have been owned by someone else before The media could not be loaded. I could not format the card so I wasn\u2019t able to use it for my VITA" - }, - "intent_template_id": 136, - "changelogs": [ + "format_specification": null, + "start_url_context": "Complete the task given the context of the current page.", + "eval": [ { - "key": "intent_template", - "category": "permissive_string_match", - "note": "Original task required subjective interpretation of review sentiment. Reframed to objective extraction of structured review data with clear filtering criteria (\u22642 stars) to enable exact matching instead of fuzzy semantic matching." + "evaluator": "AgentResponseEvaluator", + "ordered": false, + "results_schema": { "type": "array", "items": {"type": "string"} }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": ["Memory Card Came Defective"] + } } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping" - ], + "sites": ["shopping"], "task_id": 164, - "require_login": true, - "storage_state": "./.auth/shopping_state.json", - "start_url": "__SHOPPING__/mineralogie-all-natural-lip-gloss-ruby-rose.html", - "geolocation": null, + "intent_template_id": 136, + "start_urls": ["__SHOPPING__/mineralogie-all-natural-lip-gloss-ruby-rose.html"], + "intent": "List all review titles with 2 stars or below for this product.", "intent_template": "List all review titles with 2 stars or below for this product.", - "original.intent_template": "What are the main criticisms of this product? Please extract the relevant sentences.", "instantiation_dict": {}, - "intent": "List all review titles with 2 stars or below for this product.", - "original.intent": "What are the main criticisms of this product? Please extract the relevant sentences.", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "verify_retrieved_value_normalized", - "eval_params": { - "ordered": false, - "type": "text" - }, - "expected_data": [ - "Meh" - ] - } - } - ], - "site": "shopping" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "must_include": [ - "Dry", - "Uneven color" - ] - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "DryUneven color" - }, - "intent_template_id": 136, - "changelogs": [ + "format_specification": null, + "start_url_context": "Complete the task given the context of the current page.", + "eval": [ { - "key": "intent_template", - "category": "permissive_string_match", - "note": "Original task required subjective interpretation of review sentiment. Reframed to objective extraction of structured review data with clear filtering criteria (\u22642 stars) to enable exact matching instead of fuzzy semantic matching." + "evaluator": "AgentResponseEvaluator", + "ordered": false, + "results_schema": { "type": "array", "items": {"type": "string"} }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": ["Meh"] + } } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping" - ], + "sites": ["shopping"], "task_id": 165, - "require_login": true, - "storage_state": "./.auth/shopping_state.json", - "start_url": "__SHOPPING__/sandgrens-swedish-handmade-wooden-clog-sandal-copenhagen.html", - "geolocation": null, + "intent_template_id": 136, + "start_urls": ["__SHOPPING__/sandgrens-swedish-handmade-wooden-clog-sandal-copenhagen.html"], + "intent": "List all review titles with 2 stars or below for this product.", "intent_template": "List all review titles with 2 stars or below for this product.", - "original.intent_template": "What are the main criticisms of this product? Please extract the relevant sentences.", "instantiation_dict": {}, - "intent": "List all review titles with 2 stars or below for this product.", - "original.intent": "What are the main criticisms of this product? Please extract the relevant sentences.", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "verify_retrieved_value_normalized", - "eval_params": { - "ordered": false, - "type": "text" - }, - "expected_data": [ - "So cute but too small", - "Toe rubbed" - ] - } - } - ], - "site": "shopping" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "must_include": [ - "The 39 was too small. I am afraid the 40 will be too big", - "I was very sad when the shoe rubbed up against my baby toe", - "I had to return them because I knew in time it would tear up my feet", - "The problem is that the strap is made of some really stiff leather and is painful to my heel", - "The front is also uncomfortably tight", - "The Dansko's were similar (not as bad) and loosened up over time" - ] - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "The 39 was too small. I am afraid the 40 will be too big. I was very sad when the shoe rubbed up against my baby toe. I had to return them because I knew in time it would tear up my feet. The problem is that the strap is made of some really stiff leather and is painful to my heel. The front is also uncomfortably tight. The Dansko's were similar (not as bad) and loosened up over time." - }, - "intent_template_id": 136, - "changelogs": [ + "format_specification": null, + "start_url_context": "Complete the task given the context of the current page.", + "eval": [ { - "key": "intent_template", - "category": "permissive_string_match", - "note": "Original task required subjective interpretation of review sentiment. Reframed to objective extraction of structured review data with clear filtering criteria (\u22642 stars) to enable exact matching instead of fuzzy semantic matching." + "evaluator": "AgentResponseEvaluator", + "ordered": false, + "results_schema": { "type": "array", "items": {"type": "string"} }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": ["So cute but too small", "Toe rubbed"] + } } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping" - ], + "sites": ["shopping"], "task_id": 166, - "require_login": true, - "storage_state": "./.auth/shopping_state.json", - "start_url": "__SHOPPING__/sensodyne-repair-protect-whitening-toothpaste-with-fluoride-3-4-oz-pack-of-3.html", - "geolocation": null, + "intent_template_id": 136, + "start_urls": [ + "__SHOPPING__/sensodyne-repair-protect-whitening-toothpaste-with-fluoride-3-4-oz-pack-of-3.html" + ], + "intent": "List all review titles with 2 stars or below for this product.", "intent_template": "List all review titles with 2 stars or below for this product.", - "original.intent_template": "What are the main criticisms of this product? Please extract the relevant sentences.", "instantiation_dict": {}, - "intent": "List all review titles with 2 stars or below for this product.", - "original.intent": "What are the main criticisms of this product? Please extract the relevant sentences.", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "validate_status", - "expected_data": { - "expected_status": "NOT_FOUND_ERROR" - } - } - } - ], - "site": "shopping" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "fuzzy_match": "N/A" - }, - "reference_url": "", - "program_html": [], - "string_note": "there is no existing criticism", - "reference_answer_raw_annotation": "N/A" - }, - "intent_template_id": 136, - "changelogs": [ + "format_specification": null, + "start_url_context": "Complete the task given the context of the current page.", + "eval": [ { - "key": "intent_template", - "category": "permissive_string_match", - "note": "Original task required subjective interpretation of review sentiment. Reframed to objective extraction of structured review data with clear filtering criteria (\u22642 stars) to enable exact matching instead of fuzzy semantic matching." - }, - { - "key": "expected_retrieve_value", - "category": "unachievable_tasks", - "note": "Status check used instead of fuzzy match" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": { + "performed_operation": "retrieve", + "status": "NOT_FOUND_ERROR", + "retrieved_data": null + } } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping" - ], + "sites": ["shopping"], "task_id": 167, - "require_login": true, - "storage_state": "./.auth/shopping_state.json", - "start_url": "__SHOPPING__/photosmart-plus-b209-clr-inkjetfb-p-s-c-usb-wrls-1.html", - "geolocation": null, + "intent_template_id": 136, + "start_urls": ["__SHOPPING__/photosmart-plus-b209-clr-inkjetfb-p-s-c-usb-wrls-1.html"], + "intent": "List all review titles with 2 stars or below for this product.", "intent_template": "List all review titles with 2 stars or below for this product.", - "original.intent_template": "What are the main criticisms of this product? Please extract the relevant sentences.", "instantiation_dict": {}, - "intent": "List all review titles with 2 stars or below for this product.", - "original.intent": "What are the main criticisms of this product? Please extract the relevant sentences.", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "verify_retrieved_value_normalized", - "eval_params": { - "ordered": false, - "type": "text" - }, - "expected_data": [ - "Waste of big money" - ] - } - } - ], - "site": "shopping" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "must_include": [ - "The wireless connection works on a whim (about 40% of the time I've owned it)", - "It seems to constantly run out of ink", - "Cartridge prices are less than some printers I've had", - "This printer seems to have more reasons NOT to work (none that are findable or correctable) Ex: error boxes saying that it's out of paper when it automatically switches to photo printing for some reason", - "Scanner is as slow as my first scanner I ever owned in the mid-90's", - "For the $176 I paid, there isn't even a fax component on it. I guess the \"PLUS\" part of it's name is in reference to the migraines it causes when you can't figure out the new reason why it's not working for the 10th time in the past 2 months." - ] - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "The wireless connection works on a whim (about 40% of the time I've owned it). It seems to constantly run out of ink. Cartridge prices are less than some printers I've had, but now I understand why. This printer seems to have more reasons NOT to work (none that are findable or correctable) Ex: error boxes saying that it's out of paper when it automatically switches to photo printing for some reason. Scanner is as slow as my first scanner I ever owned in the mid-90's. For the $176 I paid, there isn't even a fax component on it. I guess the \"PLUS\" part of it's name is in reference to the migraines it causes when you can't figure out the new reason why it's not working for the 10th time in the past 2 months." - }, - "intent_template_id": 136, - "changelogs": [ + "format_specification": null, + "start_url_context": "Complete the task given the context of the current page.", + "eval": [ { - "key": "intent_template", - "category": "permissive_string_match", - "note": "Original task required subjective interpretation of review sentiment. Reframed to objective extraction of structured review data with clear filtering criteria (\u22642 stars) to enable exact matching instead of fuzzy semantic matching." + "evaluator": "AgentResponseEvaluator", + "ordered": false, + "results_schema": { "type": "array", "items": {"type": "string"} }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": ["Waste of big money"] + } } - ] + ], + "revision": 2 }, { - "sites": [ - "gitlab" - ], + "sites": ["gitlab"], "task_id": 168, - "require_login": true, - "storage_state": "./.auth/gitlab_state.json", - "start_url": "__GITLAB__", - "geolocation": null, - "intent_template": "Tell me the full names of the repositories where I made contributions and they got {{description}} stars?", - "instantiation_dict": { - "description": "more than 100" - }, - "intent": "Tell me the full names of the repositories where I made contributions and they got more than 100 stars?", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "validate_status", - "expected_data": { - "expected_status": "RESOURCE_NOT_FOUND_ERROR" - } - } - } - ], - "site": "gitlab" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "fuzzy_match": "N/A" - }, - "reference_url": "", - "program_html": [], - "string_note": "No repo found", - "reference_answer_raw_annotation": "No repo found" - }, "intent_template_id": 289, - "format_specification": "A list of the repository names (do not include the group/organization name)", - "changelogs": [ - { - "key": "format_specification", - "category": "clarify_instructions", - "note": "Clarify the format of the response" - }, + "start_urls": ["__GITLAB__"], + "intent": "Tell me the project ID(s) of the repository(ies) where I made contributions that got more than 100 stars?", + "intent_template": "Tell me the project ID(s) of the repository(ies) where I made contributions that got {{description}} stars?", + "instantiation_dict": {"description": "more than 100"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "expected_retrieve_value", - "category": "unachievable_tasks", - "note": "Expect the right error code for unachievable tasks" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": { + "performed_operation": "retrieve", + "status": "NOT_FOUND_ERROR", + "retrieved_data": null + } } - ] + ], + "revision": 2 }, { - "sites": [ - "gitlab" - ], + "sites": ["gitlab"], "task_id": 169, - "require_login": true, - "storage_state": "./.auth/gitlab_state.json", - "start_url": "__GITLAB__", - "geolocation": null, - "intent_template": "Tell me the full names of the repositories where I made contributions and they got {{description}} stars?", - "instantiation_dict": { - "description": "the most" - }, - "intent": "Tell me the full names of the repositories where I made contributions and they got the most stars?", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "verify_retrieved_value_normalized", - "eval_params": { - "ordered": "false", - "type": "text" - }, - "expected_data": [ - "a11yproject.com", - "design" - ] - } - } - ], - "site": "gitlab" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "must_include": [ - "a11yproject.com", - "design" - ] - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "a11yproject.com, Primer/design" - }, "intent_template_id": 289, - "format_specification": "A list of the repository names (do not include the group/organization name)", - "changelogs": [ - { - "key": "format_specification", - "category": "clarify_instructions", - "note": "Clarify the format of the response" - }, + "start_urls": ["__GITLAB__"], + "intent": "Tell me the project ID(s) of the repository(ies) where I made contributions that got the most stars?", + "intent_template": "Tell me the project ID(s) of the repository(ies) where I made contributions that got {{description}} stars?", + "instantiation_dict": {"description": "the most"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "expected_retrieve_value", - "category": "avoid_permissive_text", - "note": "Use structured response to judge correctness" - }, - { - "key": "eval", - "category": "permissive_string_match", - "note": "Switched to exact match" + "evaluator": "AgentResponseEvaluator", + "ordered": false, + "results_schema": { "type": "array", "items": {"type": "number"} }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": [174, 180] + } } - ] + ], + "revision": 2 }, { - "sites": [ - "gitlab" - ], + "sites": ["gitlab"], "task_id": 170, - "require_login": true, - "storage_state": "./.auth/gitlab_state.json", - "start_url": "__GITLAB__", - "geolocation": null, - "intent_template": "Tell me the full names of the repositories where I made contributions and they got {{description}} stars?", - "instantiation_dict": { - "description": "the least" - }, - "intent": "Tell me the full names of the repositories where I made contributions and they got the least stars?", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "verify_retrieved_value_normalized", - "eval_params": { - "ordered": "false", - "type": "text" - }, - "expected_data": [ - "cloud-to-butt", - "dotfiles", - "timeit", - "solarized-prism-theme", - "gimmiethat.space", - "remove-board-movement-events-from-the-github-issue-timeline" - ] - } + "intent_template_id": 289, + "start_urls": ["__GITLAB__"], + "intent": "Tell me the project ID(s) of the repository(ies) where I made contributions that got the least stars?", + "intent_template": "Tell me the project ID(s) of the repository(ies) where I made contributions that got {{description}} stars?", + "instantiation_dict": {"description": "the least"}, + "format_specification": null, + "start_url_context": null, + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "ordered": false, + "results_schema": { "type": "array", "items": {"type": "number"} }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": [189, 193, 190, 188, 184, 181] } - ], - "site": "gitlab" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "must_include": [ - "cloud-to-butt", - "dotfiles", - "timeit", - "solarized-prism-theme", - "gimmiethat.space", - "remove-board-movement-events-from-the-github-issue-timeline" - ] - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "cloud-to-butt, dotfiles, timeit, solarized-prism-theme, gimmiethat.space, remove-board-movement-events-from-the-github-issue-timeline" - }, - "intent_template_id": 289, - "format_specification": "A list of the repository names (do not include the group/organization name)", - "changelogs": [ - { - "key": "format_specification", - "category": "clarify_instructions", - "note": "Clarify the format of the response" - }, - { - "key": "expected_retrieve_value", - "category": "avoid_permissive_text", - "note": "Use structured response to judge correctness" - }, - { - "key": "eval", - "category": "permissive_string_match", - "note": "Switched to exact match" } - ] + ], + "revision": 2 }, { - "sites": [ - "gitlab" - ], + "sites": ["gitlab"], "task_id": 171, - "require_login": true, - "storage_state": "./.auth/gitlab_state.json", - "start_url": "__GITLAB__", - "geolocation": null, - "intent_template": "Tell me the full names of the repositories where I made contributions and they got {{description}} stars?", - "instantiation_dict": { - "description": "less than 5" - }, - "intent": "Tell me the full names of the repositories where I made contributions and they got less than 5 stars?", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "verify_retrieved_value_normalized", - "eval_params": { - "ordered": "false", - "type": "text" - }, - "expected_data": [ - "a11y-syntax-highlighting", - "a11y-webring.club", - "accessible-html-content-patterns", - "ericwbailey.website", - "cloud-to-butt", - "dotfiles", - "timeit", - "solarized-prism-theme", - "gimmiethat.space", - "remove-board-movement-events-from-the-github-issue-timeline" - ] - } - } - ], - "site": "gitlab" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "must_include": [ - "a11y-syntax-highlighting", - "a11y-webring.club", - "accessible-html-content-patterns", - "ericwbailey.website", - "cloud-to-butt", - "dotfiles", - "timeit", - "solarized-prism-theme", - "gimmiethat.space", - "remove-board-movement-events-from-the-github-issue-timeline" - ] - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "a11y-syntax-highlighting, a11y-webring.club, accessible-html-content-patterns, ericwbailey.website, cloud-to-butt, dotfiles, timeit, solarized-prism-theme, gimmiethat.space, remove-board-movement-events-from-the-github-issue-timeline" - }, "intent_template_id": 289, - "format_specification": "A list of the repository names (do not include the group/organization name)", - "changelogs": [ - { - "key": "format_specification", - "category": "clarify_instructions", - "note": "Clarify the format of the response" - }, + "start_urls": ["__GITLAB__"], + "intent": "Tell me the project ID(s) of the repository(ies) where I made contributions that got less than 5 stars?", + "intent_template": "Tell me the project ID(s) of the repository(ies) where I made contributions that got {{description}} stars?", + "instantiation_dict": {"description": "less than 5"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "expected_retrieve_value", - "category": "avoid_permissive_text", - "note": "Use structured response to judge correctness" - }, - { - "key": "eval", - "category": "permissive_string_match", - "note": "Switched to exact match" + "evaluator": "AgentResponseEvaluator", + "ordered": false, + "results_schema": { "type": "array", "items": {"type": "number"} }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": [186, 179, 185, 182, 189, 193, 190, 188, 184, 181] + } } - ] + ], + "revision": 2 }, { - "sites": [ - "gitlab" - ], + "sites": ["gitlab"], "task_id": 172, - "require_login": true, - "storage_state": "./.auth/gitlab_state.json", - "start_url": "__GITLAB__", - "geolocation": null, - "intent_template": "Tell me the full names of the repositories where I made contributions and they got {{description}} stars?", - "instantiation_dict": { - "description": "no" - }, - "intent": "Tell me the full names of the repositories where I made contributions and they got no stars?", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "verify_retrieved_value_normalized", - "eval_params": { - "ordered": "false", - "type": "text" - }, - "expected_data": [ - "cloud-to-butt", - "dotfiles", - "timeit", - "solarized-prism-theme", - "gimmiethat.space", - "remove-board-movement-events-from-the-github-issue-timeline" - ] - } - } - ], - "site": "gitlab" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "must_include": [ - "cloud-to-butt", - "dotfiles", - "timeit", - "solarized-prism-theme", - "gimmiethat.space", - "remove-board-movement-events-from-the-github-issue-timeline" - ] - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "cloud-to-butt, dotfiles, timeit, solarized-prism-theme, gimmiethat.space, remove-board-movement-events-from-the-github-issue-timeline" - }, "intent_template_id": 289, - "format_specification": "A list of the repository names (do not include the group/organization name)", - "changelogs": [ - { - "key": "format_specification", - "category": "clarify_instructions", - "note": "Clarify the format of the response" - }, - { - "key": "expected_retrieve_value", - "category": "avoid_permissive_text", - "note": "Use structured response to judge correctness" - }, + "start_urls": ["__GITLAB__"], + "intent": "Tell me the project ID(s) of the repository(ies) where I made contributions that got no stars?", + "intent_template": "Tell me the project ID(s) of the repository(ies) where I made contributions that got {{description}} stars?", + "instantiation_dict": {"description": "no"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "eval", - "category": "permissive_string_match", - "note": "Switched to exact match" + "evaluator": "AgentResponseEvaluator", + "ordered": false, + "results_schema": { "type": "array", "items": {"type": "number"} }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": [189, 193, 190, 188, 184, 181] + } } - ] + ], + "revision": 2 }, { - "sites": [ - "gitlab" - ], + "sites": ["gitlab"], "task_id": 173, - "require_login": true, - "storage_state": "./.auth/gitlab_state.json", - "start_url": "__GITLAB__", - "geolocation": null, - "intent_template": "Navigate to my latest updated issue that has keyword \"{{keyword}}\" in its title to check if it is closed", - "original.intent_template": "Open my latest updated issue that has keyword \"{{keyword}}\" in its title to check if it is closed", - "instantiation_dict": { - "keyword": "better" - }, - "intent": "Navigate to my latest updated issue that has keyword \"better\" in its title to check if it is closed", - "original.intent": "Open my latest updated issue that has keyword \"better\" in its title to check if it is closed", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "verify_retrieved_value_normalized", - "eval_params": { - "ordered": "false", - "type": "text" - }, - "expected_data": [ - "Not closed" - ] - } - } - ], - "site": "gitlab" - }, - "original.eval": { - "eval_types": [ - "string_match", - "url_match" - ], - "reference_answers": { - "fuzzy_match": [ - "No, it is open" - ] - }, - "reference_url": "__GITLAB__/byteblaze/empathy-prompts/-/issues/8", - "program_html": [], - "reference_answer_raw_annotation": "Not closed", - "string_note": "", - "url_note": "GOLD in PRED" - }, "intent_template_id": 310, - "format_specification": "Respond with \"Not closed\" or \"Closed\"", - "changelogs": [ + "start_urls": ["__GITLAB__"], + "intent": "Get whether my latest updated issue with \"better\" in its title is closed", + "intent_template": "Get whether my latest updated issue with \"{{keyword}}\" in its title is closed", + "instantiation_dict": {"keyword": "better"}, + "format_specification": "Use a boolean where true means the issue is closed and false means the issue is opened.", + "start_url_context": null, + "eval": [ { - "key": "intent_template", - "category": "clarify_instructions", - "note": "Clarify the intent as a navigation task" - }, - { - "key": "expected_retrieve_value", - "category": "avoid_permissive_text", - "note": "Use structured response to judge correctness" - }, - { - "key": "eval", - "category": "permissive_string_match", - "note": "Switched from fuzzy eval" + "evaluator": "AgentResponseEvaluator", + "ordered": "false", + "results_schema": { "type": "array", "items": {"type": "boolean"} }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": [false] + } } - ] + ], + "revision": 2 }, { - "sites": [ - "gitlab" - ], + "sites": ["gitlab"], "task_id": 174, - "require_login": true, - "storage_state": "./.auth/gitlab_state.json", - "start_url": "__GITLAB__", - "geolocation": null, - "intent_template": "Navigate to my latest updated issue that has keyword \"{{keyword}}\" in its title to check if it is closed", - "original.intent_template": "Open my latest updated issue that has keyword \"{{keyword}}\" in its title to check if it is closed", - "instantiation_dict": { - "keyword": "feature" - }, - "intent": "Navigate to my latest updated issue that has keyword \"feature\" in its title to check if it is closed", - "original.intent": "Open my latest updated issue that has keyword \"feature\" in its title to check if it is closed", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "verify_retrieved_value_normalized", - "eval_params": { - "ordered": "false", - "type": "text" - }, - "expected_data": [ - "Not closed" - ] - } - } - ], - "site": "gitlab" - }, - "original.eval": { - "eval_types": [ - "string_match", - "url_match" - ], - "reference_answers": { - "fuzzy_match": [ - "No, it is open" - ] - }, - "reference_url": "__GITLAB__/byteblaze/a11y-webring.club/-/issues/71", - "program_html": [], - "reference_answer_raw_annotation": "Not closed", - "string_note": "" - }, "intent_template_id": 310, - "format_specification": "Respond with \"Not closed\" or \"Closed\"", - "changelogs": [ - { - "key": "intent_template", - "category": "clarify_instructions", - "note": "Clarify the intent as a navigation task" - }, - { - "key": "expected_retrieve_value", - "category": "avoid_permissive_text", - "note": "Use structured response to judge correctness" - }, + "start_urls": ["__GITLAB__"], + "intent": "Get whether my latest updated issue with \"feature\" in its title is closed", + "intent_template": "Get whether my latest updated issue with \"{{keyword}}\" in its title is closed", + "instantiation_dict": {"keyword": "feature"}, + "format_specification": "Use a boolean where true means the issue is closed and false means the issue is opened.", + "start_url_context": null, + "eval": [ { - "key": "eval", - "category": "permissive_string_match", - "note": "Switched from fuzzy eval" + "evaluator": "AgentResponseEvaluator", + "ordered": "false", + "results_schema": { "type": "array", "items": {"type": "boolean"} }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": [false] + } } - ] + ], + "revision": 2 }, { - "sites": [ - "gitlab" - ], + "sites": ["gitlab"], "task_id": 175, - "require_login": true, - "storage_state": "./.auth/gitlab_state.json", - "start_url": "__GITLAB__", - "geolocation": null, - "intent_template": "Navigate to my latest updated issue that has keyword \"{{keyword}}\" in its title to check if it is closed", - "original.intent_template": "Open my latest updated issue that has keyword \"{{keyword}}\" in its title to check if it is closed", - "instantiation_dict": { - "keyword": "dependency" - }, - "intent": "Navigate to my latest updated issue that has keyword \"dependency\" in its title to check if it is closed", - "original.intent": "Open my latest updated issue that has keyword \"dependency\" in its title to check if it is closed", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "verify_retrieved_value_normalized", - "eval_params": { - "ordered": "false", - "type": "text" - }, - "expected_data": [ - "Not closed" - ] - } - } - ], - "site": "gitlab" - }, - "original.eval": { - "eval_types": [ - "string_match", - "url_match" - ], - "reference_answers": { - "fuzzy_match": [ - "No, it is open" - ] - }, - "reference_url": "__GITLAB__/byteblaze/empathy-prompts/-/issues/18", - "program_html": [], - "reference_answer_raw_annotation": "Not closed", - "string_note": "" - }, "intent_template_id": 310, - "format_specification": "Respond with \"Not closed\" or \"Closed\"", - "changelogs": [ - { - "key": "intent_template", - "category": "clarify_instructions", - "note": "Clarify the intent as a navigation task" - }, + "start_urls": ["__GITLAB__"], + "intent": "Get whether my latest updated issue with \"dependency\" in its title is closed", + "intent_template": "Get whether my latest updated issue with \"{{keyword}}\" in its title is closed", + "instantiation_dict": {"keyword": "dependency"}, + "format_specification": "Use a boolean where true means the issue is closed and false means the issue is opened.", + "start_url_context": null, + "eval": [ { - "key": "expected_retrieve_value", - "category": "avoid_permissive_text", - "note": "Use structured response to judge correctness" - }, - { - "key": "eval", - "category": "permissive_string_match", - "note": "Switched from fuzzy eval" + "evaluator": "AgentResponseEvaluator", + "ordered": "false", + "results_schema": { "type": "array", "items": {"type": "boolean"} }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": [false] + } } - ] + ], + "revision": 2 }, { - "sites": [ - "gitlab" - ], + "sites": ["gitlab"], "task_id": 176, - "require_login": true, - "storage_state": "./.auth/gitlab_state.json", - "start_url": "__GITLAB__", - "geolocation": null, - "intent_template": "Navigate to my latest updated issue that has keyword \"{{keyword}}\" in its title to check if it is closed", - "original.intent_template": "Open my latest updated issue that has keyword \"{{keyword}}\" in its title to check if it is closed", - "instantiation_dict": { - "keyword": "theme editor" - }, - "intent": "Navigate to my latest updated issue that has keyword \"theme editor\" in its title to check if it is closed", - "original.intent": "Open my latest updated issue that has keyword \"theme editor\" in its title to check if it is closed", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "verify_retrieved_value_normalized", - "eval_params": { - "ordered": "false", - "type": "text" - }, - "expected_data": [ - "Not closed" - ] - } - } - ], - "site": "gitlab" - }, - "original.eval": { - "eval_types": [ - "string_match", - "url_match" - ], - "reference_answers": { - "fuzzy_match": [ - "No, it is open" - ] - }, - "reference_url": "__GITLAB__/byteblaze/a11y-syntax-highlighting/-/issues/1", - "program_html": [], - "reference_answer_raw_annotation": "Not closed", - "string_note": "" - }, "intent_template_id": 310, - "format_specification": "Respond with \"Not closed\" or \"Closed\"", - "changelogs": [ - { - "key": "intent_template", - "category": "clarify_instructions", - "note": "Clarify the intent as a navigation task" - }, - { - "key": "expected_retrieve_value", - "category": "avoid_permissive_text", - "note": "Use structured response to judge correctness" - }, + "start_urls": ["__GITLAB__"], + "intent": "Get whether my latest updated issue with \"theme editor\" in its title is closed", + "intent_template": "Get whether my latest updated issue with \"{{keyword}}\" in its title is closed", + "instantiation_dict": {"keyword": "theme editor"}, + "format_specification": "Use a boolean where true means the issue is closed and false means the issue is opened.", + "start_url_context": null, + "eval": [ { - "key": "eval", - "category": "permissive_string_match", - "note": "Switched from fuzzy eval" + "evaluator": "AgentResponseEvaluator", + "ordered": "false", + "results_schema": { "type": "array", "items": {"type": "boolean"} }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": [false] + } } - ] + ], + "revision": 2 }, { - "sites": [ - "gitlab" - ], + "sites": ["gitlab"], "task_id": 177, - "require_login": true, - "storage_state": "./.auth/gitlab_state.json", - "start_url": "__GITLAB__", - "geolocation": null, - "intent_template": "Navigate to my latest updated issue that has keyword \"{{keyword}}\" in its title to check if it is closed", - "original.intent_template": "Open my latest updated issue that has keyword \"{{keyword}}\" in its title to check if it is closed", - "instantiation_dict": { - "keyword": "homepage content" - }, - "intent": "Navigate to my latest updated issue that has keyword \"homepage content\" in its title to check if it is closed", - "original.intent": "Open my latest updated issue that has keyword \"homepage content\" in its title to check if it is closed", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "verify_retrieved_value_normalized", - "eval_params": { - "ordered": "false", - "type": "text" - }, - "expected_data": [ - "Closed" - ] - } - } - ], - "site": "gitlab" - }, - "original.eval": { - "eval_types": [ - "string_match", - "url_match" - ], - "reference_answers": { - "fuzzy_match": [ - "Yes, it is closed" - ] - }, - "reference_url": "__GITLAB__/a11yproject/a11yproject.com/-/issues/719", - "program_html": [], - "reference_answer_raw_annotation": "closed", - "string_note": "" - }, "intent_template_id": 310, - "format_specification": "Respond with \"Not closed\" or \"Closed\"", - "changelogs": [ - { - "key": "intent_template", - "category": "clarify_instructions", - "note": "Clarify the intent as a navigation task" - }, - { - "key": "expected_retrieve_value", - "category": "avoid_permissive_text", - "note": "Use structured response to judge correctness" - }, + "start_urls": ["__GITLAB__"], + "intent": "Get whether my latest updated issue with \"homepage content\" in its title is closed", + "intent_template": "Get whether my latest updated issue with \"{{keyword}}\" in its title is closed", + "instantiation_dict": {"keyword": "homepage content"}, + "format_specification": "Use a boolean where true means the issue is closed and false means the issue is opened.", + "start_url_context": null, + "eval": [ { - "key": "eval", - "category": "permissive_string_match", - "note": "Switched from fuzzy eval" + "evaluator": "AgentResponseEvaluator", + "ordered": "false", + "results_schema": { "type": "array", "items": {"type": "boolean"} }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": [true] + } } - ] + ], + "revision": 2 }, { - "sites": [ - "gitlab" - ], + "sites": ["gitlab"], "task_id": 178, - "require_login": true, - "storage_state": "./.auth/gitlab_state.json", - "start_url": "__GITLAB__", - "geolocation": null, - "intent_template": "Navigate to and open my latest created issue that has {{keyword}} in its title to check if it is closed", - "original.intent_template": "Open my latest created issue that has {{keyword}} in its title to check if it is closed", - "instantiation_dict": { - "keyword": "better" - }, - "intent": "Navigate to and open my latest created issue that has better in its title to check if it is closed", - "original.intent": "Open my latest created issue that has better in its title to check if it is closed", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "verify_retrieved_value_normalized", - "eval_params": { - "type": "text" - }, - "expected_data": [ - "Yes" - ] - } - } - ], - "site": "gitlab" - }, - "original.eval": { - "eval_types": [ - "string_match", - "url_match" - ], - "reference_answers": { - "exact_match": "Yes" - }, - "reference_url": "__GITLAB__/a11yproject/a11yproject.com/-/issues/566", - "program_html": [], - "reference_answer_raw_annotation": "Closed", - "string_note": "" - }, "intent_template_id": 500, - "format_specification": "Return \"Yes\" if the issue is closed and \"No\" if the issue is opened", - "changelogs": [ - { - "key": "intent_template", - "category": "clarify_instructions", - "note": "Clarify that this task requires navigation and a response" - }, + "start_urls": ["__GITLAB__"], + "intent": "Get whether my latest created issue with better in its title is closed", + "intent_template": "Get whether my latest created issue with {{keyword}} in its title is closed", + "instantiation_dict": {"keyword": "better"}, + "format_specification": "Use a boolean where true means the issue is closed and false means the issue is opened.", + "start_url_context": null, + "eval": [ { - "key": "expected_retrieve_value", - "category": "avoid_permissive_text", - "note": "Use structured response to judge correctness" + "evaluator": "AgentResponseEvaluator", + "results_schema": { "type": "array", "items": {"type": "boolean"} }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": [true] + } } - ] + ], + "revision": 2 }, { - "sites": [ - "gitlab" - ], + "sites": ["gitlab"], "task_id": 179, - "require_login": true, - "storage_state": "./.auth/gitlab_state.json", - "start_url": "__GITLAB__", - "geolocation": null, - "intent_template": "Navigate to and open my latest created issue that has {{keyword}} in its title to check if it is closed", - "original.intent_template": "Open my latest created issue that has {{keyword}} in its title to check if it is closed", - "instantiation_dict": { - "keyword": "feature" - }, - "intent": "Navigate to and open my latest created issue that has feature in its title to check if it is closed", - "original.intent": "Open my latest created issue that has feature in its title to check if it is closed", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "verify_retrieved_value_normalized", - "eval_params": { - "type": "text" - }, - "expected_data": [ - "Yes" - ] - } - } - ], - "site": "gitlab" - }, - "original.eval": { - "eval_types": [ - "string_match", - "url_match" - ], - "reference_answers": { - "exact_match": "Yes" - }, - "reference_url": "__GITLAB__/a11yproject/a11yproject.com/-/issues/1517", - "program_html": [], - "reference_answer_raw_annotation": "Closed", - "string_note": "" - }, "intent_template_id": 500, - "format_specification": "Return \"Yes\" if the issue is closed and \"No\" if the issue is opened", - "changelogs": [ + "start_urls": ["__GITLAB__"], + "intent": "Get whether my latest created issue with feature in its title is closed", + "intent_template": "Get whether my latest created issue with {{keyword}} in its title is closed", + "instantiation_dict": {"keyword": "feature"}, + "format_specification": "Use a boolean where true means the issue is closed and false means the issue is opened.", + "start_url_context": null, + "eval": [ { - "key": "intent_template", - "category": "clarify_instructions", - "note": "Clarify that this task requires navigation and a response" - }, - { - "key": "expected_retrieve_value", - "category": "avoid_permissive_text", - "note": "Use structured response to judge correctness" + "evaluator": "AgentResponseEvaluator", + "results_schema": { "type": "array", "items": {"type": "boolean"} }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": [true] + } } - ] + ], + "revision": 2 }, { - "sites": [ - "gitlab" - ], + "sites": ["gitlab"], "task_id": 180, - "require_login": true, - "storage_state": "./.auth/gitlab_state.json", - "start_url": "__GITLAB__", - "geolocation": null, - "intent_template": "Navigate to and open my latest created issue that has {{keyword}} in its title to check if it is closed", - "original.intent_template": "Open my latest created issue that has {{keyword}} in its title to check if it is closed", - "instantiation_dict": { - "keyword": "dependency" - }, - "intent": "Navigate to and open my latest created issue that has dependency in its title to check if it is closed", - "original.intent": "Open my latest created issue that has dependency in its title to check if it is closed", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "verify_retrieved_value_normalized", - "eval_params": { - "type": "text" - }, - "expected_data": [ - "No" - ] - } - } - ], - "site": "gitlab" - }, - "original.eval": { - "eval_types": [ - "string_match", - "url_match" - ], - "reference_answers": { - "exact_match": "No" - }, - "reference_url": "__GITLAB__/byteblaze/empathy-prompts/-/issues/18", - "program_html": [], - "reference_answer_raw_annotation": "Not closed", - "string_note": "" - }, "intent_template_id": 500, - "format_specification": "Return \"Yes\" if the issue is closed and \"No\" if the issue is opened", - "changelogs": [ - { - "key": "intent_template", - "category": "clarify_instructions", - "note": "Clarify that this task requires navigation and a response" - }, + "start_urls": ["__GITLAB__"], + "intent": "Get whether my latest created issue with dependency in its title is closed", + "intent_template": "Get whether my latest created issue with {{keyword}} in its title is closed", + "instantiation_dict": {"keyword": "dependency"}, + "format_specification": "Use a boolean where true means the issue is closed and false means the issue is opened.", + "start_url_context": null, + "eval": [ { - "key": "expected_retrieve_value", - "category": "avoid_permissive_text", - "note": "Use structured response to judge correctness" + "evaluator": "AgentResponseEvaluator", + "results_schema": { "type": "array", "items": {"type": "boolean"} }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": [false] + } } - ] + ], + "revision": 2 }, { - "sites": [ - "gitlab" - ], + "sites": ["gitlab"], "task_id": 181, - "require_login": true, - "storage_state": "./.auth/gitlab_state.json", - "start_url": "__GITLAB__", - "geolocation": null, - "intent_template": "Navigate to and open my latest created issue that has {{keyword}} in its title to check if it is closed", - "original.intent_template": "Open my latest created issue that has {{keyword}} in its title to check if it is closed", - "instantiation_dict": { - "keyword": "theme editor" - }, - "intent": "Navigate to and open my latest created issue that has theme editor in its title to check if it is closed", - "original.intent": "Open my latest created issue that has theme editor in its title to check if it is closed", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "verify_retrieved_value_normalized", - "eval_params": { - "type": "text" - }, - "expected_data": [ - "No" - ] - } - } - ], - "site": "gitlab" - }, - "original.eval": { - "eval_types": [ - "string_match", - "url_match" - ], - "reference_answers": { - "exact_match": "No" - }, - "reference_url": "__GITLAB__/byteblaze/a11y-syntax-highlighting/-/issues/1", - "program_html": [], - "reference_answer_raw_annotation": "Not closed", - "string_note": "" - }, "intent_template_id": 500, - "format_specification": "Return \"Yes\" if the issue is closed and \"No\" if the issue is opened", - "changelogs": [ - { - "key": "intent_template", - "category": "clarify_instructions", - "note": "Clarify that this task requires navigation and a response" - }, + "start_urls": ["__GITLAB__"], + "intent": "Get whether my latest created issue with theme editor in its title is closed", + "intent_template": "Get whether my latest created issue with {{keyword}} in its title is closed", + "instantiation_dict": {"keyword": "theme editor"}, + "format_specification": "Use a boolean where true means the issue is closed and false means the issue is opened.", + "start_url_context": null, + "eval": [ { - "key": "expected_retrieve_value", - "category": "avoid_permissive_text", - "note": "Use structured response to judge correctness" + "evaluator": "AgentResponseEvaluator", + "results_schema": { "type": "array", "items": {"type": "boolean"} }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": [false] + } } - ] + ], + "revision": 2 }, { - "sites": [ - "gitlab" - ], + "sites": ["gitlab"], "task_id": 182, - "require_login": true, - "storage_state": "./.auth/gitlab_state.json", - "start_url": "__GITLAB__", - "geolocation": null, - "intent_template": "Navigate to and open my latest created issue that has {{keyword}} in its title to check if it is closed", - "original.intent_template": "Open my latest created issue that has {{keyword}} in its title to check if it is closed", - "instantiation_dict": { - "keyword": "homepage content" - }, - "intent": "Navigate to and open my latest created issue that has homepage content in its title to check if it is closed", - "original.intent": "Open my latest created issue that has homepage content in its title to check if it is closed", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "verify_retrieved_value_normalized", - "eval_params": { - "type": "text" - }, - "expected_data": [ - "Yes" - ] - } - } - ], - "site": "gitlab" - }, - "original.eval": { - "eval_types": [ - "string_match", - "url_match" - ], - "reference_answers": { - "exact_match": "Yes" - }, - "reference_url": "__GITLAB__/a11yproject/a11yproject.com/-/issues/719", - "program_html": [], - "reference_answer_raw_annotation": "closed", - "string_note": "" - }, "intent_template_id": 500, - "format_specification": "Return \"Yes\" if the issue is closed and \"No\" if the issue is opened", - "changelogs": [ - { - "key": "intent_template", - "category": "clarify_instructions", - "note": "Clarify that this task requires navigation and a response" - }, + "start_urls": ["__GITLAB__"], + "intent": "Get whether my latest created issue with homepage content in its title is closed", + "intent_template": "Get whether my latest created issue with {{keyword}} in its title is closed", + "instantiation_dict": {"keyword": "homepage content"}, + "format_specification": "Use a boolean where true means the issue is closed and false means the issue is opened.", + "start_url_context": null, + "eval": [ { - "key": "expected_retrieve_value", - "category": "avoid_permissive_text", - "note": "Use structured response to judge correctness" + "evaluator": "AgentResponseEvaluator", + "results_schema": { "type": "array", "items": {"type": "boolean"} }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": [true] + } } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping_admin" - ], + "sites": ["shopping_admin"], "task_id": 183, - "require_login": true, - "storage_state": "./.auth/shopping_admin_state.json", - "start_url": "__SHOPPING_ADMIN__", - "geolocation": null, - "intent_template": "Give me the {{Attribute}} of the products that have {{N}} units left", - "instantiation_dict": { - "Attribute": "SKU", - "N": "10" - }, - "intent": "Give me the SKU of the products that have 10 units left", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "validate_status", - "expected_data": { - "expected_status": "NOT_FOUND_ERROR" - } - } - } - ], - "site": "shopping_admin" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "fuzzy_match": "N/A" - }, - "reference_url": "", - "program_html": [], - "string_note": "There is no product that has 10 quantities left.", - "reference_answer_raw_annotation": "There is no product that has 10 quantities left." - }, "intent_template_id": 368, - "changelogs": [ + "start_urls": ["__SHOPPING_ADMIN__"], + "intent": "Give me the SKU of the simple products that have 10 units left", + "intent_template": "Give me the {{Attribute}} of the simple products that have {{N}} units left", + "instantiation_dict": {"Attribute": "SKU", "N": "10"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "expected_retrieve_value", - "category": "unachievable_tasks", - "note": "Status check used instead of fuzzy match" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": { + "performed_operation": "retrieve", + "status": "NOT_FOUND_ERROR", + "retrieved_data": null + } } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping_admin" - ], + "sites": ["shopping_admin"], "task_id": 184, - "require_login": true, - "storage_state": "./.auth/shopping_admin_state.json", - "start_url": "__SHOPPING_ADMIN__", - "geolocation": null, - "intent_template": "Give me the {{Attribute}} of the products that have {{N}} units left", - "instantiation_dict": { - "Attribute": "name", - "N": "0" - }, - "intent": "Give me the name of the products that have 0 units left", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "verify_retrieved_value_normalized", - "eval_params": { - "ordered": false, - "type": "text" - }, - "expected_data": [ - "Mona Pullover Hoodlie", - "Nadia Elements Shell", - "Neve Studio Dance Jacket", - "Juno Jacket", - "Olivia 1/4 Zip Light Jacket", - "Gabrielle Micro Sleeve Top", - "Iris Workout Top", - "Layla Tee", - "Elisa EverCool\u2122 Tee", - "Juliana Short-Sleeve Tee", - "Minerva LumaTech\u2122 V-Tee", - "Tiffany Fitness Tee", - "Karissa V-Neck Tee", - "Diva Gym Tee", - "Radiant Tee", - "Gwyn Endurance Tee", - "Desiree Fitness Tee", - "Jade Yoga Jacket", - "Adrienne Trek Jacket", - "Inez Full Zip Jacket", - "Hera Pullover Hoodie", - "Autumn Pullie", - "Miko Pullover Hoodie", - "Selene Yoga Hoodie", - "Daphne Full-Zip Hoodie", - "Phoebe Zipper Sweatshirt", - "Cassia Funnel Sweatshirt", - "Ariel Roll Sleeve Sweatshirt", - "Helena Hooded Fleece", - "Eos V-Neck Hoodie", - "Circe Hooded Ice Fleece", - "Stellar Solar Jacket", - "Josie Yoga Jacket", - "Augusta Pullover Jacket", - "Ingrid Running Jacket", - "Riona Full Zip Jacket", - "Electra Bra Top", - "Erica Evercool Sports Bra", - "Celeste Sports Bra", - "Carina Basic Capri", - "Daria Bikram Pant", - "Sylvia Capri", - "Deirdre Relaxed-Fit Capri", - "Portia Capri", - "Fiona Fitness Short", - "Maxima Drawstring Short", - "Gwen Drawstring Bike Short", - "Artemis Running Short", - "Bess Yoga Short", - "Angel Light Running Short", - "Echo Fit Compression Short", - "Sybil Running Short", - "Mimi All-Purpose Short", - "Ana Running Short", - "Ina Compression Short", - "Bardot Capri", - "Aeon Capri", - "Diana Tights", - "Prima Compete Bra Top", - "Lucia Cross-Fit Bra", - "Bella Tank", - "Zoe Tank", - "Nora Practice Tank", - "Nona Fitness Tank", - "Leah Yoga Top", - "Chloe Compete Tank", - "Maya Tunic", - "Antonia Racer Tank", - "Breathe-Easy Tank", - "Karmen Yoga Pant", - "Emma Leggings", - "Ida Workout Parachute Pant", - "Cora Parachute Pant", - "Sahara Leggings", - "Erika Running Short", - "Sprite Yoga Companion Kit", - "Taurus Elements Shell", - "Mars HeatTech\u2122 Pullover", - "Typhon Performance Fleece-lined Jacket", - "Jupiter All-Weather Trainer", - "Montana Wind Jacket", - "Proteus Fitness Jackshirt", - "Gobi HeatTec\u00ae Tee", - "Helios EverCool\u2122 Tee", - "Ryker LumaTech\u2122 Tee (Crew-neck)", - "Atomic Endurance Running Tee (V-neck)", - "Atomic Endurance Running Tee (Crew-Neck)", - "Balboa Persistence Tee", - "Zoltan Gym Tee", - "Aero Daily Fitness Tee", - "Ryker LumaTech\u2122 Tee (V-neck)", - "Logan HeatTec\u00ae Tee", - "Lando Gym Jacket", - "Orion Two-Tone Fitted Jacket", - "Kenobi Trail Jacket", - "Set of Sprite Yoga Straps", - "Chaz Kangeroo Hoodie", - "Teton Pullover Hoodie", - "Bruno Compete Hoodie", - "Frankie Sweatshirt", - "Hollister Backyard Sweatshirt", - "Stark Fundamental Hoodie", - "Hero Hoodie", - "Oslo Trek Hoodie", - "Abominable Hoodie", - "Mach Street Sweatshirt", - "Grayson Crewneck Sweatshirt", - "Ajax Full-Zip Sweatshirt", - "Marco Lightweight Active Hoodie", - "Beaumont Summit Kit", - "Hyperion Elements Jacket", - "Deion Long-Sleeve EverCool\u2122 Tee", - "Strike Endurance Tee", - "Erikssen CoolTech\u2122 Fitness Tank", - "Livingston All-Purpose Tight", - "Orestes Yoga Pant", - "Aether Gym Pant", - "Cronus Yoga Pant -33-Blue", - "Cronus Yoga Pant", - "Cobalt CoolTech\u2122 Fitness Short", - "Apollo Running Short", - "Meteor Workout Short", - "Torque Power Short", - "Hawkeye Yoga Short", - "Lono Yoga Short", - "Rapha Sports Short", - "Orestes Fitness Short", - "Troy Yoga Short", - "Sol Active Short", - "Arcadio Gym Short", - "Zeppelin Yoga Pant", - "Thorpe Track Pant", - "Mithra Warmup Pant", - "Tristan Endurance Tank", - "Primo Endurance Tank", - "Helios Endurance Tank", - "Rocco Gym Tank", - "Vulcan Weightlifting Tank", - "Argus All-Weather Tank", - "Sparta Gym Tank", - "Sinbad Fitness Tank", - "Tiberius Gym Tank", - "Atlas Fitness Tank", - "Cassius Sparring Tank", - "Caesar Warm-Up Pant", - "Viktor LumaTech\u2122 Pant", - "Geo Insulated Jogging Pant", - "Supernova Sport Pant", - "Kratos Gym Pant", - "Pierce Gym Short" - ] - } + "intent_template_id": 368, + "start_urls": ["__SHOPPING_ADMIN__"], + "intent": "Give me the name of the simple products that have 0 units left", + "intent_template": "Give me the {{Attribute}} of the simple products that have {{N}} units left", + "instantiation_dict": {"Attribute": "name", "N": "0"}, + "format_specification": null, + "start_url_context": null, + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "ordered": false, + "results_schema": { "type": "array", "items": {"type": "string"} }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": ["Cronus Yoga Pant -33-Blue"] } - ], - "site": "shopping_admin" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "exact_match": "Sinbad Fitness Tank" - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "Sinbad Fitness Tank" - }, - "intent_template_id": 368 + } + ], + "revision": 2 }, { - "sites": [ - "shopping_admin" - ], + "sites": ["shopping_admin"], "task_id": 185, - "require_login": true, - "storage_state": "./.auth/shopping_admin_state.json", - "start_url": "__SHOPPING_ADMIN__", - "geolocation": null, - "intent_template": "Give me the {{Attribute}} of the products that have {{N}} units left", - "instantiation_dict": { - "Attribute": "brand", - "N": "3" - }, - "intent": "Give me the brand of the products that have 3 units left", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "verify_retrieved_value_normalized", - "eval_params": { - "ordered": false, - "type": "text" - }, - "expected_data": [ - "Eos V-Neck Hoodie-S-Blue", - "Minera Luma Tech V-Tee-XS-Blue" - ] - } - } - ], - "site": "shopping_admin" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "must_include": [ - "Eos", - "Minerva" - ] - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "Eos, Minerva" - }, "intent_template_id": 368, - "changelogs": [ + "start_urls": ["__SHOPPING_ADMIN__"], + "intent": "Give me the brand of the simple products that have 3 units left", + "intent_template": "Give me the {{Attribute}} of the simple products that have {{N}} units left", + "instantiation_dict": {"Attribute": "brand", "N": "3"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "eval", - "category": "permissive_string_match", - "note": "Switched to exact match" + "evaluator": "AgentResponseEvaluator", + "ordered": false, + "results_schema": { "type": "array", "items": {"type": "string"} }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": ["Eos", "Minera"] + } } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping_admin" - ], + "sites": ["shopping_admin"], "task_id": 186, - "require_login": true, - "storage_state": "./.auth/shopping_admin_state.json", - "start_url": "__SHOPPING_ADMIN__", - "geolocation": null, - "intent_template": "Give me the {{Attribute}} of the products that have {{N}} units left", - "instantiation_dict": { - "Attribute": "product names and the sizes", - "N": "2-3" - }, - "intent": "Give me the product names and the sizes of the products that have 2-3 units left", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "verify_retrieved_value_normalized", - "eval_params": { - "ordered": false, - "type": "object" - }, - "expected_data": [ - { - "name": { - "value": "Eos V-Neck Hoodie-S-Blue", - "type": "text" - }, - "size": { - "value": "S", - "type": "text" - } - }, - { - "name": { - "value": "Minerva LumaTech\u2122 V-Tee-XS-Blue", - "type": "text" - }, - "size": { - "value": "XS", - "type": "text" - } - } - ] - } - } - ], - "site": "shopping_admin" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "fuzzy_match": [ - "Eos V-Neck Hoodie: S", - "Minera Luma Tech V-Tee: XS" - ] - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "Eos V-Neck Hoodie: S Minera Luma Tech V-Tee: XS" - }, "intent_template_id": 368, + "start_urls": ["__SHOPPING_ADMIN__"], + "intent": "Give me the product names and the sizes of the simple products that have 2-3 units left", + "intent_template": "Give me the {{Attribute}} of the simple products that have {{N}} units left", + "instantiation_dict": {"Attribute": "product names and the sizes", "N": "2-3"}, "format_specification": "Use \"name\" for the product name and \"size\" for the size.", - "changelogs": [ + "start_url_context": null, + "eval": [ { - "key": "eval", - "category": "permissive_string_match", - "note": "Switched from fuzzy eval" + "evaluator": "AgentResponseEvaluator", + "ordered": false, + "results_schema": { + "type": "array", + "items": { + "type": "object", + "properties": { "name": {"type": "string"}, "size": {"type": "string"} }, + "required": ["name", "size"] + } + }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": [ + { "name": "Eos V-Neck Hoodie-S-Blue" , "size": "S" }, + { "name": "Minerva LumaTech\u2122 V-Tee-XS-Blue", "size": "XS" } + ] + } } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping_admin" - ], + "sites": ["shopping_admin"], "task_id": 187, - "require_login": true, - "storage_state": "./.auth/shopping_admin_state.json", - "start_url": "__SHOPPING_ADMIN__", - "geolocation": null, - "intent_template": "Give me the {{Attribute}} of the products that have {{N}} units left", - "instantiation_dict": { - "Attribute": "SKU", - "N": "1-3" - }, - "intent": "Give me the SKU of the products that have 1-3 units left", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "verify_retrieved_value_normalized", - "eval_params": { - "ordered": false, - "type": "text" - }, - "expected_data": [ - "WH11-S-Blue", - "WS08-XS-Blue" - ] - } - } - ], - "site": "shopping_admin" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "must_include": [ - "WH11-S-Blue", - "WS08-XS-Blue" - ] - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "WH11-S-Blue, WS08-XS-Blue" - }, "intent_template_id": 368, - "changelogs": [ + "start_urls": ["__SHOPPING_ADMIN__"], + "intent": "Give me the SKU of the simple products that have 1-3 units left", + "intent_template": "Give me the {{Attribute}} of the simple products that have {{N}} units left", + "instantiation_dict": {"Attribute": "SKU", "N": "1-3"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "eval", - "category": "permissive_string_match", - "note": "Switched to exact match" + "evaluator": "AgentResponseEvaluator", + "ordered": false, + "results_schema": { "type": "array", "items": {"type": "string"} }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": ["WH11-S-Blue", "WS08-XS-Blue"] + } } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping" - ], + "sites": ["shopping"], "task_id": 188, - "require_login": true, - "storage_state": "./.auth/shopping_state.json", - "start_url": "__SHOPPING__", - "geolocation": null, - "intent_template": "Get the total cost of my latest {{status}} order", - "original.intent_template": "Tell me the total cost of my latest {{status}} order?", - "instantiation_dict": { - "status": "cancelled" - }, - "intent": "Get the total cost of my latest cancelled order", - "original.intent": "Tell me the total cost of my latest cancelled order?", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "verify_retrieved_value_normalized", - "eval_params": { - "ordered": false, - "type": "currency" - }, - "expected_data": [ - "365.42" - ] - } - } - ], - "site": "shopping" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "must_include": [ - "365.42" - ] - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "365.42" - }, "intent_template_id": 214, - "changelogs": [ - { - "key": "intent_template", - "category": "task_ambiguity", - "note": "Clarify the intent to specify return value instead of navigation" - }, + "start_urls": ["__SHOPPING__"], + "intent": "Get the total cost of my latest cancelled order", + "intent_template": "Get the total cost of my latest {{status}} order", + "instantiation_dict": {"status": "cancelled"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "eval", - "category": "permissive_string_match", - "note": "Switched to exact match" + "evaluator": "AgentResponseEvaluator", + "ordered": false, + "results_schema": { "type": "array", "items": {"type": "string"} }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": ["365.42"] + } } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping" - ], + "sites": ["shopping"], "task_id": 189, - "require_login": true, - "storage_state": "./.auth/shopping_state.json", - "start_url": "__SHOPPING__", - "geolocation": null, + "intent_template_id": 214, + "start_urls": ["__SHOPPING__"], + "intent": "Get the total cost of my latest pending order", "intent_template": "Get the total cost of my latest {{status}} order", - "original.intent_template": "Tell me the total cost of my latest {{status}} order?", - "instantiation_dict": { - "status": "pending" - }, - "intent": "Get the total cost of my latest pending order", - "original.intent": "Tell me the total cost of my latest pending order?", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "verify_retrieved_value_normalized", - "eval_params": { - "ordered": false, - "type": "currency" - }, - "expected_data": [ - "754.99" - ] - } - } - ], - "site": "shopping" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "must_include": [ - "754.99" - ] - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "754.99" - }, - "intent_template_id": 214, - "changelogs": [ - { - "key": "intent_template", - "category": "task_ambiguity", - "note": "Clarify the intent to specify return value instead of navigation" - }, + "instantiation_dict": {"status": "pending"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "eval", - "category": "permissive_string_match", - "note": "Switched to exact match" + "evaluator": "AgentResponseEvaluator", + "ordered": false, + "results_schema": { "type": "array", "items": {"type": "string"} }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": ["754.99"] + } } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping" - ], + "sites": ["shopping"], "task_id": 190, - "require_login": true, - "storage_state": "./.auth/shopping_state.json", - "start_url": "__SHOPPING__", - "geolocation": null, - "intent_template": "Get the total cost of my latest {{status}} order", - "original.intent_template": "Tell me the total cost of my latest {{status}} order?", - "instantiation_dict": { - "status": "complete" - }, - "intent": "Get the total cost of my latest complete order", - "original.intent": "Tell me the total cost of my latest complete order?", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "verify_retrieved_value_normalized", - "eval_params": { - "ordered": false, - "type": "currency" - }, - "expected_data": [ - "65.32" - ] - } - } - ], - "site": "shopping" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "must_include": [ - "65.32" - ] - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "65.32" - }, "intent_template_id": 214, - "changelogs": [ - { - "key": "intent_template", - "category": "task_ambiguity", - "note": "Clarify the intent to specify return value instead of navigation" - }, + "start_urls": ["__SHOPPING__"], + "intent": "Get the total cost of my latest complete order", + "intent_template": "Get the total cost of my latest {{status}} order", + "instantiation_dict": {"status": "complete"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "eval", - "category": "permissive_string_match", - "note": "Switched to exact match" + "evaluator": "AgentResponseEvaluator", + "ordered": false, + "results_schema": { "type": "array", "items": {"type": "string"} }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": ["65.32"] + } } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping" - ], + "sites": ["shopping"], "task_id": 191, - "require_login": true, - "storage_state": "./.auth/shopping_state.json", - "start_url": "__SHOPPING__", - "geolocation": null, - "intent_template": "Get the total cost of my latest {{status}} order", - "original.intent_template": "Tell me the total cost of my latest {{status}} order?", - "instantiation_dict": { - "status": "processing" - }, - "intent": "Get the total cost of my latest processing order", - "original.intent": "Tell me the total cost of my latest processing order?", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "validate_status", - "expected_data": { - "expected_status": "NOT_FOUND_ERROR" - } - } - } - ], - "site": "shopping" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "fuzzy_match": "N/A" - }, - "reference_url": "", - "program_html": [], - "string_note": "There is no order of \"processing\" status", - "reference_answer_raw_annotation": "There is no order of \"processing\" status" - }, "intent_template_id": 214, - "changelogs": [ - { - "key": "intent_template", - "category": "task_ambiguity", - "note": "Clarify the intent to specify return value instead of navigation" - }, + "start_urls": ["__SHOPPING__"], + "intent": "Get the total cost of my latest processing order", + "intent_template": "Get the total cost of my latest {{status}} order", + "instantiation_dict": {"status": "processing"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "expected_retrieve_value", - "category": "unachievable_tasks", - "note": "Status check used instead of fuzzy match" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": { + "performed_operation": "retrieve", + "status": "NOT_FOUND_ERROR", + "retrieved_data": null + } } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping" - ], + "sites": ["shopping"], "task_id": 192, - "require_login": true, - "storage_state": "./.auth/shopping_state.json", - "start_url": "__SHOPPING__", - "geolocation": null, - "intent_template": "Get the total cost of my latest {{status}} order", - "original.intent_template": "Tell me the total cost of my latest {{status}} order?", - "instantiation_dict": { - "status": "non-cancelled" - }, - "intent": "Get the total cost of my latest non-cancelled order", - "original.intent": "Tell me the total cost of my latest non-cancelled order?", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "verify_retrieved_value_normalized", - "eval_params": { - "ordered": false, - "type": "currency" - }, - "expected_data": [ - "754.99" - ] - } - } - ], - "site": "shopping" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "must_include": [ - "754.99" - ] - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "754.99" - }, "intent_template_id": 214, - "changelogs": [ - { - "key": "intent_template", - "category": "task_ambiguity", - "note": "Clarify the intent to specify return value instead of navigation" - }, + "start_urls": ["__SHOPPING__"], + "intent": "Get the total cost of my latest non-cancelled order", + "intent_template": "Get the total cost of my latest {{status}} order", + "instantiation_dict": {"status": "non-cancelled"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "eval", - "category": "permissive_string_match", - "note": "Switched to exact match" + "evaluator": "AgentResponseEvaluator", + "ordered": false, + "results_schema": { "type": "array", "items": {"type": "string"} }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": ["754.99"] + } } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping_admin" - ], + "sites": ["shopping_admin"], "task_id": 193, - "require_login": true, - "storage_state": "./.auth/shopping_admin_state.json", - "start_url": "__SHOPPING_ADMIN__", - "geolocation": null, - "intent_template": "Get the total payment amount of the last {{N}} {{status}} orders", - "instantiation_dict": { - "status": "completed", - "N": "2" - }, - "intent": "Get the total payment amount of the last 2 completed orders", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "verify_retrieved_value_normalized", - "eval_params": { - "ordered": false, - "type": "currency" - }, - "expected_data": [ - "182.4" - ] - } - } - ], - "site": "shopping_admin" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "must_include": [ - "182.4" - ] - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "182.4" - }, "intent_template_id": 367, - "changelogs": [ + "start_urls": ["__SHOPPING_ADMIN__"], + "intent": "Get the total payment amount of the last 2 completed orders", + "intent_template": "Get the total payment amount of the last {{N}} {{status}} orders", + "instantiation_dict": {"status": "completed", "N": "2"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "eval", - "category": "permissive_string_match", - "note": "Switched to exact match" + "evaluator": "AgentResponseEvaluator", + "ordered": false, + "results_schema": { "type": "array", "items": {"type": "string"} }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": ["182.4"] + } } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping_admin" - ], + "sites": ["shopping_admin"], "task_id": 194, - "require_login": true, - "storage_state": "./.auth/shopping_admin_state.json", - "start_url": "__SHOPPING_ADMIN__", - "geolocation": null, - "intent_template": "Get the total payment amount of the last {{N}} {{status}} orders", - "instantiation_dict": { - "status": "completed", - "N": "5" - }, - "intent": "Get the total payment amount of the last 5 completed orders", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "verify_retrieved_value_normalized", - "eval_params": { - "ordered": false, - "type": "currency" - }, - "expected_data": [ - "555.2" - ] - } - } - ], - "site": "shopping_admin" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "must_include": [ - "555.2" - ] - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "555.2" - }, "intent_template_id": 367, - "changelogs": [ + "start_urls": ["__SHOPPING_ADMIN__"], + "intent": "Get the total payment amount of the last 5 completed orders", + "intent_template": "Get the total payment amount of the last {{N}} {{status}} orders", + "instantiation_dict": {"status": "completed", "N": "5"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "eval", - "category": "permissive_string_match", - "note": "Switched to exact match" + "evaluator": "AgentResponseEvaluator", + "ordered": false, + "results_schema": { "type": "array", "items": {"type": "string"} }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": ["555.2"] + } } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping_admin" - ], + "sites": ["shopping_admin"], "task_id": 195, - "require_login": true, - "storage_state": "./.auth/shopping_admin_state.json", - "start_url": "__SHOPPING_ADMIN__", - "geolocation": null, - "intent_template": "Get the total payment amount of the last {{N}} {{status}} orders", - "instantiation_dict": { - "status": "pending", - "N": "5" - }, - "intent": "Get the total payment amount of the last 5 pending orders", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "verify_retrieved_value_normalized", - "eval_params": { - "ordered": false, - "type": "currency" - }, - "expected_data": [ - "885.4" - ] - } - } - ], - "site": "shopping_admin" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "must_include": [ - "885.4" - ] - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "885.4" - }, "intent_template_id": 367, - "changelogs": [ + "start_urls": ["__SHOPPING_ADMIN__"], + "intent": "Get the total payment amount of the last 5 pending orders", + "intent_template": "Get the total payment amount of the last {{N}} {{status}} orders", + "instantiation_dict": {"status": "pending", "N": "5"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "eval", - "category": "permissive_string_match", - "note": "Switched to exact match" + "evaluator": "AgentResponseEvaluator", + "ordered": false, + "results_schema": { "type": "array", "items": {"type": "string"} }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": ["885.4"] + } } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping_admin" - ], + "sites": ["shopping_admin"], "task_id": 196, - "require_login": true, - "storage_state": "./.auth/shopping_admin_state.json", - "start_url": "__SHOPPING_ADMIN__", - "geolocation": null, - "intent_template": "Compare the payment difference of the last {{N}} {{status_1}} orders and {{status_2}} orders", - "instantiation_dict": { - "status_1": "cancelled", - "status_2": "completed", - "N": "4" - }, - "intent": "Return the payment difference between the last 4 cancelled orders and the last 4 completed orders", - "original.intent": "Compare the payment difference of the last 4 cancelled orders and completed orders", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "verify_retrieved_value_normalized", - "eval_params": { - "ordered": false, - "type": "currency" - }, - "expected_data": [ - "194.25" - ] - } - } - ], - "site": "shopping_admin" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "must_include": [ - "194.25" - ] - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "194.25" - }, "intent_template_id": 367, - "changelogs": [ - { - "key": "intent", - "category": "task_ambiguity", - "note": "Compare vs return the payment difference" - }, + "start_urls": ["__SHOPPING_ADMIN__"], + "intent": "Get the payment difference between the last 4 cancelled orders and the last 4 completed orders", + "intent_template": "Get the payment difference between the last {{N}} {{status_1}} orders and the last {{N}} {{status_2}} orders", + "instantiation_dict": {"status_1": "cancelled", "status_2": "completed", "N": "4"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "eval", - "category": "permissive_string_match", - "note": "Switched to exact match" + "evaluator": "AgentResponseEvaluator", + "ordered": false, + "results_schema": { "type": "array", "items": {"type": "string"} }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": ["194.25"] + } } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping_admin" - ], + "sites": ["shopping_admin"], "task_id": 197, - "require_login": true, - "storage_state": "./.auth/shopping_admin_state.json", - "start_url": "__SHOPPING_ADMIN__", - "geolocation": null, - "intent_template": "Get the total payment amount of the last {{N}} {{status}} orders", - "instantiation_dict": { - "status": "non-cancelled", - "N": "5" - }, - "intent": "Get the total payment amount of the last 5 non-cancelled orders", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "verify_retrieved_value_normalized", - "eval_params": { - "ordered": false, - "type": "currency" - }, - "expected_data": [ - "778.2" - ] - } - } - ], - "site": "shopping_admin" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "must_include": [ - "778.2" - ] - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "annotation_note": "219.4+210+166.4+93.4+89", - "reference_answer_raw_annotation": "778.2" - }, "intent_template_id": 367, - "changelogs": [ + "start_urls": ["__SHOPPING_ADMIN__"], + "intent": "Get the total payment amount of the last 5 non-cancelled orders", + "intent_template": "Get the total payment amount of the last {{N}} {{status}} orders", + "instantiation_dict": {"status": "non-cancelled", "N": "5"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "eval", - "category": "permissive_string_match", - "note": "Switched to exact match" + "evaluator": "AgentResponseEvaluator", + "ordered": false, + "results_schema": { "type": "array", "items": {"type": "string"} }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": ["778.2"] + } } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping_admin" - ], + "sites": ["shopping_admin"], "task_id": 198, - "require_login": true, - "storage_state": "./.auth/shopping_admin_state.json", - "start_url": "__SHOPPING_ADMIN__", - "geolocation": null, - "intent_template": "Get the {{attribute}} of the {{status}} order", - "instantiation_dict": { - "attribute": "customer name", - "status": "most recent cancelled" - }, + "intent_template_id": 366, + "start_urls": ["__SHOPPING_ADMIN__"], "intent": "Get the customer name of the most recent cancelled order", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "verify_retrieved_value_normalized", - "eval_params": { - "ordered": false, - "type": "text" - }, - "expected_data": [ - "Lily Potter" - ] - } + "intent_template": "Get the {{attribute}} of the {{status}} order", + "instantiation_dict": {"attribute": "customer name", "status": "most recent cancelled"}, + "format_specification": null, + "start_url_context": null, + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "ordered": false, + "results_schema": { "type": "array", "items": {"type": "string"} }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": ["Lily Potter"] } - ], - "site": "shopping_admin" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "exact_match": "Lily Potter" - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "Lily Potter" - }, - "intent_template_id": 366 + } + ], + "revision": 2 }, { - "sites": [ - "shopping_admin" - ], + "sites": ["shopping_admin"], "task_id": 199, - "require_login": true, - "storage_state": "./.auth/shopping_admin_state.json", - "start_url": "__SHOPPING_ADMIN__", - "geolocation": null, - "intent_template": "Get the {{attribute}} of the {{status}} order", - "instantiation_dict": { - "attribute": "order ID", - "status": "newest pending" - }, - "intent": "Get the order ID of the newest pending order", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "verify_retrieved_value_normalized", - "eval_params": { - "ordered": false, - "type": "numeric" - }, - "expected_data": [ - 299 - ] - } - } - ], - "site": "shopping_admin" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "must_include": [ - "299" - ] - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "299" - }, "intent_template_id": 366, - "changelogs": [ + "start_urls": ["__SHOPPING_ADMIN__"], + "intent": "Get the order ID of the newest pending order", + "intent_template": "Get the {{attribute}} of the {{status}} order", + "instantiation_dict": {"attribute": "order ID", "status": "newest pending"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "eval", - "category": "permissive_string_match", - "note": "Switched to exact match" + "evaluator": "AgentResponseEvaluator", + "ordered": false, + "results_schema": { "type": "array", "items": {"type": "number"} }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": [299] + } } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping_admin" - ], + "sites": ["shopping_admin"], "task_id": 200, - "require_login": true, - "storage_state": "./.auth/shopping_admin_state.json", - "start_url": "__SHOPPING_ADMIN__", - "geolocation": null, - "intent_template": "Get the {{attribute}} of the {{status}} order", - "instantiation_dict": { - "attribute": "billing name", - "status": "oldest complete" - }, + "intent_template_id": 366, + "start_urls": ["__SHOPPING_ADMIN__"], "intent": "Get the billing name of the oldest complete order", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "verify_retrieved_value_normalized", - "eval_params": { - "ordered": false, - "type": "text" - }, - "expected_data": [ - "John Lee" - ] - } + "intent_template": "Get the {{attribute}} of the {{status}} order", + "instantiation_dict": {"attribute": "billing name", "status": "oldest complete"}, + "format_specification": null, + "start_url_context": null, + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "ordered": false, + "results_schema": { "type": "array", "items": {"type": "string"} }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": ["John Lee"] } - ], - "site": "shopping_admin" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "exact_match": "John Lee" - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "John Lee" - }, - "intent_template_id": 366 + } + ], + "revision": 2 }, { - "sites": [ - "shopping_admin" - ], + "sites": ["shopping_admin"], "task_id": 201, - "require_login": true, - "storage_state": "./.auth/shopping_admin_state.json", - "start_url": "__SHOPPING_ADMIN__", - "geolocation": null, - "intent_template": "Get the {{attribute}} of the {{status}} order", - "instantiation_dict": { - "attribute": "customer name", - "status": "earliest fraud suspect" - }, - "intent": "Get the customer name of the earliest fraud suspect order", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "validate_status", - "expected_data": { - "expected_status": "NOT_FOUND_ERROR" - } - } - } - ], - "site": "shopping_admin" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "fuzzy_match": "N/A" - }, - "reference_url": "", - "program_html": [], - "string_note": "There is no order of \"fraud suspect\" status", - "reference_answer_raw_annotation": "N/A" - }, "intent_template_id": 366, - "changelogs": [ + "start_urls": ["__SHOPPING_ADMIN__"], + "intent": "Get the customer name of the earliest fraud suspect order", + "intent_template": "Get the {{attribute}} of the {{status}} order", + "instantiation_dict": {"attribute": "customer name", "status": "earliest fraud suspect"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "expected_retrieve_value", - "category": "unachievable_tasks", - "note": "Status check used instead of fuzzy match" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": { + "performed_operation": "retrieve", + "status": "NOT_FOUND_ERROR", + "retrieved_data": null + } } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping_admin" - ], + "sites": ["shopping_admin"], "task_id": 202, - "require_login": true, - "storage_state": "./.auth/shopping_admin_state.json", - "start_url": "__SHOPPING_ADMIN__", - "geolocation": null, - "intent_template": "Get the {{attribute}} of the {{status}} order", - "instantiation_dict": { - "attribute": "date", - "status": "most recent cancelled" - }, - "original.instantiation_dict": { - "attribute": "date", - "status": "most recent canlled" - }, - "intent": "Get the date of the most recent cancelled order", - "original.intent": "Get the date of the most recent canlled order", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "verify_retrieved_value_normalized", - "eval_params": { - "ordered": false, - "type": "date" - }, - "expected_data": [ - "May 23 2023" - ] - } - } - ], - "site": "shopping_admin" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "fuzzy_match": [ - "May 23 2023" - ] - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "May 23, 2023" - }, "intent_template_id": 366, - "changelogs": [ - { - "key": "instantiation_dict", - "category": "spelling_or_grammar", - "note": "Update cancelled spelling" - }, + "start_urls": ["__SHOPPING_ADMIN__"], + "intent": "Get the date of the most recent cancelled order", + "intent_template": "Get the {{attribute}} of the {{status}} order", + "instantiation_dict": {"attribute": "date", "status": "most recent cancelled"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "eval", - "category": "permissive_string_match", - "note": "Switched from fuzzy eval" + "evaluator": "AgentResponseEvaluator", + "ordered": false, + "results_schema": { "type": "array", "items": {"type": "string"} }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": ["May 23 2023"] + } } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping_admin" - ], + "sites": ["shopping_admin"], "task_id": 203, - "require_login": true, - "storage_state": "./.auth/shopping_admin_state.json", - "start_url": "__SHOPPING_ADMIN__", - "geolocation": null, - "intent_template": "Get the {{attribute}} of the {{status}} order", - "instantiation_dict": { - "attribute": "purchase date and order id", - "status": "most recent pending" - }, - "intent": "Get the purchase date and order id of the most recent pending order", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "verify_retrieved_value_normalized", - "eval_params": { - "ordered": false, - "type": "object" - }, - "expected_data": [ - { - "date": { - "value": "May 31, 2023", - "type": "date" - }, - "order_id": { - "value": "000000299", - "type": "text" - } - } - ] - } - } - ], - "site": "shopping_admin" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "fuzzy_match": [ - "order id: 000000299", - "purchase date: May 31, 2023" - ] - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "000000299, May 31, 2023, 2:55:09 AM" - }, "intent_template_id": 366, + "start_urls": ["__SHOPPING_ADMIN__"], + "intent": "Get the purchase date and order id of the most recent pending order", + "intent_template": "Get the {{attribute}} of the {{status}} order", + "instantiation_dict": {"attribute": "purchase date and order id", "status": "most recent pending"}, "format_specification": "Use \"date\" for the date and \"order_id\" for the order id.", - "changelogs": [ + "start_url_context": null, + "eval": [ { - "key": "eval", - "category": "permissive_string_match", - "note": "Switched from fuzzy eval" + "evaluator": "AgentResponseEvaluator", + "ordered": false, + "results_schema": { + "type": "array", + "items": { + "type": "object", + "properties": { "date": {"type": "string", "format": "date"}, "order_id": {"type": "string"} }, + "required": ["date", "order_id"] + } + }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": [ {"date": "May 31, 2023", "order_id": "000000299"} ] + } } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping_admin" - ], + "sites": ["shopping_admin"], "task_id": 204, - "require_login": true, - "storage_state": "./.auth/shopping_admin_state.json", - "start_url": "__SHOPPING_ADMIN__", - "geolocation": null, + "intent_template_id": 366, + "start_urls": ["__SHOPPING_ADMIN__"], + "intent": "Get the product name and discounted price (low to high) of the most recent completed order", "intent_template": "Get the {{attribute}} of the {{status}} order", "instantiation_dict": { "attribute": "product name and discounted price (low to high)", "status": "most recent completed" }, - "intent": "Get the product name and discounted price (low to high) of the most recent completed order", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "verify_retrieved_value_normalized", - "eval_params": { - "ordered": "true", - "type": "object" + "format_specification": "Use \"name\" for the product name and \"price\" for the discounted price.", + "start_url_context": null, + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "ordered": "true", + "results_schema": { + "type": "array", + "items": { + "type": "object", + "properties": { + "name" : { "type": "string" }, + "price": { "type": "number", "format": "currency" } }, - "expected_data": [ - { - "name": { - "value": "Proteus Fitness Jackshirt", - "type": "text" - }, - "price": { - "value": "$45", - "type": "currency" - } - }, - { - "name": { - "value": "Ida Workout Parachute Pant", - "type": "text" - }, - "price": { - "value": "$48", - "type": "currency" - } - } - ] + "required": ["name", "price"] } + }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": [ + { "name": "Proteus Fitness Jackshirt" , "price": 45 }, + { "name": "Ida Workout Parachute Pant", "price": 48 } + ] } - ], - "site": "shopping_admin" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "fuzzy_match": [ - "Rapha Sports Short: $35", - "Thorpe Track Pant: $54.4", - "Mach Street Sweatshirt: $62" - ] - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "Rapha Sports Short: $35 Thorpe Track Pant: $54.4 Mach Street Sweatshirt: $62" - }, - "intent_template_id": 366, - "format_specification": "Use \"name\" for the product name and \"price\" for the discounted price.", - "changelogs": [ - { - "key": "format_specification", - "category": "reference_alignment", - "note": "Original expected was for the first pending order not a completed order" - }, - { - "key": "eval", - "category": "permissive_string_match", - "note": "Switched from fuzzy eval" } - ] + ], + "revision": 2 }, { - "sites": [ - "gitlab" - ], + "sites": ["gitlab"], "task_id": 205, - "require_login": true, - "storage_state": "./.auth/gitlab_state.json", - "start_url": "__GITLAB__/a11yproject/a11yproject.com", - "geolocation": null, - "intent_template": "How many commits did {{user}} make on {{date}}?", - "instantiation_dict": { - "user": "kilian", - "date": "3/5/2023" - }, + "intent_template_id": 320, + "start_urls": ["__GITLAB__/a11yproject/a11yproject.com"], "intent": "How many commits did kilian make on 3/5/2023?", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "verify_retrieved_value_normalized", - "eval_params": { - "ordered": "false", - "type": "numeric" - }, - "expected_data": [ - 1 - ] - } + "intent_template": "How many commits did {{user}} make on {{date}}?", + "instantiation_dict": {"user": "kilian", "date": "3/5/2023"}, + "format_specification": null, + "start_url_context": "Complete the task given the context of the current page.", + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "ordered": "false", + "results_schema": { "type": "array", "items": {"type": "number"} }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": [1] } - ], - "site": "gitlab" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "must_include": [ - "1" - ] - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "1" - }, - "intent_template_id": 320, - "changelogs": [ - { - "key": "expected_retrieve_value", - "category": "avoid_permissive_text", - "note": "Use structured response to judge correctness" - }, - { - "key": "eval", - "category": "permissive_string_match", - "note": "Switched to exact match" } - ] + ], + "revision": 2 }, { - "sites": [ - "gitlab" - ], + "sites": ["gitlab"], "task_id": 206, - "require_login": true, - "storage_state": "./.auth/gitlab_state.json", - "start_url": "__GITLAB__/a11yproject/a11yproject.com", - "geolocation": null, - "intent_template": "How many commits did {{user}} make on {{date}}?", - "instantiation_dict": { - "user": "Eric", - "date": "3/2" - }, - "intent": "How many commits did Eric make on 3/2?", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "verify_retrieved_value_normalized", - "eval_params": { - "ordered": "false", - "type": "numeric" - }, - "expected_data": [ - 2 - ] - } - } - ], - "site": "gitlab" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "must_include": [ - "2" - ] - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "2" - }, "intent_template_id": 320, - "changelogs": [ - { - "key": "expected_retrieve_value", - "category": "avoid_permissive_text", - "note": "Use structured response to judge correctness" - }, + "start_urls": ["__GITLAB__/a11yproject/a11yproject.com"], + "intent": "How many commits did Eric make on 3/2?", + "intent_template": "How many commits did {{user}} make on {{date}}?", + "instantiation_dict": {"user": "Eric", "date": "3/2"}, + "format_specification": null, + "start_url_context": "Complete the task given the context of the current page.", + "eval": [ { - "key": "eval", - "category": "permissive_string_match", - "note": "Switched to exact match" + "evaluator": "AgentResponseEvaluator", + "ordered": "false", + "results_schema": { "type": "array", "items": {"type": "number"} }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": [2] + } } - ] + ], + "revision": 2 }, { - "sites": [ - "gitlab" - ], + "sites": ["gitlab"], "task_id": 207, - "require_login": true, - "storage_state": "./.auth/gitlab_state.json", - "start_url": "__GITLAB__/a11yproject/a11yproject.com", - "geolocation": null, - "intent_template": "How many commits did {{user}} make on {{date}} in total?", - "instantiation_dict": { - "user": "Eric and Kilian", - "date": "1/3/2023" - }, - "intent": "How many commits did Eric and Kilian make on 1/3/2023 in total?", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "verify_retrieved_value_normalized", - "eval_params": { - "ordered": "false", - "type": "numeric" - }, - "expected_data": [ - 1 - ] - } - } - ], - "site": "gitlab" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "must_include": [ - "1" - ] - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "1" - }, "intent_template_id": 320, - "changelogs": [ - { - "key": "expected_retrieve_value", - "category": "avoid_permissive_text", - "note": "Use structured response to judge correctness" - }, + "start_urls": ["__GITLAB__/a11yproject/a11yproject.com"], + "intent": "How many commits did Eric and Kilian make on 1/3/2023 in total?", + "intent_template": "How many commits did {{user}} make on {{date}} in total?", + "instantiation_dict": {"user": "Eric and Kilian", "date": "1/3/2023"}, + "format_specification": null, + "start_url_context": "Complete the task given the context of the current page.", + "eval": [ { - "key": "eval", - "category": "permissive_string_match", - "note": "Switched to exact match" + "evaluator": "AgentResponseEvaluator", + "ordered": "false", + "results_schema": { "type": "array", "items": {"type": "number"} }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": [1] + } } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping_admin" - ], + "sites": ["shopping_admin"], "task_id": 208, - "require_login": true, - "storage_state": "./.auth/shopping_admin_state.json", - "start_url": "__SHOPPING_ADMIN__", - "geolocation": null, - "intent_template": "Find the customer name and email with phone number {{PhoneNum}}", - "instantiation_dict": { - "PhoneNum": "+1 2058812302" - }, - "intent": "Find the customer name and email with phone number +1 2058812302", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "verify_retrieved_value_normalized", - "eval_params": { - "ordered": false, - "type": "object" - }, - "expected_data": [ - { - "name": { - "value": "John Smith", - "type": "text" - }, - "email": { - "value": "john.smith.xyz@gmail.com", - "type": "text" - } - } - ] - } - } - ], - "site": "shopping_admin" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "must_include": [ - "John Smith", - "john.smith.xyz@gmail.com" - ] - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "John Smith, john.smith.xyz@gmail.com" - }, "intent_template_id": 364, + "start_urls": ["__SHOPPING_ADMIN__"], + "intent": "Find the customer name and email with phone number +1 2058812302", + "intent_template": "Find the customer name and email with phone number {{PhoneNum}}", + "instantiation_dict": {"PhoneNum": "+1 2058812302"}, "format_specification": "Use \"name\" for the customer name and \"email\" for the email.", - "original.format_specification": "Return customer name in field \"name\" and email in field \"email\"", - "changelogs": [ + "start_url_context": null, + "eval": [ { - "key": "eval", - "category": "permissive_string_match", - "note": "Switched to exact match" + "evaluator": "AgentResponseEvaluator", + "ordered": false, + "results_schema": { + "type": "array", + "items": { + "type": "object", + "properties": { "name": {"type": "string"}, "email": {"type": "string"} }, + "required": ["email", "name"] + } + }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": [ {"name": "John Smith", "email": "john.smith.xyz@gmail.com"} ] + } } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping_admin" - ], + "sites": ["shopping_admin"], "task_id": 209, - "require_login": true, - "storage_state": "./.auth/shopping_admin_state.json", - "start_url": "__SHOPPING_ADMIN__", - "geolocation": null, - "intent_template": "Find the customer name and email with phone number {{PhoneNum}}", - "instantiation_dict": { - "PhoneNum": "2137418080" - }, - "intent": "Find the customer name and email with phone number 2137418080", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "verify_retrieved_value_normalized", - "eval_params": { - "ordered": false, - "type": "object" - }, - "expected_data": [ - { - "name": { - "value": "Jennifer White", - "type": "text" - }, - "email": { - "value": "jennifer.white@yahoo.com", - "type": "text" - } - } - ] - } - } - ], - "site": "shopping_admin" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "must_include": [ - "Jennifer White", - "jennifer.white@yahoo.com" - ] - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "Jennifer White, jennifer.white@yahoo.com" - }, "intent_template_id": 364, + "start_urls": ["__SHOPPING_ADMIN__"], + "intent": "Find the customer name and email with phone number 2137418080", + "intent_template": "Find the customer name and email with phone number {{PhoneNum}}", + "instantiation_dict": {"PhoneNum": "2137418080"}, "format_specification": "Use \"name\" for the customer name and \"email\" for the email.", - "original.format_specification": "Return customer name in field \"name\" and email in field \"email\"", - "changelogs": [ + "start_url_context": null, + "eval": [ { - "key": "eval", - "category": "permissive_string_match", - "note": "Switched to exact match" + "evaluator": "AgentResponseEvaluator", + "ordered": false, + "results_schema": { + "type": "array", + "items": { + "type": "object", + "properties": { "name": {"type": "string"}, "email": {"type": "string"} }, + "required": ["email", "name"] + } + }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": [ {"name": "Jennifer White", "email": "jennifer.white@yahoo.com"} ] + } } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping_admin" - ], + "sites": ["shopping_admin"], "task_id": 210, - "require_login": true, - "storage_state": "./.auth/shopping_admin_state.json", - "start_url": "__SHOPPING_ADMIN__", - "geolocation": null, - "intent_template": "Find the customer name and email with phone number {{PhoneNum}}", - "instantiation_dict": { - "PhoneNum": "2065555555" - }, - "intent": "Find the customer name and email with phone number 2065555555", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "verify_retrieved_value_normalized", - "eval_params": { - "ordered": false, - "type": "object" - }, - "expected_data": [ - { - "name": { - "value": "Adam Garcia", - "type": "text" - }, - "email": { - "value": "gamingpro456@gmail.com", - "type": "text" - } - } - ] - } - } - ], - "site": "shopping_admin" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "must_include": [ - "Adam Garcia", - "gamingpro456@gmail.com" - ] - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "Adam Garcia, gamingpro456@gmail.com" - }, "intent_template_id": 364, + "start_urls": ["__SHOPPING_ADMIN__"], + "intent": "Find the customer name and email with phone number 2065555555", + "intent_template": "Find the customer name and email with phone number {{PhoneNum}}", + "instantiation_dict": {"PhoneNum": "2065555555"}, "format_specification": "Use \"name\" for the customer name and \"email\" for the email.", - "original.format_specification": "Return customer name in field \"name\" and email in field \"email\"", - "changelogs": [ + "start_url_context": null, + "eval": [ { - "key": "eval", - "category": "permissive_string_match", - "note": "Switched to exact match" + "evaluator": "AgentResponseEvaluator", + "ordered": false, + "results_schema": { + "type": "array", + "items": { + "type": "object", + "properties": { "name": {"type": "string"}, "email": {"type": "string"} }, + "required": ["email", "name"] + } + }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": [ {"name": "Adam Garcia", "email": "gamingpro456@gmail.com"} ] + } } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping_admin" - ], + "sites": ["shopping_admin"], "task_id": 211, - "require_login": true, - "storage_state": "./.auth/shopping_admin_state.json", - "start_url": "__SHOPPING_ADMIN__", - "geolocation": null, - "intent_template": "Find the customer name and email with phone number {{PhoneNum}}", - "instantiation_dict": { - "PhoneNum": "8015551212" - }, - "intent": "Find the customer name and email with phone number 8015551212", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "verify_retrieved_value_normalized", - "eval_params": { - "ordered": false, - "type": "object" - }, - "expected_data": [ - { - "name": { - "value": "Sean Miller", - "type": "text" - }, - "email": { - "value": "sean.miller@gmail.com", - "type": "text" - } - } - ] - } - } - ], - "site": "shopping_admin" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "must_include": [ - "Sean Miller", - "sean.miller@gmail.com" - ] - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "Sean Miller, sean.miller@gmail.com" - }, "intent_template_id": 364, + "start_urls": ["__SHOPPING_ADMIN__"], + "intent": "Find the customer name and email with phone number 8015551212", + "intent_template": "Find the customer name and email with phone number {{PhoneNum}}", + "instantiation_dict": {"PhoneNum": "8015551212"}, "format_specification": "Use \"name\" for the customer name and \"email\" for the email.", - "original.format_specification": "Return customer name in field \"name\" and email in field \"email\"", - "changelogs": [ + "start_url_context": null, + "eval": [ { - "key": "eval", - "category": "permissive_string_match", - "note": "Switched to exact match" + "evaluator": "AgentResponseEvaluator", + "ordered": false, + "results_schema": { + "type": "array", + "items": { + "type": "object", + "properties": { "name": {"type": "string"}, "email": {"type": "string"} }, + "required": ["email", "name"] + } + }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": [ {"name": "Sean Miller", "email": "sean.miller@gmail.com"} ] + } } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping_admin" - ], + "sites": ["shopping_admin"], "task_id": 212, - "require_login": true, - "storage_state": "./.auth/shopping_admin_state.json", - "start_url": "__SHOPPING_ADMIN__", - "geolocation": null, - "intent_template": "Find the customer name and email with phone number {{PhoneNum}}", - "instantiation_dict": { - "PhoneNum": "555-229-3326" - }, - "intent": "Find the customer name and email with phone number 555-229-3326", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "verify_retrieved_value_normalized", - "eval_params": { - "ordered": false, - "type": "object" - }, - "expected_data": [ - { - "name": { - "value": "Veronica Costello", - "type": "text" - }, - "email": { - "value": "roni_cost@example.com", - "type": "text" - } - } - ] - } - } - ], - "site": "shopping_admin" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "must_include": [ - "Veronica Costello", - "roni_cost@example.com" - ] - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "Veronica Costello, roni_cost@example.com" - }, "intent_template_id": 364, + "start_urls": ["__SHOPPING_ADMIN__"], + "intent": "Find the customer name and email with phone number 555-229-3326", + "intent_template": "Find the customer name and email with phone number {{PhoneNum}}", + "instantiation_dict": {"PhoneNum": "555-229-3326"}, "format_specification": "Use \"name\" for the customer name and \"email\" for the email.", - "original.format_specification": "Return customer name in field \"name\" and email in field \"email\"", - "changelogs": [ + "start_url_context": null, + "eval": [ { - "key": "eval", - "category": "permissive_string_match", - "note": "Switched to exact match" + "evaluator": "AgentResponseEvaluator", + "ordered": false, + "results_schema": { + "type": "array", + "items": { + "type": "object", + "properties": { "name": {"type": "string"}, "email": {"type": "string"} }, + "required": ["email", "name"] + } + }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": [ {"name": "Veronica Costello", "email": "roni_cost@example.com"} ] + } } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping_admin" - ], + "sites": ["shopping_admin"], "task_id": 213, - "require_login": true, - "storage_state": "./.auth/shopping_admin_state.json", - "start_url": "__SHOPPING_ADMIN__", - "geolocation": null, - "intent_template": "List all reviews with 3 stars or below for {{product}}.", - "original.intent_template": "What are the key aspects that the customers don't like about {{product}}", - "instantiation_dict": { - "product": "Antonia Racer Tank" - }, - "intent": "List all reviews with 3 stars or below for Antonia Racer Tank.", - "original.intent": "What are the key aspects that the customers don't like about Antonia Racer Tank", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "verify_retrieved_value_normalized", - "eval_params": { - "ordered": false, - "type": "object" - }, - "expected_data": [ - { - "title": { - "value": "Zero support/modesty", - "type": "text" - }, - "rating": { - "value": "2", - "type": "numeric" - } - }, - { - "title": { - "value": "Not for high impact", - "type": "text" - }, - "rating": { - "value": "3", - "type": "numeric" - } - } - ] - } - } - ], - "site": "shopping_admin" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "fuzzy_match": [ - "Not suitable for high-impact workouts" - ] - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "Not suitable for high-impact workouts" - }, "intent_template_id": 249, + "start_urls": ["__SHOPPING_ADMIN__"], + "intent": "List all reviews with 3 stars or below for Antonia Racer Tank.", + "intent_template": "List all reviews with 3 stars or below for {{product}}.", + "instantiation_dict": {"product": "Antonia Racer Tank"}, "format_specification": "Use \"title\" for the review title and \"rating\" for rating (number of stars).", - "changelogs": [ + "start_url_context": null, + "eval": [ { - "key": "eval", - "category": "permissive_string_match", - "note": "Switched from fuzzy eval" + "evaluator": "AgentResponseEvaluator", + "ordered": false, + "results_schema": { + "type": "array", + "items": { + "type": "object", + "properties": { "title": {"type": "string"}, "rating": {"type": "string"} }, + "required": ["rating", "title"] + } + }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": [ + { "title": "Zero support/modesty", "rating": "2" }, + { "title": "Not for high impact" , "rating": "3" } + ] + } } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping_admin" - ], + "sites": ["shopping_admin"], "task_id": 214, - "require_login": true, - "storage_state": "./.auth/shopping_admin_state.json", - "start_url": "__SHOPPING_ADMIN__", - "geolocation": null, - "intent_template": "List all reviews with 3 stars or below for {{product}}.", - "original.intent_template": "What are the key aspects that the customers don't like about {{product}}", - "instantiation_dict": { - "product": "Zing Jump Rope" - }, - "intent": "List all reviews with 3 stars or below for Zing Jump Rope.", - "original.intent": "What are the key aspects that the customers don't like about Zing Jump Rope", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "verify_retrieved_value_normalized", - "eval_params": { - "ordered": false, - "type": "object" - }, - "expected_data": [] - } - } - ], - "site": "shopping_admin" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "fuzzy_match": [ - "It is hard to find the right size. Won't last long" - ] - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "It is hard to find the right size. Won't last long" - }, "intent_template_id": 249, + "start_urls": ["__SHOPPING_ADMIN__"], + "intent": "List all reviews with 3 stars or below for Zing Jump Rope.", + "intent_template": "List all reviews with 3 stars or below for {{product}}.", + "instantiation_dict": {"product": "Zing Jump Rope"}, "format_specification": "Use \"title\" for the review title and \"rating\" for rating (number of stars).", - "changelogs": [ + "start_url_context": null, + "eval": [ { - "key": "eval", - "category": "permissive_string_match", - "note": "Switched from fuzzy eval" + "evaluator": "AgentResponseEvaluator", + "ordered": false, + "results_schema": { + "type": "array", + "items": { + "type": "object", + "properties": { "title": {"type": "string"}, "rating": {"type": "string"} }, + "required": ["rating", "title"] + } + }, + "expected": {"performed_operation": "retrieve", "status": "SUCCESS", "retrieved_data": []} } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping_admin" - ], + "sites": ["shopping_admin"], "task_id": 215, - "require_login": true, - "storage_state": "./.auth/shopping_admin_state.json", - "start_url": "__SHOPPING_ADMIN__", - "geolocation": null, - "intent_template": "List all reviews with 3 stars or below for {{product}}.", - "original.intent_template": "What are the key aspects that the customers don't like about {{product}}", - "instantiation_dict": { - "product": "Circe ice fleece" - }, - "intent": "List all reviews with 3 stars or below for Circe ice fleece.", - "original.intent": "What are the key aspects that the customers don't like about Circe ice fleece", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "verify_retrieved_value_normalized", - "eval_params": { - "ordered": false, - "type": "object" - }, - "expected_data": [ - { - "title": { - "value": "Bad!", - "type": "text" - }, - "rating": { - "value": "1", - "type": "numeric" - } - } - ] - } - } - ], - "site": "shopping_admin" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "fuzzy_match": [ - "Material quality, fit, insufficient warmth, color" - ] - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "Material quality, fit, insufficient warmth, color" - }, "intent_template_id": 249, + "start_urls": ["__SHOPPING_ADMIN__"], + "intent": "List all reviews with 3 stars or below for Circe ice fleece.", + "intent_template": "List all reviews with 3 stars or below for {{product}}.", + "instantiation_dict": {"product": "Circe ice fleece"}, "format_specification": "Use \"title\" for the review title and \"rating\" for rating (number of stars).", - "changelogs": [ + "start_url_context": null, + "eval": [ { - "key": "eval", - "category": "permissive_string_match", - "note": "Switched from fuzzy eval" + "evaluator": "AgentResponseEvaluator", + "ordered": false, + "results_schema": { + "type": "array", + "items": { + "type": "object", + "properties": { "title": {"type": "string"}, "rating": {"type": "string"} }, + "required": ["rating", "title"] + } + }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": [ {"title": "Bad!", "rating": "1"} ] + } } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping_admin" - ], + "sites": ["shopping_admin"], "task_id": 216, - "require_login": true, - "storage_state": "./.auth/shopping_admin_state.json", - "start_url": "__SHOPPING_ADMIN__", - "geolocation": null, - "intent_template": "List all reviews with 3 stars or below for {{product}}.", - "original.intent_template": "What are the key aspects that the customers don't like about {{product}}", - "instantiation_dict": { - "product": "Electra Bra Top" - }, - "intent": "List all reviews with 3 stars or below for Electra Bra Top.", - "original.intent": "What are the key aspects that the customers don't like about Electra Bra Top", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "verify_retrieved_value_normalized", - "eval_params": { - "ordered": false, - "type": "object" - }, - "expected_data": [ - { - "title": { - "value": "Not exactly true to size", - "type": "text" - }, - "rating": { - "value": "3", - "type": "numeric" - } - } - ] - } - } - ], - "site": "shopping_admin" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "fuzzy_match": [ - "Not true to size" - ] - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "Not true to size" - }, "intent_template_id": 249, + "start_urls": ["__SHOPPING_ADMIN__"], + "intent": "List all reviews with 3 stars or below for Electra Bra Top.", + "intent_template": "List all reviews with 3 stars or below for {{product}}.", + "instantiation_dict": {"product": "Electra Bra Top"}, "format_specification": "Use \"title\" for the review title and \"rating\" for rating (number of stars).", - "changelogs": [ + "start_url_context": null, + "eval": [ { - "key": "eval", - "category": "permissive_string_match", - "note": "Switched from fuzzy eval" + "evaluator": "AgentResponseEvaluator", + "ordered": false, + "results_schema": { + "type": "array", + "items": { + "type": "object", + "properties": { "title": {"type": "string"}, "rating": {"type": "string"} }, + "required": ["rating", "title"] + } + }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": [ {"title": "Not exactly true to size", "rating": "3"} ] + } } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping_admin" - ], + "sites": ["shopping_admin"], "task_id": 217, - "require_login": true, - "storage_state": "./.auth/shopping_admin_state.json", - "start_url": "__SHOPPING_ADMIN__", - "geolocation": null, - "intent_template": "List all reviews with 3 stars or below for {{product}}.", - "original.intent_template": "What are the key aspects that the customers don't like about {{product}}", - "instantiation_dict": { - "product": "Pursuit Tone Band" - }, - "intent": "List all reviews with 3 stars or below for Pursuit Tone Band.", - "original.intent": "What are the key aspects that the customers don't like about Pursuit Tone Band", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "verify_retrieved_value_normalized", - "eval_params": { - "ordered": false, - "type": "object" - }, - "expected_data": [ - { - "title": { - "value": "Agreed. More resistance", - "type": "text" - }, - "rating": { - "value": "3", - "type": "numeric" - } - }, - { - "title": { - "value": "Want more resistance", - "type": "text" - }, - "rating": { - "value": "3", - "type": "numeric" - } - } - ] - } - } - ], - "site": "shopping_admin" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "fuzzy_match": [ - "Insufficient resistance for their workouts." - ] - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "Insufficient resistance for their workouts." - }, "intent_template_id": 249, + "start_urls": ["__SHOPPING_ADMIN__"], + "intent": "List all reviews with 3 stars or below for Pursuit Tone Band.", + "intent_template": "List all reviews with 3 stars or below for {{product}}.", + "instantiation_dict": {"product": "Pursuit Tone Band"}, "format_specification": "Use \"title\" for the review title and \"rating\" for rating (number of stars).", - "changelogs": [ + "start_url_context": null, + "eval": [ { - "key": "eval", - "category": "permissive_string_match", - "note": "Switched from fuzzy eval" + "evaluator": "AgentResponseEvaluator", + "ordered": false, + "results_schema": { + "type": "array", + "items": { + "type": "object", + "properties": { "title": {"type": "string"}, "rating": {"type": "string"} }, + "required": ["rating", "title"] + } + }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": [ + { "title": "Agreed. More resistance", "rating": "3" }, + { "title": "Want more resistance" , "rating": "3" } + ] + } } - ] + ], + "revision": 2 }, { - "sites": [ - "map" - ], + "sites": ["map"], "task_id": 218, - "require_login": true, - "storage_state": null, - "start_url": "__MAP__", - "geolocation": null, - "intent_template": "Show me the walking distance from nearby hotels to {{location}} that take at most {{n}} minutes?", - "instantiation_dict": { - "location": "CMU, Pittsburgh", - "n": "5" - }, - "intent": "Show me the walking distance from nearby hotels to CMU, Pittsburgh that take at most 5 minutes?", - "require_reset": false, - "eval": { - "site": "map" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "fuzzy_match": "N/A" - }, - "reference_url": "", - "program_html": [], - "string_note": "There is no hotel near CMU that is within 5 minutes walking distance", - "reference_answer_raw_annotation": "N/A" - }, "intent_template_id": 41, - "changelogs": [ + "start_urls": ["__MAP__"], + "intent": "Get the name of the hotel and walking distance of nearby hotels to CMU, Pittsburgh that take at most 6 minutes", + "intent_template": "Get the name of the hotel and walking distance of nearby hotels to {{location}} that take at most {{n}} minutes", + "instantiation_dict": {"location": "CMU, Pittsburgh", "n": "5"}, + "format_specification": "Use \"hotel\" for the name and \"distance\" for the distance.", + "start_url_context": null, + "eval": [ { - "key": "expected_retrieve_value", - "category": "unachievable_tasks", - "note": "Status check used instead of fuzzy match" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": { + "performed_operation": "retrieve", + "status": "NOT_FOUND_ERROR", + "retrieved_data": null + } } - ] + ], + "revision": 2 }, { - "sites": [ - "map" - ], + "sites": ["map"], "task_id": 219, - "require_login": true, - "storage_state": null, - "start_url": "__MAP__", - "geolocation": null, - "intent_template": "Show me the walking distance from nearby hotels to {{location}} that take at most {{n}} minutes?", - "instantiation_dict": { - "location": "Pittsburgh airport", - "n": "3" - }, - "intent": "Show me the walking distance from nearby hotels to Pittsburgh airport that take at most 3 minutes?", - "require_reset": false, - "eval": { - "site": "map" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "fuzzy_match": "N/A" - }, - "reference_url": "", - "program_html": [], - "string_note": "There is no hotel near CMU that is within 5 minutes walking distance", - "reference_answer_raw_annotation": "N/A" - }, "intent_template_id": 41, - "changelogs": [ + "start_urls": ["__MAP__"], + "intent": "Get the name of the hotel and walking distance of nearby hotels to Pittsburgh airport that take at most 3 minutes", + "intent_template": "Get the name of the hotel and walking distance of nearby hotels to {{location}} that take at most {{n}} minutes", + "instantiation_dict": {"location": "Pittsburgh airport", "n": "3"}, + "format_specification": "Use \"hotel\" for the name and \"distance\" for the distance.", + "start_url_context": null, + "eval": [ { - "key": "expected_retrieve_value", - "category": "unachievable_tasks", - "note": "Status check used instead of fuzzy match" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": { + "performed_operation": "retrieve", + "status": "NOT_FOUND_ERROR", + "retrieved_data": null + } } - ] + ], + "revision": 2 }, { - "sites": [ - "map" - ], + "sites": ["map"], "task_id": 220, - "require_login": true, - "storage_state": null, - "start_url": "__MAP__", - "geolocation": null, - "intent_template": "Show me the walking distance from nearby hotels to {{location}} that take at most {{n}} minutes?", - "instantiation_dict": { - "location": "Gardner Steel Conference Center,", - "n": 5 - }, - "intent": "Show me the walking distance from nearby hotels to Gardner Steel Conference Center, that take at most 5 minutes?", - "require_reset": false, - "eval": { - "site": "map" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "fuzzy_match": [ - "Wyndham Pittsburgh University Cente: 375m", - "The Oaklander Hotel: 338m" - ] - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "Wyndham Pittsburgh University Cente: 375 m\nThe Oaklander Hotel: 338 m" - }, "intent_template_id": 41, - "changelogs": [ + "start_urls": ["__MAP__"], + "intent": "Get the name of the hotel and walking distance of nearby hotels to Gardner Steel Conference Center, that take at most 5 minutes", + "intent_template": "Tell me the name of the hotel and walking distance of nearby hotels to {{location}} that take at most {{n}} minutes", + "instantiation_dict": {"location": "Gardner Steel Conference Center,", "n": 5}, + "format_specification": "Use \"hotel\" for the name and \"distance\" for the distance.", + "start_url_context": null, + "eval": [ { - "key": "eval", - "category": "permissive_string_match", - "note": "Switched from fuzzy eval" + "evaluator": "AgentResponseEvaluator", + "ordered": false, + "results_schema": { + "type": "array", + "items": { + "type": "object", + "properties": { + "name" : { "type": "string" }, + "distance": { "type": "string", "format": "distance" } + }, + "required": ["distance", "name"] + } + }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": [ + { "name": "Wyndham Pittsburgh University Cente", "distance": "375m" }, + { "name": "The Oaklander Hotel" , "distance": "338m" } + ] + } } - ] + ], + "revision": 2 }, { - "sites": [ - "map" - ], + "sites": ["map"], "task_id": 221, - "require_login": true, - "storage_state": null, - "start_url": "__MAP__", - "geolocation": null, - "intent_template": "I am at CMU Pittsburgh, how long it takes to the nearest {{location}} with different transportation methods?", - "instantiation_dict": { - "location": "USPS postal office" - }, - "intent": "I am at CMU Pittsburgh, how long it takes to the nearest USPS postal office with different transportation methods?", - "require_reset": false, - "eval": { - "site": "map" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "fuzzy_match": [ - "Walk: 1 minute", - "Drive: less than 1 minute", - "Bike: less than 1 minute" - ] - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "Walk: 1 minute to walk and\nDrive: less than 1 minute\nBike: less than 1 minute" - }, "intent_template_id": 35, - "changelogs": [ + "start_urls": ["__MAP__"], + "intent": "I am at CMU Pittsburgh, how long does it take to the nearest USPS postal office with different transportation methods?", + "intent_template": "I am at CMU Pittsburgh, how long does it take to the nearest {{location}} with different transportation methods?", + "instantiation_dict": {"location": "USPS postal office"}, + "format_specification": "Use \"transportation_method\" for the transportation method and \"duration\" for the duration.", + "start_url_context": null, + "eval": [ { - "key": "eval", - "category": "permissive_string_match", - "note": "Switched from fuzzy eval" + "evaluator": "AgentResponseEvaluator", + "ordered": false, + "results_schema": { + "type": "array", + "items": { + "type": "object", + "properties": { + "transportation_method": { "type": "string" }, + "duration" : { "type": "string", "format": "duration" } + }, + "required": ["duration", "transportation_method"] + } + }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": [ + { "transportation_method": "Walking", "duration": "19min" }, + { "transportation_method": "Driving", "duration": "2min" }, + { "transportation_method": "Biking" , "duration": "7min" } + ] + } } - ] + ], + "revision": 2 }, { - "sites": [ - "map" - ], + "sites": ["map"], "task_id": 222, - "require_login": true, - "storage_state": null, - "start_url": "__MAP__", - "geolocation": null, - "intent_template": "I am at CMU Pittsburgh, how long it takes to drive to the nearest {{location}}", - "instantiation_dict": { - "location": "cold stone ice cream" - }, - "intent": "I am at CMU Pittsburgh, how long it takes to drive to the nearest cold stone ice cream", - "require_reset": false, - "eval": { - "site": "map" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "fuzzy_match": [ - "3min" - ] - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "3min" - }, "intent_template_id": 35, - "changelogs": [ + "start_urls": ["__MAP__"], + "intent": "I am at CMU Pittsburgh, how long does it take to the nearest cold stone ice cream with different transportation methods?", + "intent_template": "I am at CMU Pittsburgh, how long does it take to the nearest {{location}} with different transportation methods?", + "instantiation_dict": {"location": "cold stone ice cream"}, + "format_specification": "Use \"transportation_method\" for the transportation method and \"duration\" for the duration.", + "start_url_context": null, + "eval": [ { - "key": "eval", - "category": "permissive_string_match", - "note": "Switched from fuzzy eval" + "evaluator": "AgentResponseEvaluator", + "ordered": false, + "results_schema": { "type": "array", "items": {"type": "string"} }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": ["3min"] + } } - ] + ], + "revision": 2 }, { - "sites": [ - "map" - ], + "sites": ["map"], "task_id": 223, - "require_login": true, - "storage_state": null, - "start_url": "__MAP__", - "geolocation": null, - "intent_template": "I am at CMU Pittsburgh, how long it takes to drive to the nearest {{location}}", - "instantiation_dict": { - "location": "Mcdonald's" - }, - "intent": "I am at CMU Pittsburgh, how long it takes to drive to the nearest Mcdonald's", - "require_reset": false, - "eval": { - "site": "map" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "fuzzy_match": [ - "4min" - ] - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "4min" - }, "intent_template_id": 35, - "changelogs": [ + "start_urls": ["__MAP__"], + "intent": "I am at CMU Pittsburgh, how long does it take to the nearest Mcdonald's with different transportation methods?", + "intent_template": "I am at CMU Pittsburgh, how long does it take to the nearest {{location}} with different transportation methods?", + "instantiation_dict": {"location": "Mcdonald's"}, + "format_specification": "Use \"transportation_method\" for the transportation method and \"duration\" for the duration.", + "start_url_context": null, + "eval": [ { - "key": "eval", - "category": "permissive_string_match", - "note": "Switched from fuzzy eval" + "evaluator": "AgentResponseEvaluator", + "ordered": false, + "results_schema": { "type": "array", "items": {"type": "string"} }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": ["4min"] + } } - ] + ], + "revision": 2 }, { - "sites": [ - "map" - ], + "sites": ["map"], "task_id": 224, - "require_login": true, - "storage_state": null, - "start_url": "__MAP__", - "geolocation": null, - "intent_template": "I am at CMU Pittsburgh, how long it takes to drive to the nearest {{location}}", - "instantiation_dict": { - "location": "wendys" - }, - "intent": "I am at CMU Pittsburgh, how long it takes to drive to the nearest wendys", - "require_reset": false, - "eval": { - "site": "map" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "fuzzy_match": [ - "3min" - ] - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "3min" - }, "intent_template_id": 35, - "changelogs": [ + "start_urls": ["__MAP__"], + "intent": "I am at CMU Pittsburgh, how long does it take to the nearest wendys with different transportation methods?", + "intent_template": "I am at CMU Pittsburgh, how long does it take to the nearest {{location}} with different transportation methods?", + "instantiation_dict": {"location": "wendys"}, + "format_specification": "Use \"transportation_method\" for the transportation method and \"duration\" for the duration.", + "start_url_context": null, + "eval": [ { - "key": "eval", - "category": "permissive_string_match", - "note": "Switched from fuzzy eval" + "evaluator": "AgentResponseEvaluator", + "ordered": false, + "results_schema": { "type": "array", "items": {"type": "string"} }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": ["3min"] + } } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping" - ], + "sites": ["shopping"], "task_id": 225, - "require_login": true, - "storage_state": "./.auth/shopping_state.json", - "start_url": "__SHOPPING__", - "geolocation": null, - "intent_template": "Return the titles for reviews with 3 stars or below for {{product_type}} from {{manufature}}", - "original.intent_template": "What do customers say about {{product_type}} from {{manufature}}", - "instantiation_dict": { - "product_type": "brush", - "manufature": "sephora" - }, - "intent": "Return the titles for reviews with 3 stars or below for brush from sephora", - "original.intent": "What do customers say about brush from sephora", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "validate_status", - "expected_data": { - "expected_status": "NOT_FOUND_ERROR" - } - } - } - ], - "site": "shopping" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "fuzzy_match": "N/A" - }, - "reference_url": "", - "program_html": [], - "string_note": "The sephora brushes don't have reviews", - "reference_answer_raw_annotation": "N/A" - }, "intent_template_id": 135, - "changelogs": [ + "start_urls": ["__SHOPPING__"], + "intent": "Return the titles for reviews with 3 stars or below for brush from sephora", + "intent_template": "Return the titles for reviews with 3 stars or below for {{product_type}} from {{manufature}}", + "instantiation_dict": {"product_type": "brush", "manufature": "sephora"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "expected_retrieve_value", - "category": "unachievable_tasks", - "note": "Status check used instead of fuzzy match" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": { + "performed_operation": "retrieve", + "status": "NOT_FOUND_ERROR", + "retrieved_data": null + } } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping" - ], + "sites": ["shopping"], "task_id": 226, - "require_login": true, - "storage_state": "./.auth/shopping_state.json", - "start_url": "__SHOPPING__", - "geolocation": null, - "intent_template": "What is the price range for products from {{brand}}?", - "instantiation_dict": { - "brand": "Amazon basic" - }, - "intent": "What is the price range for products from Amazon basic?", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "verify_retrieved_value_normalized", - "eval_params": { - "ordered": false, - "type": "object" - }, - "expected_data": [ - { - "min": { - "value": 5.49, - "type": "currency" - }, - "max": { - "value": 375.19, - "type": "currency" - } - } - ] - } - } - ], - "site": "shopping" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "must_include": [ - "5.49", - "375.19" - ] - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "$5.49 - $375.19" - }, "intent_template_id": 370, + "start_urls": ["__SHOPPING__"], + "intent": "What is the price range for products from Amazon basic?", + "intent_template": "What is the price range for products from {{brand}}?", + "instantiation_dict": {"brand": "Amazon basic"}, "format_specification": "Use \"min\" for lowest price and \"max\" for highest price.", - "changelogs": [ + "start_url_context": null, + "eval": [ { - "key": "eval", - "category": "permissive_string_match", - "note": "Switched to exact match" + "evaluator": "AgentResponseEvaluator", + "ordered": false, + "results_schema": { + "type": "array", + "items": { + "type": "object", + "properties": { "min": {"type": "number"}, "max": {"type": "number"} }, + "required": ["max", "min"] + } + }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": [ {"min": 5.49, "max": 375.19} ] + } } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping" - ], + "sites": ["shopping"], "task_id": 227, - "require_login": true, - "storage_state": "./.auth/shopping_state.json", - "start_url": "__SHOPPING__", - "geolocation": null, - "intent_template": "What is the price range for products from {{brand}}?", - "instantiation_dict": { - "brand": "EYZUTAK" - }, - "intent": "What is the price range for products from EYZUTAK?", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "verify_retrieved_value_normalized", - "eval_params": { - "ordered": false, - "type": "object" - }, - "expected_data": [ - { - "min": { - "value": 9.99, - "type": "currency" - }, - "max": { - "value": 9.99, - "type": "currency" - } - } - ] - } - } - ], - "site": "shopping" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "must_include": [ - "9.99" - ] - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "$9.99" - }, "intent_template_id": 370, + "start_urls": ["__SHOPPING__"], + "intent": "What is the price range for products from EYZUTAK?", + "intent_template": "What is the price range for products from {{brand}}?", + "instantiation_dict": {"brand": "EYZUTAK"}, "format_specification": "Use \"min\" for lowest price and \"max\" for highest price.", - "changelogs": [ + "start_url_context": null, + "eval": [ { - "key": "eval", - "category": "permissive_string_match", - "note": "Switched to exact match" + "evaluator": "AgentResponseEvaluator", + "ordered": false, + "results_schema": { + "type": "array", + "items": { + "type": "object", + "properties": { "min": {"type": "number"}, "max": {"type": "number"} }, + "required": ["max", "min"] + } + }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": [ {"min": 9.99, "max": 9.99} ] + } } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping" - ], + "sites": ["shopping"], "task_id": 228, - "require_login": true, - "storage_state": "./.auth/shopping_state.json", - "start_url": "__SHOPPING__", - "geolocation": null, - "intent_template": "What is the price range for products from {{brand}}?", - "instantiation_dict": { - "brand": "sephora" - }, - "intent": "What is the price range for products from sephora?", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "verify_retrieved_value_normalized", - "eval_params": { - "ordered": false, - "type": "object" - }, - "expected_data": [ - { - "min": { - "value": 18.18, - "type": "currency" - }, - "max": { - "value": 94.99, - "type": "currency" - } - } - ] - } - } - ], - "site": "shopping" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "must_include": [ - "18.18", - "94.99" - ] - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "$18.18 - $94.99" - }, "intent_template_id": 370, + "start_urls": ["__SHOPPING__"], + "intent": "What is the price range for products from sephora?", + "intent_template": "What is the price range for products from {{brand}}?", + "instantiation_dict": {"brand": "sephora"}, "format_specification": "Use \"min\" for lowest price and \"max\" for highest price.", - "changelogs": [ + "start_url_context": null, + "eval": [ { - "key": "eval", - "category": "permissive_string_match", - "note": "Switched to exact match" + "evaluator": "AgentResponseEvaluator", + "ordered": false, + "results_schema": { + "type": "array", + "items": { + "type": "object", + "properties": { "min": {"type": "number"}, "max": {"type": "number"} }, + "required": ["max", "min"] + } + }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": [ {"min": 18.18, "max": 94.99} ] + } } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping" - ], + "sites": ["shopping"], "task_id": 229, - "require_login": true, - "storage_state": "./.auth/shopping_state.json", - "start_url": "__SHOPPING__", - "geolocation": null, - "intent_template": "What is the price range for products from {{brand}}?", - "instantiation_dict": { - "brand": "ugreen" - }, - "intent": "What is the price range for products from ugreen?", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "verify_retrieved_value_normalized", - "eval_params": { - "ordered": false, - "type": "object" - }, - "expected_data": [ - { - "min": { - "value": 6.99, - "type": "currency" - }, - "max": { - "value": 38.99, - "type": "currency" - } - } - ] - } - } - ], - "site": "shopping" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "must_include": [ - "6.99", - "38.99" - ] - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "$6.99 - $38.99" - }, "intent_template_id": 370, + "start_urls": ["__SHOPPING__"], + "intent": "What is the price range for products from ugreen?", + "intent_template": "What is the price range for products from {{brand}}?", + "instantiation_dict": {"brand": "ugreen"}, "format_specification": "Use \"min\" for lowest price and \"max\" for highest price.", - "changelogs": [ + "start_url_context": null, + "eval": [ { - "key": "eval", - "category": "permissive_string_match", - "note": "Switched to exact match" + "evaluator": "AgentResponseEvaluator", + "ordered": false, + "results_schema": { + "type": "array", + "items": { + "type": "object", + "properties": { "min": {"type": "number"}, "max": {"type": "number"} }, + "required": ["max", "min"] + } + }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": [ {"min": 6.99, "max": 38.99} ] + } } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping" - ], + "sites": ["shopping"], "task_id": 230, - "require_login": true, - "storage_state": "./.auth/shopping_state.json", - "start_url": "__SHOPPING__", - "geolocation": null, - "intent_template": "What is the price range for products from {{brand}}?", - "instantiation_dict": { - "brand": "Perricone MD" - }, - "intent": "What is the price range for products from Perricone MD?", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "verify_retrieved_value_normalized", - "eval_params": { - "ordered": false, - "type": "object" - }, - "expected_data": [ - { - "min": { - "value": 35.0, - "type": "currency" - }, - "max": { - "value": 149.0, - "type": "currency" - } - } - ] - } - } - ], - "site": "shopping" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "must_include": [ - "35", - "149" - ] - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "$35 - $149" - }, "intent_template_id": 370, + "start_urls": ["__SHOPPING__"], + "intent": "What is the price range for products from Perricone MD?", + "intent_template": "What is the price range for products from {{brand}}?", + "instantiation_dict": {"brand": "Perricone MD"}, "format_specification": "Use \"min\" for lowest price and \"max\" for highest price.", - "changelogs": [ + "start_url_context": null, + "eval": [ { - "key": "eval", - "category": "permissive_string_match", - "note": "Switched to exact match" + "evaluator": "AgentResponseEvaluator", + "ordered": false, + "results_schema": { + "type": "array", + "items": { + "type": "object", + "properties": { "min": {"type": "number"}, "max": {"type": "number"} }, + "required": ["max", "min"] + } + }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": [ {"min": 35.0, "max": 149.0} ] + } } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping" - ], + "sites": ["shopping"], "task_id": 231, - "require_login": true, - "storage_state": "./.auth/shopping_state.json", - "start_url": "__SHOPPING__", - "geolocation": null, - "intent_template": "Get the order number of my most recent {{status}} order ", - "instantiation_dict": { - "status": "cancelled" - }, - "intent": "Get the order number of my most recent cancelled order ", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "verify_retrieved_value_normalized", - "eval_params": { - "ordered": false, - "type": "numeric" - }, - "expected_data": [ - 170 - ] - } - } - ], - "site": "shopping" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "must_include": [ - "170" - ] - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "000000170" - }, "intent_template_id": 213, - "changelogs": [ + "start_urls": ["__SHOPPING__"], + "intent": "Get the order number of my most recent cancelled order ", + "intent_template": "Get the order number of my most recent {{status}} order ", + "instantiation_dict": {"status": "cancelled"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "eval", - "category": "permissive_string_match", - "note": "Switched to exact match" + "evaluator": "AgentResponseEvaluator", + "ordered": false, + "results_schema": { "type": "array", "items": {"type": "number"} }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": [170] + } } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping" - ], + "sites": ["shopping"], "task_id": 232, - "require_login": true, - "storage_state": "./.auth/shopping_state.json", - "start_url": "__SHOPPING__", - "geolocation": null, - "intent_template": "Get the order number of my most recent {{status}} order ", - "instantiation_dict": { - "status": "pending" - }, - "intent": "Get the order number of my most recent pending order ", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "verify_retrieved_value_normalized", - "eval_params": { - "ordered": false, - "type": "numeric" - }, - "expected_data": [ - 189 - ] - } - } - ], - "site": "shopping" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "must_include": [ - "189" - ] - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "000000189" - }, "intent_template_id": 213, - "changelogs": [ + "start_urls": ["__SHOPPING__"], + "intent": "Get the order number of my most recent pending order ", + "intent_template": "Get the order number of my most recent {{status}} order ", + "instantiation_dict": {"status": "pending"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "eval", - "category": "permissive_string_match", - "note": "Switched to exact match" + "evaluator": "AgentResponseEvaluator", + "ordered": false, + "results_schema": { "type": "array", "items": {"type": "number"} }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": [189] + } } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping" - ], + "sites": ["shopping"], "task_id": 233, - "require_login": true, - "storage_state": "./.auth/shopping_state.json", - "start_url": "__SHOPPING__", - "geolocation": null, - "intent_template": "Get the order number of my most recent {{status}} order ", - "instantiation_dict": { - "status": "complete" - }, - "intent": "Get the order number of my most recent complete order ", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "verify_retrieved_value_normalized", - "eval_params": { - "ordered": false, - "type": "numeric" - }, - "expected_data": [ - 180 - ] - } - } - ], - "site": "shopping" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "must_include": [ - "180" - ] - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "000000180" - }, "intent_template_id": 213, - "changelogs": [ + "start_urls": ["__SHOPPING__"], + "intent": "Get the order number of my most recent complete order ", + "intent_template": "Get the order number of my most recent {{status}} order ", + "instantiation_dict": {"status": "complete"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "eval", - "category": "permissive_string_match", - "note": "Switched to exact match" + "evaluator": "AgentResponseEvaluator", + "ordered": false, + "results_schema": { "type": "array", "items": {"type": "number"} }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": [180] + } } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping" - ], + "sites": ["shopping"], "task_id": 234, - "require_login": true, - "storage_state": "./.auth/shopping_state.json", - "start_url": "__SHOPPING__", - "geolocation": null, - "intent_template": "Get the order number of my most recent {{status}} order ", - "instantiation_dict": { - "status": "on hold" - }, - "intent": "Get the order number of my most recent on hold order ", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "validate_status", - "expected_data": { - "expected_status": "NOT_FOUND_ERROR" - } - } - } - ], - "site": "shopping" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "fuzzy_match": "N/A" - }, - "reference_url": "", - "program_html": [], - "string_note": "there is no on hold order", - "reference_answer_raw_annotation": "N/A" - }, "intent_template_id": 213, - "changelogs": [ + "start_urls": ["__SHOPPING__"], + "intent": "Get the order number of my most recent on hold order ", + "intent_template": "Get the order number of my most recent {{status}} order ", + "instantiation_dict": {"status": "on hold"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "expected_retrieve_value", - "category": "unachievable_tasks", - "note": "Status check used instead of fuzzy match" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": { + "performed_operation": "retrieve", + "status": "NOT_FOUND_ERROR", + "retrieved_data": null + } } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping" - ], + "sites": ["shopping"], "task_id": 235, - "require_login": true, - "storage_state": "./.auth/shopping_state.json", - "start_url": "__SHOPPING__", - "geolocation": null, - "intent_template": "Get the order number of my most recent {{status}} order ", - "instantiation_dict": { - "status": "under delivery" - }, - "intent": "Get the order number of my most recent under delivery order ", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "validate_status", - "expected_data": { - "expected_status": "NOT_FOUND_ERROR" - } - } - } - ], - "site": "shopping" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "fuzzy_match": "N/A" - }, - "reference_url": "", - "program_html": [], - "string_note": "There is no under delivery order", - "reference_answer_raw_annotation": "N/A" - }, "intent_template_id": 213, - "changelogs": [ + "start_urls": ["__SHOPPING__"], + "intent": "Get the order number of my most recent under delivery order ", + "intent_template": "Get the order number of my most recent {{status}} order ", + "instantiation_dict": {"status": "under delivery"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "expected_retrieve_value", - "category": "unachievable_tasks", - "note": "Status check used instead of fuzzy match" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": { + "performed_operation": "retrieve", + "status": "NOT_FOUND_ERROR", + "retrieved_data": null + } } - ] + ], + "revision": 2 }, { - "sites": [ - "map" - ], + "sites": ["map"], "task_id": 236, - "require_login": true, - "storage_state": null, - "start_url": "__MAP__", - "geolocation": null, - "intent_template": "Where is the nearest {{location}} from {{location2}} {{condition}}", + "intent_template_id": 39, + "start_urls": ["__MAP__"], + "intent": "Get the name and address of the nearest pharmacy from Carnegie Mellon I can walk within 20mins", + "intent_template": "Get the name and address of the nearest {{location}} from {{location2}} {{condition}}", "instantiation_dict": { "location": "pharmacy", "location2": "Carnegie Mellon", "condition": "I can walk within 20mins" }, - "intent": "Where is the nearest pharmacy from Carnegie Mellon I can walk within 20mins", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "verify_retrieved_value_normalized", - "eval_params": { - "ordered": false, - "type": "text" - }, - "expected_data": [ - "Schiller's Pharmacy", - "811", - "South Aiken Avenue", - "Shadyside", - "Pittsburgh" - ] - } + "format_specification": "Return an object with keys \"name\", \"house_number\", \"street\", \"city\", \"state\", and \"postcode\"", + "start_url_context": null, + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "ordered": true, + "results_schema": { + "type": "array", + "items": { + "type": "object", + "title": "full_address", + "properties": { + "name" : { "type": "string" }, + "house_number": { "type": "string" }, + "street" : { "type": "string" }, + "city" : { "type": "string" }, + "state" : { "type": "string" }, + "postcode" : { "type": "string" } + }, + "required": ["city", "name", "state", "house_number", "street", "postcode"] + } + }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": [ + { + "name": "Schiller's Pharmacy", + "house_number": "811", + "street": "South Aiken Avenue", + "city": "Pittsburgh", + "state": "Pennsylvania", + "postcode": "15232" + } + ] } - ], - "site": "map" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "must_include": [ - "Schiller's Pharmacy", - "811", - "South Aiken Avenue", - "Shadyside", - "Pittsburgh" - ] - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "Schiller's Pharmacy, 811, South Aiken Avenue, Shadyside, Pittsburgh, Allegheny County, 15232, United States" - }, - "intent_template_id": 39, - "changelogs": [ - { - "key": "eval", - "category": "permissive_string_match", - "note": "Switched to exact match" } - ] + ], + "revision": 2 }, { - "sites": [ - "map" - ], + "sites": ["map"], "task_id": 237, - "require_login": true, - "storage_state": null, - "start_url": "__MAP__", - "geolocation": null, - "intent_template": "Where is the nearest {{location}} from {{location2}} {{condition}}", - "instantiation_dict": { - "location": "gas station", - "location2": "CMU", - "condition": "" - }, - "intent": "Where is the nearest gas station from CMU ", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "verify_retrieved_value_normalized", - "eval_params": { - "ordered": false, - "type": "text" - }, - "expected_data": [ - "Sunoco", - "North Craig Street", - "North Oakland", - "Pittsburgh" - ] - } - } - ], - "site": "map" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "must_include": [ - "Sunoco", - "North Craig Street", - "North Oakland", - "Pittsburgh" - ] - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "Sunoco, North Craig Street, North Oakland, Pittsburgh, Allegheny County, 15213, United States" - }, "intent_template_id": 39, - "changelogs": [ - { - "key": "eval", - "category": "permissive_string_match", - "note": "Switched to exact match" + "start_urls": ["__MAP__"], + "intent": "Get the name and address of the nearest gas station from CMU", + "intent_template": "Get the name and address of the nearest {{location}} from {{location2}} {{condition}}", + "instantiation_dict": {"location": "gas station", "location2": "CMU", "condition": ""}, + "format_specification": "Return an object with keys \"name\", \"house_number\", \"street\", \"city\", \"state\", and \"postcode\"", + "start_url_context": null, + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "ordered": true, + "results_schema": { + "type": "array", + "items": { + "type": "object", + "title": "full_address", + "properties": { + "name" : { "type": "string" }, + "house_number": { "type": "string" }, + "street" : { "type": "string" }, + "city" : { "type": "string" }, + "state" : { "type": "string" }, + "postcode" : { "type": "string" } + }, + "required": ["city", "name", "state", "house_number", "street", "postcode"] + } + }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": [ + { + "name": "Sunoco", + "house_number": "", + "street": "North Craig Street", + "city": "Pittsburgh", + "state": "Pennsylvania", + "postcode": "15213" + } + ] + } } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping" - ], + "sites": ["shopping"], "task_id": 238, - "require_login": true, - "storage_state": "./.auth/shopping_state.json", - "start_url": "__SHOPPING__", - "geolocation": null, - "intent_template": "I am doing a market survey for one stop market, show me the most expensive product from {{product_category}} category", - "instantiation_dict": { - "product_category": "PS4 accessories" - }, + "intent_template_id": 138, + "start_urls": ["__SHOPPING__"], "intent": "I am doing a market survey for one stop market, show me the most expensive product from PS4 accessories category", - "require_reset": false, - "eval": { - "expected_ui_state": [ - { - "url": "__SHOPPING__/astro-gaming-a50-wireless-headset-base-station-gen-4-compatible-with-ps5-ps4-pc-mac-black-silver.html" - } - ], - "site": "shopping" - }, - "original.eval": { - "eval_types": [ - "url_match" - ], - "reference_answers": null, - "reference_url": "__SHOPPING__/astro-gaming-a50-wireless-headset-base-station-gen-4-compatible-with-ps5-ps4-pc-mac-black-silver.html", - "program_html": [], - "url_note": "GOLD in PRED" - }, - "intent_template_id": 138 + "intent_template": "I am doing a market survey for one stop market, show me the most expensive product from {{product_category}} category", + "instantiation_dict": {"product_category": "PS4 accessories"}, + "format_specification": null, + "start_url_context": null, + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} + }, + { + "evaluator": "NetworkEventEvaluator", + "site": "shopping", + "url_match_mode": "exact", + "last_event_only": true, + "expected": { + "url": "__SHOPPING__/astro-gaming-a50-wireless-headset-base-station-gen-4-compatible-with-ps5-ps4-pc-mac-black-silver.html", + "response_status": 200, + "event_type": "navigation" + } + } + ], + "revision": 2 }, { - "sites": [ - "shopping" - ], + "sites": ["shopping"], "task_id": 239, - "require_login": true, - "storage_state": "./.auth/shopping_state.json", - "start_url": "__SHOPPING__", - "geolocation": null, - "intent_template": "I am doing a market survey for one stop market, show me the most expensive product from {{product_category}} category", - "instantiation_dict": { - "product_category": "nutrition bars and drinks" - }, + "intent_template_id": 138, + "start_urls": ["__SHOPPING__"], "intent": "I am doing a market survey for one stop market, show me the most expensive product from nutrition bars and drinks category", - "require_reset": false, - "eval": { - "expected_ui_state": [ - { - "url": "__SHOPPING__/kellogg-s-special-k-protein-meal-bars-chocolate-caramel-12-7oz-6-count.html" - } - ], - "site": "shopping" - }, - "original.eval": { - "eval_types": [ - "url_match" - ], - "reference_answers": null, - "reference_url": "__SHOPPING__/kellogg-s-special-k-protein-meal-bars-chocolate-caramel-12-7oz-6-count.html", - "program_html": [], - "url_note": "GOLD in PRED" - }, - "intent_template_id": 138 + "intent_template": "I am doing a market survey for one stop market, show me the most expensive product from {{product_category}} category", + "instantiation_dict": {"product_category": "nutrition bars and drinks"}, + "format_specification": null, + "start_url_context": null, + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} + }, + { + "evaluator": "NetworkEventEvaluator", + "site": "shopping", + "url_match_mode": "exact", + "last_event_only": true, + "expected": { + "url": "__SHOPPING__/kellogg-s-special-k-protein-meal-bars-chocolate-caramel-12-7oz-6-count.html", + "response_status": 200, + "event_type": "navigation" + } + } + ], + "revision": 2 }, { - "sites": [ - "shopping" - ], + "sites": ["shopping"], "task_id": 240, - "require_login": true, - "storage_state": "./.auth/shopping_state.json", - "start_url": "__SHOPPING__", - "geolocation": null, - "intent_template": "I am doing a market survey for one stop market, show me the most expensive product from {{product_category}} category", - "instantiation_dict": { - "product_category": "competitive swimwear" - }, + "intent_template_id": 138, + "start_urls": ["__SHOPPING__"], "intent": "I am doing a market survey for one stop market, show me the most expensive product from competitive swimwear category", - "require_reset": false, - "eval": { - "expected_ui_state": [ - { - "url": "__SHOPPING__/women-cross-flower-beachwear-tankini-bandeau-bandage-bikini-set-push-up-swimwear-bathing-suit-two-pieces-swimsuits.html" - } - ], - "site": "shopping" - }, - "original.eval": { - "eval_types": [ - "url_match" - ], - "reference_answers": null, - "reference_url": "__SHOPPING__/women-cross-flower-beachwear-tankini-bandeau-bandage-bikini-set-push-up-swimwear-bathing-suit-two-pieces-swimsuits.html", - "program_html": [], - "url_note": "GOLD in PRED" - }, - "intent_template_id": 138 + "intent_template": "I am doing a market survey for one stop market, show me the most expensive product from {{product_category}} category", + "instantiation_dict": {"product_category": "competitive swimwear"}, + "format_specification": null, + "start_url_context": null, + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} + }, + { + "evaluator": "NetworkEventEvaluator", + "site": "shopping", + "url_match_mode": "exact", + "last_event_only": true, + "expected": { + "url": "__SHOPPING__/women-cross-flower-beachwear-tankini-bandeau-bandage-bikini-set-push-up-swimwear-bathing-suit-two-pieces-swimsuits.html", + "response_status": 200, + "event_type": "navigation" + } + } + ], + "revision": 2 }, { - "sites": [ - "shopping" - ], + "sites": ["shopping"], "task_id": 241, - "require_login": true, - "storage_state": "./.auth/shopping_state.json", - "start_url": "__SHOPPING__", - "geolocation": null, - "intent_template": "I am doing a market survey for one stop market, show me the most expensive product from {{product_category}} category", - "instantiation_dict": { - "product_category": "skin care tool" - }, + "intent_template_id": 138, + "start_urls": ["__SHOPPING__"], "intent": "I am doing a market survey for one stop market, show me the most expensive product from skin care tool category", - "require_reset": false, - "eval": { - "expected_ui_state": [ - { - "url": "__SHOPPING__/professional-medi-spa-scar-stretch-mark-reduction-system.html" - } - ], - "site": "shopping" - }, - "original.eval": { - "eval_types": [ - "url_match" - ], - "reference_answers": null, - "reference_url": "__SHOPPING__/professional-medi-spa-scar-stretch-mark-reduction-system.html", - "program_html": [], - "url_note": "GOLD in PRED" - }, - "intent_template_id": 138 - }, - { - "sites": [ - "shopping" - ], - "task_id": 242, - "require_login": true, - "storage_state": "./.auth/shopping_state.json", - "start_url": "__SHOPPING__", - "geolocation": null, "intent_template": "I am doing a market survey for one stop market, show me the most expensive product from {{product_category}} category", - "instantiation_dict": { - "product_category": "Household Supplies" - }, - "intent": "I am doing a market survey for one stop market, show me the most expensive product from Household Supplies category", - "require_reset": false, - "eval": { - "expected_ui_state": [ - { - "url": "__SHOPPING__/lynx-battery-12v-200ah-lithium-iron-phosphate-lifepo4-prismatic-deep-cell-battery-set-of-4-3-2v-cells-with-3-bus-bars-and-8-lug-nuts-for-rv-solar-marine-off-grid-applications.html" - } - ], - "site": "shopping" - }, - "original.eval": { - "eval_types": [ - "url_match" - ], - "reference_answers": null, - "reference_url": "__SHOPPING__/lynx-battery-12v-200ah-lithium-iron-phosphate-lifepo4-prismatic-deep-cell-battery-set-of-4-3-2v-cells-with-3-bus-bars-and-8-lug-nuts-for-rv-solar-marine-off-grid-applications.html", - "program_html": [], - "url_note": "GOLD in PRED" - }, - "intent_template_id": 138 + "instantiation_dict": {"product_category": "skin care tool"}, + "format_specification": null, + "start_url_context": null, + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} + }, + { + "evaluator": "NetworkEventEvaluator", + "site": "shopping", + "url_match_mode": "exact", + "last_event_only": true, + "expected": { + "url": "__SHOPPING__/professional-medi-spa-scar-stretch-mark-reduction-system.html", + "response_status": 200, + "event_type": "navigation" + } + } + ], + "revision": 2 }, { - "sites": [ - "shopping_admin" + "sites": ["shopping"], + "task_id": 242, + "intent_template_id": 138, + "start_urls": ["__SHOPPING__"], + "intent": "I am doing a market survey for one stop market, show me the most expensive product from Household Supplies category", + "intent_template": "I am doing a market survey for one stop market, show me the most expensive product from {{product_category}} category", + "instantiation_dict": {"product_category": "Household Supplies"}, + "format_specification": null, + "start_url_context": null, + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} + }, + { + "evaluator": "NetworkEventEvaluator", + "site": "shopping", + "url_match_mode": "exact", + "last_event_only": true, + "expected": { + "url": "__SHOPPING__/lynx-battery-12v-200ah-lithium-iron-phosphate-lifepo4-prismatic-deep-cell-battery-set-of-4-3-2v-cells-with-3-bus-bars-and-8-lug-nuts-for-rv-solar-marine-off-grid-applications.html", + "response_status": 200, + "event_type": "navigation" + } + } ], + "revision": 2 + }, + { + "sites": ["shopping_admin"], "task_id": 243, - "require_login": true, - "storage_state": "./.auth/shopping_admin_state.json", - "start_url": "__SHOPPING_ADMIN__", - "geolocation": null, - "intent_template": "Show me the {{information}} of the customer who is the most unhappy with {{product}}", - "instantiation_dict": { - "information": "email address", - "product": "Circe fleece" - }, + "intent_template_id": 244, + "start_urls": ["__SHOPPING_ADMIN__"], "intent": "Show me the email address of the customer who is the most unhappy with Circe fleece", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "verify_retrieved_value_normalized", - "eval_params": { - "ordered": false, - "type": "text" - }, - "expected_data": [ - "hannah.lim@gmail.com" - ] - } + "intent_template": "Show me the {{information}} of the customer who is the most unhappy with {{product}}", + "instantiation_dict": {"information": "email address", "product": "Circe fleece"}, + "format_specification": null, + "start_url_context": null, + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "ordered": false, + "results_schema": { "type": "array", "items": {"type": "string"} }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": ["hannah.lim@gmail.com"] } - ], - "site": "shopping_admin" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "exact_match": "hannah.lim@gmail.com" - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "hannah.lim@gmail.com" - }, - "intent_template_id": 244 + } + ], + "revision": 2 }, { - "sites": [ - "shopping_admin" - ], + "sites": ["shopping_admin"], "task_id": 244, - "require_login": true, - "storage_state": "./.auth/shopping_admin_state.json", - "start_url": "__SHOPPING_ADMIN__", - "geolocation": null, - "intent_template": "Show me the {{information}} of the customer who is the most unhappy with {{product}}", - "instantiation_dict": { - "information": "email address", - "product": "Olivia zip jacket" - }, + "intent_template_id": 244, + "start_urls": ["__SHOPPING_ADMIN__"], "intent": "Show me the email address of the customer who is the most unhappy with Olivia zip jacket", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "verify_retrieved_value_normalized", - "eval_params": { - "ordered": false, - "type": "text" - }, - "expected_data": [ - "emma.lopez@gmail.com" - ] - } + "intent_template": "Show me the {{information}} of the customer who is the most unhappy with {{product}}", + "instantiation_dict": {"information": "email address", "product": "Olivia zip jacket"}, + "format_specification": null, + "start_url_context": null, + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "ordered": false, + "results_schema": { "type": "array", "items": {"type": "string"} }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": ["emma.lopez@gmail.com"] } - ], - "site": "shopping_admin" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "exact_match": "emma.lopez@gmail.com" - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "emma.lopez@gmail.com" - }, - "intent_template_id": 244 + } + ], + "revision": 2 }, { - "sites": [ - "shopping_admin" - ], + "sites": ["shopping_admin"], "task_id": 245, - "require_login": true, - "storage_state": "./.auth/shopping_admin_state.json", - "start_url": "__SHOPPING_ADMIN__", - "geolocation": null, - "intent_template": "Show me the {{information}} of the customer who is the most unhappy with {{product}}", - "instantiation_dict": { - "information": "name", - "product": "Antonia racer tank" - }, + "intent_template_id": 244, + "start_urls": ["__SHOPPING_ADMIN__"], "intent": "Show me the name of the customer who is the most unhappy with Antonia racer tank", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "verify_retrieved_value_normalized", - "eval_params": { - "ordered": false, - "type": "text" - }, - "expected_data": [ - "Shaunte" - ] - } + "intent_template": "Show me the {{information}} of the customer who is the most unhappy with {{product}}", + "instantiation_dict": {"information": "name", "product": "Antonia racer tank"}, + "format_specification": null, + "start_url_context": null, + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "ordered": false, + "results_schema": { "type": "array", "items": {"type": "string"} }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": ["Shaunte"] } - ], - "site": "shopping_admin" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "exact_match": "Shaunte" - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "Shaunte" - }, - "intent_template_id": 244 + } + ], + "revision": 2 }, { - "sites": [ - "shopping_admin" - ], + "sites": ["shopping_admin"], "task_id": 246, - "require_login": true, - "storage_state": "./.auth/shopping_admin_state.json", - "start_url": "__SHOPPING_ADMIN__", - "geolocation": null, - "intent_template": "Show me the {{information}} of the customer who is the most unhappy with {{product}}", - "instantiation_dict": { - "information": "name", - "product": "Chloe tank" - }, + "intent_template_id": 244, + "start_urls": ["__SHOPPING_ADMIN__"], "intent": "Show me the name of the customer who is the most unhappy with Chloe tank", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "verify_retrieved_value_normalized", - "eval_params": { - "ordered": false, - "type": "text" - }, - "expected_data": [ - "Teofila" - ] - } + "intent_template": "Show me the {{information}} of the customer who is the most unhappy with {{product}}", + "instantiation_dict": {"information": "name", "product": "Chloe tank"}, + "format_specification": null, + "start_url_context": null, + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "ordered": false, + "results_schema": { "type": "array", "items": {"type": "string"} }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": ["Teofila"] } - ], - "site": "shopping_admin" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "exact_match": "Teofila" - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "Teofila" - }, - "intent_template_id": 244 + } + ], + "revision": 2 }, { - "sites": [ - "shopping_admin" - ], + "sites": ["shopping_admin"], "task_id": 247, - "require_login": true, - "storage_state": "./.auth/shopping_admin_state.json", - "start_url": "__SHOPPING_ADMIN__", - "geolocation": null, - "intent_template": "Show me the {{information}} of the customer who is the most unhappy with {{product}}", - "instantiation_dict": { - "information": "email address", - "product": "the style of Zoe products" - }, - "intent": "Show me the email address of the customer who is the most unhappy with the style of Zoe products", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "validate_status", - "expected_data": { - "expected_status": "NOT_FOUND_ERROR" - } - } - } - ], - "site": "shopping_admin" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "fuzzy_match": "N/A" - }, - "reference_url": "Valorie doesn't have a email in the system", - "program_html": [], - "string_note": "There is no negative review for Zoe products", - "reference_answer_raw_annotation": "N/A" - }, "intent_template_id": 244, - "changelogs": [ + "start_urls": ["__SHOPPING_ADMIN__"], + "intent": "Show me the email address of the customer who is the most unhappy with the style of Zoe products", + "intent_template": "Show me the {{information}} of the customer who is the most unhappy with {{product}}", + "instantiation_dict": {"information": "email address", "product": "the style of Zoe products"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "expected_retrieve_value", - "category": "unachievable_tasks", - "note": "Status check used instead of fuzzy match" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": { + "performed_operation": "retrieve", + "status": "NOT_FOUND_ERROR", + "retrieved_data": null + } } - ] + ], + "revision": 2 }, { - "sites": [ - "map" - ], + "sites": ["map"], "task_id": 248, - "require_login": true, - "storage_state": null, - "start_url": "__MAP__", - "geolocation": null, - "intent_template": "Tell me the coordinates of {{location}} in DD format", - "instantiation_dict": { - "location": "Carnegie Mellon Caf\u00e9" - }, - "intent": "Tell me the coordinates of Carnegie Mellon Caf\u00e9 in DD format", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "verify_retrieved_value_normalized", - "eval_params": { - "ordered": false, - "type": "text" - }, - "expected_data": [ - "40.442", - "-79.939" - ] - } - } - ], - "site": "map" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "must_include": [ - "40.442", - "-79.939" - ] - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "40.4424191, -79.9397388" - }, "intent_template_id": 46, - "changelogs": [ + "start_urls": ["__MAP__"], + "intent": "Tell me the coordinates of Carnegie Mellon Caf\u00e9 in DD format", + "intent_template": "Tell me the coordinates of {{location}} in DD format", + "instantiation_dict": {"location": "Carnegie Mellon Caf\u00e9"}, + "format_specification": "Use \"latitude\" for the latitude coordinate and \"longitude\" for the longitude coordinate.", + "start_url_context": null, + "eval": [ { - "key": "eval", - "category": "permissive_string_match", - "note": "Switched to exact match" + "evaluator": "AgentResponseEvaluator", + "ordered": true, + "results_schema": { "type": "array", "items": {"format": "coordinates"} }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": [ {"latitude": "40.4424191", "longitude": "-79.9397388"} ] + } } - ] + ], + "revision": 2 }, { - "sites": [ - "map" - ], + "sites": ["map"], "task_id": 249, - "require_login": true, - "storage_state": null, - "start_url": "__MAP__", - "geolocation": null, - "intent_template": "Tell me the coordinates of {{location}} in DD format", - "instantiation_dict": { - "location": "Western Pennsylvania Hospital Heliport" - }, - "intent": "Tell me the coordinates of Western Pennsylvania Hospital Heliport in DD format", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "verify_retrieved_value_normalized", - "eval_params": { - "ordered": false, - "type": "text" - }, - "expected_data": [ - "40.460", - "-79.946" - ] - } - } - ], - "site": "map" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "must_include": [ - "40.460", - "-79.946" - ] - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "40.46076, -79.94666" - }, "intent_template_id": 46, - "changelogs": [ + "start_urls": ["__MAP__"], + "intent": "Tell me the coordinates of Western Pennsylvania Hospital Heliport in DD format", + "intent_template": "Tell me the coordinates of {{location}} in DD format", + "instantiation_dict": {"location": "Western Pennsylvania Hospital Heliport"}, + "format_specification": "Use \"latitude\" for the latitude coordinate and \"longitude\" for the longitude coordinate.", + "start_url_context": null, + "eval": [ { - "key": "eval", - "category": "permissive_string_match", - "note": "Switched to exact match" + "evaluator": "AgentResponseEvaluator", + "ordered": true, + "results_schema": { "type": "array", "items": {"format": "coordinates"} }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": [ {"latitude": "40.46076", "longitude": "-79.94666"} ] + } } - ] + ], + "revision": 2 }, { - "sites": [ - "map" - ], + "sites": ["map"], "task_id": 250, - "require_login": true, - "storage_state": null, - "start_url": "__MAP__", - "geolocation": null, - "intent_template": "Tell me the coordinates of {{location}} in DD format", - "instantiation_dict": { - "location": "Apple Store near Pitt" - }, - "intent": "Tell me the coordinates of Apple Store near Pitt in DD format", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "verify_retrieved_value_normalized", - "eval_params": { - "ordered": false, - "type": "text" - }, - "expected_data": [ - "40.451", - "-79.933" - ] - } - } - ], - "site": "map" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "must_include": [ - "40.451", - "-79.933" - ] - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "40.4511693, -79.9334241" - }, "intent_template_id": 46, - "changelogs": [ + "start_urls": ["__MAP__"], + "intent": "Tell me the coordinates of Apple Store near Pitt in DD format", + "intent_template": "Tell me the coordinates of {{location}} in DD format", + "instantiation_dict": {"location": "Apple Store near Pitt"}, + "format_specification": "Use \"latitude\" for the latitude coordinate and \"longitude\" for the longitude coordinate.", + "start_url_context": null, + "eval": [ { - "key": "eval", - "category": "permissive_string_match", - "note": "Switched to exact match" + "evaluator": "AgentResponseEvaluator", + "ordered": true, + "results_schema": { "type": "array", "items": {"format": "coordinates"} }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": [ {"latitude": "40.4511693", "longitude": "-79.9334241"} ] + } } - ] + ], + "revision": 2 }, { - "sites": [ - "map" - ], + "sites": ["map"], "task_id": 251, - "require_login": true, - "storage_state": null, - "start_url": "__MAP__", - "geolocation": null, - "intent_template": "Tell me the coordinates of {{location}} in DD format", - "instantiation_dict": { - "location": "bus stop on the Carnegie art museum side of the street near CMU" - }, - "intent": "Tell me the coordinates of bus stop on the Carnegie art museum side of the street near CMU in DD format", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "verify_retrieved_value_normalized", - "eval_params": { - "ordered": false, - "type": "text" - }, - "expected_data": [ - "40.444", - "-79.948" - ] - } - } - ], - "site": "map" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "must_include": [ - "40.444", - "-79.948" - ] - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "40.4443, -79.94889" - }, "intent_template_id": 46, - "changelogs": [ + "start_urls": ["__MAP__"], + "intent": "Tell me the coordinates of bus stop on the Carnegie art museum side of the street near CMU in DD format", + "intent_template": "Tell me the coordinates of {{location}} in DD format", + "instantiation_dict": {"location": "bus stop on the Carnegie art museum side of the street near CMU"}, + "format_specification": "Use \"latitude\" for the latitude coordinate and \"longitude\" for the longitude coordinate.", + "start_url_context": null, + "eval": [ { - "key": "eval", - "category": "permissive_string_match", - "note": "Switched to exact match" + "evaluator": "AgentResponseEvaluator", + "ordered": true, + "results_schema": { "type": "array", "items": {"format": "coordinates"} }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": [ {"latitude": "40.4443", "longitude": "-79.94889"} ] + } } - ] + ], + "revision": 2 }, { - "sites": [ - "map" - ], + "sites": ["map"], "task_id": 252, - "require_login": true, - "storage_state": null, - "start_url": "__MAP__", - "geolocation": null, - "intent_template": "Tell me the coordinates of {{location}} in DD format", - "instantiation_dict": { - "location": "Tokyo Japanese Food Store in Pittsburgh" - }, - "intent": "Tell me the coordinates of Tokyo Japanese Food Store in Pittsburgh in DD format", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "verify_retrieved_value_normalized", - "eval_params": { - "ordered": false, - "type": "text" - }, - "expected_data": [ - "40.457", - "-79.929" - ] - } - } - ], - "site": "map" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "must_include": [ - "40.457", - "-79.929" - ] - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "40.45761, -79.92934" - }, "intent_template_id": 46, - "changelogs": [ + "start_urls": ["__MAP__"], + "intent": "Tell me the coordinates of Tokyo Japanese Food Store in Pittsburgh in DD format", + "intent_template": "Tell me the coordinates of {{location}} in DD format", + "instantiation_dict": {"location": "Tokyo Japanese Food Store in Pittsburgh"}, + "format_specification": "Use \"latitude\" for the latitude coordinate and \"longitude\" for the longitude coordinate.", + "start_url_context": null, + "eval": [ { - "key": "eval", - "category": "permissive_string_match", - "note": "Switched to exact match" + "evaluator": "AgentResponseEvaluator", + "ordered": true, + "results_schema": { "type": "array", "items": {"format": "coordinates"} }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": [ {"latitude": "40.45761", "longitude": "-79.92934"} ] + } } - ] + ], + "revision": 2 }, { - "sites": [ - "map" - ], + "sites": ["map"], "task_id": 253, - "require_login": true, - "storage_state": null, - "start_url": "__MAP__", - "geolocation": null, - "intent_template": "What is the {{information}} of {{location}}", - "instantiation_dict": { - "location": "Carnegie Mellon Caf\u00e9", - "information": "phone number" - }, - "intent": "What is the phone number of Carnegie Mellon Caf\u00e9", - "require_reset": false, - "eval": { - "site": "map" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "fuzzy_match": "N/A" - }, - "reference_url": "", - "program_html": [], - "string_note": "There is no such information in the map", - "reference_answer_raw_annotation": "N/A" - }, "intent_template_id": 501, - "changelogs": [ + "start_urls": ["__MAP__"], + "intent": "What is the phone number of Carnegie Mellon Caf\u00e9", + "intent_template": "What is the {{information}} of {{location}}", + "instantiation_dict": {"location": "Carnegie Mellon Caf\u00e9", "information": "phone number"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "expected_retrieve_value", - "category": "unachievable_tasks", - "note": "Status check used instead of fuzzy match" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": { + "performed_operation": "retrieve", + "status": "NOT_FOUND_ERROR", + "retrieved_data": null + } } - ] + ], + "revision": 2 }, { - "sites": [ - "map" - ], + "sites": ["map"], "task_id": 254, - "require_login": true, - "storage_state": null, - "start_url": "__MAP__", - "geolocation": null, - "intent_template": "What is the {{information}} of {{location}}", - "instantiation_dict": { - "location": "Western Pennsylvania Hospital", - "information": "phone number" - }, + "intent_template_id": 501, + "start_urls": ["__MAP__"], "intent": "What is the phone number of Western Pennsylvania Hospital", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "verify_retrieved_value_normalized", - "eval_params": { - "ordered": false, - "type": "text" - }, - "expected_data": [ - "4125785000" - ] - } + "intent_template": "What is the {{information}} of {{location}}", + "instantiation_dict": {"location": "Western Pennsylvania Hospital", "information": "phone number"}, + "format_specification": null, + "start_url_context": null, + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "ordered": false, + "results_schema": { "type": "array", "items": {"type": "string"} }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": ["4125785000"] } - ], - "site": "map" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "exact_match": "4125785000" - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "4125785000" - }, - "intent_template_id": 501 + } + ], + "revision": 2 }, { - "sites": [ - "map" - ], + "sites": ["map"], "task_id": 255, - "require_login": true, - "storage_state": null, - "start_url": "__MAP__", - "geolocation": null, - "intent_template": "Who is the {{information}} of {{location}}", - "instantiation_dict": { - "location": "PIT airport", - "information": "operator" - }, + "intent_template_id": 501, + "start_urls": ["__MAP__"], "intent": "Who is the operator of PIT airport", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "verify_retrieved_value_normalized", - "eval_params": { - "ordered": false, - "type": "text" - }, - "expected_data": [ - "Allegheny County Airport Authority" - ] - } + "intent_template": "Who is the {{information}} of {{location}}", + "instantiation_dict": {"location": "PIT airport", "information": "operator"}, + "format_specification": null, + "start_url_context": null, + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "ordered": false, + "results_schema": { "type": "array", "items": {"type": "string"} }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": ["Allegheny County Airport Authority"] } - ], - "site": "map" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "exact_match": "Allegheny County Airport Authority" - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "Allegheny County Airport Authority" - }, - "intent_template_id": 501 + } + ], + "revision": 2 }, { - "sites": [ - "map" - ], + "sites": ["map"], "task_id": 256, - "require_login": true, - "storage_state": null, - "start_url": "__MAP__", - "geolocation": null, - "intent_template": "What is the {{information}} of {{location}}", - "instantiation_dict": { - "location": "Carnegie art museum in pittsburgh", - "information": "website" - }, + "intent_template_id": 501, + "start_urls": ["__MAP__"], "intent": "What is the website of Carnegie art museum in pittsburgh", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "verify_retrieved_value_normalized", - "eval_params": { - "ordered": false, - "type": "text" - }, - "expected_data": [ - "http://web.cmoa.org/" - ] - } + "intent_template": "What is the {{information}} of {{location}}", + "instantiation_dict": {"location": "Carnegie art museum in pittsburgh", "information": "website"}, + "format_specification": null, + "start_url_context": null, + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "ordered": false, + "results_schema": { "type": "array", "items": {"type": "string"} }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": ["http://web.cmoa.org/"] } - ], - "site": "map" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "exact_match": "http://web.cmoa.org/" - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "http://web.cmoa.org/" - }, - "intent_template_id": 501 + } + ], + "revision": 2 }, { - "sites": [ - "map" - ], + "sites": ["map"], "task_id": 257, - "require_login": true, - "storage_state": null, - "start_url": "__MAP__", - "geolocation": null, + "intent_template_id": 501, + "start_urls": ["__MAP__"], + "intent": "What is the hours of operation of Tokyo Japanese Food Store in Pittsburgh", "intent_template": "What is the {{information}} of {{location}}", "instantiation_dict": { "location": "Tokyo Japanese Food Store in Pittsburgh", "information": "hours of operation" }, - "intent": "What is the hours of operation of Tokyo Japanese Food Store in Pittsburgh", - "require_reset": false, - "eval": { - "site": "map" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "fuzzy_match": [ - "We-Su 10:00-17:00" - ] - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "We-Su 10:00-17:00" - }, - "intent_template_id": 501, - "changelogs": [ + "format_specification": "Provide the information for each day of the week when the location is open. Use \"day\" for the day of the week, \"open_time\" for the opening time, and \"close_time\" for the closing time.", + "start_url_context": null, + "eval": [ { - "key": "eval", - "category": "permissive_string_match", - "note": "Switched from fuzzy eval" + "evaluator": "AgentResponseEvaluator", + "ordered": false, + "results_schema": { + "type": "array", + "items": { + "type": "object", + "properties": { + "day" : { "type": "string" }, + "open_time" : { "type": "string" }, + "close_time": { "type": "string" } + }, + "required": ["close_time", "day", "open_time"] + } + }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": [ + { "day": "Wednesday", "open_time": "10:00", "close_time": "17:00" }, + { "day": "Thursday" , "open_time": "10:00", "close_time": "17:00" }, + { "day": "Friday" , "open_time": "10:00", "close_time": "17:00" }, + { "day": "Saturday" , "open_time": "10:00", "close_time": "17:00" }, + { "day": "Sunday" , "open_time": "10:00", "close_time": "17:00" } + ] + } } - ] + ], + "revision": 2 }, { - "sites": [ - "gitlab" - ], + "sites": ["gitlab"], "task_id": 258, - "require_login": true, - "storage_state": "./.auth/gitlab_state.json", - "start_url": "__GITLAB__", - "geolocation": null, + "intent_template_id": 325, + "start_urls": ["__GITLAB__"], + "intent": "Navigate to all public projects", "intent_template": "Navigate to all public projects", - "original.intent_template": "See all public projects", "instantiation_dict": {}, - "intent": "Navigate to all public projects", - "original.intent": "See all public projects", - "require_reset": false, - "eval": { - "expected_ui_state": [ - { - "url": "__GITLAB__/explore" - } - ], - "site": "gitlab" - }, - "original.eval": { - "eval_types": [ - "url_match" - ], - "reference_answers": null, - "reference_url": "__GITLAB__/explore", - "program_html": [], - "url_note": "GOLD in PRED" - }, - "intent_template_id": 325, - "changelogs": [ + "format_specification": null, + "start_url_context": null, + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} + }, { - "key": "intent_template", - "category": "clarify_instructions", - "note": "Clarify the intent as a navigation task" + "evaluator": "NetworkEventEvaluator", + "site": "gitlab", + "url_match_mode": "exact", + "last_event_only": true, + "expected": { + "url": "__GITLAB__/explore?visibility_level=20", + "response_status": 200, + "event_type": "navigation" + } } - ] + ], + "revision": 2 }, { - "sites": [ - "gitlab" - ], + "sites": ["gitlab"], "task_id": 259, - "require_login": true, - "storage_state": "./.auth/gitlab_state.json", - "start_url": "__GITLAB__", - "geolocation": null, + "intent_template_id": 312, + "start_urls": ["__GITLAB__"], + "intent": "Get me my RSS feed token", "intent_template": "Get me my RSS feed token", "instantiation_dict": {}, - "intent": "Get me my RSS feed token", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "verify_retrieved_value_normalized", - "eval_params": { - "ordered": "false", - "type": "text" - }, - "expected_data": [ - "TMN_bBn9Z48qVbUFZV45" - ] - } - } - ], - "site": "gitlab" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "exact_match": "TMN_bBn9Z48qVbUFZV45" - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "TMN_bBn9Z48qVbUFZV45" - }, - "intent_template_id": 312, - "changelogs": [ + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "expected_retrieve_value", - "category": "avoid_permissive_text", - "note": "Use structured response to judge correctness" + "evaluator": "AgentResponseEvaluator", + "ordered": "false", + "results_schema": { "type": "array", "items": {"type": "string"} }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": ["TMN_bBn9Z48qVbUFZV45"] + } } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping" - ], + "sites": ["shopping"], "task_id": 260, - "require_login": true, - "storage_state": "./.auth/shopping_state.json", - "start_url": "__SHOPPING__", - "geolocation": null, - "intent_template": "I want to browse the products in the {{category}} category", - "instantiation_dict": { - "category": "Video Game" - }, + "intent_template_id": 211, + "start_urls": ["__SHOPPING__"], "intent": "I want to browse the products in the Video Game category", - "require_reset": false, - "eval": { - "expected_ui_state": [ - { - "url": "__SHOPPING__/video-games.html" - } - ], - "site": "shopping" - }, - "original.eval": { - "eval_types": [ - "url_match" - ], - "reference_answers": null, - "reference_url": "__SHOPPING__/video-games.html", - "program_html": [], - "url_note": "GOLD in PRED" - }, - "intent_template_id": 211 + "intent_template": "I want to browse the products in the {{category}} category", + "instantiation_dict": {"category": "Video Game"}, + "format_specification": null, + "start_url_context": null, + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} + }, + { + "evaluator": "NetworkEventEvaluator", + "site": "shopping", + "url_match_mode": "exact", + "last_event_only": true, + "expected": { + "url": "__SHOPPING__/video-games.html", + "response_status": 200, + "event_type": "navigation" + } + } + ], + "revision": 2 }, { - "sites": [ - "shopping" - ], + "sites": ["shopping"], "task_id": 261, - "require_login": true, - "storage_state": "./.auth/shopping_state.json", - "start_url": "__SHOPPING__", - "geolocation": null, - "intent_template": "I want to browse the products in the {{category}} category", - "instantiation_dict": { - "category": "Headphones" - }, + "intent_template_id": 211, + "start_urls": ["__SHOPPING__"], "intent": "I want to browse the products in the Headphones category", - "require_reset": false, - "eval": { - "expected_ui_state": [ - { - "url": "__SHOPPING__/electronics/headphones.html" - } - ], - "site": "shopping" - }, - "original.eval": { - "eval_types": [ - "url_match" - ], - "reference_answers": null, - "reference_url": "__SHOPPING__/electronics/headphones.html", - "program_html": [], - "url_note": "GOLD in PRED" - }, - "intent_template_id": 211 + "intent_template": "I want to browse the products in the {{category}} category", + "instantiation_dict": {"category": "Headphones"}, + "format_specification": null, + "start_url_context": null, + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} + }, + { + "evaluator": "NetworkEventEvaluator", + "site": "shopping", + "url_match_mode": "exact", + "last_event_only": true, + "expected": { + "url": "__SHOPPING__/electronics/headphones.html", + "response_status": 200, + "event_type": "navigation" + } + } + ], + "revision": 2 }, { - "sites": [ - "shopping" - ], + "sites": ["shopping"], "task_id": 262, - "require_login": true, - "storage_state": "./.auth/shopping_state.json", - "start_url": "__SHOPPING__", - "geolocation": null, - "intent_template": "I want to browse the products in the {{category}} category", - "instantiation_dict": { - "category": "Men shoes" - }, + "intent_template_id": 211, + "start_urls": ["__SHOPPING__"], "intent": "I want to browse the products in the Men shoes category", - "require_reset": false, - "eval": { - "expected_ui_state": [ - { - "url": "__SHOPPING__/clothing-shoes-jewelry/men/shoes.html" - } - ], - "site": "shopping" - }, - "original.eval": { - "eval_types": [ - "url_match" - ], - "reference_answers": null, - "reference_url": "__SHOPPING__/clothing-shoes-jewelry/men/shoes.html", - "program_html": [], - "url_note": "GOLD in PRED" - }, - "intent_template_id": 211 + "intent_template": "I want to browse the products in the {{category}} category", + "instantiation_dict": {"category": "Men shoes"}, + "format_specification": null, + "start_url_context": null, + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} + }, + { + "evaluator": "NetworkEventEvaluator", + "site": "shopping", + "url_match_mode": "exact", + "last_event_only": true, + "expected": { + "url": "__SHOPPING__/clothing-shoes-jewelry/men/shoes.html", + "response_status": 200, + "event_type": "navigation" + } + } + ], + "revision": 2 }, { - "sites": [ - "shopping" - ], + "sites": ["shopping"], "task_id": 263, - "require_login": true, - "storage_state": "./.auth/shopping_state.json", - "start_url": "__SHOPPING__", - "geolocation": null, - "intent_template": "I want to browse the products in the {{category}} category", - "instantiation_dict": { - "category": "Woman clothing" - }, + "intent_template_id": 211, + "start_urls": ["__SHOPPING__"], "intent": "I want to browse the products in the Woman clothing category", - "require_reset": false, - "eval": { - "expected_ui_state": [ - { - "url": "__SHOPPING__/clothing-shoes-jewelry/women/clothing.html" - } - ], - "site": "shopping" - }, - "original.eval": { - "eval_types": [ - "url_match" - ], - "reference_answers": null, - "reference_url": "__SHOPPING__/clothing-shoes-jewelry/women/clothing.html", - "program_html": [], - "url_note": "GOLD in PRED" - }, - "intent_template_id": 211 + "intent_template": "I want to browse the products in the {{category}} category", + "instantiation_dict": {"category": "Woman clothing"}, + "format_specification": null, + "start_url_context": null, + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} + }, + { + "evaluator": "NetworkEventEvaluator", + "site": "shopping", + "url_match_mode": "exact", + "last_event_only": true, + "expected": { + "url": "__SHOPPING__/clothing-shoes-jewelry/women/clothing.html", + "response_status": 200, + "event_type": "navigation" + } + } + ], + "revision": 2 }, { - "sites": [ - "shopping" - ], + "sites": ["shopping"], "task_id": 264, - "require_login": true, - "storage_state": "./.auth/shopping_state.json", - "start_url": "__SHOPPING__", - "geolocation": null, - "intent_template": "I want to browse the products in the {{category}} category", - "instantiation_dict": { - "category": "Cabinets, Racks & Shelves" - }, + "intent_template_id": 211, + "start_urls": ["__SHOPPING__"], "intent": "I want to browse the products in the Cabinets, Racks & Shelves category", - "require_reset": false, - "eval": { - "expected_ui_state": [ - { - "url": "__SHOPPING__/office-products/office-furniture-lighting/cabinets-racks-shelves.html" - } - ], - "site": "shopping" - }, - "original.eval": { - "eval_types": [ - "url_match" - ], - "reference_answers": null, - "reference_url": "__SHOPPING__/office-products/office-furniture-lighting/cabinets-racks-shelves.html", - "program_html": [], - "url_note": "GOLD in PRED" - }, - "intent_template_id": 211 + "intent_template": "I want to browse the products in the {{category}} category", + "instantiation_dict": {"category": "Cabinets, Racks & Shelves"}, + "format_specification": null, + "start_url_context": null, + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} + }, + { + "evaluator": "NetworkEventEvaluator", + "site": "shopping", + "url_match_mode": "exact", + "last_event_only": true, + "expected": { + "url": "__SHOPPING__/office-products/office-furniture-lighting/cabinets-racks-shelves.html", + "response_status": 200, + "event_type": "navigation" + } + } + ], + "revision": 2 }, { - "sites": [ - "wikipedia", - "map" - ], + "sites": ["wikipedia", "map"], "task_id": 265, - "require_login": true, - "storage_state": null, - "start_url": "__MAP__", - "geolocation": null, - "intent_template": "What's the closest national park to {{city}}? How far is it to drive there?", - "instantiation_dict": { - "city": "Boston" - }, + "intent_template_id": 85, + "start_urls": ["__MAP__", "__WIKIPEDIA__"], "intent": "What's the closest national park to Boston? How far is it to drive there?", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "verify_retrieved_value_normalized", - "eval_params": { - "ordered": false, - "type": "text" + "intent_template": "What's the closest national park to {{city}}? How far is it to drive there?", + "instantiation_dict": {"city": "Boston"}, + "format_specification": "Provide the name of the park using \"park_name\" and its distance using \"distance\".", + "start_url_context": null, + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "ordered": false, + "results_schema": { + "type": "array", + "items": { + "type": "object", + "properties": { + "park_name": { "type": "string" }, + "distance" : { "type": "string", "format": "distance" } }, - "expected_data": [ - "Acadia National Park", - "457km" - ] + "required": ["distance", "park_name"] } + }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": [ {"park_name": "Acadia National Park", "distance": "457km"} ] } - ], - "site": "map" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "must_include": [ - "Acadia National Park", - "457km" - ] - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "Acadia National Park\n457km" - }, - "intent_template_id": 85, - "changelogs": [ - { - "key": "eval", - "category": "permissive_string_match", - "note": "Switched to exact match" } - ] + ], + "revision": 2 }, { - "sites": [ - "wikipedia", - "map" - ], + "sites": ["wikipedia", "map"], "task_id": 266, - "require_login": true, - "storage_state": null, - "start_url": "__MAP__", - "geolocation": null, - "intent_template": "What's the closest national park to {{city}}?", - "instantiation_dict": { - "city": "the largest city in Maine" - }, + "intent_template_id": 85, + "start_urls": ["__MAP__", "__WIKIPEDIA__"], "intent": "What's the closest national park to the largest city in Maine?", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "verify_retrieved_value_normalized", - "eval_params": { - "ordered": false, - "type": "text" - }, - "expected_data": [ - "Acadia National Park" - ] - } + "intent_template": "What's the closest national park to {{city}}?", + "instantiation_dict": {"city": "the largest city in Maine"}, + "format_specification": null, + "start_url_context": null, + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "ordered": true, + "results_schema": { "type": "array", "items": {"type": "string"} }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": ["Acadia National Park"] } - ], - "site": "map" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "exact_match": "Acadia National Park" - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "Acadia National Park" - }, - "intent_template_id": 85 + } + ], + "revision": 2 }, { - "sites": [ - "wikipedia", - "map" - ], + "sites": ["wikipedia", "map"], "task_id": 267, - "require_login": true, - "storage_state": null, - "start_url": "__MAP__", - "geolocation": null, - "intent_template": "What's the closest national park to {{city}}? How long it takes to drive there?", - "instantiation_dict": { - "city": "the hometown of Stephen King" - }, + "intent_template_id": 85, + "start_urls": ["__MAP__", "__WIKIPEDIA__"], "intent": "What's the closest national park to the hometown of Stephen King? How long it takes to drive there?", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "verify_retrieved_value_normalized", - "eval_params": { - "ordered": false, - "type": "text" + "intent_template": "What's the closest national park to {{city}}? How long it takes to drive there?", + "instantiation_dict": {"city": "the hometown of Stephen King"}, + "format_specification": "Provide the name of the park using \"park_name\" and the time using \"time\".", + "start_url_context": null, + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "ordered": false, + "results_schema": { + "type": "array", + "items": { + "type": "object", + "properties": { + "park_name": { "type": "string" }, + "time" : { "type": "string", "format": "duration" } }, - "expected_data": [ - "Acadia National Park" - ] + "required": ["park_name", "time"] } + }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": [ {"park_name": "Acadia National Park", "time": "1h 23min"} ] } - ], - "site": "map" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "must_include": [ - "Acadia National Park" - ], - "fuzzy_match": [ - "1h 23min" - ] - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "Acadia National Park\n1h 23min" - }, - "intent_template_id": 85, - "changelogs": [ - { - "key": "eval", - "category": "permissive_string_match", - "note": "Switched from fuzzy eval" } - ] + ], + "revision": 2 }, { - "sites": [ - "wikipedia", - "map" - ], + "sites": ["wikipedia", "map"], "task_id": 268, - "require_login": true, - "storage_state": null, - "start_url": "__MAP__", - "geolocation": null, - "intent_template": "What's the closest national park to {{city}}? How long does it take to bike there?", - "instantiation_dict": { - "city": "Vinalhaven, ME" - }, + "intent_template_id": 85, + "start_urls": ["__MAP__", "__WIKIPEDIA__"], "intent": "What's the closest national park to Vinalhaven, ME? How long does it take to bike there?", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "verify_retrieved_value_normalized", - "eval_params": { - "ordered": false, - "type": "text" + "intent_template": "What's the closest national park to {{city}}? How long does it take to bike there?", + "instantiation_dict": {"city": "Vinalhaven, ME"}, + "format_specification": "Provide the name of the park using \"park_name\" and the time using \"time\".", + "start_url_context": null, + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "ordered": false, + "results_schema": { + "type": "array", + "items": { + "type": "object", + "properties": { + "park_name": { "type": "string" }, + "time" : { "type": "string", "format": "duration" } }, - "expected_data": [ - "Acadia National Park" - ] + "required": ["park_name", "time"] } + }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": [ {"park_name": "Acadia National Park", "time": "10h 33min"} ] } - ], - "site": "map" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "must_include": [ - "Acadia National Park" - ], - "fuzzy_match": [ - "10h 33min" - ] - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "Acadia National Park\n10h 33min" - }, - "intent_template_id": 85, - "changelogs": [ - { - "key": "eval", - "category": "permissive_string_match", - "note": "Switched from fuzzy eval" } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping" - ], + "sites": ["shopping"], "task_id": 269, - "require_login": true, - "storage_state": "./.auth/shopping_state.json", - "start_url": "__SHOPPING__", - "geolocation": null, - "intent_template": "Show me products under ${{price}} in \"{{product_category}}\" category", - "instantiation_dict": { - "price": "25", - "product_category": "women shoes" - }, + "intent_template_id": 139, + "start_urls": ["__SHOPPING__"], "intent": "Show me products under $25 in \"women shoes\" category", - "require_reset": false, - "eval": { - "expected_ui_state": [ - { - "url": "__SHOPPING__/clothing-shoes-jewelry/women/shoes.html?price=0-25" - } - ], - "site": "shopping" - }, - "original.eval": { - "eval_types": [ - "url_match" - ], - "reference_answers": null, - "reference_url": "__SHOPPING__/clothing-shoes-jewelry/women/shoes.html?price=0-25", - "program_html": [], - "url_note": "GOLD in PRED" - }, - "intent_template_id": 139 + "intent_template": "Show me products under ${{price}} in \"{{product_category}}\" category", + "instantiation_dict": {"price": "25", "product_category": "women shoes"}, + "format_specification": null, + "start_url_context": null, + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} + }, + { + "evaluator": "NetworkEventEvaluator", + "site": "shopping", + "url_match_mode": "exact", + "last_event_only": true, + "expected": { + "url": "__SHOPPING__/clothing-shoes-jewelry/women/shoes.html?price=0-25", + "response_status": 200, + "event_type": "navigation" + } + } + ], + "revision": 2 }, { - "sites": [ - "shopping" - ], + "sites": ["shopping"], "task_id": 270, - "require_login": true, - "storage_state": "./.auth/shopping_state.json", - "start_url": "__SHOPPING__", - "geolocation": null, - "intent_template": "Show me products under ${{price}} in \"{{product_category}}\" category", - "instantiation_dict": { - "price": "30", - "product_category": "men shoes" - }, + "intent_template_id": 139, + "start_urls": ["__SHOPPING__"], "intent": "Show me products under $30 in \"men shoes\" category", - "require_reset": false, - "eval": { - "expected_ui_state": [ - { - "url": "__SHOPPING__/clothing-shoes-jewelry/men/shoes.html?price=0-30" - } - ], - "site": "shopping" - }, - "original.eval": { - "eval_types": [ - "url_match" - ], - "reference_answers": null, - "reference_url": "__SHOPPING__/clothing-shoes-jewelry/men/shoes.html?price=0-30", - "program_html": [], - "url_note": "GOLD in PRED" - }, - "intent_template_id": 139 + "intent_template": "Show me products under ${{price}} in \"{{product_category}}\" category", + "instantiation_dict": {"price": "30", "product_category": "men shoes"}, + "format_specification": null, + "start_url_context": null, + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} + }, + { + "evaluator": "NetworkEventEvaluator", + "site": "shopping", + "url_match_mode": "exact", + "last_event_only": true, + "expected": { + "url": "__SHOPPING__/clothing-shoes-jewelry/men/shoes.html?price=0-30", + "response_status": 200, + "event_type": "navigation" + } + } + ], + "revision": 2 }, { - "sites": [ - "shopping" - ], + "sites": ["shopping"], "task_id": 271, - "require_login": true, - "storage_state": "./.auth/shopping_state.json", - "start_url": "__SHOPPING__", - "geolocation": null, - "intent_template": "Show me products under ${{price}} in \"{{product_category}}\" category", - "instantiation_dict": { - "price": "46.99", - "product_category": "makeup remover" - }, + "intent_template_id": 139, + "start_urls": ["__SHOPPING__"], "intent": "Show me products under $46.99 in \"makeup remover\" category", - "require_reset": false, - "eval": { - "expected_ui_state": [ - { - "url": "__SHOPPING__/beauty-personal-care/makeup/makeup-remover.html?price=0-46.99" - } - ], - "site": "shopping" - }, - "original.eval": { - "eval_types": [ - "url_match" - ], - "reference_answers": null, - "reference_url": "__SHOPPING__/beauty-personal-care/makeup/makeup-remover.html?price=0-46.99", - "program_html": [], - "url_note": "GOLD in PRED" - }, - "intent_template_id": 139 + "intent_template": "Show me products under ${{price}} in \"{{product_category}}\" category", + "instantiation_dict": {"price": "46.99", "product_category": "makeup remover"}, + "format_specification": null, + "start_url_context": null, + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} + }, + { + "evaluator": "NetworkEventEvaluator", + "site": "shopping", + "url_match_mode": "exact", + "last_event_only": true, + "expected": { + "url": "__SHOPPING__/beauty-personal-care/makeup/makeup-remover.html?price=0-46.99", + "response_status": 200, + "event_type": "navigation" + } + } + ], + "revision": 2 }, { - "sites": [ - "shopping" - ], + "sites": ["shopping"], "task_id": 272, - "require_login": true, - "storage_state": "./.auth/shopping_state.json", - "start_url": "__SHOPPING__", - "geolocation": null, - "intent_template": "Show me products under ${{price}} in \"{{product_category}}\" category", - "instantiation_dict": { - "price": "78", - "product_category": "children dental care" - }, + "intent_template_id": 139, + "start_urls": ["__SHOPPING__"], "intent": "Show me products under $78 in \"children dental care\" category", - "require_reset": false, - "eval": { - "expected_ui_state": [ - { - "url": "__SHOPPING__/beauty-personal-care/oral-care/children-s-dental-care.html?price=0-78" - } - ], - "site": "shopping" - }, - "original.eval": { - "eval_types": [ - "url_match" - ], - "reference_answers": null, - "reference_url": "__SHOPPING__/beauty-personal-care/oral-care/children-s-dental-care.html?price=0-78", - "program_html": [], - "url_note": "GOLD in PRED" - }, - "intent_template_id": 139 + "intent_template": "Show me products under ${{price}} in \"{{product_category}}\" category", + "instantiation_dict": {"price": "78", "product_category": "children dental care"}, + "format_specification": null, + "start_url_context": null, + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} + }, + { + "evaluator": "NetworkEventEvaluator", + "site": "shopping", + "url_match_mode": "exact", + "last_event_only": true, + "expected": { + "url": "__SHOPPING__/beauty-personal-care/oral-care/children-s-dental-care.html?price=0-78", + "response_status": 200, + "event_type": "navigation" + } + } + ], + "revision": 2 }, { - "sites": [ - "shopping" - ], + "sites": ["shopping"], "task_id": 273, - "require_login": true, - "storage_state": "./.auth/shopping_state.json", - "start_url": "__SHOPPING__", - "geolocation": null, - "intent_template": "Show me products under ${{price}} in \"{{product_category}}\" category", - "instantiation_dict": { - "price": "199", - "product_category": "furtiture with accent" - }, + "intent_template_id": 139, + "start_urls": ["__SHOPPING__"], "intent": "Show me products under $199 in \"furtiture with accent\" category", - "require_reset": false, - "eval": { - "expected_ui_state": [ - { - "url": "__SHOPPING__/home-kitchen/furniture/accent-furniture.html?price=0-199" - } - ], - "site": "shopping" - }, - "original.eval": { - "eval_types": [ - "url_match" - ], - "reference_answers": null, - "reference_url": "__SHOPPING__/home-kitchen/furniture/accent-furniture.html?price=0-199", - "program_html": [], - "url_note": "GOLD in PRED" - }, - "intent_template_id": 139 + "intent_template": "Show me products under ${{price}} in \"{{product_category}}\" category", + "instantiation_dict": {"price": "199", "product_category": "furtiture with accent"}, + "format_specification": null, + "start_url_context": null, + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} + }, + { + "evaluator": "NetworkEventEvaluator", + "site": "shopping", + "url_match_mode": "exact", + "last_event_only": true, + "expected": { + "url": "__SHOPPING__/home-kitchen/furniture/accent-furniture.html?price=0-199", + "response_status": 200, + "event_type": "navigation" + } + } + ], + "revision": 2 }, { - "sites": [ - "shopping" - ], + "sites": ["shopping"], "task_id": 274, - "require_login": true, - "storage_state": "./.auth/shopping_state.json", - "start_url": "__SHOPPING__", - "geolocation": null, - "intent_template": "Search for \"{{keyword}}\"", - "instantiation_dict": { - "keyword": "usb wifi" - }, + "intent_template_id": 212, + "start_urls": ["__SHOPPING__"], "intent": "Search for \"usb wifi\"", - "require_reset": false, - "eval": { - "expected_ui_state": [ - { - "url": "__SHOPPING__/catalogsearch/result/?q=usb+wifi" - } - ], - "site": "shopping" - }, - "original.eval": { - "eval_types": [ - "url_match" - ], - "reference_answers": null, - "reference_url": "__SHOPPING__/catalogsearch/result/?q=usb+wifi", - "program_html": [], - "url_note": "GOLD in PRED" - }, - "intent_template_id": 212 + "intent_template": "Search for \"{{keyword}}\"", + "instantiation_dict": {"keyword": "usb wifi"}, + "format_specification": null, + "start_url_context": null, + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} + }, + { + "evaluator": "NetworkEventEvaluator", + "site": "shopping", + "url_match_mode": "exact", + "last_event_only": true, + "expected": { + "url": "__SHOPPING__/catalogsearch/result/?q=usb+wifi", + "response_status": 200, + "event_type": "navigation" + } + } + ], + "revision": 2 }, { - "sites": [ - "shopping" - ], + "sites": ["shopping"], "task_id": 275, - "require_login": true, - "storage_state": "./.auth/shopping_state.json", - "start_url": "__SHOPPING__", - "geolocation": null, - "intent_template": "Search for \"{{keyword}}\"", - "instantiation_dict": { - "keyword": "xbox" - }, + "intent_template_id": 212, + "start_urls": ["__SHOPPING__"], "intent": "Search for \"xbox\"", - "require_reset": false, - "eval": { - "expected_ui_state": [ - { - "url": "__SHOPPING__/catalogsearch/result/?q=xbox" - } - ], - "site": "shopping" - }, - "original.eval": { - "eval_types": [ - "url_match" - ], - "reference_answers": null, - "reference_url": "__SHOPPING__/catalogsearch/result/?q=xbox", - "program_html": [], - "url_note": "GOLD in PRED" - }, - "intent_template_id": 212 + "intent_template": "Search for \"{{keyword}}\"", + "instantiation_dict": {"keyword": "xbox"}, + "format_specification": null, + "start_url_context": null, + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} + }, + { + "evaluator": "NetworkEventEvaluator", + "site": "shopping", + "url_match_mode": "exact", + "last_event_only": true, + "expected": { + "url": "__SHOPPING__/catalogsearch/result/?q=xbox", + "response_status": 200, + "event_type": "navigation" + } + } + ], + "revision": 2 }, { - "sites": [ - "shopping" - ], + "sites": ["shopping"], "task_id": 276, - "require_login": true, - "storage_state": "./.auth/shopping_state.json", - "start_url": "__SHOPPING__", - "geolocation": null, - "intent_template": "Search for \"{{keyword}}\"", - "instantiation_dict": { - "keyword": "switch accessories" - }, + "intent_template_id": 212, + "start_urls": ["__SHOPPING__"], "intent": "Search for \"switch accessories\"", - "require_reset": false, - "eval": { - "expected_ui_state": [ - { - "url": "__SHOPPING__/catalogsearch/result/?q=switch+accessories" - } - ], - "site": "shopping" - }, - "original.eval": { - "eval_types": [ - "url_match" - ], - "reference_answers": null, - "reference_url": "__SHOPPING__/catalogsearch/result/?q=switch+accessories", - "program_html": [], - "url_note": "GOLD in PRED" - }, - "intent_template_id": 212 + "intent_template": "Search for \"{{keyword}}\"", + "instantiation_dict": {"keyword": "switch accessories"}, + "format_specification": null, + "start_url_context": null, + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} + }, + { + "evaluator": "NetworkEventEvaluator", + "site": "shopping", + "url_match_mode": "exact", + "last_event_only": true, + "expected": { + "url": "__SHOPPING__/catalogsearch/result/?q=switch+accessories", + "response_status": 200, + "event_type": "navigation" + } + } + ], + "revision": 2 }, { - "sites": [ - "shopping" - ], + "sites": ["shopping"], "task_id": 277, - "require_login": true, - "storage_state": "./.auth/shopping_state.json", - "start_url": "__SHOPPING__", - "geolocation": null, - "intent_template": "Search for \"{{keyword}}\"", - "instantiation_dict": { - "keyword": "batteries for iphone 13" - }, + "intent_template_id": 212, + "start_urls": ["__SHOPPING__"], "intent": "Search for \"batteries for iphone 13\"", - "require_reset": false, - "eval": { - "expected_ui_state": [ - { - "url": "__SHOPPING__/catalogsearch/result/?q=iphone+13" - } - ], - "site": "shopping" - }, - "original.eval": { - "eval_types": [ - "url_match" - ], - "reference_answers": null, - "reference_url": "__SHOPPING__/catalogsearch/result/?q=iphone+13", - "program_html": [], - "url_note": "GOLD in PRED" - }, - "intent_template_id": 212 + "intent_template": "Search for \"{{keyword}}\"", + "instantiation_dict": {"keyword": "batteries for iphone 13"}, + "format_specification": null, + "start_url_context": null, + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} + }, + { + "evaluator": "NetworkEventEvaluator", + "site": "shopping", + "url_match_mode": "exact", + "last_event_only": true, + "expected": { + "url": "__SHOPPING__/catalogsearch/result/?q=batteries+for+iphone+13", + "response_status": 200, + "event_type": "navigation" + } + } + ], + "revision": 2 }, { - "sites": [ - "shopping" - ], + "sites": ["shopping"], "task_id": 278, - "require_login": true, - "storage_state": "./.auth/shopping_state.json", - "start_url": "__SHOPPING__", - "geolocation": null, - "intent_template": "Search for \"{{keyword}}\"", - "instantiation_dict": { - "keyword": "green tea bag for weight loss" - }, + "intent_template_id": 212, + "start_urls": ["__SHOPPING__"], "intent": "Search for \"green tea bag for weight loss\"", - "require_reset": false, - "eval": { - "expected_ui_state": [ - { - "url": "__SHOPPING__/catalogsearch/result/?q=green+tea+bag+for+weight+loss" - } - ], - "site": "shopping" - }, - "original.eval": { - "eval_types": [ - "url_match" - ], - "reference_answers": null, - "reference_url": "__SHOPPING__/catalogsearch/result/?q=green+tea+bag+for+weight+loss", - "program_html": [], - "url_note": "GOLD in PRED" - }, - "intent_template_id": 212 + "intent_template": "Search for \"{{keyword}}\"", + "instantiation_dict": {"keyword": "green tea bag for weight loss"}, + "format_specification": null, + "start_url_context": null, + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} + }, + { + "evaluator": "NetworkEventEvaluator", + "site": "shopping", + "url_match_mode": "exact", + "last_event_only": true, + "expected": { + "url": "__SHOPPING__/catalogsearch/result/?q=green+tea+bag+for+weight+loss", + "response_status": 200, + "event_type": "navigation" + } + } + ], + "revision": 2 }, { - "sites": [ - "shopping" - ], + "sites": ["shopping"], "task_id": 279, - "require_login": true, - "storage_state": "./.auth/shopping_state.json", - "start_url": "__SHOPPING__", - "geolocation": null, - "intent_template": "Provide me with the full names of {{product}}, and also share the price range for the available models", - "original.intent_template": "Provide me with the complete names of Bluetooth headphones from Sony, and also share the price range for the available models", - "instantiation_dict": { - "product": "Bluetooth headphones from Sony" - }, - "original.instantiation_dict": {}, + "intent_template_id": 204, + "start_urls": ["__SHOPPING__"], "intent": "Provide me with the full names of Bluetooth headphones from Sony, and also share the price range for the available models", - "original.intent": "Provide me with the complete names of Bluetooth headphones from Sony, and also share the price range for the available models", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "verify_retrieved_value_normalized", - "eval_params": { - "ordered": false, - "type": "object" - }, - "expected_data": [ - { - "names": { - "value": [ - "SONY WH1000XM3 Bluetooth Wireless Noise Canceling Headphones Silver WH-1000XM3/S (Renewed)", - "Sony WH-CH710N/H Wireless Bluetooth Noise Cancelling Headphones", - "Sony WH-1000XM3B Wireless Bluetooth Noise-Canceling Over-Ear Headphones (Black) Basic Headphone Bundle Kit with Stylus", - "Sony Wireless Headphones WH-CH510: Wireless Bluetooth On-Ear Headset with Mic for Phone-Call, Black", - "Sony WHCH710N Wireless Bluetooth Noise Canceling Over-The-Ear Headphones (Black) with Kratos 18W PD Two-Port Power Adapter and Kratos 6-Feet Nylon Braided USB-C Cable Bundle (3 Items)", - "Sony WI-SP500 Wireless in-Ear Sports Headphones, White (WISP500/W)", - "Sony WI-SP510 Extra BASS Wireless in-Ear Headset/Headphones with mic for Phone Call Sports IPX5 Bluetooth, Black (WISP510/B)", - "Sony MDRAS600BT Active Sports Bluetooth Headset (Black)", - "Sony WH-1000XM4 Wireless Noise Canceling Over-Ear Headphones (Black) with Sony WLA-NS7 Wireless TV Adapter Bundle (2 Items)", - "Sony WI-C300 Wireless In-Ear Headphones, Red (WIC300/R)", - "Sony XB950N1 Extra Bass Wireless Noise Canceling Headphones, Black", - "SONY - H900N Hi-Res Noise Cancelling Wireless Headphone Grayish Black Renewed" - ], - "type": "text" - }, - "min": { - "value": "18.99", - "type": "currency" - }, - "max": { - "value": "406", - "type": "currency" - } - } - ] - } - } - ], - "site": "shopping" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "must_include": [ - "SONY WH1000XM3 Bluetooth Wireless Noise Canceling Headphones Silver WH-1000XM3/S (Renewed)", - "Sony WH-CH710N/H Wireless Bluetooth Noise Cancelling Headphones", - "Sony WH-1000XM3B Wireless Bluetooth Noise-Canceling Over-Ear Headphones (Black) Basic Headphone Bundle Kit with Stylus", - "Sony Wireless Headphones WH-CH510: Wireless Bluetooth On-Ear Headset with Mic for Phone-Call, Black", - "Sony WHCH710N Wireless Bluetooth Noise Canceling Over-The-Ear Headphones (Black) with Kratos 18W PD Two-Port Power Adapter and Kratos 6-Feet Nylon Braided USB-C Cable Bundle (3 Items)", - "Sony WI-SP500 Wireless in-Ear Sports Headphones, White (WISP500/W)", - "Sony WI-SP510 Extra BASS Wireless in-Ear Headset/Headphones with mic for Phone Call Sports IPX5 Bluetooth, Black (WISP510/B)", - "Sony MDRAS600BT Active Sports Bluetooth Headset (Black)", - "Sony WH-1000XM4 Wireless Noise Canceling Over-Ear Headphones (Black) with Sony WLA-NS7 Wireless TV Adapter Bundle (2 Items)", - "Sony WI-C300 Wireless In-Ear Headphones, Red (WIC300/R)", - "Sony XB950N1 Extra Bass Wireless Noise Canceling Headphones, Black", - "SONY - H900N Hi-Res Noise Cancelling Wireless Headphone Grayish Black Renewed", - "18.99", - "406" - ] - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "These models are avaiable: SONY WH1000XM3 Bluetooth Wireless Noise Canceling Headphones Silver WH-1000XM3/S (Renewed) Sony WH-CH710N/H Wireless Bluetooth Noise Cancelling Headphones Sony WH-1000XM3B Wireless Bluetooth Noise-Canceling Over-Ear Headphones (Black) Basic Headphone Bundle Kit with Stylus Sony Wireless Headphones WH-CH510: Wireless Bluetooth On-Ear Headset with Mic for Phone-Call, Black Sony WHCH710N Wireless Bluetooth Noise Canceling Over-The-Ear Headphones (Black) with Kratos 18W PD Two-Port Power Adapter and Kratos 6-Feet Nylon Braided USB-C Cable Bundle (3 Items) Sony WI-SP500 Wireless in-Ear Sports Headphones, White (WISP500/W) Sony WI-SP510 Extra BASS Wireless in-Ear Headset/Headphones with mic for Phone Call Sports IPX5 Bluetooth, Black (WISP510/B) Sony MDRAS600BT Active Sports Bluetooth Headset (Black) Sony WH-1000XM4 Wireless Noise Canceling Over-Ear Headphones (Black) with Sony WLA-NS7 Wireless TV Adapter Bundle (2 Items) Sony WI-C300 Wireless In-Ear Headphones, Red (WIC300/R) Sony XB950N1 Extra Bass Wireless Noise Canceling Headphones, Black SONY - H900N Hi-Res Noise Cancelling Wireless Headphone Grayish Black Renewed The price ranges from $18.99 to $406 " - }, - "intent_template_id": 204, + "intent_template": "Provide me with the full names of {{product}}, and also share the price range for the available models", + "instantiation_dict": {"product": "Bluetooth headphones from Sony"}, "format_specification": "Use \"names\" for the list of product names and \"min\" for the minimum price and \"max\" for the maximum price.", - "changelogs": [ - { - "key": "eval", - "category": "permissive_string_match", - "note": "Switched to exact match" + "start_url_context": null, + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "ordered": false, + "results_schema": { + "type": "array", + "items": { + "type": "object", + "properties": { + "names": { "type": "array", "items": {"type": "string"} }, + "min": {"type": "string"}, + "max": {"type": "string"} + }, + "required": ["max", "min", "names"] + } + }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": [ + { + "names": [ + "SONY WH1000XM3 Bluetooth Wireless Noise Canceling Headphones Silver WH-1000XM3/S (Renewed)", + "Sony WH-CH710N/H Wireless Bluetooth Noise Cancelling Headphones", + "Sony WH-1000XM3B Wireless Bluetooth Noise-Canceling Over-Ear Headphones (Black) Basic Headphone Bundle Kit with Stylus", + "Sony Wireless Headphones WH-CH510: Wireless Bluetooth On-Ear Headset with Mic for Phone-Call, Black", + "Sony WHCH710N Wireless Bluetooth Noise Canceling Over-The-Ear Headphones (Black) with Kratos 18W PD Two-Port Power Adapter and Kratos 6-Feet Nylon Braided USB-C Cable Bundle (3 Items)", + "Sony WI-SP500 Wireless in-Ear Sports Headphones, White (WISP500/W)", + "Sony WI-SP510 Extra BASS Wireless in-Ear Headset/Headphones with mic for Phone Call Sports IPX5 Bluetooth, Black (WISP510/B)", + "Sony MDRAS600BT Active Sports Bluetooth Headset (Black)", + "Sony WH-1000XM4 Wireless Noise Canceling Over-Ear Headphones (Black) with Sony WLA-NS7 Wireless TV Adapter Bundle (2 Items)", + "Sony WI-C300 Wireless In-Ear Headphones, Red (WIC300/R)", + "Sony XB950N1 Extra Bass Wireless Noise Canceling Headphones, Black", + "SONY - H900N Hi-Res Noise Cancelling Wireless Headphone Grayish Black Renewed" + ], + "min": "18.99", + "max": "406" + } + ] + } } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping" - ], + "sites": ["shopping"], "task_id": 280, - "require_login": true, - "storage_state": "./.auth/shopping_state.json", - "start_url": "__SHOPPING__", - "geolocation": null, - "intent_template": "Provide me with the full names of {{product}}, and also share the price range for the available models", - "original.intent_template": "Provide me with the full names of chargers from Anker, and also share the price range for the available models", - "instantiation_dict": { - "product": "chargers from Anker" - }, - "original.instantiation_dict": {}, - "intent": "Provide me with the full names of chargers from Anker, and also share the price range for the available models", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "verify_retrieved_value_normalized", - "eval_params": { - "ordered": false, - "type": "object" - }, - "expected_data": [ - { - "names": { - "value": [ - "Anker USB C Charger 30W, 711 Charger, Compact Fast Charger (Not Foldable) for MacBook Air/iPhone 13/13 Mini/13 Pro/13 Pro Max/12, Galaxy S21, Note 20, iPad Pro, Pixel, and More", - "Anker USB C Charger 40W, 521 Charger (Nano Pro), PIQ 3.0 Durable Compact Fast Charger (Not Foldable) for iPhone 13/13 Mini/13 Pro/13 Pro Max/12, Galaxy, Pixel 4/3, iPad/iPad Mini (Cable Not Included)", - "Anker PowerCore Speed 20000, 20000mAh Qualcomm Quick Charge 3.0 & PowerIQ Portable Charger, with Quick Charge Recharging, Power Bank for Samsung, iPhone, iPad and More, Black (A1278)", - "5Ft Micro-USB Charger Cord Cable Fit for Anker-PowerCore 5000 10000 20100 13000 26800 Mini 3350 Fusion II 15000 Redux 20000 Slim 10000 Astro E1 AC Replacement Power Adapter Supply", - "Anker 10W Max Wireless Charger, 313 Wireless Charger (Pad), Qi-Certified Wireless Charging 7.5W for iPhone 12/12 Pro/12 mini/12 Pro Max, 10W for Galaxy S10 S9 S8, S9 Plus, Note 9 (No AC Adapter)", - "Anker Wireless Charger, 313 Wireless Charger (Stand), Qi-Certified for iPhone 12, 12 Pro Max, SE, 11, 11 Pro, 11 Pro Max, XR, XS Max, 10W Fast-Charging Galaxy S20, S10 (No AC Adapter)", - "USB Charger, Anker Elite Dual Port 24W Wall Charger, PowerPort 2 with PowerIQ and Foldable Plug, for iPhone 11/Xs/XS Max/XR/X/8/7/6/Plus, iPad Pro/Air 2/Mini 3/Mini 4, Samsung S4/S5, and More", - "iPhone 12 Charger [GaN Tech], Anker 30W Compact USB-C Wall Charger with Power Delivery, PowerPort Atom for iPhone 12 / Mini/Pro/Pro Max / 11 / X/XS/XR, iPad Pro, MacBook 12'', Pixel, Galaxy", - "USB C Charger, Anker 30W 2 Port Fast Charger with 18W USB C Power Adapter, Foldable PowerPort PD 2 Charger for iPad Pro, iPhone 11/11 Pro / 11 Pro Max/XS/Max/XR/X, Pixel, Galaxy, and More", - "Anker 40W 5-Port USB Wall Charger, PowerPort 5 for iPhone XS / XS Max / XR / X / 8 / 7 / 6 / Plus, iPad Pro / Air 2 / mini, Galaxy S9 / S8 / Edge / Plus, Note 8 / 7, LG, Nexus, HTC and More, Black (AK-A2124111)", - "Anker Quick Charge 3.0 39W Dual USB Wall Charger, PowerPort Speed 2 for Galaxy S10/S9/S8/Edge/Plus, Note 8/7 and PowerIQ for iPhone Xs/XS Max/XR/X/8/Plus, iPad Pro/Air 2/Mini, LG, Nexus, HTC and More", - "USB C Charger, Anker 20W PIQ 3.0 Fast Charger with Foldable Plug, PowerPort III Charger for iPhone 13/13 Mini/13 Pro/13 Pro Max/12/11, iPad/iPad Mini, MagSafe, and More (Cable Not Included)" - ], - "type": "text" - }, - "min": { - "value": "8.99", - "type": "currency" - }, - "max": { - "value": "59.99", - "type": "currency" - } - } - ] - } - } - ], - "site": "shopping" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "must_include": [ - "Anker USB C Charger 30W, 711 Charger, Compact Fast Charger (Not Foldable) for MacBook Air/iPhone 13/13 Mini/13 Pro/13 Pro Max/12, Galaxy S21, Note 20, iPad Pro, Pixel, and More", - "Anker USB C Charger 40W, 521 Charger (Nano Pro), PIQ 3.0 Durable Compact Fast Charger (Not Foldable) for iPhone 13/13 Mini/13 Pro/13 Pro Max/12, Galaxy, Pixel 4/3, iPad/iPad Mini (Cable Not Included)", - "Anker PowerCore Speed 20000, 20000mAh Qualcomm Quick Charge 3.0 & PowerIQ Portable Charger, with Quick Charge Recharging, Power Bank for Samsung, iPhone, iPad and More, Black (A1278)", - "5Ft Micro-USB Charger Cord Cable Fit for Anker-PowerCore 5000 10000 20100 13000 26800 Mini 3350 Fusion II 15000 Redux 20000 Slim 10000 Astro E1 AC Replacement Power Adapter Supply", - "Anker 10W Max Wireless Charger, 313 Wireless Charger (Pad), Qi-Certified Wireless Charging 7.5W for iPhone 12/12 Pro/12 mini/12 Pro Max, 10W for Galaxy S10 S9 S8, S9 Plus, Note 9 (No AC Adapter)", - "Anker Wireless Charger, 313 Wireless Charger (Stand), Qi-Certified for iPhone 12, 12 Pro Max, SE, 11, 11 Pro, 11 Pro Max, XR, XS Max, 10W Fast-Charging Galaxy S20, S10 (No AC Adapter)", - "USB Charger, Anker Elite Dual Port 24W Wall Charger, PowerPort 2 with PowerIQ and Foldable Plug, for iPhone 11/Xs/XS Max/XR/X/8/7/6/Plus, iPad Pro/Air 2/Mini 3/Mini 4, Samsung S4/S5, and More", - "iPhone 12 Charger [GaN Tech], Anker 30W Compact USB-C Wall Charger with Power Delivery, PowerPort Atom for iPhone 12 / Mini/Pro/Pro Max / 11 / X/XS/XR, iPad Pro, MacBook 12'', Pixel, Galaxy", - "USB C Charger, Anker 30W 2 Port Fast Charger with 18W USB C Power Adapter, Foldable PowerPort PD 2 Charger for iPad Pro, iPhone 11/11 Pro / 11 Pro Max/XS/Max/XR/X, Pixel, Galaxy, and More", - "Anker 40W 5-Port USB Wall Charger, PowerPort 5 for iPhone XS / XS Max / XR / X / 8 / 7 / 6 / Plus, iPad Pro / Air 2 / mini, Galaxy S9 / S8 / Edge / Plus, Note 8 / 7, LG, Nexus, HTC and More, Black (AK-A2124111)", - "Anker Quick Charge 3.0 39W Dual USB Wall Charger, PowerPort Speed 2 for Galaxy S10/S9/S8/Edge/Plus, Note 8/7 and PowerIQ for iPhone Xs/XS Max/XR/X/8/Plus, iPad Pro/Air 2/Mini, LG, Nexus, HTC and More", - "USB C Charger, Anker 20W PIQ 3.0 Fast Charger with Foldable Plug, PowerPort III Charger for iPhone 13/13 Mini/13 Pro/13 Pro Max/12/11, iPad/iPad Mini, MagSafe, and More (Cable Not Included)", - "8.99", - "59.99" - ] - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "These models are availiable: Anker USB C Charger 30W, 711 Charger, Compact Fast Charger (Not Foldable) for MacBook Air/iPhone 13/13 Mini/13 Pro/13 Pro Max/12, Galaxy S21, Note 20, iPad Pro, Pixel, and More Anker USB C Charger 40W, 521 Charger (Nano Pro), PIQ 3.0 Durable Compact Fast Charger (Not Foldable) for iPhone 13/13 Mini/13 Pro/13 Pro Max/12, Galaxy, Pixel 4/3, iPad/iPad Mini (Cable Not Included) Anker PowerCore Speed 20000, 20000mAh Qualcomm Quick Charge 3.0 & PowerIQ Portable Charger, with Quick Charge Recharging, Power Bank for Samsung, iPhone, iPad and More, Black (A1278) 5Ft Micro-USB Charger Cord Cable Fit for Anker-PowerCore 5000 10000 20100 13000 26800 Mini 3350 Fusion II 15000 Redux 20000 Slim 10000 Astro E1 AC Replacement Power Adapter Supply Anker 10W Max Wireless Charger, 313 Wireless Charger (Pad), Qi-Certified Wireless Charging 7.5W for iPhone 12/12 Pro/12 mini/12 Pro Max, 10W for Galaxy S10 S9 S8, S9 Plus, Note 9 (No AC Adapter) Anker Wireless Charger, 313 Wireless Charger (Stand), Qi-Certified for iPhone 12, 12 Pro Max, SE, 11, 11 Pro, 11 Pro Max, XR, XS Max, 10W Fast-Charging Galaxy S20, S10 (No AC Adapter) USB Charger, Anker Elite Dual Port 24W Wall Charger, PowerPort 2 with PowerIQ and Foldable Plug, for iPhone 11/Xs/XS Max/XR/X/8/7/6/Plus, iPad Pro/Air 2/Mini 3/Mini 4, Samsung S4/S5, and More iPhone 12 Charger [GaN Tech], Anker 30W Compact USB-C Wall Charger with Power Delivery, PowerPort Atom for iPhone 12 / Mini/Pro/Pro Max / 11 / X/XS/XR, iPad Pro, MacBook 12'', Pixel, Galaxy USB C Charger, Anker 30W 2 Port Fast Charger with 18W USB C Power Adapter, Foldable PowerPort PD 2 Charger for iPad Pro, iPhone 11/11 Pro / 11 Pro Max/XS/Max/XR/X, Pixel, Galaxy, and More Anker 40W 5-Port USB Wall Charger, PowerPort 5 for iPhone XS / XS Max / XR / X / 8 / 7 / 6 / Plus, iPad Pro / Air 2 / mini, Galaxy S9 / S8 / Edge / Plus, Note 8 / 7, LG, Nexus, HTC and More, Black (AK-A2124111) Anker Quick Charge 3.0 39W Dual USB Wall Charger, PowerPort Speed 2 for Galaxy S10/S9/S8/Edge/Plus, Note 8/7 and PowerIQ for iPhone Xs/XS Max/XR/X/8/Plus, iPad Pro/Air 2/Mini, LG, Nexus, HTC and More USB C Charger, Anker 20W PIQ 3.0 Fast Charger with Foldable Plug, PowerPort III Charger for iPhone 13/13 Mini/13 Pro/13 Pro Max/12/11, iPad/iPad Mini, MagSafe, and More (Cable Not Included) Magnetic Wireless Charger, Anker Wireless Charger with 5ft Built-in USB-C Cable, PowerWave Magnetic Pad, 7.5W Charging for iPhone 13 / 13 Pro / 13 Pro Max / 13 mini / 12 / 12 Pro (No AC Adapter) USB C Super Fast Charger, Anker 25W PD Wall Charger Fast Charging for Samsung Galaxy S21/S21+/S21 Ultra/S20/Z Flip/Note20/20 Ultra/Note10/10+/S9/S8/S10e, iPad Pro 12.9, and More (Cable not Included) The price ranges from $8.99 to $59.99" - }, "intent_template_id": 204, + "start_urls": ["__SHOPPING__"], + "intent": "Provide me with the full names of chargers from Anker, and also share the price range for the available models", + "intent_template": "Provide me with the full names of {{product}}, and also share the price range for the available models", + "instantiation_dict": {"product": "chargers from Anker"}, "format_specification": "Use \"names\" for the list of product names and \"min\" for the minimum price and \"max\" for the maximum price.", - "changelogs": [ - { - "key": "eval", - "category": "permissive_string_match", - "note": "Switched to exact match" + "start_url_context": null, + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "ordered": false, + "results_schema": { + "type": "array", + "items": { + "type": "object", + "properties": { + "names": { "type": "array", "items": {"type": "string"} }, + "min": {"type": "number", "format": "currency"}, + "max": {"type": "number", "format": "currency"} + }, + "required": ["max", "min", "names"] + } + }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": [ + { + "names": [ + "Anker USB C Charger 30W, 711 Charger, Compact Fast Charger (Not Foldable) for MacBook Air/iPhone 13/13 Mini/13 Pro/13 Pro Max/12, Galaxy S21, Note 20, iPad Pro, Pixel, and More", + "Anker USB C Charger 40W, 521 Charger (Nano Pro), PIQ 3.0 Durable Compact Fast Charger (Not Foldable) for iPhone 13/13 Mini/13 Pro/13 Pro Max/12, Galaxy, Pixel 4/3, iPad/iPad Mini (Cable Not Included)", + "Anker PowerCore Speed 20000, 20000mAh Qualcomm Quick Charge 3.0 & PowerIQ Portable Charger, with Quick Charge Recharging, Power Bank for Samsung, iPhone, iPad and More, Black (A1278)", + "5Ft Micro-USB Charger Cord Cable Fit for Anker-PowerCore 5000 10000 20100 13000 26800 Mini 3350 Fusion II 15000 Redux 20000 Slim 10000 Astro E1 AC Replacement Power Adapter Supply", + "Anker 10W Max Wireless Charger, 313 Wireless Charger (Pad), Qi-Certified Wireless Charging 7.5W for iPhone 12/12 Pro/12 mini/12 Pro Max, 10W for Galaxy S10 S9 S8, S9 Plus, Note 9 (No AC Adapter)", + "Anker Wireless Charger, 313 Wireless Charger (Stand), Qi-Certified for iPhone 12, 12 Pro Max, SE, 11, 11 Pro, 11 Pro Max, XR, XS Max, 10W Fast-Charging Galaxy S20, S10 (No AC Adapter)", + "USB Charger, Anker Elite Dual Port 24W Wall Charger, PowerPort 2 with PowerIQ and Foldable Plug, for iPhone 11/Xs/XS Max/XR/X/8/7/6/Plus, iPad Pro/Air 2/Mini 3/Mini 4, Samsung S4/S5, and More", + "iPhone 12 Charger [GaN Tech], Anker 30W Compact USB-C Wall Charger with Power Delivery, PowerPort Atom for iPhone 12 / Mini/Pro/Pro Max / 11 / X/XS/XR, iPad Pro, MacBook 12'', Pixel, Galaxy", + "USB C Charger, Anker 30W 2 Port Fast Charger with 18W USB C Power Adapter, Foldable PowerPort PD 2 Charger for iPad Pro, iPhone 11/11 Pro / 11 Pro Max/XS/Max/XR/X, Pixel, Galaxy, and More", + "Anker 40W 5-Port USB Wall Charger, PowerPort 5 for iPhone XS / XS Max / XR / X / 8 / 7 / 6 / Plus, iPad Pro / Air 2 / mini, Galaxy S9 / S8 / Edge / Plus, Note 8 / 7, LG, Nexus, HTC and More, Black (AK-A2124111)", + "Anker Quick Charge 3.0 39W Dual USB Wall Charger, PowerPort Speed 2 for Galaxy S10/S9/S8/Edge/Plus, Note 8/7 and PowerIQ for iPhone Xs/XS Max/XR/X/8/Plus, iPad Pro/Air 2/Mini, LG, Nexus, HTC and More", + "USB C Charger, Anker 20W PIQ 3.0 Fast Charger with Foldable Plug, PowerPort III Charger for iPhone 13/13 Mini/13 Pro/13 Pro Max/12/11, iPad/iPad Mini, MagSafe, and More (Cable Not Included)" + ], + "min": "8.99", + "max": "59.99" + } + ] + } } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping" - ], + "sites": ["shopping"], "task_id": 281, - "require_login": true, - "storage_state": "./.auth/shopping_state.json", - "start_url": "__SHOPPING__", - "geolocation": null, - "intent_template": "Provide me with the full names of {{product}}, and also share the price range for the available models", - "original.intent_template": "Please provide me with the complete product names of Oral B brush heads designed for children, along with their corresponding price range per brush", - "instantiation_dict": { - "product": "Oral B brush heads designed for children" - }, - "original.instantiation_dict": {}, - "intent": "Provide me with the full names of Oral B brush heads designed for children, and also share the price range for the available models", - "original.intent": "Please provide me with the complete product names of Oral B brush heads designed for children, along with their corresponding price range per brush", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "verify_retrieved_value_normalized", - "eval_params": { - "ordered": false, - "type": "object" - }, - "expected_data": [ - { - "names": { - "value": [ - "Oral-B Kids Extra Soft Replacement Brush Heads featuring STAR WARS, 2 count", - "Kids By Oral-b Stages Power Star Wars Replacement Heads 4 Pack" - ], - "type": "text" - }, - "min": { - "value": "12.99", - "type": "currency" - }, - "max": { - "value": "14.98", - "type": "currency" - } - } - ] - } - } - ], - "site": "shopping" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "must_include": [ - "Oral-B Kids Extra Soft Replacement Brush Heads featuring STAR WARS, 2 count", - "Kids By Oral-b Stages Power Star Wars Replacement Heads 4 Pack", - "3.745", - "6.495" - ] - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "These models are availiable: Oral-B Kids Extra Soft Replacement Brush Heads featuring STAR WARS, 2 count Kids By Oral-b Stages Power Star Wars Replacement Heads 4 Pack The price ranges from $3.745 to $6.495 " - }, "intent_template_id": 204, + "start_urls": ["__SHOPPING__"], + "intent": "Provide me with the full names of Oral B brush heads designed for children, and also share the price range for the available models", + "intent_template": "Provide me with the full names of {{product}}, and also share the price range for the available models", + "instantiation_dict": {"product": "Oral B brush heads designed for children"}, "format_specification": "Use \"names\" for the list of product names and \"min\" for the minimum price and \"max\" for the maximum price.", - "changelogs": [ - { - "key": "eval", - "category": "permissive_string_match", - "note": "Switched to exact match" + "start_url_context": null, + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "ordered": false, + "results_schema": { + "type": "array", + "items": { + "type": "object", + "properties": { + "names": { "type": "array", "items": {"type": "string"} }, + "min": {"type": "string"}, + "max": {"type": "string"} + }, + "required": ["max", "min", "names"] + } + }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": [ + { + "names": [ + "Oral-B Kids Extra Soft Replacement Brush Heads featuring STAR WARS, 2 count", + "Kids By Oral-b Stages Power Star Wars Replacement Heads 4 Pack" + ], + "min": "12.99", + "max": "14.98" + } + ] + } } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping" - ], + "sites": ["shopping"], "task_id": 282, - "require_login": true, - "storage_state": "./.auth/shopping_state.json", - "start_url": "__SHOPPING__", - "geolocation": null, - "intent_template": "Provide me with the full names of {{product}}, and also share the price range for the available models", - "original.intent_template": "List the full product names of slide slippers from Nike and tell me the price range of the available products", - "instantiation_dict": { - "product": "slide slippers from Nike" - }, - "original.instantiation_dict": {}, - "intent": "Provide me with the full names of slide slippers from Nike, and also share the price range for the available models", - "original.intent": "List the full product names of slide slippers from Nike and tell me the price range of the available products", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "verify_retrieved_value_normalized", - "eval_params": { - "ordered": false, - "type": "object" - }, - "expected_data": [ - { - "names": { - "value": [ - "Nike Men's Air Max Camden Slide Sandal", - "Nike Men's Benassi JDI Fanny Pack Slides", - "Nike Victori One Mens Comfort Slide Cn9675-003 (Midnight Navy/Midnight Navy/White, Numeric_10)", - "Nike Offcourt Slide Mens Bq4639-002 Size 12", - "Nike Jordan Men's Break Slide Red AR6374-602", - "Nike Victori One Slide Mens Style : Dd9559-300", - "Nike Men's Benassi Solarsoft Slide Athletic Sandal (Black/White, numeric_14)", - "Nike Men's Benassi Solarsoft Slide Athletic Sandal (Midnight Navy/Blue, numeric_8)", - "Nike womens Benassi Just Do It" - ], - "type": "text" - }, - "min": { - "value": "27.6", - "type": "currency" - }, - "max": { - "value": "90.65", - "type": "currency" - } - } - ] - } - } - ], - "site": "shopping" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "must_include": [ - "Nike Men's Air Max Camden Slide Sandal", - "Nike Men's Benassi JDI Fanny Pack Slides", - "Nike Victori One Mens Comfort Slide Cn9675-003 (Midnight Navy/Midnight Navy/White, Numeric_10)", - "Nike Offcourt Slide Mens Bq4639-002 Size 12", - "Nike Jordan Men's Break Slide Red AR6374-602", - "Nike Victori One Slide Mens Style : Dd9559-300", - "Nike Men's Benassi Solarsoft Slide Athletic Sandal (Black/White, numeric_14)", - "Nike Men's Benassi Solarsoft Slide Athletic Sandal (Midnight Navy/Blue, numeric_8)", - "Nike womens Benassi Just Do It", - "27.6", - "90.65" - ] - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "These models are availiable: Nike Men's Air Max Camden Slide Sandal Nike Men's Benassi JDI Fanny Pack Slides Nike Victori One Mens Comfort Slide Cn9675-003 (Midnight Navy/Midnight Navy/White, Numeric_10) Nike Offcourt Slide Mens Bq4639-002 Size 12 Nike Jordan Men's Break Slide Red AR6374-602 Nike Victori One Slide Mens Style : Dd9559-300 Nike Men's Benassi Solarsoft Slide Athletic Sandal (Black/White, numeric_14) Nike Men's Benassi Solarsoft Slide Athletic Sandal (Midnight Navy/Blue, numeric_8) Nike womens Benassi Just Do It The price ranges from $27.6 to $90.65" - }, "intent_template_id": 204, + "start_urls": ["__SHOPPING__"], + "intent": "Provide me with the full names of slide slippers from Nike, and also share the price range for the available models", + "intent_template": "Provide me with the full names of {{product}}, and also share the price range for the available models", + "instantiation_dict": {"product": "slide slippers from Nike"}, "format_specification": "Use \"names\" for the list of product names and \"min\" for the minimum price and \"max\" for the maximum price.", - "changelogs": [ - { - "key": "eval", - "category": "permissive_string_match", - "note": "Switched to exact match" + "start_url_context": null, + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "ordered": false, + "results_schema": { + "type": "array", + "items": { + "type": "object", + "properties": { + "names": { "type": "array", "items": {"type": "string"} }, + "min": {"type": "string"}, + "max": {"type": "string"} + }, + "required": ["max", "min", "names"] + } + }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": [ + { + "names": [ + "Nike Men's Air Max Camden Slide Sandal", + "Nike Men's Benassi JDI Fanny Pack Slides", + "Nike Victori One Mens Comfort Slide Cn9675-003 (Midnight Navy/Midnight Navy/White, Numeric_10)", + "Nike Offcourt Slide Mens Bq4639-002 Size 12", + "Nike Jordan Men's Break Slide Red AR6374-602", + "Nike Victori One Slide Mens Style : Dd9559-300", + "Nike Men's Benassi Solarsoft Slide Athletic Sandal (Black/White, numeric_14)", + "Nike Men's Benassi Solarsoft Slide Athletic Sandal (Midnight Navy/Blue, numeric_8)", + "Nike womens Benassi Just Do It" + ], + "min": "27.6", + "max": "90.65" + } + ] + } } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping" - ], + "sites": ["shopping"], "task_id": 283, - "require_login": true, - "storage_state": "./.auth/shopping_state.json", - "start_url": "__SHOPPING__", - "geolocation": null, + "intent_template_id": 210, + "start_urls": ["__SHOPPING__"], + "intent": "Navigate to the most recent models of XBox controllers released between 2020-2021.", "intent_template": "Navigate to the most recent models of XBox controllers released between 2020-2021.", - "original.intent_template": "Look up the most recent models of XBox controllers released between 2020-2021?", "instantiation_dict": {}, - "intent": "Navigate to the most recent models of XBox controllers released between 2020-2021.", - "original.intent": "Look up the most recent models of XBox controllers released between 2020-2021?", - "require_reset": false, - "eval": { - "expected_ui_state": [ - { - "url": "__SHOPPING__/microsoft-xbox-controller-carbon-black-for-series-x-series-s-xbox-one-windows-10-android-ios-bundled-with-dual-port-charging-dock-xbox-controller-skin-voucher-premgear-cloth.html" - } - ], - "site": "shopping" - }, - "original.eval": { - "eval_types": [ - "url_match" - ], - "reference_answers": null, - "reference_url": "__SHOPPING__/microsoft-xbox-controller-carbon-black-for-series-x-series-s-xbox-one-windows-10-android-ios-bundled-with-dual-port-charging-dock-xbox-controller-skin-voucher-premgear-cloth.html", - "program_html": [], - "url_note": "GOLD in PRED" - }, - "intent_template_id": 210, - "changelogs": [ + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "intent_template", - "category": "task_ambiguity", - "note": "Clearly specify expected navigate vs return value. and remove ambiguous '?'." + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} + }, + { + "evaluator": "NetworkEventEvaluator", + "site": "shopping", + "url_match_mode": "exact", + "last_event_only": true, + "expected": { + "url": "__SHOPPING__/microsoft-xbox-controller-carbon-black-for-series-x-series-s-xbox-one-windows-10-android-ios-bundled-with-dual-port-charging-dock-xbox-controller-skin-voucher-premgear-cloth.html", + "response_status": 200, + "event_type": "navigation" + } } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping" - ], + "sites": ["shopping"], "task_id": 284, - "require_login": true, - "storage_state": "./.auth/shopping_state.json", - "start_url": "__SHOPPING__", - "geolocation": null, - "intent_template": "Show the least expensive {{product}} with a minimum storage capacity of {{min_storage}}.", - "instantiation_dict": { - "product": "shoe storage", - "min_storage": "12 pairs" - }, + "intent_template_id": 207, + "start_urls": ["__SHOPPING__"], "intent": "Show the least expensive shoe storage with a minimum storage capacity of 12 pairs.", - "require_reset": false, - "eval": { - "expected_ui_state": [ - { - "url": "__SHOPPING__/onlyeasy-over-the-door-shoe-storage-organizer-hanging-shoe-rack-holder-with-24-large-fabric-pockets-22-1-x-61-4-herringbone-grey-mxrodsb1p.html" - } - ], - "site": "shopping" - }, - "original.eval": { - "eval_types": [ - "url_match" - ], - "reference_answers": null, - "reference_url": "__SHOPPING__/onlyeasy-over-the-door-shoe-storage-organizer-hanging-shoe-rack-holder-with-24-large-fabric-pockets-22-1-x-61-4-herringbone-grey-mxrodsb1p.html", - "program_html": [], - "url_note": "GOLD in PRED" - }, - "intent_template_id": 207 + "intent_template": "Show the least expensive {{product}} with a minimum storage capacity of {{min_storage}}.", + "instantiation_dict": {"product": "shoe storage", "min_storage": "12 pairs"}, + "format_specification": null, + "start_url_context": null, + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} + }, + { + "evaluator": "NetworkEventEvaluator", + "site": "shopping", + "url_match_mode": "exact", + "last_event_only": true, + "expected": { + "url": "__SHOPPING__/onlyeasy-over-the-door-shoe-storage-organizer-hanging-shoe-rack-holder-with-24-large-fabric-pockets-22-1-x-61-4-herringbone-grey-mxrodsb1p.html", + "response_status": 200, + "event_type": "navigation" + } + } + ], + "revision": 2 }, { - "sites": [ - "shopping" - ], + "sites": ["shopping"], "task_id": 285, - "require_login": true, - "storage_state": "./.auth/shopping_state.json", - "start_url": "__SHOPPING__", - "geolocation": null, - "intent_template": "Show the least expensive {{product}} with a minimum storage capacity of {{min_storage}}.", - "instantiation_dict": { - "product": "switch card holder", - "min_storage": "15 cards" - }, + "intent_template_id": 207, + "start_urls": ["__SHOPPING__"], "intent": "Show the least expensive switch card holder with a minimum storage capacity of 15 cards.", - "require_reset": false, - "eval": { - "expected_ui_state": [ - { - "url": "__SHOPPING__/game-card-holder-storage-case-for-nintendo-switch-games-or-ps-vita-game-case-or-sd-memory-cards-black.html" - } - ], - "site": "shopping" - }, - "original.eval": { - "eval_types": [ - "url_match" - ], - "reference_answers": null, - "reference_url": "__SHOPPING__/game-card-holder-storage-case-for-nintendo-switch-games-or-ps-vita-game-case-or-sd-memory-cards-black.html", - "program_html": [], - "url_note": "GOLD in PRED" - }, - "intent_template_id": 207 + "intent_template": "Show the least expensive {{product}} with a minimum storage capacity of {{min_storage}}.", + "instantiation_dict": {"product": "switch card holder", "min_storage": "15 cards"}, + "format_specification": null, + "start_url_context": null, + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} + }, + { + "evaluator": "NetworkEventEvaluator", + "site": "shopping", + "url_match_mode": "exact", + "last_event_only": true, + "expected": { + "url": "__SHOPPING__/game-card-holder-storage-case-for-nintendo-switch-games-or-ps-vita-game-case-or-sd-memory-cards-black.html", + "response_status": 200, + "event_type": "navigation" + } + } + ], + "revision": 2 }, { - "sites": [ - "shopping" - ], + "sites": ["shopping"], "task_id": 286, - "require_login": true, - "storage_state": "./.auth/shopping_state.json", - "start_url": "__SHOPPING__", - "geolocation": null, - "intent_template": "Show the least expensive {{product}} with a minimum storage capacity of {{min_storage}}.", - "instantiation_dict": { - "product": "ssd hard drive", - "min_storage": "1TB" - }, + "intent_template_id": 207, + "start_urls": ["__SHOPPING__"], "intent": "Show the least expensive ssd hard drive with a minimum storage capacity of 1TB.", - "require_reset": false, - "eval": { - "expected_ui_state": [ - { - "url": "__SHOPPING__/external-hard-drive-2tb-ultra-thin-external-hard-drive-2000gb-ultra-high-speed-portable-3-1-type-c-storage-drive-compatible-with-pc-laptop-and-mac-2tb-a1.html" - } - ], - "site": "shopping" - }, - "original.eval": { - "eval_types": [ - "url_match" - ], - "reference_answers": null, - "reference_url": "__SHOPPING__/external-hard-drive-2tb-ultra-thin-external-hard-drive-2000gb-ultra-high-speed-portable-3-1-type-c-storage-drive-compatible-with-pc-laptop-and-mac-2tb-a1.html", - "program_html": [], - "url_note": "GOLD in PRED" - }, - "intent_template_id": 207 + "intent_template": "Show the least expensive {{product}} with a minimum storage capacity of {{min_storage}}.", + "instantiation_dict": {"product": "ssd hard drive", "min_storage": "1TB"}, + "format_specification": null, + "start_url_context": null, + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} + }, + { + "evaluator": "NetworkEventEvaluator", + "site": "shopping", + "url_match_mode": "exact", + "last_event_only": true, + "expected": { + "url": "__SHOPPING__/external-hard-drive-2tb-ultra-thin-external-hard-drive-2000gb-ultra-high-speed-portable-3-1-type-c-storage-drive-compatible-with-pc-laptop-and-mac-2tb-a1.html", + "response_status": 200, + "event_type": "navigation" + } + } + ], + "revision": 2 }, { - "sites": [ - "map" - ], + "sites": ["map"], "task_id": 287, - "require_login": true, - "storage_state": null, - "start_url": "__MAP__", - "geolocation": null, + "intent_template_id": 47, + "start_urls": ["__MAP__"], + "intent": "How much time does it take from Pittsburgh to Philadelphia by car?", "intent_template": "How much time does it take from Pittsburgh to Philadelphia by car?", "instantiation_dict": {}, - "intent": "How much time does it take from Pittsburgh to Philadelphia by car?", - "require_reset": false, - "eval": { - "site": "map" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "fuzzy_match": [ - "5h 47min" - ] - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "5h 47min" - }, - "intent_template_id": 47, - "changelogs": [ + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "eval", - "category": "permissive_string_match", - "note": "Switched from fuzzy eval" + "evaluator": "AgentResponseEvaluator", + "ordered": true, + "results_schema": { "type": "array", "items": {"type": "string"} }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": ["5h 47min"] + } } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping_admin" - ], + "sites": ["shopping_admin"], "task_id": 288, - "require_login": true, - "storage_state": "./.auth/shopping_admin_state.json", - "start_url": "__SHOPPING_ADMIN__", - "geolocation": null, + "intent_template_id": 234, + "start_urls": ["__SHOPPING_ADMIN__"], + "intent": "Get the name of the customer who has the most cancellations in the history", "intent_template": "Tell me the {{attribute}} of the customer who has the most cancellations in the history", - "instantiation_dict": { - "attribute": "name" - }, - "intent": "Tell me the name of the customer who has the most cancellations in the history", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "verify_retrieved_value_normalized", - "eval_params": { - "ordered": false, - "type": "text" - }, - "expected_data": [ - "Samantha Jones" - ] - } + "instantiation_dict": {"attribute": "name"}, + "format_specification": null, + "start_url_context": null, + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "ordered": false, + "results_schema": { "type": "array", "items": {"type": "string"} }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": ["Samantha Jones"] } - ], - "site": "shopping_admin" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "exact_match": "Samantha Jones" - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "Samantha Jones" - }, - "intent_template_id": 234 + } + ], + "revision": 2 }, { - "sites": [ - "shopping_admin" - ], + "sites": ["shopping_admin"], "task_id": 289, - "require_login": true, - "storage_state": "./.auth/shopping_admin_state.json", - "start_url": "__SHOPPING_ADMIN__", - "geolocation": null, - "intent_template": "Tell me the {{attribute}} of the customer who has the most cancellations in the history", - "instantiation_dict": { - "attribute": "email address, name, phone number" - }, + "intent_template_id": 234, + "start_urls": ["__SHOPPING_ADMIN__"], "intent": "Tell me the email address, name, phone number of the customer who has the most cancellations in the history", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "verify_retrieved_value_normalized", - "eval_params": { - "ordered": false, - "type": "object" + "intent_template": "Tell me the {{attribute}} of the customer who has the most cancellations in the history", + "instantiation_dict": {"attribute": "email address, name, phone number"}, + "format_specification": "Use \"name\" for the customer name and \"email\" for the email and \"phone_number\" for the phone number.", + "start_url_context": null, + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "ordered": false, + "results_schema": { + "type": "array", + "items": { + "type": "object", + "properties": { + "name" : { "type": "string" }, + "email" : { "type": "string" }, + "phone_number": { "type": "string" } }, - "expected_data": [ - { - "name": { - "value": "Samantha Jones", - "type": "text" - }, - "email": { - "value": "coolcat321@hotmail.com", - "type": "text" - }, - "phone_number": { - "value": "3055551212", - "type": "text" - } - } - ] + "required": ["email", "name", "phone_number"] } + }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": [ + { + "name": "Samantha Jones", + "email": "coolcat321@hotmail.com", + "phone_number": "3055551212" + } + ] } - ], - "site": "shopping_admin" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "fuzzy_match": [ - "email: coolcat321@hotmail.com", - "name: Samantha Jones", - "phone number: 3055551212" - ] - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "email: coolcat321@hotmail.com name: Samantha Jones phone number: 3055551212" - }, - "intent_template_id": 234, - "format_specification": "Use \"name\" for the customer name and \"email\" for the email and \"phone_number\" for the phone number.", - "changelogs": [ - { - "key": "eval", - "category": "permissive_string_match", - "note": "Switched from fuzzy eval" } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping_admin" - ], + "sites": ["shopping_admin"], "task_id": 290, - "require_login": true, - "storage_state": "./.auth/shopping_admin_state.json", - "start_url": "__SHOPPING_ADMIN__", - "geolocation": null, - "intent_template": "Tell me the {{attribute}} of the customer who has the most cancellations in the history", - "instantiation_dict": { - "attribute": "product SKUs in the most recent cancelled orders" - }, - "intent": "Tell me the product SKUs in the most recent cancelled orders of the customer who has the most cancellations in the history", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "verify_retrieved_value_normalized", - "eval_params": { - "ordered": false, - "type": "text" - }, - "expected_data": [ - "WSH09-29-White", - "WSH09-28-Green", - "MSH11-34-Blue", - "WP09-29-Purple" - ] - } - } - ], - "site": "shopping_admin" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "must_include": [ - "WSH09-29-White", - "WSH09-28-Green", - "MSH11-34-Blue", - "WP09-29-Purple" - ] - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "WSH09-29-White,WSH09-28-Green,MSH11-34-Blue,WP09-29-Purple" - }, "intent_template_id": 234, - "changelogs": [ + "start_urls": ["__SHOPPING_ADMIN__"], + "intent": "Tell me the product SKUs in the most recent cancelled orders of the customer who has the most cancellations in the history", + "intent_template": "Tell me the {{attribute}} of the customer who has the most cancellations in the history", + "instantiation_dict": {"attribute": "product SKUs in the most recent cancelled orders"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "eval", - "category": "permissive_string_match", - "note": "Switched to exact match" + "evaluator": "AgentResponseEvaluator", + "ordered": false, + "results_schema": { "type": "array", "items": {"type": "string"} }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": ["WSH09-29-White", "WSH09-28-Green", "MSH11-34-Blue", "WP09-29-Purple"] + } } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping_admin" - ], + "sites": ["shopping_admin"], "task_id": 291, - "require_login": true, - "storage_state": "./.auth/shopping_admin_state.json", - "start_url": "__SHOPPING_ADMIN__", - "geolocation": null, - "intent_template": "Tell me the {{attribute}} of the customer who has the most cancellations in the history", - "instantiation_dict": { - "attribute": "total spend on products in the most recent cancelled orders" - }, - "intent": "Tell me the total spend on products in the most recent cancelled orders of the customer who has the most cancellations in the history", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "verify_retrieved_value_normalized", - "eval_params": { - "ordered": false, - "type": "numeric" - }, - "expected_data": [ - 148 - ] - } - } - ], - "site": "shopping_admin" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "must_include": [ - "148" - ] - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "$148" - }, "intent_template_id": 234, - "changelogs": [ + "start_urls": ["__SHOPPING_ADMIN__"], + "intent": "Tell me the total spend on products in the most recent cancelled orders of the customer who has the most cancellations in the history", + "intent_template": "Tell me the {{attribute}} of the customer who has the most cancellations in the history", + "instantiation_dict": {"attribute": "total spend on products in the most recent cancelled orders"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "eval", - "category": "permissive_string_match", - "note": "Switched to exact match" + "evaluator": "AgentResponseEvaluator", + "ordered": false, + "results_schema": { "type": "array", "items": {"type": "number"} }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": [148] + } } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping_admin" - ], + "sites": ["shopping_admin"], "task_id": 292, - "require_login": true, - "storage_state": "./.auth/shopping_admin_state.json", - "start_url": "__SHOPPING_ADMIN__", - "geolocation": null, - "intent_template": "Tell me the {{attribute}} of the customer who has the most cancellations in the history", - "instantiation_dict": { - "attribute": "total number of cancellations" - }, - "intent": "Tell me the total number of cancellations of the customer who has the most cancellations in the history", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "verify_retrieved_value_normalized", - "eval_params": { - "ordered": false, - "type": "numeric" - }, - "expected_data": [ - 9 - ] - } - } - ], - "site": "shopping_admin" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "must_include": [ - "9" - ] - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "9" - }, "intent_template_id": 234, - "changelogs": [ + "start_urls": ["__SHOPPING_ADMIN__"], + "intent": "Tell me the total number of cancellations of the customer who has the most cancellations in the history", + "intent_template": "Tell me the {{attribute}} of the customer who has the most cancellations in the history", + "instantiation_dict": {"attribute": "total number of cancellations"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "eval", - "category": "permissive_string_match", - "note": "Switched to exact match" + "evaluator": "AgentResponseEvaluator", + "ordered": false, + "results_schema": { "type": "array", "items": {"type": "number"} }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": [9] + } } - ] + ], + "revision": 2 }, { - "sites": [ - "gitlab" - ], + "sites": ["gitlab"], "task_id": 293, - "require_login": true, - "storage_state": "./.auth/gitlab_state.json", - "start_url": "__GITLAB__", - "geolocation": null, - "intent_template": "Show me the command to clone {{repo}} with SSH. Use the globally provided host and port", - "original.intent_template": "Show me the command to clone {{repo}} with SSH.", - "instantiation_dict": { - "repo": "Super_Awesome_Robot" - }, - "intent": "Show me the command to clone Super_Awesome_Robot with SSH. Use the globally provided host and port", - "original.intent": "Show me the command to clone Super_Awesome_Robot with SSH.", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "verify_retrieved_value_normalized", - "eval_params": { - "type": "text" - }, - "expected_data": [ - "git clone ssh://git@{{ resource.base_host }}:2222/convexegg/super_awesome_robot.git" - ] - } - } - ], - "site": "gitlab" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "exact_match": "git clone ssh://git@metis.lti.cs.cmu.edu:2222/convexegg/super_awesome_robot.git" - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "git clone ssh://git@metis.lti.cs.cmu.edu:2222/convexegg/super_awesome_robot.git" - }, "intent_template_id": 329, - "changelogs": [ - { - "key": "intent_template", - "category": "reference_alignment", - "note": "Specify which host and port to use when giving the final output" - }, + "start_urls": ["__GITLAB__"], + "intent": "Show me the command to clone Super_Awesome_Robot with SSH.", + "intent_template": "Show me the command to clone {{repo}} with SSH.", + "instantiation_dict": {"repo": "Super_Awesome_Robot"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "expected_retrieve_value", - "category": "avoid_permissive_text", - "note": "Use structured response to judge correctness" + "evaluator": "AgentResponseEvaluator", + "results_schema": { "type": "array", "items": {"type": "string"} }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": [ + "git clone ssh://git@{{ resource.base_host }}:2222/convexegg/super_awesome_robot.git" + ] + } } - ] + ], + "revision": 2 }, { - "sites": [ - "gitlab" - ], + "sites": ["gitlab"], "task_id": 294, - "require_login": true, - "storage_state": "./.auth/gitlab_state.json", - "start_url": "__GITLAB__", - "geolocation": null, - "intent_template": "Show me the command to clone {{repo}} with SSH. Use the globally provided host and port", - "original.intent_template": "Show me the command to clone {{repo}} with SSH.", - "instantiation_dict": { - "repo": "ChatGPT" - }, - "intent": "Show me the command to clone ChatGPT with SSH. Use the globally provided host and port", - "original.intent": "Show me the command to clone ChatGPT with SSH.", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "verify_retrieved_value_normalized", - "eval_params": { - "type": "text" - }, - "expected_data": [ - "git clone ssh://git@{{ resource.base_host }}:2222/convexegg/chatgpt.git" - ] - } - } - ], - "site": "gitlab" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "exact_match": "git clone ssh://git@metis.lti.cs.cmu.edu:2222/convexegg/chatgpt.git" - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "git clone ssh://git@metis.lti.cs.cmu.edu:2222/convexegg/chatgpt.git" - }, "intent_template_id": 329, - "changelogs": [ - { - "key": "intent_template", - "category": "reference_alignment", - "note": "Specify which host and port to use when giving the final output" - }, + "start_urls": ["__GITLAB__"], + "intent": "Show me the command to clone ChatGPT with SSH.", + "intent_template": "Show me the command to clone {{repo}} with SSH.", + "instantiation_dict": {"repo": "ChatGPT"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "expected_retrieve_value", - "category": "avoid_permissive_text", - "note": "Use structured response to judge correctness" + "evaluator": "AgentResponseEvaluator", + "results_schema": { "type": "array", "items": {"type": "string"} }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": ["git clone ssh://git@{{ resource.base_host }}:2222/convexegg/chatgpt.git"] + } } - ] + ], + "revision": 2 }, { - "sites": [ - "gitlab" - ], + "sites": ["gitlab"], "task_id": 295, - "require_login": true, - "storage_state": "./.auth/gitlab_state.json", - "start_url": "__GITLAB__", - "geolocation": null, - "intent_template": "Show me the command to clone {{repo}} with SSH. Use the globally provided host and port", - "original.intent_template": "Show me the command to clone {{repo}} with SSH.", - "instantiation_dict": { - "repo": "metaseq" - }, - "intent": "Show me the command to clone metaseq with SSH. Use the globally provided host and port", - "original.intent": "Show me the command to clone metaseq with SSH.", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "verify_retrieved_value_normalized", - "eval_params": { - "type": "text" - }, - "expected_data": [ - "git clone ssh://git@{{ resource.base_host }}:2222/root/metaseq.git" - ] - } - } - ], - "site": "gitlab" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "exact_match": "git clone ssh://git@metis.lti.cs.cmu.edu:2222/root/metaseq.git" - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "git clone ssh://git@metis.lti.cs.cmu.edu:2222/root/metaseq.git" - }, "intent_template_id": 329, - "changelogs": [ - { - "key": "intent_template", - "category": "reference_alignment", - "note": "Specify which host and port to use when giving the final output" - }, + "start_urls": ["__GITLAB__"], + "intent": "Show me the command to clone metaseq with SSH.", + "intent_template": "Show me the command to clone {{repo}} with SSH.", + "instantiation_dict": {"repo": "metaseq"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "expected_retrieve_value", - "category": "avoid_permissive_text", - "note": "Use structured response to judge correctness" + "evaluator": "AgentResponseEvaluator", + "results_schema": { "type": "array", "items": {"type": "string"} }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": ["git clone ssh://git@{{ resource.base_host }}:2222/root/metaseq.git"] + } } - ] + ], + "revision": 2 }, { - "sites": [ - "gitlab" - ], + "sites": ["gitlab"], "task_id": 296, - "require_login": true, - "storage_state": "./.auth/gitlab_state.json", - "start_url": "__GITLAB__", - "geolocation": null, - "intent_template": "Show me the command to clone {{repo}} with SSH. Use the globally provided host and port", - "original.intent_template": "Show me the command to clone {{repo}} with SSH.", - "instantiation_dict": { - "repo": "the best GAN python implementation" - }, - "intent": "Show me the command to clone the best GAN python implementation with SSH. Use the globally provided host and port", - "original.intent": "Show me the command to clone the best GAN python implementation with SSH.", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "verify_retrieved_value_normalized", - "eval_params": { - "type": "text" - }, - "expected_data": [ - "git clone ssh://git@{{ resource.base_host }}:2222/eriklindernoren/PyTorch-GAN.git" - ] - } - } - ], - "site": "gitlab" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "exact_match": "ssh://git@metis.lti.cs.cmu.edu:2222/eriklindernoren/PyTorch-GAN.git" - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "ssh://git@metis.lti.cs.cmu.edu:2222/eriklindernoren/PyTorch-GAN.git" - }, "intent_template_id": 329, - "changelogs": [ + "start_urls": ["__GITLAB__"], + "intent": "Show me the command to clone the best GAN python implementation with SSH.", + "intent_template": "Show me the command to clone {{repo}} with SSH.", + "instantiation_dict": {"repo": "the best GAN python implementation"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "intent_template", - "category": "reference_alignment", - "note": "Specify which host and port to use when giving the final output" - }, - { - "key": "expected_retrieve_value", - "category": "avoid_permissive_text", - "note": "Use structured response to judge correctness" + "evaluator": "AgentResponseEvaluator", + "results_schema": { "type": "array", "items": {"type": "string"} }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": [ + "git clone ssh://git@{{ resource.base_host }}:2222/eriklindernoren/PyTorch-GAN.git" + ] + } } - ] + ], + "revision": 2 }, { - "sites": [ - "gitlab" - ], + "sites": ["gitlab"], "task_id": 297, - "require_login": true, - "storage_state": "./.auth/gitlab_state.json", - "start_url": "__GITLAB__", - "geolocation": null, - "intent_template": "Show me the command to clone {{repo}} with SSH. Use the globally provided host and port", - "original.intent_template": "Show me the command to clone {{repo}} with SSH.", - "instantiation_dict": { - "repo": "the most stared Covid location tracker" - }, - "intent": "Show me the command to clone the most stared Covid location tracker with SSH. Use the globally provided host and port", - "original.intent": "Show me the command to clone the most stared Covid location tracker with SSH.", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "verify_retrieved_value_normalized", - "eval_params": { - "type": "text" - }, - "expected_data": [ - "git clone ssh://git@{{ resource.base_host }}:2222/yjlou/2019-nCov.git" - ] - } - } - ], - "site": "gitlab" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "exact_match": "ssh://git@metis.lti.cs.cmu.edu:2222/yjlou/2019-nCov.git" - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "ssh://git@metis.lti.cs.cmu.edu:2222/yjlou/2019-nCov.git" - }, "intent_template_id": 329, - "changelogs": [ - { - "key": "intent_template", - "category": "reference_alignment", - "note": "Specify which host and port to use when giving the final output" - }, + "start_urls": ["__GITLAB__"], + "intent": "Show me the command to clone the most stared Covid location tracker with SSH.", + "intent_template": "Show me the command to clone {{repo}} with SSH.", + "instantiation_dict": {"repo": "the most stared Covid location tracker"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "expected_retrieve_value", - "category": "avoid_permissive_text", - "note": "Use structured response to judge correctness" + "evaluator": "AgentResponseEvaluator", + "results_schema": { "type": "array", "items": {"type": "string"} }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": ["git clone ssh://git@{{ resource.base_host }}:2222/yjlou/2019-nCov.git"] + } } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping" - ], + "sites": ["shopping"], "task_id": 298, - "require_login": true, - "storage_state": "./.auth/shopping_state.json", - "start_url": "__SHOPPING__", - "geolocation": null, - "intent_template": "Show the most recent {{status}} order", - "instantiation_dict": { - "status": "completed" - }, + "intent_template_id": 180, + "start_urls": ["__SHOPPING__"], "intent": "Show the most recent completed order", - "require_reset": false, - "eval": { - "expected_ui_state": [ - { - "url": "__SHOPPING__/sales/order/view/order_id/180/" - } - ], - "site": "shopping" - }, - "original.eval": { - "eval_types": [ - "url_match" - ], - "reference_answers": null, - "reference_url": "__SHOPPING__/sales/order/view/order_id/180/", - "program_html": [], - "url_note": "GOLD in PRED" - }, - "intent_template_id": 180 + "intent_template": "Show the most recent {{status}} order", + "instantiation_dict": {"status": "completed"}, + "format_specification": null, + "start_url_context": null, + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} + }, + { + "evaluator": "NetworkEventEvaluator", + "site": "shopping", + "url_match_mode": "exact", + "last_event_only": true, + "expected": { + "url": "__SHOPPING__/sales/order/view/order_id/180/", + "response_status": 200, + "event_type": "navigation" + } + } + ], + "revision": 2 }, { - "sites": [ - "shopping" - ], + "sites": ["shopping"], "task_id": 299, - "require_login": true, - "storage_state": "./.auth/shopping_state.json", - "start_url": "__SHOPPING__", - "geolocation": null, - "intent_template": "Show the most recent {{status}} order", - "instantiation_dict": { - "status": "cancelled" - }, + "intent_template_id": 180, + "start_urls": ["__SHOPPING__"], "intent": "Show the most recent cancelled order", - "require_reset": false, - "eval": { - "expected_ui_state": [ - { - "url": "__SHOPPING__/sales/order/view/order_id/170/" - } - ], - "site": "shopping" - }, - "original.eval": { - "eval_types": [ - "url_match" - ], - "reference_answers": null, - "reference_url": "__SHOPPING__/sales/order/view/order_id/170/", - "program_html": [], - "url_note": "GOLD in PRED" - }, - "intent_template_id": 180 + "intent_template": "Show the most recent {{status}} order", + "instantiation_dict": {"status": "cancelled"}, + "format_specification": null, + "start_url_context": null, + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} + }, + { + "evaluator": "NetworkEventEvaluator", + "site": "shopping", + "url_match_mode": "exact", + "last_event_only": true, + "expected": { + "url": "__SHOPPING__/sales/order/view/order_id/170/", + "response_status": 200, + "event_type": "navigation" + } + } + ], + "revision": 2 }, { - "sites": [ - "shopping" - ], + "sites": ["shopping"], "task_id": 300, - "require_login": true, - "storage_state": "./.auth/shopping_state.json", - "start_url": "__SHOPPING__", - "geolocation": null, - "intent_template": "Show the most recent {{status}} order", - "instantiation_dict": { - "status": "pending" - }, + "intent_template_id": 180, + "start_urls": ["__SHOPPING__"], "intent": "Show the most recent pending order", - "require_reset": false, - "eval": { - "expected_ui_state": [ - { - "url": "__SHOPPING__/sales/order/view/order_id/189/" - } - ], - "site": "shopping" - }, - "original.eval": { - "eval_types": [ - "url_match" - ], - "reference_answers": null, - "reference_url": "__SHOPPING__/sales/order/view/order_id/189/", - "program_html": [], - "url_note": "GOLD in PRED" - }, - "intent_template_id": 180 + "intent_template": "Show the most recent {{status}} order", + "instantiation_dict": {"status": "pending"}, + "format_specification": null, + "start_url_context": null, + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} + }, + { + "evaluator": "NetworkEventEvaluator", + "site": "shopping", + "url_match_mode": "exact", + "last_event_only": true, + "expected": { + "url": "__SHOPPING__/sales/order/view/order_id/189/", + "response_status": 200, + "event_type": "navigation" + } + } + ], + "revision": 2 }, { - "sites": [ - "shopping" - ], + "sites": ["shopping"], "task_id": 301, - "require_login": true, - "storage_state": "./.auth/shopping_state.json", - "start_url": "__SHOPPING__", - "geolocation": null, - "intent_template": "Show the most recent {{status}} order", - "instantiation_dict": { - "status": "processing" - }, - "intent": "Show the most recent processing order", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "validate_status", - "expected_data": { - "expected_status": "NOT_FOUND_ERROR" - } - } - } - ], - "site": "shopping" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "fuzzy_match": "N/A" - }, - "reference_url": "", - "program_html": [], - "string_note": "there is no order in processing" - }, "intent_template_id": 180, - "changelogs": [ + "start_urls": ["__SHOPPING__"], + "intent": "Show the most recent processing order", + "intent_template": "Show the most recent {{status}} order", + "instantiation_dict": {"status": "processing"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "expected_retrieve_value", - "category": "unachievable_tasks", - "note": "Status check used instead of fuzzy match" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": { + "performed_operation": "retrieve", + "status": "NOT_FOUND_ERROR", + "retrieved_data": null + } } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping" - ], + "sites": ["shopping"], "task_id": 302, - "require_login": true, - "storage_state": "./.auth/shopping_state.json", - "start_url": "__SHOPPING__", - "geolocation": null, - "intent_template": "Show the most recent {{status}} order", - "instantiation_dict": { - "status": "out of delivery" - }, - "intent": "Show the most recent out of delivery order", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "validate_status", - "expected_data": { - "expected_status": "NOT_FOUND_ERROR" - } - } - } - ], - "site": "shopping" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "fuzzy_match": "N/A" - }, - "reference_url": "", - "program_html": [], - "string_note": "there is no order in processing" - }, "intent_template_id": 180, - "changelogs": [ + "start_urls": ["__SHOPPING__"], + "intent": "Show the most recent out of delivery order", + "intent_template": "Show the most recent {{status}} order", + "instantiation_dict": {"status": "out of delivery"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "expected_retrieve_value", - "category": "unachievable_tasks", - "note": "Status check used instead of fuzzy match" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": { + "performed_operation": "retrieve", + "status": "NOT_FOUND_ERROR", + "retrieved_data": null + } } - ] + ], + "revision": 2 }, { - "sites": [ - "gitlab" - ], + "sites": ["gitlab"], "task_id": 303, - "require_login": true, - "storage_state": "./.auth/gitlab_state.json", - "start_url": "__GITLAB__/a11yproject/a11yproject.com", - "geolocation": null, - "intent_template": "How many commits did {{user}} make {{period}}?", - "instantiation_dict": { - "user": "Kilian", - "period": "during 2023" - }, - "original.instantiation_dict": { - "user": "Kilian", - "period": "durning 2023" - }, - "intent": "How many commits did Kilian make during 2023?", - "original.intent": "How many commits did Kilian make durning 2023?", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "verify_retrieved_value_normalized", - "eval_params": { - "ordered": "false", - "type": "numeric" - }, - "expected_data": [ - 1 - ] - } - } - ], - "site": "gitlab" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "must_include": [ - "1" - ] - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "1" - }, "intent_template_id": 321, - "changelogs": [ - { - "key": "instantiation_dict", - "category": "spelling_or_grammar", - "note": "Specify response format and correct spelling" - }, - { - "key": "expected_retrieve_value", - "category": "avoid_permissive_text", - "note": "Use structured response to judge correctness" - }, + "start_urls": ["__GITLAB__/a11yproject/a11yproject.com"], + "intent": "How many commits did Kilian make during 2023?", + "intent_template": "How many commits did {{user}} make {{period}}?", + "instantiation_dict": {"user": "Kilian", "period": "during 2023"}, + "format_specification": null, + "start_url_context": "Complete the task given the context of the current page.", + "eval": [ { - "key": "eval", - "category": "permissive_string_match", - "note": "Switched to exact match" + "evaluator": "AgentResponseEvaluator", + "ordered": "false", + "results_schema": { "type": "array", "items": {"type": "number"} }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": [1] + } } - ] + ], + "revision": 2 }, { - "sites": [ - "gitlab" - ], + "sites": ["gitlab"], "task_id": 304, - "require_login": true, - "storage_state": "./.auth/gitlab_state.json", - "start_url": "__GITLAB__/a11yproject/a11yproject.com", - "geolocation": null, - "intent_template": "How many commits did {{user}} make {{period}}?", - "instantiation_dict": { - "user": "Eric", - "period": "between Feb 2023 and May 2023" - }, - "intent": "How many commits did Eric make between Feb 2023 and May 2023?", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "verify_retrieved_value_normalized", - "eval_params": { - "ordered": "false", - "type": "numeric" - }, - "expected_data": [ - 14 - ] - } - } - ], - "site": "gitlab" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "must_include": [ - "14" - ] - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "14" - }, "intent_template_id": 321, - "changelogs": [ - { - "key": "expected_retrieve_value", - "category": "avoid_permissive_text", - "note": "Use structured response to judge correctness" - }, + "start_urls": ["__GITLAB__/a11yproject/a11yproject.com"], + "intent": "How many commits did Eric make between Feb 2023 and May 2023?", + "intent_template": "How many commits did {{user}} make {{period}}?", + "instantiation_dict": {"user": "Eric", "period": "between Feb 2023 and May 2023"}, + "format_specification": null, + "start_url_context": "Complete the task given the context of the current page.", + "eval": [ { - "key": "eval", - "category": "permissive_string_match", - "note": "Switched to exact match" + "evaluator": "AgentResponseEvaluator", + "ordered": "false", + "results_schema": { "type": "array", "items": {"type": "number"} }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": [14] + } } - ] + ], + "revision": 2 }, { - "sites": [ - "gitlab" - ], + "sites": ["gitlab"], "task_id": 305, - "require_login": true, - "storage_state": "./.auth/gitlab_state.json", - "start_url": "__GITLAB__/a11yproject/a11yproject.com", - "geolocation": null, - "intent_template": "How many commits did {{user}} make {{period}}?", - "instantiation_dict": { - "user": "Philip", - "period": "in 2023/1" - }, - "intent": "How many commits did Philip make in 2023/1?", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "verify_retrieved_value_normalized", - "eval_params": { - "ordered": "false", - "type": "numeric" - }, - "expected_data": [ - 0 - ] - } - } - ], - "site": "gitlab" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "must_include": [ - "0" - ] - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "0" - }, "intent_template_id": 321, - "changelogs": [ - { - "key": "expected_retrieve_value", - "category": "avoid_permissive_text", - "note": "Use structured response to judge correctness" - }, + "start_urls": ["__GITLAB__/a11yproject/a11yproject.com"], + "intent": "How many commits did Philip make in 2023/1?", + "intent_template": "How many commits did {{user}} make {{period}}?", + "instantiation_dict": {"user": "Philip", "period": "in 2023/1"}, + "format_specification": null, + "start_url_context": "Complete the task given the context of the current page.", + "eval": [ { - "key": "eval", - "category": "permissive_string_match", - "note": "Switched to exact match" + "evaluator": "AgentResponseEvaluator", + "ordered": "false", + "results_schema": { "type": "array", "items": {"type": "number"} }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": [0] + } } - ] + ], + "revision": 2 }, { - "sites": [ - "gitlab" - ], + "sites": ["gitlab"], "task_id": 306, - "require_login": true, - "storage_state": "./.auth/gitlab_state.json", - "start_url": "__GITLAB__/a11yproject/a11yproject.com", - "geolocation": null, - "intent_template": "How many commits did {{user}} make {{period}}?", - "instantiation_dict": { - "user": "Anthony", - "period": "between 08/2022-09/2022" - }, - "intent": "How many commits did Anthony make between 08/2022-09/2022?", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "verify_retrieved_value_normalized", - "eval_params": { - "ordered": "false", - "type": "numeric" - }, - "expected_data": [ - 0 - ] - } - } - ], - "site": "gitlab" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "must_include": [ - "0" - ] - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "0" - }, "intent_template_id": 321, - "changelogs": [ - { - "key": "expected_retrieve_value", - "category": "avoid_permissive_text", - "note": "Use structured response to judge correctness" - }, + "start_urls": ["__GITLAB__/a11yproject/a11yproject.com"], + "intent": "How many commits did Anthony make between 08/2022-09/2022?", + "intent_template": "How many commits did {{user}} make {{period}}?", + "instantiation_dict": {"user": "Anthony", "period": "between 08/2022-09/2022"}, + "format_specification": null, + "start_url_context": "Complete the task given the context of the current page.", + "eval": [ { - "key": "eval", - "category": "permissive_string_match", - "note": "Switched to exact match" + "evaluator": "AgentResponseEvaluator", + "ordered": "false", + "results_schema": { "type": "array", "items": {"type": "number"} }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": [0] + } } - ] + ], + "revision": 2 }, { - "sites": [ - "gitlab" - ], + "sites": ["gitlab"], "task_id": 307, - "require_login": true, - "storage_state": "./.auth/gitlab_state.json", - "start_url": "__GITLAB__/a11yproject/a11yproject.com", - "geolocation": null, - "intent_template": "How many commits did {{user}} make {{period}}?", - "instantiation_dict": { - "user": "Nic", - "period": "in April 2021" - }, - "intent": "How many commits did Nic make in April 2021?", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "validate_status", - "expected_data": { - "expected_status": "RESOURCE_NOT_FOUND_ERROR" - } - } - } - ], - "site": "gitlab" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "must_include": [ - "16" - ] - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "16" - }, "intent_template_id": 321, - "changelogs": [ - { - "key": "expected_retrieve_value", - "category": "unachievable_tasks", - "note": "Expect the right error code for unachievable tasks" - }, + "start_urls": ["__GITLAB__/a11yproject/a11yproject.com"], + "intent": "How many commits did Nic make in April 2021?", + "intent_template": "How many commits did {{user}} make {{period}}?", + "instantiation_dict": {"user": "Nic", "period": "in April 2021"}, + "format_specification": null, + "start_url_context": "Complete the task given the context of the current page.", + "eval": [ { - "key": "eval", - "category": "permissive_string_match", - "note": "Switched to exact match" + "evaluator": "AgentResponseEvaluator", + "ordered": "false", + "results_schema": { "type": "array", "items": {"type": "number"} }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": [16] + } } - ] + ], + "revision": 2 }, { - "sites": [ - "gitlab" - ], + "sites": ["gitlab"], "task_id": 308, - "require_login": true, - "storage_state": "./.auth/gitlab_state.json", - "start_url": "__GITLAB__", - "geolocation": null, - "intent_template": "Tell me who has made the most contributions, in terms of number of commits, to the {{repo}} project", - "instantiation_dict": { - "repo": "primer/design" - }, - "intent": "Tell me who has made the most contributions, in terms of number of commits, to the primer/design project", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "verify_retrieved_value_normalized", - "eval_params": { - "type": "text" - }, - "expected_data": [ - "Shawn Allen" - ] - } - } - ], - "site": "gitlab" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "exact_match": "Shawn Allen" - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "Shawn Allen" - }, "intent_template_id": 323, - "format_specification": "Return the first and last name as a string. If first and last name don't exist then return their username", - "changelogs": [ + "start_urls": ["__GITLAB__"], + "intent": "Get the username(s) of the user(s) with the most commits to the primer/design project", + "intent_template": "Get the username(s) of the user(s) with the most commits to the {{repo}} project", + "instantiation_dict": {"repo": "primer/design"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "expected_retrieve_value", - "category": "avoid_permissive_text", - "note": "Use structured response to judge correctness" + "evaluator": "AgentResponseEvaluator", + "results_schema": { "type": "array", "items": {"type": "string"} }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": ["shawn.allen@github.com"] + } } - ] + ], + "revision": 2 }, { - "sites": [ - "gitlab" - ], + "sites": ["gitlab"], "task_id": 309, - "require_login": true, - "storage_state": "./.auth/gitlab_state.json", - "start_url": "__GITLAB__", - "geolocation": null, - "intent_template": "Tell me who has made the most contributions, in terms of number of commits, to the {{repo}} project", - "instantiation_dict": { - "repo": "thoughtbot/administrate" - }, - "intent": "Tell me who has made the most contributions, in terms of number of commits, to the thoughtbot/administrate project", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "verify_retrieved_value_normalized", - "eval_params": { - "type": "text" - }, - "expected_data": [ - "Grayson Wright" - ] - } - } - ], - "site": "gitlab" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "exact_match": "Grayson Wright" - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "Grayson Wright" - }, "intent_template_id": 323, - "format_specification": "Return the first and last name as a string. If first and last name don't exist then return their username", - "changelogs": [ + "start_urls": ["__GITLAB__"], + "intent": "Get the username(s) of the user(s) with the most commits to the thoughtbot/administrate project", + "intent_template": "Get the username(s) of the user(s) with the most commits to the {{repo}} project", + "instantiation_dict": {"repo": "thoughtbot/administrate"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "expected_retrieve_value", - "category": "avoid_permissive_text", - "note": "Use structured response to judge correctness" + "evaluator": "AgentResponseEvaluator", + "results_schema": { "type": "array", "items": {"type": "string"} }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": ["wright.grayson@gmail.com"] + } } - ] + ], + "revision": 2 }, { - "sites": [ - "gitlab" - ], + "sites": ["gitlab"], "task_id": 310, - "require_login": true, - "storage_state": "./.auth/gitlab_state.json", - "start_url": "__GITLAB__", - "geolocation": null, - "intent_template": "Tell me who has made the most contributions, in terms of number of commits, to the {{repo}} project", - "instantiation_dict": { - "repo": "AndroidSlidingUpPanel" - }, - "intent": "Tell me who has made the most contributions, in terms of number of commits, to the AndroidSlidingUpPanel project", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "verify_retrieved_value_normalized", - "eval_params": { - "type": "text" - }, - "expected_data": [ - "tokudu" - ] - } - } - ], - "site": "gitlab" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "exact_match": "tokudu" - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "tokudu" - }, "intent_template_id": 323, - "format_specification": "Return the first and last name as a string. If first and last name don't exist then return their username", - "changelogs": [ + "start_urls": ["__GITLAB__"], + "intent": "Get the username(s) of the user(s) with the most commits to the AndroidSlidingUpPanel project", + "intent_template": "Get the username(s) of the user(s) with the most commits to the {{repo}} project", + "instantiation_dict": {"repo": "AndroidSlidingUpPanel"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "expected_retrieve_value", - "category": "avoid_permissive_text", - "note": "Use structured response to judge correctness" + "evaluator": "AgentResponseEvaluator", + "results_schema": { "type": "array", "items": {"type": "string"} }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": ["antonlopyrev@gmail.com"] + } } - ] + ], + "revision": 2 }, { - "sites": [ - "gitlab" - ], + "sites": ["gitlab"], "task_id": 311, - "require_login": true, - "storage_state": "./.auth/gitlab_state.json", - "start_url": "__GITLAB__", - "geolocation": null, - "intent_template": "Tell me who has made the most contributions, in terms of number of commits, to the {{repo}} project", - "instantiation_dict": { - "repo": "Pytorch GAN" - }, - "intent": "Tell me who has made the most contributions, in terms of number of commits, to the Pytorch GAN project", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "verify_retrieved_value_normalized", - "eval_params": { - "type": "text" - }, - "expected_data": [ - "Erik Linder-Nor\u00e9n" - ] - } - } - ], - "site": "gitlab" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "exact_match": "Erik Linder-Nor\u00e9n" - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "Erik Linder-Nor\u00e9n" - }, "intent_template_id": 323, - "format_specification": "Return the first and last name as a string. If first and last name don't exist then return their username", - "changelogs": [ + "start_urls": ["__GITLAB__"], + "intent": "Get the username(s) of the user(s) with the most commits to the Pytorch GAN project", + "intent_template": "Get the username(s) of the user(s) with the most commits to the {{repo}} project", + "instantiation_dict": {"repo": "Pytorch GAN"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "expected_retrieve_value", - "category": "avoid_permissive_text", - "note": "Use structured response to judge correctness" + "evaluator": "AgentResponseEvaluator", + "results_schema": { "type": "array", "items": {"type": "string"} }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": ["eriklindernoren@live.se"] + } } - ] + ], + "revision": 2 }, { - "sites": [ - "gitlab" - ], + "sites": ["gitlab"], "task_id": 312, - "require_login": true, - "storage_state": "./.auth/gitlab_state.json", - "start_url": "__GITLAB__", - "geolocation": null, - "intent_template": "Tell me who has made the most contributions, in terms of number of commits, to the {{repo}} project", - "instantiation_dict": { - "repo": "csvkit" - }, - "intent": "Tell me who has made the most contributions, in terms of number of commits, to the csvkit project", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "verify_retrieved_value_normalized", - "eval_params": { - "type": "text" - }, - "expected_data": [ - "Christopher Groskopf" - ] - } - } - ], - "site": "gitlab" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "exact_match": "Christopher Groskopf" - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "Christopher Groskopf" - }, "intent_template_id": 323, - "format_specification": "Return the first and last name as a string. If first and last name don't exist then return their username", - "changelogs": [ + "start_urls": ["__GITLAB__"], + "intent": "Get the username(s) of the user(s) with the most commits to the csvkit project", + "intent_template": "Get the username(s) of the user(s) with the most commits to the {{repo}} project", + "instantiation_dict": {"repo": "csvkit"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "expected_retrieve_value", - "category": "avoid_permissive_text", - "note": "Use structured response to judge correctness" + "evaluator": "AgentResponseEvaluator", + "results_schema": { "type": "array", "items": {"type": "string"} }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": ["staringmonkey@gmail.com"] + } } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping" - ], + "sites": ["shopping"], "task_id": 313, - "require_login": true, - "storage_state": "./.auth/shopping_state.json", - "start_url": "__SHOPPING__", - "geolocation": null, - "intent_template": "Return the phone number to call for the customer service?", - "original.intent_template": "Which number to call for the customer service?", - "instantiation_dict": {}, - "intent": "Return the phone number to call for the customer service?", - "original.intent": "Which number to call for the customer service?", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "validate_status", - "expected_data": { - "expected_status": "NOT_FOUND_ERROR" - } - } - } - ], - "site": "shopping" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "fuzzy_match": "N/A" - }, - "reference_url": "", - "program_html": [], - "string_note": "There is no phone number in the website", - "reference_answer_raw_annotation": "N/A" - }, "intent_template_id": 134, - "changelogs": [ - { - "key": "intent_template", - "category": "task_ambiguity", - "note": "Clarify the intent to specify return value" - }, + "start_urls": ["__SHOPPING__"], + "intent": "Get the customer service phone number", + "intent_template": "Get the customer service phone number", + "instantiation_dict": {}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "expected_retrieve_value", - "category": "unachievable_tasks", - "note": "Status check used instead of fuzzy match" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": { + "performed_operation": "retrieve", + "status": "NOT_FOUND_ERROR", + "retrieved_data": null + } } - ] + ], + "revision": 2 }, { - "sites": [ - "gitlab" - ], + "sites": ["gitlab"], "task_id": 314, - "require_login": true, - "storage_state": "./.auth/gitlab_state.json", - "start_url": "__GITLAB__", - "geolocation": null, - "intent_template": "List the {{attribute}} of the top 3 contributors to {{repo}} repo, ranked by the number of commits?", - "instantiation_dict": { - "repo": "primer/design", - "attribute": "name" - }, - "original.instantiation_dict": { - "repo": "prime/design", - "attribute": "name" - }, - "intent": "List the name of the top 3 contributors to primer/design repo, ranked by the number of commits?", - "original.intent": "List the name of the top 3 contributors to prime/design repo, ranked by the number of commits?", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "verify_retrieved_value_normalized", - "eval_params": { - "type": "text" - }, - "expected_data": [ - "Shawn Allen", - "Inayaili Le\u00f3n", - "Aurora Pleguezuelo" - ] - } - } - ], - "site": "gitlab" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "must_include": [ - "Shawn Allen", - "Inayaili Le\u00f3n", - "Aurora Pleguezuelo" - ] - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "Shawn Allen, Inayaili Le\u00f3n, Aurora Pleguezuelo" - }, "intent_template_id": 324, - "format_specification": "Return the first and last names as a list", - "changelogs": [ - { - "key": "instantiation_dict", - "category": "spelling_or_grammar", - "note": "Corrected misspelling" - }, - { - "key": "expected_retrieve_value", - "category": "avoid_permissive_text", - "note": "Use structured response to judge correctness" - }, + "start_urls": ["__GITLAB__"], + "intent": "List the full names of the top 3 contributors to primer/design repo, ranked by the number of commits?", + "intent_template": "List the {{attribute}} of the top 3 contributors to {{repo}} repo, ranked by the number of commits?", + "instantiation_dict": {"repo": "primer/design", "attribute": "full names"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "eval", - "category": "permissive_string_match", - "note": "Switched to exact match" + "evaluator": "AgentResponseEvaluator", + "results_schema": { "type": "array", "items": {"type": "string"} }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": ["Shawn Allen", "Inayaili Le\u00f3n", "Aurora Pleguezuelo"] + } } - ] + ], + "revision": 2 }, { - "sites": [ - "gitlab" - ], + "sites": ["gitlab"], "task_id": 315, - "require_login": true, - "storage_state": "./.auth/gitlab_state.json", - "start_url": "__GITLAB__", - "geolocation": null, + "intent_template_id": 324, + "start_urls": ["__GITLAB__"], + "intent": "List the email addresses of the top 3 contributors to Pytorch GAN repo, ranked by the number of commits?", "intent_template": "List the {{attribute}} of the top 3 contributors to {{repo}} repo, ranked by the number of commits?", - "instantiation_dict": { - "repo": "Pytorch GAN", - "attribute": "email address" - }, - "intent": "List the email address of the top 3 contributors to Pytorch GAN repo, ranked by the number of commits?", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "verify_retrieved_value_normalized", - "eval_params": { - "type": "text" - }, - "expected_data": [ - "eriklindernoren@live.se", - "eriklindernoren@gmail.com", - "pinnacle.chen@qq.com" - ] - } - } - ], - "site": "gitlab" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "must_include": [ - "eriklindernoren@live.se", - "eriklindernoren@gmail.com", - "pinnacle.chen@qq.com" - ] - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "eriklindernoren@live.se, eriklindernoren@gmail.com, pinnacle.chen@qq.com" - }, - "intent_template_id": 324, - "changelogs": [ - { - "key": "expected_retrieve_value", - "category": "avoid_permissive_text", - "note": "Use structured response to judge correctness" - }, + "instantiation_dict": {"repo": "Pytorch GAN", "attribute": "email addresses"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "eval", - "category": "permissive_string_match", - "note": "Switched to exact match" + "evaluator": "AgentResponseEvaluator", + "results_schema": { "type": "array", "items": {"type": "string"} }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": ["eriklindernoren@live.se", "eriklindernoren@gmail.com", "pinnacle.chen@qq.com"] + } } - ] + ], + "revision": 2 }, { - "sites": [ - "gitlab" - ], + "sites": ["gitlab"], "task_id": 316, - "require_login": true, - "storage_state": "./.auth/gitlab_state.json", - "start_url": "__GITLAB__", - "geolocation": null, - "intent_template": "List the {{attribute}} of the top 3 contributors to {{repo}} repo, ranked by the number of commits?", - "instantiation_dict": { - "repo": "facebook's guide on building react apps", - "attribute": "name" - }, - "intent": "List the name of the top 3 contributors to facebook's guide on building react apps repo, ranked by the number of commits?", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "validate_status", - "expected_data": { - "expected_status": "RESOURCE_NOT_FOUND_ERROR" - } - } - } - ], - "site": "gitlab" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "must_include": [ - "Ian Sutherland", - "Joe Hadda", - "Dan Abramov" - ] - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "Ian Sutherland, Joe Hadda, Dan Abramov" - }, "intent_template_id": 324, - "changelogs": [ - { - "key": "expected_data", - "category": "unachievable_tasks", - "note": "Repository not found" - }, - { - "key": "expected_retrieve_value", - "category": "avoid_permissive_text", - "note": "Use structured response to judge correctness" - }, + "start_urls": ["__GITLAB__"], + "intent": "List the names of the top 3 contributors to facebook's guide on building react apps repo, ranked by the number of commits?", + "intent_template": "List the {{attribute}} of the top 3 contributors to {{repo}} repo, ranked by the number of commits?", + "instantiation_dict": {"repo": "facebook's guide on building react apps", "attribute": "names"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "eval", - "category": "permissive_string_match", - "note": "Switched to exact match" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": { + "performed_operation": "retrieve", + "status": "NOT_FOUND_ERROR", + "retrieved_data": null + } } - ] + ], + "revision": 2 }, { - "sites": [ - "gitlab" - ], + "sites": ["gitlab"], "task_id": 317, - "require_login": true, - "storage_state": "./.auth/gitlab_state.json", - "start_url": "__GITLAB__", - "geolocation": null, + "intent_template_id": 324, + "start_urls": ["__GITLAB__"], + "intent": "List the names and number of commits of the top 3 contributors to metaseq repo, ranked by the number of commits?", "intent_template": "List the {{attribute}} of the top 3 contributors to {{repo}} repo, ranked by the number of commits?", - "instantiation_dict": { - "repo": "metaseq", - "attribute": "name and number of commits" - }, - "intent": "List the name and number of commits of the top 3 contributors to metaseq repo, ranked by the number of commits?", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "verify_retrieved_value_normalized", - "eval_params": { - "ordered": "false", - "type": "text" + "instantiation_dict": {"repo": "metaseq", "attribute": "names and number of commits"}, + "format_specification": "Use objects with keys: \"first_name\", \"last_name\" and \"number_of_commits\".", + "start_url_context": null, + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "ordered": "false", + "results_schema": { + "type": "array", + "items": { + "type": "object", + "properties": { + "first_name" : { "type": "string" }, + "last_name" : { "type": "string" }, + "number_of_commits": { "type": "number" } }, - "expected_data": [ - "Susan Zhang: 70", - "Stephen Roller: 51", - "Peter Albert: 12" - ] + "required": ["first_name", "last_name", "number_of_commits"] } + }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": [ + { "first_name": "Susan" , "last_name": "Zhang" , "number_of_commits": 70 }, + { "first_name": "Stephen", "last_name": "Roller", "number_of_commits": 51 }, + { "first_name": "Peter" , "last_name": "Albert", "number_of_commits": 12 } + ] } - ], - "site": "gitlab" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "must_include": [ - "Susan Zhang: 70", - "Stephen Roller: 51", - "Peter Albert: 12" - ] - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "Susan Zhang: 70, Stephen Roller: 51, Peter Albert: 12" - }, - "intent_template_id": 324, - "format_specification": "Return a list where each element is a string with format, : ", - "changelogs": [ - { - "key": "expected_retrieve_value", - "category": "avoid_permissive_text", - "note": "Use structured response to judge correctness" - }, - { - "key": "eval", - "category": "permissive_string_match", - "note": "Switched to exact match" } - ] + ], + "revision": 2 }, { - "sites": [ - "gitlab" - ], + "sites": ["gitlab"], "task_id": 318, - "require_login": true, - "storage_state": "./.auth/gitlab_state.json", - "start_url": "__GITLAB__", - "geolocation": null, - "intent_template": "List the {{attribute}} of the top 3 contributors to {{repo}} repo, ranked by the number of commits?", - "instantiation_dict": { - "repo": "2019-nCov", - "attribute": "last names" - }, - "intent": "List the last names of the top 3 contributors to 2019-nCov repo, ranked by the number of commits?", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "verify_retrieved_value_normalized", - "eval_params": { - "ordered": "false", - "type": "text" - }, - "expected_data": [ - "Lo", - "Chen", - "Chu" - ] - } - } - ], - "site": "gitlab" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "must_include": [ - "Lo", - "Chen", - "Chu" - ] - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "Lo, Chen, Chu" - }, "intent_template_id": 324, - "format_specification": "Return a list their first names", - "changelogs": [ - { - "key": "expected_retrieve_value", - "category": "avoid_permissive_text", - "note": "Use structured response to judge correctness" - }, + "start_urls": ["__GITLAB__"], + "intent": "List the last names of the top 3 contributors to 2019-nCov repo, ranked by the number of commits?", + "intent_template": "List the {{attribute}} of the top 3 contributors to {{repo}} repo, ranked by the number of commits?", + "instantiation_dict": {"repo": "2019-nCov", "attribute": "last names"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "eval", - "category": "permissive_string_match", - "note": "Switched to exact match" + "evaluator": "AgentResponseEvaluator", + "ordered": "false", + "results_schema": { "type": "array", "items": {"type": "string"} }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": ["Lo", "Chen", "Chu"] + } } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping" - ], + "sites": ["shopping"], "task_id": 319, - "require_login": true, - "storage_state": "./.auth/shopping_state.json", - "start_url": "__SHOPPING__", - "geolocation": null, - "intent_template": "How much refund I should expect from my order canceled in {{time}}, including shipping fee", - "original.intent_template": "How much refund I should expect from my order canlled in {{time}}, including shipping fee", - "instantiation_dict": { - "time": "April 2022" - }, - "intent": "How much refund I should expect from my order canceled in April 2022, including shipping fee", - "original.intent": "How much refund I should expect from my order canlled in April 2022, including shipping fee", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "verify_retrieved_value_normalized", - "eval_params": { - "ordered": false, - "type": "currency" - }, - "expected_data": [ - 0 - ] - } - } - ], - "site": "shopping" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "must_include": [ - "0" - ] - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "0" - }, "intent_template_id": 160, - "changelogs": [ - { - "key": "intent_template", - "category": "spelling_or_grammar", - "note": "Fix spelling of canceled" - }, + "start_urls": ["__SHOPPING__"], + "intent": "How much refund I should expect from my order canceled in April 2022, including shipping fee", + "intent_template": "How much refund I should expect from my order canceled in {{time}}, including shipping fee", + "instantiation_dict": {"time": "April 2022"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "eval", - "category": "permissive_string_match", - "note": "Switched to exact match" + "evaluator": "AgentResponseEvaluator", + "ordered": false, + "results_schema": { "type": "array", "items": {"type": "number"} }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": [0] + } } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping" - ], + "sites": ["shopping"], "task_id": 320, - "require_login": true, - "storage_state": "./.auth/shopping_state.json", - "start_url": "__SHOPPING__", - "geolocation": null, - "intent_template": "How much refund I should expect from my order canceled in {{time}}, including shipping fee", - "original.intent_template": "How much refund I should expect from my order canlled in {{time}}, including shipping fee", - "instantiation_dict": { - "time": "Feb 2023" - }, - "intent": "How much refund I should expect from my order canceled in Feb 2023, including shipping fee", - "original.intent": "How much refund I should expect from my order canlled in Feb 2023, including shipping fee", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "verify_retrieved_value_normalized", - "eval_params": { - "ordered": false, - "type": "currency" - }, - "expected_data": [ - "406.53" - ] - } - } - ], - "site": "shopping" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "must_include": [ - "406.53" - ] - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "406.53" - }, "intent_template_id": 160, - "changelogs": [ - { - "key": "intent_template", - "category": "spelling_or_grammar", - "note": "Fix spelling of canceled" - }, + "start_urls": ["__SHOPPING__"], + "intent": "How much refund I should expect from my order canceled in Feb 2023, including shipping fee", + "intent_template": "How much refund I should expect from my order canceled in {{time}}, including shipping fee", + "instantiation_dict": {"time": "Feb 2023"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "eval", - "category": "permissive_string_match", - "note": "Switched to exact match" + "evaluator": "AgentResponseEvaluator", + "ordered": false, + "results_schema": { "type": "array", "items": {"type": "string"} }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": ["406.53"] + } } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping" - ], + "sites": ["shopping"], "task_id": 321, - "require_login": true, - "storage_state": "./.auth/shopping_state.json", - "start_url": "__SHOPPING__", - "geolocation": null, - "intent_template": "How much refund I should expect from my order canceled in {{time}}, including shipping fee", - "original.intent_template": "How much refund I should expect from my order canlled in {{time}}, including shipping fee", - "instantiation_dict": { - "time": "2022" - }, - "intent": "How much refund I should expect from my order canceled in 2022, including shipping fee", - "original.intent": "How much refund I should expect from my order canlled in 2022, including shipping fee", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "verify_retrieved_value_normalized", - "eval_params": { - "ordered": false, - "type": "currency" - }, - "expected_data": [ - "3053.97" - ] - } - } - ], - "site": "shopping" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "must_include": [ - "3053.97" - ] - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "3053.97" - }, "intent_template_id": 160, - "changelogs": [ - { - "key": "intent_template", - "category": "spelling_or_grammar", - "note": "Fix spelling of canceled" - }, + "start_urls": ["__SHOPPING__"], + "intent": "How much refund I should expect from my order canceled in 2022, including shipping fee", + "intent_template": "How much refund I should expect from my order canceled in {{time}}, including shipping fee", + "instantiation_dict": {"time": "2022"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "eval", - "category": "permissive_string_match", - "note": "Switched to exact match" + "evaluator": "AgentResponseEvaluator", + "ordered": false, + "results_schema": { "type": "array", "items": {"type": "string"} }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": ["3053.97"] + } } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping" - ], + "sites": ["shopping"], "task_id": 322, - "require_login": true, - "storage_state": "./.auth/shopping_state.json", - "start_url": "__SHOPPING__", - "geolocation": null, - "intent_template": "How much refund I should expect from my order canceled in {{time}} if I cannot get the shipping fee refunded?", - "original.intent_template": "How much refund I should expect from my order canlled in {{time}} if I cannot get the shipping fee refunded?", - "instantiation_dict": { - "time": "May 2023" - }, - "intent": "How much refund I should expect from my order canceled in May 2023 if I cannot get the shipping fee refunded?", - "original.intent": "How much refund I should expect from my order canlled in May 2023 if I cannot get the shipping fee refunded?", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "verify_retrieved_value_normalized", - "eval_params": { - "ordered": false, - "type": "currency" - }, - "expected_data": [ - "350.42" - ] - } - } - ], - "site": "shopping" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "must_include": [ - "350.42" - ] - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "350.42" - }, "intent_template_id": 160, - "changelogs": [ - { - "key": "intent_template", - "category": "spelling_or_grammar", - "note": "Fix spelling of canceled" - }, - { - "key": "intent_template", - "category": "spelling_or_grammar", - "note": "Fix spelling of canceled" - }, + "start_urls": ["__SHOPPING__"], + "intent": "How much refund I should expect from my order canceled in May 2023 if I cannot get the shipping fee refunded?", + "intent_template": "How much refund I should expect from my order canceled in {{time}} if I cannot get the shipping fee refunded?", + "instantiation_dict": {"time": "May 2023"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "eval", - "category": "permissive_string_match", - "note": "Switched to exact match" + "evaluator": "AgentResponseEvaluator", + "ordered": false, + "results_schema": { "type": "array", "items": {"type": "string"} }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": ["350.42"] + } } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping" - ], + "sites": ["shopping"], "task_id": 323, - "require_login": true, - "storage_state": "./.auth/shopping_state.json", - "start_url": "__SHOPPING__", - "geolocation": null, - "intent_template": "How much refund I should expect from my order canceled in {{time}}? I only kept the AC-DC Adapter and the shop told me that I cannot get the shipping fee back", - "original.intent_template": "How much refund I should expect from my order canlled in {{time}}? I only kept the AC-DC Adapter and the shop told me that I cannot get the shipping fee back", - "instantiation_dict": { - "time": "2022/03" - }, - "intent": "How much refund I should expect from my order canceled in 2022/03? I only kept the AC-DC Adapter and the shop told me that I cannot get the shipping fee back", - "original.intent": "How much refund I should expect from my order canlled in 2022/03? I only kept the AC-DC Adapter and the shop told me that I cannot get the shipping fee back", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "verify_retrieved_value_normalized", - "eval_params": { - "ordered": false, - "type": "currency" - }, - "expected_data": [ - "264.49" - ] - } - } - ], - "site": "shopping" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "must_include": [ - "264.49" - ] - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "264.49" - }, "intent_template_id": 160, - "changelogs": [ - { - "key": "intent_template", - "category": "spelling_or_grammar", - "note": "Fix spelling of canceled" - }, - { - "key": "intent_template", - "category": "spelling_or_grammar", - "note": "Fix spelling of canceled" - }, + "start_urls": ["__SHOPPING__"], + "intent": "How much refund I should expect from my order canceled in 2022/03? I only kept the AC-DC Adapter and the shop told me that I cannot get the shipping fee back", + "intent_template": "How much refund I should expect from my order canceled in {{time}}? I only kept the AC-DC Adapter and the shop told me that I cannot get the shipping fee back", + "instantiation_dict": {"time": "2022/03"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "eval", - "category": "permissive_string_match", - "note": "Switched to exact match" + "evaluator": "AgentResponseEvaluator", + "ordered": false, + "results_schema": { "type": "array", "items": {"type": "string"} }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": ["264.49"] + } } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping" - ], + "sites": ["shopping"], "task_id": 324, - "require_login": true, - "storage_state": "./.auth/shopping_state.json", - "start_url": "__SHOPPING__", - "geolocation": null, - "intent_template": "Show me the \"{{product}}\" listings by {{sorting_order}}.", - "instantiation_dict": { - "product": "chairs", - "sorting_order": "ascending price" - }, + "intent_template_id": 208, + "start_urls": ["__SHOPPING__"], "intent": "Show me the \"chairs\" listings by ascending price.", - "require_reset": false, - "eval": { - "expected_ui_state": [ - { - "url": "__SHOPPING__/catalogsearch/result/index/?product_list_order=price&q=chairs&product_list_dir=asc" - } - ], - "site": "shopping" - }, - "original.eval": { - "eval_types": [ - "url_match" - ], - "reference_answers": null, - "reference_url": "__SHOPPING__/catalogsearch/result/index/?product_list_order=price&q=chairs&product_list_dir=asc", - "program_html": [], - "url_note": "GOLD in PRED" - }, - "intent_template_id": 208 + "intent_template": "Show me the \"{{product}}\" listings by {{sorting_order}}.", + "instantiation_dict": {"product": "chairs", "sorting_order": "ascending price"}, + "format_specification": null, + "start_url_context": null, + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} + }, + { + "evaluator": "NetworkEventEvaluator", + "site": "shopping", + "url_match_mode": "exact", + "last_event_only": true, + "expected": { + "url": "__SHOPPING__/catalogsearch/result/index/?product_list_order=price&q=chairs&product_list_dir=asc", + "response_status": 200, + "event_type": "navigation" + } + } + ], + "revision": 2 }, { - "sites": [ - "shopping" - ], + "sites": ["shopping"], "task_id": 325, - "require_login": true, - "storage_state": "./.auth/shopping_state.json", - "start_url": "__SHOPPING__", - "geolocation": null, - "intent_template": "Show me the \"{{product}}\" listings by {{sorting_order}}.", - "instantiation_dict": { - "product": "mouth night guard", - "sorting_order": "descending price" - }, + "intent_template_id": 208, + "start_urls": ["__SHOPPING__"], "intent": "Show me the \"mouth night guard\" listings by descending price.", - "require_reset": false, - "eval": { - "expected_ui_state": [ - { - "url": "__SHOPPING__/catalogsearch/result/index/?q=mouth%20night%20guard%20&product_list_order=price" - } - ], - "site": "shopping" - }, - "original.eval": { - "eval_types": [ - "url_match" - ], - "reference_answers": null, - "reference_url": "__SHOPPING__/catalogsearch/result/index/?q=mouth%20night%20guard%20&product_list_order=price", - "program_html": [], - "url_note": "GOLD in PRED" - }, - "intent_template_id": 208 + "intent_template": "Show me the \"{{product}}\" listings by {{sorting_order}}.", + "instantiation_dict": {"product": "mouth night guard", "sorting_order": "descending price"}, + "format_specification": null, + "start_url_context": null, + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} + }, + { + "evaluator": "NetworkEventEvaluator", + "site": "shopping", + "url_match_mode": "exact", + "last_event_only": true, + "expected": { + "url": "__SHOPPING__/catalogsearch/result/index/?q=mouth%20night%20guard%20&product_list_order=price", + "response_status": 200, + "event_type": "navigation" + } + } + ], + "revision": 2 }, { - "sites": [ - "shopping" - ], + "sites": ["shopping"], "task_id": 326, - "require_login": true, - "storage_state": "./.auth/shopping_state.json", - "start_url": "__SHOPPING__", - "geolocation": null, + "intent_template_id": 208, + "start_urls": ["__SHOPPING__"], + "intent": "Show me the \"Canon photo printer\" listings by search relevance, from most to least.", "intent_template": "Show me the \"{{product}}\" listings by {{sorting_order}}.", "instantiation_dict": { "product": "Canon photo printer", "sorting_order": "search relevance, from most to least" }, - "intent": "Show me the \"Canon photo printer\" listings by search relevance, from most to least.", - "require_reset": false, - "eval": { - "expected_ui_state": [ - { - "url": "__SHOPPING__/catalogsearch/result/?q=Canon+photo+printer" - } - ], - "site": "shopping" - }, - "original.eval": { - "eval_types": [ - "url_match" - ], - "reference_answers": null, - "reference_url": "__SHOPPING__/catalogsearch/result/?q=Canon+photo+printer", - "program_html": [], - "url_note": "GOLD in PRED" - }, - "intent_template_id": 208 + "format_specification": null, + "start_url_context": null, + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} + }, + { + "evaluator": "NetworkEventEvaluator", + "site": "shopping", + "url_match_mode": "exact", + "last_event_only": true, + "expected": { + "url": "__SHOPPING__/catalogsearch/result/?q=Canon+photo+printer", + "response_status": 200, + "event_type": "navigation" + } + } + ], + "revision": 2 }, { - "sites": [ - "shopping" - ], + "sites": ["shopping"], "task_id": 327, - "require_login": true, - "storage_state": "./.auth/shopping_state.json", - "start_url": "__SHOPPING__", - "geolocation": null, - "intent_template": "Show me the \"{{product}}\" listings by {{sorting_order}}.", - "instantiation_dict": { - "product": "iphone 12 phone case", - "sorting_order": "name alphabetically" - }, + "intent_template_id": 208, + "start_urls": ["__SHOPPING__"], "intent": "Show me the \"iphone 12 phone case\" listings by name alphabetically.", - "require_reset": false, - "eval": { - "expected_ui_state": [ - { - "url": "__SHOPPING__/catalogsearch/result/index/?q=%20iphone%2012%20phone%20case&product_list_order=name" - } - ], - "site": "shopping" - }, - "original.eval": { - "eval_types": [ - "url_match" - ], - "reference_answers": null, - "reference_url": "__SHOPPING__/catalogsearch/result/index/?q=%20iphone%2012%20phone%20case&product_list_order=name", - "program_html": [], - "url_note": "GOLD in PRED" - }, - "intent_template_id": 208 + "intent_template": "Show me the \"{{product}}\" listings by {{sorting_order}}.", + "instantiation_dict": {"product": "iphone 12 phone case", "sorting_order": "name alphabetically"}, + "format_specification": null, + "start_url_context": null, + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} + }, + { + "evaluator": "NetworkEventEvaluator", + "site": "shopping", + "url_match_mode": "exact", + "last_event_only": true, + "expected": { + "url": "__SHOPPING__/catalogsearch/result/index/?q=%20iphone%2012%20phone%20case&product_list_order=name", + "response_status": 200, + "event_type": "navigation" + } + } + ], + "revision": 2 }, { - "sites": [ - "shopping" - ], + "sites": ["shopping"], "task_id": 328, - "require_login": true, - "storage_state": "./.auth/shopping_state.json", - "start_url": "__SHOPPING__", - "geolocation": null, - "intent_template": "Show me the \"{{product}}\" listings by {{sorting_order}}.", - "instantiation_dict": { - "product": "iphone 12 phone case", - "sorting_order": "price" - }, + "intent_template_id": 208, + "start_urls": ["__SHOPPING__"], "intent": "Show me the \"iphone 12 phone case\" listings by price.", - "require_reset": false, - "eval": { - "expected_ui_state": [ - { - "url": "__SHOPPING__/catalogsearch/result/index/?product_list_order=price&q=%20iphone%2012%20phone%20case" - } - ], - "site": "shopping" - }, - "original.eval": { - "eval_types": [ - "url_match" - ], - "reference_answers": null, - "reference_url": "__SHOPPING__/catalogsearch/result/index/?product_list_order=price&q=%20iphone%2012%20phone%20case", - "program_html": [], - "url_note": "GOLD in PRED" - }, - "intent_template_id": 208 + "intent_template": "Show me the \"{{product}}\" listings by {{sorting_order}}.", + "instantiation_dict": {"product": "iphone 12 phone case", "sorting_order": "price"}, + "format_specification": null, + "start_url_context": null, + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} + }, + { + "evaluator": "NetworkEventEvaluator", + "site": "shopping", + "url_match_mode": "exact", + "last_event_only": true, + "expected": { + "url": "__SHOPPING__/catalogsearch/result/index/?product_list_order=price&q=%20iphone%2012%20phone%20case", + "response_status": 200, + "event_type": "navigation" + } + } + ], + "revision": 2 }, { - "sites": [ - "shopping" - ], + "sites": ["shopping"], "task_id": 329, - "require_login": true, - "storage_state": "./.auth/shopping_state.json", - "start_url": "__SHOPPING__", - "geolocation": null, - "intent_template": "Return how much I spent {{time}} on shopping at One Stop Market?", - "original.intent_template": "How much I spend {{time}} on shopping at One Stop Market?", - "instantiation_dict": { - "time": "on 4/19/2023" - }, - "intent": "Return how much I spent on 4/19/2023 on shopping at One Stop Market?", - "original.intent": "How much I spend on 4/19/2023 on shopping at One Stop Market?", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "verify_retrieved_value_normalized", - "eval_params": { - "ordered": false, - "type": "numeric" - }, - "expected_data": [ - 0 - ] - } - } - ], - "site": "shopping" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "must_include": [ - "0" - ] - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "0" - }, "intent_template_id": 147, - "changelogs": [ - { - "key": "intent_template", - "category": "spelling_or_grammar", - "note": "Fix task wording" - }, + "start_urls": ["__SHOPPING__"], + "intent": "Return how much I spent on 4/19/2023 on shopping at One Stop Market?", + "intent_template": "Return how much I spent {{time}} on shopping at One Stop Market?", + "instantiation_dict": {"time": "on 4/19/2023"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "eval", - "category": "permissive_string_match", - "note": "Switched to exact match" + "evaluator": "AgentResponseEvaluator", + "ordered": false, + "results_schema": { "type": "array", "items": {"type": "number"} }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": [0] + } } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping" - ], + "sites": ["shopping"], "task_id": 330, - "require_login": true, - "storage_state": "./.auth/shopping_state.json", - "start_url": "__SHOPPING__", - "geolocation": null, - "intent_template": "Return how much I spent {{time}} on shopping at One Stop Market?", - "original.intent_template": "How much I spend {{time}} on shopping at One Stop Market?", - "instantiation_dict": { - "time": "in March 2023" - }, - "intent": "Return how much I spent in March 2023 on shopping at One Stop Market?", - "original.intent": "How much I spend in March 2023 on shopping at One Stop Market?", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "verify_retrieved_value_normalized", - "eval_params": { - "ordered": false, - "type": "currency" - }, - "expected_data": [ - "83.31" - ] - } - } - ], - "site": "shopping" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "must_include": [ - "81.31" - ] - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "81.31" - }, "intent_template_id": 147, - "changelogs": [ - { - "key": "intent_template", - "category": "spelling_or_grammar", - "note": "Fix task wording" - }, - { - "key": "expected_retrieve_value", - "category": "reference_alignment", - "note": "The two orders add up to 83.31" - }, + "start_urls": ["__SHOPPING__"], + "intent": "Return how much I spent in March 2023 on shopping at One Stop Market?", + "intent_template": "Return how much I spent {{time}} on shopping at One Stop Market?", + "instantiation_dict": {"time": "in March 2023"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "eval", - "category": "permissive_string_match", - "note": "Switched to exact match" + "evaluator": "AgentResponseEvaluator", + "ordered": false, + "results_schema": { "type": "array", "items": {"type": "string"} }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": ["83.31"] + } } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping" - ], + "sites": ["shopping"], "task_id": 331, - "require_login": true, - "storage_state": "./.auth/shopping_state.json", - "start_url": "__SHOPPING__", - "geolocation": null, - "intent_template": "Return how much I spent {{time}} on shopping at One Stop Market?", - "original.intent_template": "How much I spend {{time}} on shopping at One Stop Market?", - "instantiation_dict": { - "time": "in July 2022" - }, + "intent_template_id": 147, + "start_urls": ["__SHOPPING__"], "intent": "Return how much I spent in July 2022 on shopping at One Stop Market?", - "original.intent": "How much I spend in July 2022 on shopping at One Stop Market?", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "verify_retrieved_value_normalized", - "eval_params": { - "ordered": false, - "type": "currency" - }, - "expected_data": [ - "40.16" - ] - } + "intent_template": "Return how much I spent {{time}} on shopping at One Stop Market?", + "instantiation_dict": {"time": "in July 2022"}, + "format_specification": null, + "start_url_context": null, + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "ordered": false, + "results_schema": { "type": "array", "items": {"type": "string"} }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": ["40.16"] } - ], - "site": "shopping" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "must_include": [ - "40.16" - ] - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "40.16" - }, - "intent_template_id": 147, - "changelogs": [ - { - "key": "intent_template", - "category": "spelling_or_grammar", - "note": "Fix task wording" - }, - { - "key": "eval", - "category": "permissive_string_match", - "note": "Switched to exact match" } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping" - ], + "sites": ["shopping"], "task_id": 332, - "require_login": true, - "storage_state": "./.auth/shopping_state.json", - "start_url": "__SHOPPING__", - "geolocation": null, - "intent_template": "Return how much I spent {{time}} on shopping at One Stop Market?", - "original.intent_template": "How much I spend {{time}} on shopping at One Stop Market?", - "instantiation_dict": { - "time": "each month from Jan to the end of March 2023" - }, + "intent_template_id": 147, + "start_urls": ["__SHOPPING__"], "intent": "Return how much I spent each month from Jan to the end of March 2023 on shopping at One Stop Market?", - "original.intent": "How much I spend each month from Jan to the end of March 2023 on shopping at One Stop Market?", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "verify_retrieved_value_normalized", - "eval_params": { - "ordered": false, - "type": "object" + "intent_template": "Return how much I spent {{time}} on shopping at One Stop Market?", + "instantiation_dict": {"time": "each month from Jan to the end of March 2023"}, + "format_specification": "Use \"month\" for month and \"total\" for spent amount.", + "start_url_context": null, + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "ordered": false, + "results_schema": { + "type": "array", + "items": { + "type": "object", + "properties": { + "month": { "type": "string", "format": "month" }, + "total": { "type": "number", "format": "currency" } }, - "expected_data": [ - { - "month": { - "value": "Jan", - "type": "month" - }, - "total": { - "value": "572.88", - "type": "currency" - } - }, - { - "month": { - "value": "Feb", - "type": "month" - }, - "total": { - "value": "947.5", - "type": "currency" - } - }, - { - "month": { - "value": "Mar", - "type": "month" - }, - "total": { - "value": "83.31", - "type": "currency" - } - } - ] + "required": ["month", "total"] } + }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": [ + { "month": "Jan", "total": 572.88 }, + { "month": "Feb", "total": 947.50 }, + { "month": "Mar", "total": 83.31 } + ] } - ], - "site": "shopping" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "fuzzy_match": [ - "Jan: 572.8", - "Feb: 762.18", - "Mar: 83.31" - ] - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "Jan: 572.8\nFeb: 762.18\nMar: 83.31" - }, - "intent_template_id": 147, - "changelogs": [ - { - "key": "intent_template", - "category": "spelling_or_grammar", - "note": "Fix task wording" - }, - { - "key": "eval", - "category": "permissive_string_match", - "note": "Switched from fuzzy eval" } ], - "format_specification": "Use \"month\" for month and \"total\" for spent amount." + "revision": 2 }, { - "sites": [ - "shopping" - ], + "sites": ["shopping"], "task_id": 333, - "require_login": true, - "storage_state": "./.auth/shopping_state.json", - "start_url": "__SHOPPING__", - "geolocation": null, - "intent_template": "Return how much I spent {{time}} on shopping at One Stop Market?", - "original.intent_template": "How much did I spend on shopping at One Stop Market {{time}}? They gave me a 20% discount on the total amount for orders exceeding $200 in cash", - "instantiation_dict": { - "time": "on November 2022" - }, - "intent": "Return how much I spent on November 2022 on shopping at One Stop Market?", - "original.intent": "How much did I spend on shopping at One Stop Market on November 2022? They gave me a 20% discount on the total amount for orders exceeding $200 in cash", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "verify_retrieved_value_normalized", - "eval_params": { - "ordered": false, - "type": "currency" - }, - "expected_data": [ - "403.18" - ] - } - } - ], - "site": "shopping" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "must_include": [ - "359.546" - ] - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "359.546" - }, "intent_template_id": 147, - "changelogs": [ - { - "key": "intent_template", - "category": "spelling_or_grammar", - "note": "Fix task wording" - }, - { - "key": "expected_retrieve_value", - "category": "reference_alignment", - "note": "The three orders add up to 403.18" - }, + "start_urls": ["__SHOPPING__"], + "intent": "Return how much I spent on November 2022 on shopping at One Stop Market?", + "intent_template": "Return how much I spent {{time}} on shopping at One Stop Market?", + "instantiation_dict": {"time": "on November 2022"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "eval", - "category": "permissive_string_match", - "note": "Switched to exact match" + "evaluator": "AgentResponseEvaluator", + "ordered": false, + "results_schema": { "type": "array", "items": {"type": "string"} }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": ["403.18"] + } } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping" - ], + "sites": ["shopping"], "task_id": 334, - "require_login": true, - "storage_state": "./.auth/shopping_state.json", - "start_url": "__SHOPPING__", - "geolocation": null, - "intent_template": "Tell me when I last ordered my {{description}}?", - "instantiation_dict": { - "description": "muffin cornbread mix" - }, - "intent": "Tell me when I last ordered my muffin cornbread mix?", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "verify_retrieved_value_normalized", - "eval_params": { - "ordered": false, - "type": "date" - }, - "expected_data": [ - "March 11th 2023" - ] - } - } - ], - "site": "shopping" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "fuzzy_match": [ - "March 11th 2023" - ] - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "March 11th 2023" - }, "intent_template_id": 169, - "changelogs": [ + "start_urls": ["__SHOPPING__"], + "intent": "Tell me when I last ordered my muffin cornbread mix?", + "intent_template": "Tell me when I last ordered my {{description}}?", + "instantiation_dict": {"description": "muffin cornbread mix"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "eval", - "category": "permissive_string_match", - "note": "Switched from fuzzy eval" + "evaluator": "AgentResponseEvaluator", + "ordered": false, + "results_schema": { "type": "array", "items": {"type": "string"} }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": ["March 11th 2023"] + } } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping" - ], + "sites": ["shopping"], "task_id": 335, - "require_login": true, - "storage_state": "./.auth/shopping_state.json", - "start_url": "__SHOPPING__", - "geolocation": null, - "intent_template": "Tell me when I last ordered my {{description}}?", - "instantiation_dict": { - "description": "body butter" - }, - "intent": "Tell me when I last ordered my body butter?", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "verify_retrieved_value_normalized", - "eval_params": { - "ordered": false, - "type": "date" - }, - "expected_data": [ - "January 16th 2023" - ] - } - } - ], - "site": "shopping" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "fuzzy_match": [ - "January 16th 2023" - ] - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "January 16th 2023" - }, "intent_template_id": 169, - "changelogs": [ + "start_urls": ["__SHOPPING__"], + "intent": "Tell me when I last ordered my body butter?", + "intent_template": "Tell me when I last ordered my {{description}}?", + "instantiation_dict": {"description": "body butter"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "eval", - "category": "permissive_string_match", - "note": "Switched from fuzzy eval" + "evaluator": "AgentResponseEvaluator", + "ordered": false, + "results_schema": { "type": "array", "items": {"type": "string"} }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": ["January 16th 2023"] + } } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping" - ], + "sites": ["shopping"], "task_id": 336, - "require_login": true, - "storage_state": "./.auth/shopping_state.json", - "start_url": "__SHOPPING__", - "geolocation": null, - "intent_template": "Tell me when I last ordered my {{description}}?", - "instantiation_dict": { - "description": "conditioner" - }, - "intent": "Tell me when I last ordered my conditioner?", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "verify_retrieved_value_normalized", - "eval_params": { - "ordered": false, - "type": "date" - }, - "expected_data": [ - "January 16th 2023" - ] - } - } - ], - "site": "shopping" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "fuzzy_match": [ - "January 16th 2023" - ] - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "January 16th 2023" - }, "intent_template_id": 169, - "changelogs": [ + "start_urls": ["__SHOPPING__"], + "intent": "Tell me when I last ordered my conditioner?", + "intent_template": "Tell me when I last ordered my {{description}}?", + "instantiation_dict": {"description": "conditioner"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "eval", - "category": "permissive_string_match", - "note": "Switched from fuzzy eval" + "evaluator": "AgentResponseEvaluator", + "ordered": false, + "results_schema": { "type": "array", "items": {"type": "string"} }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": ["January 16th 2023"] + } } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping" - ], + "sites": ["shopping"], "task_id": 337, - "require_login": true, - "storage_state": "./.auth/shopping_state.json", - "start_url": "__SHOPPING__", - "geolocation": null, - "intent_template": "Tell me when I last ordered my {{description}}?", - "instantiation_dict": { - "description": "bread olive" - }, - "intent": "Tell me when I last ordered my bread olive?", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "verify_retrieved_value_normalized", - "eval_params": { - "ordered": false, - "type": "date" - }, - "expected_data": [ - "December 12th 2022" - ] - } - } - ], - "site": "shopping" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "fuzzy_match": [ - "December 12th 2022" - ] - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "December 12th 2022" - }, "intent_template_id": 169, - "changelogs": [ + "start_urls": ["__SHOPPING__"], + "intent": "Tell me when I last ordered my bread olive?", + "intent_template": "Tell me when I last ordered my {{description}}?", + "instantiation_dict": {"description": "bread olive"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "eval", - "category": "permissive_string_match", - "note": "Switched from fuzzy eval" + "evaluator": "AgentResponseEvaluator", + "ordered": false, + "results_schema": { "type": "array", "items": {"type": "string"} }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": ["December 12th 2022"] + } } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping" - ], + "sites": ["shopping"], "task_id": 338, - "require_login": true, - "storage_state": "./.auth/shopping_state.json", - "start_url": "__SHOPPING__", - "geolocation": null, - "intent_template": "Tell me when I last ordered my {{description}}?", - "instantiation_dict": { - "description": "toothpaste" - }, - "intent": "Tell me when I last ordered my toothpaste?", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "verify_retrieved_value_normalized", - "eval_params": { - "ordered": false, - "type": "date" - }, - "expected_data": [ - "December 4th 2022" - ] - } - } - ], - "site": "shopping" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "fuzzy_match": [ - "December 4th 2022" - ] - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "December 4th 2022" - }, "intent_template_id": 169, - "changelogs": [ + "start_urls": ["__SHOPPING__"], + "intent": "Tell me when I last ordered my toothpaste?", + "intent_template": "Tell me when I last ordered my {{description}}?", + "instantiation_dict": {"description": "toothpaste"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "eval", - "category": "permissive_string_match", - "note": "Switched from fuzzy eval" + "evaluator": "AgentResponseEvaluator", + "ordered": false, + "results_schema": { "type": "array", "items": {"type": "string"} }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": ["December 4th 2022"] + } } - ] + ], + "revision": 2 }, { - "sites": [ - "gitlab" - ], + "sites": ["gitlab"], "task_id": 339, - "require_login": true, - "storage_state": "./.auth/gitlab_state.json", - "start_url": "__GITLAB__/a11yproject/a11yproject.com", - "geolocation": null, - "intent_template": "Navigate to and display a list of all opened issues {{description}}", - "original.intent_template": "List all opened issues {{description}}", - "instantiation_dict": { - "description": "that report bugs" - }, - "intent": "Navigate to and display a list of all opened issues that report bugs", - "original.intent": "List all opened issues that report bugs", - "require_reset": false, - "eval": { - "expected_ui_state": [ - { - "url": "__GITLAB__/a11yproject/a11yproject.com/-/issues/?label_name%5B%5D=bug" - } - ], - "site": "gitlab" - }, - "original.eval": { - "eval_types": [ - "url_match" - ], - "reference_answers": null, - "reference_url": "__GITLAB__/a11yproject/a11yproject.com/-/issues/?label_name%5B%5D=bug", - "program_html": [], - "url_note": "GOLD in PRED" - }, "intent_template_id": 299, - "changelogs": [ + "start_urls": ["__GITLAB__/a11yproject/a11yproject.com"], + "intent": "Navigate to the list of all opened issues that report bugs", + "intent_template": "Navigate to the list of all opened issues {{description}}", + "instantiation_dict": {"description": "that report bugs"}, + "format_specification": null, + "start_url_context": "Complete the task given the context of the current page.", + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} + }, { - "key": "intent_template", - "category": "clarify_instructions", - "note": "Clarify the intent as a navigation task" + "evaluator": "NetworkEventEvaluator", + "site": "gitlab", + "url_match_mode": "exact", + "last_event_only": true, + "expected": { + "url": "__GITLAB__/a11yproject/a11yproject.com/-/issues/?label_name%5B%5D=bug&state=opened", + "response_status": 200, + "event_type": "navigation" + } } - ] + ], + "revision": 2 }, { - "sites": [ - "gitlab" - ], + "sites": ["gitlab"], "task_id": 340, - "require_login": true, - "storage_state": "./.auth/gitlab_state.json", - "start_url": "__GITLAB__/primer/design", - "geolocation": null, - "intent_template": "Navigate to and display a list of all opened issues {{description}}", - "original.intent_template": "List all opened issues {{description}}", - "instantiation_dict": { - "description": "that report bugs" - }, - "intent": "Navigate to and display a list of all opened issues that report bugs", - "original.intent": "List all opened issues that report bugs", - "require_reset": false, - "eval": { - "expected_ui_state": [ - { - "url": "__GITLAB__/primer/design/-/issues/?label_name%5B%5D=type%3A%20bug%20%F0%9F%90%9E" - } - ], - "site": "gitlab" - }, - "original.eval": { - "eval_types": [ - "url_match" - ], - "reference_answers": null, - "reference_url": "__GITLAB__/primer/design/-/issues/?label_name%5B%5D=type%3A%20bug%20%F0%9F%90%9E", - "program_html": [], - "url_note": "GOLD in PRED" - }, "intent_template_id": 299, - "changelogs": [ + "start_urls": ["__GITLAB__/primer/design"], + "intent": "Navigate to the list of all opened issues that report bugs", + "intent_template": "Navigate to the list of all opened issues {{description}}", + "instantiation_dict": {"description": "that report bugs"}, + "format_specification": null, + "start_url_context": "Complete the task given the context of the current page.", + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} + }, { - "key": "intent_template", - "category": "clarify_instructions", - "note": "Clarify the intent as a navigation task" + "evaluator": "NetworkEventEvaluator", + "site": "gitlab", + "url_match_mode": "exact", + "last_event_only": true, + "expected": { + "url": "__GITLAB__/primer/design/-/issues/?label_name%5B%5D=type%3A%20bug%20%F0%9F%90%9E&state=opened", + "response_status": 200, + "event_type": "navigation" + } } - ] + ], + "revision": 2 }, { - "sites": [ - "gitlab" - ], + "sites": ["gitlab"], "task_id": 341, - "require_login": true, - "storage_state": "./.auth/gitlab_state.json", - "start_url": "__GITLAB__/root/metaseq", - "geolocation": null, - "intent_template": "Navigate to and display a list of all opened issues {{description}}", - "original.intent_template": "List all opened issues {{description}}", - "instantiation_dict": { - "description": "requesting new features" - }, - "intent": "Navigate to and display a list of all opened issues requesting new features", - "original.intent": "List all opened issues requesting new features", - "require_reset": false, - "eval": { - "expected_ui_state": [ - { - "url": "__GITLAB__/root/metaseq/-/issues/?label_name%5B%5D=enhancement" - } - ], - "site": "gitlab" - }, - "original.eval": { - "eval_types": [ - "url_match" - ], - "reference_answers": null, - "reference_url": "__GITLAB__/root/metaseq/-/issues/?label_name%5B%5D=enhancement", - "program_html": [], - "url_note": "GOLD in PRED" - }, "intent_template_id": 299, - "changelogs": [ + "start_urls": ["__GITLAB__/root/metaseq"], + "intent": "Navigate to the list of all opened issues requesting new features", + "intent_template": "Navigate to the list of all opened issues {{description}}", + "instantiation_dict": {"description": "requesting new features"}, + "format_specification": null, + "start_url_context": "Complete the task given the context of the current page.", + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} + }, { - "key": "intent_template", - "category": "clarify_instructions", - "note": "Clarify the intent as a navigation task" + "evaluator": "NetworkEventEvaluator", + "site": "gitlab", + "url_match_mode": "exact", + "last_event_only": true, + "expected": { + "url": "__GITLAB__/root/metaseq/-/issues/?label_name%5B%5D=enhancement&state=opened", + "response_status": 200, + "event_type": "navigation" + } } - ] + ], + "revision": 2 }, { - "sites": [ - "gitlab" - ], + "sites": ["gitlab"], "task_id": 342, - "require_login": true, - "storage_state": "./.auth/gitlab_state.json", - "start_url": "__GITLAB__/root/metaseq", - "geolocation": null, - "intent_template": "Navigate to and display a list of all opened issues {{description}}", - "original.intent_template": "List all opened issues {{description}}", - "instantiation_dict": { - "description": "that ask about OPT model related questions" - }, - "intent": "Navigate to and display a list of all opened issues that ask about OPT model related questions", - "original.intent": "List all opened issues that ask about OPT model related questions", - "require_reset": false, - "eval": { - "expected_ui_state": [ - { - "url": "__GITLAB__/root/metaseq/-/issues/?search=OPT&label_name%5B%5D=question" - } - ], - "site": "gitlab" - }, - "original.eval": { - "eval_types": [ - "url_match" - ], - "reference_answers": null, - "reference_url": "__GITLAB__/root/metaseq/-/issues/?search=OPT&label_name%5B%5D=question", - "program_html": [], - "url_note": "GOLD in PRED" - }, "intent_template_id": 299, - "changelogs": [ + "start_urls": ["__GITLAB__/root/metaseq"], + "intent": "Navigate to the list of all opened issues that ask about OPT model related questions", + "intent_template": "Navigate to the list of all opened issues {{description}}", + "instantiation_dict": {"description": "that ask about OPT model related questions"}, + "format_specification": null, + "start_url_context": "Complete the task given the context of the current page.", + "eval": [ { - "key": "intent_template", - "category": "clarify_instructions", - "note": "Clarify the intent as a navigation task" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} + }, + { + "evaluator": "NetworkEventEvaluator", + "site": "gitlab", + "url_match_mode": "exact", + "last_event_only": true, + "expected": { + "url": "__GITLAB__/root/metaseq/-/issues/?search=OPT&label_name%5B%5D=question&state=opened", + "response_status": 200, + "event_type": "navigation" + } } - ] + ], + "revision": 2 }, { - "sites": [ - "gitlab" - ], + "sites": ["gitlab"], "task_id": 343, - "require_login": true, - "storage_state": "./.auth/gitlab_state.json", - "start_url": "__GITLAB__/root/metaseq", - "geolocation": null, - "intent_template": "Navigate to and display a list of all opened issues {{description}}", - "original.intent_template": "List all opened issues {{description}}", - "instantiation_dict": { - "description": "that don't have any labels" - }, - "intent": "Navigate to and display a list of all opened issues that don't have any labels", - "original.intent": "List all opened issues that don't have any labels", - "require_reset": false, - "eval": { - "expected_ui_state": [ - { - "url": "__GITLAB__/root/metaseq/-/issues/?label_name%5B%5D=None" - } - ], - "site": "gitlab" - }, - "original.eval": { - "eval_types": [ - "url_match" - ], - "reference_answers": null, - "reference_url": "__GITLAB__/root/metaseq/-/issues/?label_name%5B%5D=None", - "program_html": [], - "url_note": "GOLD in PRED" - }, "intent_template_id": 299, - "changelogs": [ + "start_urls": ["__GITLAB__/root/metaseq"], + "intent": "Navigate to the list of all opened issues that don't have any labels", + "intent_template": "Navigate to the list of all opened issues {{description}}", + "instantiation_dict": {"description": "that don't have any labels"}, + "format_specification": null, + "start_url_context": "Complete the task given the context of the current page.", + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} + }, { - "key": "intent_template", - "category": "clarify_instructions", - "note": "Clarify the intent as a navigation task" + "evaluator": "NetworkEventEvaluator", + "site": "gitlab", + "url_match_mode": "exact", + "last_event_only": true, + "expected": { + "url": "__GITLAB__/root/metaseq/-/issues/?label_name%5B%5D=None&state=opened", + "response_status": 200, + "event_type": "navigation" + } } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping_admin" - ], + "sites": ["shopping_admin"], "task_id": 344, - "require_login": true, - "storage_state": "./.auth/shopping_admin_state.json", - "start_url": "__SHOPPING_ADMIN__", - "geolocation": null, - "intent_template": "How many reviews did our shop receive {{time}}?", - "original.intent_template": "How many reviews our shop received {{time}}?", - "instantiation_dict": { - "time": "so far" - }, - "original.instantiation_dict": { - "time": "by far" - }, - "intent": "How many reviews did our shop receive so far?", - "original.intent": "How many reviews our shop received by far?", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "verify_retrieved_value_normalized", - "eval_params": { - "ordered": false, - "type": "numeric" - }, - "expected_data": [ - 351 - ] - } - } - ], - "site": "shopping_admin" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "must_include": [ - "351" - ] - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "351" - }, "intent_template_id": 248, - "changelogs": [ - { - "key": "intent_template", - "category": "spelling_or_grammar", - "note": "Fix task wording." - }, + "start_urls": ["__SHOPPING_ADMIN__"], + "intent": "How many reviews did our shop receive so far?", + "intent_template": "How many reviews did our shop receive {{time}}?", + "instantiation_dict": {"time": "so far"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "eval", - "category": "permissive_string_match", - "note": "Switched to exact match" + "evaluator": "AgentResponseEvaluator", + "ordered": false, + "results_schema": { "type": "array", "items": {"type": "number"} }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": [351] + } } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping_admin" - ], + "sites": ["shopping_admin"], "task_id": 345, - "require_login": true, - "storage_state": "./.auth/shopping_admin_state.json", - "start_url": "__SHOPPING_ADMIN__", - "geolocation": null, - "intent_template": "How many reviews did our shop receive {{time}}?", - "original.intent_template": "How many reviews our shop received {{time}}?", - "instantiation_dict": { - "time": "in Apr 2023" - }, - "intent": "How many reviews did our shop receive in Apr 2023?", - "original.intent": "How many reviews our shop received in Apr 2023?", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "verify_retrieved_value_normalized", - "eval_params": { - "ordered": false, - "type": "numeric" - }, - "expected_data": [ - 351 - ] - } - } - ], - "site": "shopping_admin" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "must_include": [ - "351" - ] - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "351" - }, "intent_template_id": 248, - "changelogs": [ - { - "key": "intent_template", - "category": "spelling_or_grammar", - "note": "Fix task wording." - }, + "start_urls": ["__SHOPPING_ADMIN__"], + "intent": "How many reviews did our shop receive in Apr 2023?", + "intent_template": "How many reviews did our shop receive {{time}}?", + "instantiation_dict": {"time": "in Apr 2023"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "eval", - "category": "permissive_string_match", - "note": "Switched to exact match" + "evaluator": "AgentResponseEvaluator", + "ordered": false, + "results_schema": { "type": "array", "items": {"type": "number"} }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": [351] + } } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping_admin" - ], + "sites": ["shopping_admin"], "task_id": 346, - "require_login": true, - "storage_state": "./.auth/shopping_admin_state.json", - "start_url": "__SHOPPING_ADMIN__", - "geolocation": null, - "intent_template": "How many reviews did our shop receive {{time}}?", - "original.intent_template": "How many reviews our shop received {{time}}?", - "instantiation_dict": { - "time": "during 2022" - }, - "intent": "How many reviews did our shop receive during 2022?", - "original.intent": "How many reviews our shop received during 2022?", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "verify_retrieved_value_normalized", - "eval_params": { - "ordered": false, - "type": "numeric" - }, - "expected_data": [ - 0 - ] - } - } - ], - "site": "shopping_admin" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "must_include": [ - "0" - ] - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "0" - }, "intent_template_id": 248, - "changelogs": [ - { - "key": "intent_template", - "category": "spelling_or_grammar", - "note": "Fix task wording." - }, + "start_urls": ["__SHOPPING_ADMIN__"], + "intent": "How many reviews did our shop receive during 2022?", + "intent_template": "How many reviews did our shop receive {{time}}?", + "instantiation_dict": {"time": "during 2022"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "eval", - "category": "permissive_string_match", - "note": "Switched to exact match" + "evaluator": "AgentResponseEvaluator", + "ordered": false, + "results_schema": { "type": "array", "items": {"type": "number"} }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": [0] + } } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping_admin" - ], + "sites": ["shopping_admin"], "task_id": 347, - "require_login": true, - "storage_state": "./.auth/shopping_admin_state.json", - "start_url": "__SHOPPING_ADMIN__", - "geolocation": null, - "intent_template": "How many reviews did our shop receive {{time}}?", - "original.intent_template": "How many reviews our shop received {{time}}?", - "instantiation_dict": { - "time": "from the beginning of the shop" - }, - "intent": "How many reviews did our shop receive from the beginning of the shop?", - "original.intent": "How many reviews our shop received from the beginning of the shop?", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "verify_retrieved_value_normalized", - "eval_params": { - "ordered": false, - "type": "numeric" - }, - "expected_data": [ - 351 - ] - } - } - ], - "site": "shopping_admin" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "must_include": [ - "351" - ] - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "351" - }, "intent_template_id": 248, - "changelogs": [ - { - "key": "intent_template", - "category": "spelling_or_grammar", - "note": "Fix task wording." - }, + "start_urls": ["__SHOPPING_ADMIN__"], + "intent": "How many reviews did our shop receive from the beginning of the shop?", + "intent_template": "How many reviews did our shop receive {{time}}?", + "instantiation_dict": {"time": "from the beginning of the shop"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "eval", - "category": "permissive_string_match", - "note": "Switched to exact match" + "evaluator": "AgentResponseEvaluator", + "ordered": false, + "results_schema": { "type": "array", "items": {"type": "number"} }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": [351] + } } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping_admin" - ], + "sites": ["shopping_admin"], "task_id": 348, - "require_login": true, - "storage_state": "./.auth/shopping_admin_state.json", - "start_url": "__SHOPPING_ADMIN__", - "geolocation": null, - "intent_template": "How many reviews did our shop receive {{time}}?", - "original.intent_template": "How many reviews our shop received {{time}}?", - "instantiation_dict": { - "time": "in May 2023" - }, - "intent": "How many reviews did our shop receive in May 2023?", - "original.intent": "How many reviews our shop received in May 2023?", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "verify_retrieved_value_normalized", - "eval_params": { - "ordered": false, - "type": "numeric" - }, - "expected_data": [ - 0 - ] - } - } - ], - "site": "shopping_admin" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "must_include": [ - "0" - ] - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "0" - }, "intent_template_id": 248, - "changelogs": [ - { - "key": "intent_template", - "category": "spelling_or_grammar", - "note": "Fix task wording." - }, + "start_urls": ["__SHOPPING_ADMIN__"], + "intent": "How many reviews did our shop receive in May 2023?", + "intent_template": "How many reviews did our shop receive {{time}}?", + "instantiation_dict": {"time": "in May 2023"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "eval", - "category": "permissive_string_match", - "note": "Switched to exact match" + "evaluator": "AgentResponseEvaluator", + "ordered": false, + "results_schema": { "type": "array", "items": {"type": "number"} }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": [0] + } } - ] + ], + "revision": 2 }, { - "sites": [ - "gitlab" - ], + "sites": ["gitlab"], "task_id": 349, - "require_login": true, - "storage_state": "./.auth/gitlab_state.json", - "start_url": "__GITLAB__", - "geolocation": null, - "intent_template": "Who else has access to my repo {{repo}}, return me their usernames", - "original.intent_template": "Who else have access to my repo {{repo}}, show me their usernames", - "instantiation_dict": { - "repo": "gimmiethat.space" - }, - "intent": "Who else has access to my repo gimmiethat.space, return me their usernames", - "original.intent": "Who else have access to my repo gimmiethat.space, show me their usernames", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "verify_retrieved_value_normalized", - "eval_params": { - "ordered": "false", - "type": "text" - }, - "expected_data": [ - "yjlou" - ] - } + "intent_template_id": 298, + "start_urls": ["__GITLAB__"], + "intent": "Get the usernames of other users who have access to my repo gimmiethat.space", + "intent_template": "Get the usernames of other users who have access to my repo {{repo}}", + "instantiation_dict": {"repo": "gimmiethat.space"}, + "format_specification": null, + "start_url_context": null, + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "ordered": "false", + "results_schema": { "type": "array", "items": {"type": "string"} }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": ["yjlou"] } - ], - "site": "gitlab" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "exact_match": "yjlou" - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "yjlou" - }, - "intent_template_id": 298, - "changelogs": [ - { - "key": "intent_template", - "category": "clarify_instructions", - "note": "Clarify the intent as a retrieve task" - }, - { - "key": "expected_retrieve_value", - "category": "avoid_permissive_text", - "note": "Use structured response to judge correctness" } - ] + ], + "revision": 2 }, { - "sites": [ - "gitlab" - ], + "sites": ["gitlab"], "task_id": 350, - "require_login": true, - "storage_state": "./.auth/gitlab_state.json", - "start_url": "__GITLAB__", - "geolocation": null, - "intent_template": "Who else has access to my repo {{repo}}, return me their usernames", - "original.intent_template": "Who else have access to my repo {{repo}}, show me their usernames", - "instantiation_dict": { - "repo": "prism-theme" - }, - "intent": "Who else has access to my repo prism-theme, return me their usernames", - "original.intent": "Who else have access to my repo prism-theme, show me their usernames", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "verify_retrieved_value_normalized", - "eval_params": { - "ordered": "false", - "type": "text" - }, - "expected_data": [ - "abisubramanya27" - ] - } - } - ], - "site": "gitlab" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "must_include": [ - "abisubramanya27" - ] - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "Abishek S, abisubramanya27" - }, "intent_template_id": 298, - "changelogs": [ + "start_urls": ["__GITLAB__"], + "intent": "Get the usernames of other users who have access to my repo prism-theme", + "intent_template": "Get the usernames of other users who have access to my repo {{repo}}", + "instantiation_dict": {"repo": "prism-theme"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "intent_template", - "category": "clarify_instructions", - "note": "Clarify the intent as a retrieve task" - }, - { - "key": "expected_retrieve_value", - "category": "avoid_permissive_text", - "note": "Use structured response to judge correctness" - }, - { - "key": "eval", - "category": "permissive_string_match", - "note": "Switched to exact match" + "evaluator": "AgentResponseEvaluator", + "ordered": "false", + "results_schema": { "type": "array", "items": {"type": "string"} }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": ["abisubramanya27"] + } } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping" - ], + "sites": ["shopping"], "task_id": 351, - "require_login": true, - "storage_state": "./.auth/shopping_state.json", - "start_url": "__SHOPPING__", - "geolocation": null, - "intent_template": "List products from {{product_category}} category by {{order}} price", - "instantiation_dict": { - "product_category": "PS4 accessories", - "order": "ascending" - }, + "intent_template_id": 137, + "start_urls": ["__SHOPPING__"], "intent": "List products from PS4 accessories category by ascending price", - "require_reset": false, - "eval": { - "expected_ui_state": [ - { - "url": "__SHOPPING__/video-games/playstation-4/accessories.html?product_list_order=price" - } - ], - "site": "shopping" - }, - "original.eval": { - "eval_types": [ - "url_match" - ], - "reference_answers": null, - "reference_url": "__SHOPPING__/video-games/playstation-4/accessories.html?product_list_order=price", - "program_html": [], - "url_note": "GOLD in PRED" - }, - "intent_template_id": 137 + "intent_template": "List products from {{product_category}} category by {{order}} price", + "instantiation_dict": {"product_category": "PS4 accessories", "order": "ascending"}, + "format_specification": null, + "start_url_context": null, + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} + }, + { + "evaluator": "NetworkEventEvaluator", + "site": "shopping", + "url_match_mode": "exact", + "last_event_only": true, + "expected": { + "url": "__SHOPPING__/video-games/playstation-4/accessories.html?product_list_order=price", + "response_status": 200, + "event_type": "navigation" + } + } + ], + "revision": 2 }, { - "sites": [ - "shopping" - ], + "sites": ["shopping"], "task_id": 352, - "require_login": true, - "storage_state": "./.auth/shopping_state.json", - "start_url": "__SHOPPING__", - "geolocation": null, - "intent_template": "List products from {{product_category}} category by {{order}} price", - "instantiation_dict": { - "product_category": "nutrition bars and drinks", - "order": "ascending" - }, + "intent_template_id": 137, + "start_urls": ["__SHOPPING__"], "intent": "List products from nutrition bars and drinks category by ascending price", - "require_reset": false, - "eval": { - "expected_ui_state": [ - { - "url": "__SHOPPING__/health-household/diet-sports-nutrition/nutrition-bars-drinks.html?product_list_order=price" - } - ], - "site": "shopping" - }, - "original.eval": { - "eval_types": [ - "url_match" - ], - "reference_answers": null, - "reference_url": "__SHOPPING__/health-household/diet-sports-nutrition/nutrition-bars-drinks.html?product_list_order=price", - "program_html": [], - "url_note": "GOLD in PRED" - }, - "intent_template_id": 137 + "intent_template": "List products from {{product_category}} category by {{order}} price", + "instantiation_dict": {"product_category": "nutrition bars and drinks", "order": "ascending"}, + "format_specification": null, + "start_url_context": null, + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} + }, + { + "evaluator": "NetworkEventEvaluator", + "site": "shopping", + "url_match_mode": "exact", + "last_event_only": true, + "expected": { + "url": "__SHOPPING__/health-household/diet-sports-nutrition/nutrition-bars-drinks.html?product_list_order=price", + "response_status": 200, + "event_type": "navigation" + } + } + ], + "revision": 2 }, { - "sites": [ - "shopping" - ], + "sites": ["shopping"], "task_id": 353, - "require_login": true, - "storage_state": "./.auth/shopping_state.json", - "start_url": "__SHOPPING__", - "geolocation": null, - "intent_template": "List products from {{product_category}} category by {{order}} price", - "instantiation_dict": { - "product_category": "competitive swimwear", - "order": "ascending" - }, + "intent_template_id": 137, + "start_urls": ["__SHOPPING__"], "intent": "List products from competitive swimwear category by ascending price", - "require_reset": false, - "eval": { - "expected_ui_state": [ - { - "url": "__SHOPPING__/clothing-shoes-jewelry/sport-specific-clothing/competitive-swimwear.html?product_list_order=price" - } - ], - "site": "shopping" - }, - "original.eval": { - "eval_types": [ - "url_match" - ], - "reference_answers": null, - "reference_url": "__SHOPPING__/clothing-shoes-jewelry/sport-specific-clothing/competitive-swimwear.html?product_list_order=price", - "program_html": [], - "url_note": "GOLD in PRED" - }, - "intent_template_id": 137 + "intent_template": "List products from {{product_category}} category by {{order}} price", + "instantiation_dict": {"product_category": "competitive swimwear", "order": "ascending"}, + "format_specification": null, + "start_url_context": null, + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} + }, + { + "evaluator": "NetworkEventEvaluator", + "site": "shopping", + "url_match_mode": "exact", + "last_event_only": true, + "expected": { + "url": "__SHOPPING__/clothing-shoes-jewelry/sport-specific-clothing/competitive-swimwear.html?product_list_order=price", + "response_status": 200, + "event_type": "navigation" + } + } + ], + "revision": 2 }, { - "sites": [ - "shopping" - ], + "sites": ["shopping"], "task_id": 354, - "require_login": true, - "storage_state": "./.auth/shopping_state.json", - "start_url": "__SHOPPING__", - "geolocation": null, - "intent_template": "List products from {{product_category}} category by {{order}} price", - "instantiation_dict": { - "product_category": "living room furtniture", - "order": "descending" - }, + "intent_template_id": 137, + "start_urls": ["__SHOPPING__"], "intent": "List products from living room furtniture category by descending price", - "require_reset": false, - "eval": { - "expected_ui_state": [ - { - "url": "__SHOPPING__/home-kitchen/furniture/living-room-furniture.html?product_list_order=price&product_list_dir=desc" - } - ], - "site": "shopping" - }, - "original.eval": { - "eval_types": [ - "url_match" - ], - "reference_answers": null, - "reference_url": "__SHOPPING__/home-kitchen/furniture/living-room-furniture.html?product_list_order=price&product_list_dir=desc", - "program_html": [], - "url_note": "GOLD in PRED" - }, - "intent_template_id": 137 + "intent_template": "List products from {{product_category}} category by {{order}} price", + "instantiation_dict": {"product_category": "living room furtniture", "order": "descending"}, + "format_specification": null, + "start_url_context": null, + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} + }, + { + "evaluator": "NetworkEventEvaluator", + "site": "shopping", + "url_match_mode": "exact", + "last_event_only": true, + "expected": { + "url": "__SHOPPING__/home-kitchen/furniture/living-room-furniture.html?product_list_order=price&product_list_dir=desc", + "response_status": 200, + "event_type": "navigation" + } + } + ], + "revision": 2 }, { - "sites": [ - "shopping" - ], + "sites": ["shopping"], "task_id": 355, - "require_login": true, - "storage_state": "./.auth/shopping_state.json", - "start_url": "__SHOPPING__", - "geolocation": null, - "intent_template": "List products from {{product_category}} category by {{order}} price", - "instantiation_dict": { - "product_category": "kids' bedding", - "order": "descending" - }, + "intent_template_id": 137, + "start_urls": ["__SHOPPING__"], "intent": "List products from kids' bedding category by descending price", - "require_reset": false, - "eval": { - "expected_ui_state": [ - { - "url": "__SHOPPING__/home-kitchen/bedding/kids-bedding.html?product_list_dir=desc" - } - ], - "site": "shopping" - }, - "original.eval": { - "eval_types": [ - "url_match" - ], - "reference_answers": null, - "reference_url": "__SHOPPING__/home-kitchen/bedding/kids-bedding.html?product_list_dir=desc", - "program_html": [], - "url_note": "GOLD in PRED" - }, - "intent_template_id": 137 + "intent_template": "List products from {{product_category}} category by {{order}} price", + "instantiation_dict": {"product_category": "kids' bedding", "order": "descending"}, + "format_specification": null, + "start_url_context": null, + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} + }, + { + "evaluator": "NetworkEventEvaluator", + "site": "shopping", + "url_match_mode": "exact", + "last_event_only": true, + "expected": { + "url": "__SHOPPING__/home-kitchen/bedding/kids-bedding.html?product_list_dir=desc", + "response_status": 200, + "event_type": "navigation" + } + } + ], + "revision": 2 }, { - "sites": [ - "map" - ], + "sites": ["map"], "task_id": 356, - "require_login": true, - "storage_state": null, - "start_url": "__MAP__", - "geolocation": null, - "intent_template": "Show the route from SCS CMU in Pittsburgh to the location where the Declaration of Independence and Constitution were signed", + "intent_template_id": 49, + "start_urls": ["__MAP__"], + "intent": "Show the route from Gates and Hillman Centers at CMU in Pittsburgh to the location where the Declaration of Independence and Constitution were signed", + "intent_template": "Show the route from Gates and Hillman Centers at CMU in Pittsburgh to the location where the Declaration of Independence and Constitution were signed", "instantiation_dict": {}, - "intent": "Show the route from SCS CMU in Pittsburgh to the location where the Declaration of Independence and Constitution were signed", - "require_reset": false, - "eval": { - "site": "map" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": null, - "program_html": [ - { - "url": "last", - "locator": "document.querySelector(\"div#content select.routing_engines\").selectedIndex", - "required_contents": { - "exact_match": "1" - } - }, - { - "url": "last", - "locator": "document.querySelector('[name=\"route_from\"').value", - "required_contents": { - "must_include": [ - "Gates and Hillman Centers", - "Pittsburgh" - ] - } - }, - { - "url": "last", - "locator": "document.querySelector('[name=\"route_to\"').value", - "required_contents": { - "must_include": [ - "Independence Hall", - "Philadelphia" - ] - } - } - ] - }, - "intent_template_id": 49 + "format_specification": null, + "start_url_context": null, + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} + }, + { + "evaluator": "NetworkEventEvaluator", + "site": "map", + "url_match_mode": "exact", + "last_event_only": true, + "expected": {"url": "__MAP__", "response_status": 200, "event_type": "navigation"} + } + ], + "revision": 2 }, { - "sites": [ - "gitlab" - ], + "sites": ["gitlab"], "task_id": 357, - "require_login": true, - "storage_state": "./.auth/gitlab_state.json", - "start_url": "__GITLAB__", - "geolocation": null, + "intent_template_id": 291, + "start_urls": ["__GITLAB__"], + "intent": "Navigate to the merge requests requiring my review", "intent_template": "Navigate to the merge requests requiring my review", - "original.intent_template": "Checkout merge requests requiring my review", "instantiation_dict": {}, - "intent": "Navigate to the merge requests requiring my review", - "original.intent": "Checkout merge requests requiring my review", - "require_reset": false, - "eval": { - "expected_ui_state": [ - { - "url": "__GITLAB__/dashboard/merge_requests?reviewer_username=byteblaze" - } - ], - "site": "gitlab" - }, - "original.eval": { - "eval_types": [ - "url_match" - ], - "reference_answers": null, - "reference_url": "__GITLAB__/dashboard/merge_requests?reviewer_username=byteblaze", - "program_html": [], - "url_note": "GOLD in PRED" - }, - "intent_template_id": 291, - "changelogs": [ - { - "key": "intent_template", - "category": "clarify_instructions", - "note": "Clarify the intent as a navigation task" + "format_specification": null, + "start_url_context": null, + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} + }, + { + "evaluator": "NetworkEventEvaluator", + "site": "gitlab", + "url_match_mode": "exact", + "last_event_only": true, + "expected": { + "url": [ + "__GITLAB__/dashboard/merge_requests?reviewer_username=byteblaze", + "__GITLAB__/dashboard/merge_requests?reviewer_username=byteblaze&scope=all&state=opened" + ], + "response_status": 200, + "event_type": "navigation" + } } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping" - ], + "sites": ["shopping"], "task_id": 358, - "require_login": true, - "storage_state": "./.auth/shopping_state.json", - "start_url": "__SHOPPING__", - "geolocation": null, - "intent_template": "Get the {{info}} for order number {{order_number}}.", - "original.intent_template": "Show me the {{info}} for order number {{order_number}}.", - "instantiation_dict": { - "info": "shipping method", - "order_number": 187 - }, - "intent": "Get the shipping method for order number 187.", - "original.intent": "Show me the shipping method for order number 187.", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "verify_retrieved_value_normalized", - "eval_params": { - "ordered": false, - "type": "text" - }, - "expected_data": [ - "Flat Rate - Fixed" - ] - } - } - ], - "site": "shopping" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "must_include": [ - "Flat Rate - Fixed" - ] - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "Flat Rate - Fixed" - }, "intent_template_id": 206, - "changelogs": [ - { - "key": "intent_template", - "category": "task_ambiguity", - "note": "Clearly specify return value expected" - }, + "start_urls": ["__SHOPPING__"], + "intent": "Get the shipping method for order number 187.", + "intent_template": "Get the {{info}} for order number {{order_number}}.", + "instantiation_dict": {"info": "shipping method", "order_number": 187}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "eval", - "category": "permissive_string_match", - "note": "Switched to exact match" + "evaluator": "AgentResponseEvaluator", + "ordered": false, + "results_schema": { "type": "array", "items": {"type": "string"} }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": ["Flat Rate - Fixed"] + } } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping" - ], + "sites": ["shopping"], "task_id": 359, - "require_login": true, - "storage_state": "./.auth/shopping_state.json", - "start_url": "__SHOPPING__", - "geolocation": null, - "intent_template": "Get the {{info}} for order number {{order_number}}.", - "original.intent_template": "Show me the {{info}} for order number {{order_number}}.", - "instantiation_dict": { - "info": "order date", - "order_number": "148" - }, - "intent": "Get the order date for order number 148.", - "original.intent": "Show me the order date for order number 148.", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "verify_retrieved_value_normalized", - "eval_params": { - "ordered": false, - "type": "date" - }, - "expected_data": [ - "1/29/2023" - ] - } - } - ], - "site": "shopping" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "fuzzy_match": [ - "1/29/2023" - ] - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "1/29/2023" - }, "intent_template_id": 206, - "changelogs": [ - { - "key": "intent_template", - "category": "task_ambiguity", - "note": "Clearly specify return value expected" - }, + "start_urls": ["__SHOPPING__"], + "intent": "Get the order date for order number 148.", + "intent_template": "Get the {{info}} for order number {{order_number}}.", + "instantiation_dict": {"info": "order date", "order_number": "148"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "eval", - "category": "permissive_string_match", - "note": "Switched from fuzzy eval" + "evaluator": "AgentResponseEvaluator", + "ordered": false, + "results_schema": { "type": "array", "items": {"type": "string"} }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": ["1/29/2023"] + } } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping" - ], + "sites": ["shopping"], "task_id": 360, - "require_login": true, - "storage_state": "./.auth/shopping_state.json", - "start_url": "__SHOPPING__", - "geolocation": null, - "intent_template": "Get the {{info}} for order number {{order_number}}.", - "original.intent_template": "Show me the {{info}} for order number {{order_number}}.", - "instantiation_dict": { - "info": "product names", - "order_number": "148" - }, - "intent": "Get the product names for order number 148.", - "original.intent": "Show me the product names for order number 148.", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "verify_retrieved_value_normalized", - "eval_params": { - "ordered": false, - "type": "text" - }, - "expected_data": [ - "Bornbridge Artificial Spiral Topiary Tree - Indoor / Outdoor Topiary Trees - Artificial Outdoor Plants (2 Pack, 4' Cypress)", - "Russound 5B45W 4\" Indoor Outdoor Speakers White" - ] - } - } - ], - "site": "shopping" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "must_include": [ - "Bornbridge Artificial Spiral Topiary Tree - Indoor / Outdoor Topiary Trees - Artificial Outdoor Plants (2 Pack, 4' Cypress)", - "Russound 5B45W 4\" Indoor Outdoor Speakers White" - ] - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "Bornbridge Artificial Spiral Topiary Tree - Indoor / Outdoor Topiary Trees - Artificial Outdoor Plants (2 Pack, 4' Cypress), Russound 5B45W 4\" Indoor Outdoor Speakers White" - }, "intent_template_id": 206, - "changelogs": [ - { - "key": "intent_template", - "category": "task_ambiguity", - "note": "Clearly specify return value expected" - }, + "start_urls": ["__SHOPPING__"], + "intent": "Get the product names for order number 148.", + "intent_template": "Get the {{info}} for order number {{order_number}}.", + "instantiation_dict": {"info": "product names", "order_number": "148"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "eval", - "category": "permissive_string_match", - "note": "Switched to exact match" + "evaluator": "AgentResponseEvaluator", + "ordered": false, + "results_schema": { "type": "array", "items": {"type": "string"} }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": [ + "Bornbridge Artificial Spiral Topiary Tree - Indoor / Outdoor Topiary Trees - Artificial Outdoor Plants (2 Pack, 4' Cypress)", + "Russound 5B45W 4\" Indoor Outdoor Speakers White" + ] + } } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping" - ], + "sites": ["shopping"], "task_id": 361, - "require_login": true, - "storage_state": "./.auth/shopping_state.json", - "start_url": "__SHOPPING__", - "geolocation": null, - "intent_template": "Get the {{info}} for order number {{order_number}}.", - "original.intent_template": "Show me the {{info}} for order number {{order_number}}.", - "instantiation_dict": { - "info": "order statuses", - "order_number": "170 and 189" - }, + "intent_template_id": 206, + "start_urls": ["__SHOPPING__"], "intent": "Get the order statuses for order number 170 and 189.", - "original.intent": "Show me the order statuses for order number 170 and 189.", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "verify_retrieved_value_normalized", - "eval_params": { - "ordered": false, - "type": "object" - }, - "expected_data": [ - { - "order_number": { - "value": "170", - "type": "numeric" - }, - "status": { - "value": "canceled", - "type": "text" - } - }, - { - "order_number": { - "value": "189", - "type": "numeric" - }, - "status": { - "value": "pending", - "type": "text" - } - } - ] + "intent_template": "Get the {{info}} for order number {{order_number}}.", + "instantiation_dict": {"info": "order statuses", "order_number": "170 and 189"}, + "format_specification": "Use \"order_number\" for order number and \"status\" for status.", + "start_url_context": null, + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "ordered": false, + "results_schema": { + "type": "array", + "items": { + "type": "object", + "properties": { "order_number": {"type": "string"}, "status": {"type": "string"} }, + "required": ["order_number", "status"] } + }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": [ + { "order_number": "170", "status": "canceled" }, + { "order_number": "189", "status": "pending" } + ] } - ], - "site": "shopping" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "fuzzy_match": [ - "170: cancelled", - "189: pending" - ] - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "170: cancelled, 189: pending" - }, - "intent_template_id": 206, - "changelogs": [ - { - "key": "intent_template", - "category": "task_ambiguity", - "note": "Clearly specify return value expected" - }, - { - "key": "eval", - "category": "permissive_string_match", - "note": "Switched from fuzzy eval" } ], - "format_specification": "Use \"order_number\" for order number and \"status\" for status." + "revision": 2 }, { - "sites": [ - "shopping" - ], + "sites": ["shopping"], "task_id": 362, - "require_login": true, - "storage_state": "./.auth/shopping_state.json", - "start_url": "__SHOPPING__", - "geolocation": null, - "intent_template": "Get the {{info}} for order number {{order_number}}.", - "original.intent_template": "Show me the {{info}} for order number {{order_number}}.", - "instantiation_dict": { - "info": "billing address", - "order_number": "00178" - }, + "intent_template_id": 206, + "start_urls": ["__SHOPPING__"], "intent": "Get the billing address for order number 00178.", - "original.intent": "Show me the billing address for order number 00178.", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "verify_retrieved_value_normalized", - "eval_params": { - "ordered": false, - "type": "object" - }, - "expected_data": [ - { - "address": { - "value": "101 S San Mateo Dr", - "type": "text" - }, - "city": { - "value": "San Mateo", - "type": "text" - }, - "state": { - "value": "California", - "type": "text" - }, - "zip_code": { - "value": "94010", - "type": "text" - }, - "country": { - "value": "United States", - "type": "text" - } - } - ] - } + "intent_template": "Get the {{info}} for order number {{order_number}}.", + "instantiation_dict": {"info": "billing address", "order_number": "00178"}, + "format_specification": "Use keys \"house_number\", \"street\", \"city\", \"state\", \"postcode\", and \"country\". Set any key to null if not available.", + "start_url_context": null, + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "ordered": false, + "results_schema": { + "type": "array", + "items": { + "type": "object", + "title": "full_address", + "properties": { + "house_number": { "type": "string" }, + "street" : { "type": "string" }, + "city" : { "type": "string" }, + "state" : { "type": "string" }, + "postcode" : { "type": "string" }, + "country" : { "type": "string" } + }, + "required": ["house_number", "street", "city", "country", "state", "postcode"] + } + }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": [ + { + "house_number": "101", + "street": "S San Mateo Dr", + "city": "San Mateo", + "state": "California", + "postcode": "94010", + "country": "United States" + } + ] } - ], - "site": "shopping" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "must_include": [ - "101 S San Mateo Dr", - "San Mateo", - "California", - "94010", - "United States" - ] - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "Emma Lopez, 101 S San Mateo Dr, San Mateo, California, 94010, United States" - }, - "intent_template_id": 206, - "changelogs": [ - { - "key": "intent_template", - "category": "task_ambiguity", - "note": "Clearly specify return value expected" - }, - { - "key": "eval", - "category": "permissive_string_match", - "note": "Switched to exact match" } ], - "format_specification": "Use keys \"address\", \"city\", \"state\", \"zip_code\", and \"country\". Set any key to null if not available." + "revision": 2 }, { - "sites": [ - "map" - ], + "sites": ["map"], "task_id": 363, - "require_login": true, - "storage_state": null, - "start_url": "__MAP__", - "geolocation": null, + "intent_template_id": 58, + "start_urls": ["__MAP__"], + "intent": "Measure distance between Carnegie Mellon University and Carnegie Music Hall by walking", "intent_template": "Measure distance between {{location/address_1}} and {{location/address_2}} by walking", "instantiation_dict": { "location/address_1": "Carnegie Mellon University", "location/address_2": "Carnegie Music Hall" }, - "intent": "Measure distance between Carnegie Mellon University and Carnegie Music Hall by walking", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "verify_retrieved_value_normalized", - "eval_params": { - "ordered": false, - "type": "text" - }, - "expected_data": [ - "748m" - ] - } + "format_specification": null, + "start_url_context": null, + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "ordered": true, + "results_schema": { "type": "array", "items": {"type": "string", "format": "distance"} }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": ["748m"] } - ], - "site": "map" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "exact_match": "748m" - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "748m" - }, - "intent_template_id": 58 + } + ], + "revision": 2 }, { - "sites": [ - "map" - ], + "sites": ["map"], "task_id": 364, - "require_login": true, - "storage_state": null, - "start_url": "__MAP__", - "geolocation": null, + "intent_template_id": 58, + "start_urls": ["__MAP__"], + "intent": "Measure distance between Carnegie Mellon University and UPMC Shadyside by walking", "intent_template": "Measure distance between {{location/address_1}} and {{location/address_2}} by walking", "instantiation_dict": { "location/address_1": "Carnegie Mellon University", "location/address_2": "UPMC Shadyside" }, - "intent": "Measure distance between Carnegie Mellon University and UPMC Shadyside by walking", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "verify_retrieved_value_normalized", - "eval_params": { - "ordered": false, - "type": "text" - }, - "expected_data": [ - "1.7km" - ] - } + "format_specification": null, + "start_url_context": null, + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "ordered": true, + "results_schema": { "type": "array", "items": {"type": "string", "format": "distance"} }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": ["1.7km"] } - ], - "site": "map" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "exact_match": "1.7km" - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "1.7km" - }, - "intent_template_id": 58 + } + ], + "revision": 2 }, { - "sites": [ - "map" - ], + "sites": ["map"], "task_id": 365, - "require_login": true, - "storage_state": null, - "start_url": "__MAP__", - "geolocation": null, + "intent_template_id": 58, + "start_urls": ["__MAP__"], + "intent": "Measure distance between Carnegie Music Hall and UPMC Shadyside by walking", "intent_template": "Measure distance between {{location/address_1}} and {{location/address_2}} by walking", "instantiation_dict": { "location/address_1": "Carnegie Music Hall", "location/address_2": "UPMC Shadyside" }, - "intent": "Measure distance between Carnegie Music Hall and UPMC Shadyside by walking", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "verify_retrieved_value_normalized", - "eval_params": { - "ordered": false, - "type": "text" - }, - "expected_data": [ - "2.2km" - ] - } + "format_specification": null, + "start_url_context": null, + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "ordered": true, + "results_schema": { "type": "array", "items": {"type": "string", "format": "distance"} }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": ["2.2km"] } - ], - "site": "map" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "exact_match": "2.2km" - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "2.2km" - }, - "intent_template_id": 58 + } + ], + "revision": 2 }, { - "sites": [ - "map" - ], + "sites": ["map"], "task_id": 366, - "require_login": true, - "storage_state": null, - "start_url": "__MAP__", - "geolocation": null, + "intent_template_id": 58, + "start_urls": ["__MAP__"], + "intent": "Measure distance between CVS (closet one) and UPMC Shadyside by walking", "intent_template": "Measure distance between {{location/address_1}} and {{location/address_2}} by walking", "instantiation_dict": { "location/address_1": "CVS (closet one)", "location/address_2": "UPMC Shadyside" }, - "intent": "Measure distance between CVS (closet one) and UPMC Shadyside by walking", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "verify_retrieved_value_normalized", - "eval_params": { - "ordered": false, - "type": "text" - }, - "expected_data": [ - "1.2km" - ] - } + "format_specification": null, + "start_url_context": null, + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "ordered": true, + "results_schema": { "type": "array", "items": {"type": "string", "format": "distance"} }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": ["1.2km"] } - ], - "site": "map" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "exact_match": "1.2km" - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "1.2km" - }, - "intent_template_id": 58 + } + ], + "revision": 2 }, { - "sites": [ - "map" - ], + "sites": ["map"], "task_id": 367, - "require_login": true, - "storage_state": null, - "start_url": "__MAP__", - "geolocation": null, + "intent_template_id": 58, + "start_urls": ["__MAP__"], + "intent": "Measure distance between Carnegie Mellon University and CVS (closet one) by walking", "intent_template": "Measure distance between {{location/address_1}} and {{location/address_2}} by walking", "instantiation_dict": { "location/address_1": "Carnegie Mellon University", "location/address_2": "CVS (closet one)" }, - "intent": "Measure distance between Carnegie Mellon University and CVS (closet one) by walking", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "verify_retrieved_value_normalized", - "eval_params": { - "ordered": false, - "type": "text" - }, - "expected_data": [ - "1.4km" - ] - } + "format_specification": null, + "start_url_context": null, + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "ordered": true, + "results_schema": { "type": "array", "items": {"type": "string", "format": "distance"} }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": ["1.4km"] } - ], - "site": "map" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "exact_match": "1.4km" - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "1.4km" - }, - "intent_template_id": 58 + } + ], + "revision": 2 }, { - "sites": [ - "shopping" - ], + "sites": ["shopping"], "task_id": 368, - "require_login": true, - "storage_state": "./.auth/shopping_state.json", - "start_url": "__SHOPPING__", - "geolocation": null, - "intent_template": "find discounted items.", - "instantiation_dict": {}, - "intent": "find discounted items.", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "validate_status", - "expected_data": { - "expected_status": "NOT_FOUND_ERROR" - } - } - } - ], - "site": "shopping" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "fuzzy_match": "N/A" - }, - "reference_url": "", - "program_html": [], - "string_note": "There is no function to show only discount items", - "reference_answer_raw_annotation": "There is no function to show only discount items." - }, "intent_template_id": 188, - "changelogs": [ + "start_urls": ["__SHOPPING__"], + "intent": "find discounted items.", + "intent_template": "find discounted items.", + "instantiation_dict": {}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "expected_retrieve_value", - "category": "unachievable_tasks", - "note": "Status check used instead of fuzzy match" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": { + "performed_operation": "retrieve", + "status": "NOT_FOUND_ERROR", + "retrieved_data": null + } } - ] + ], + "revision": 2 }, { - "sites": [ - "map" - ], + "sites": ["map"], "task_id": 369, - "require_login": true, - "storage_state": null, - "start_url": "__MAP__", - "geolocation": null, - "intent_template": "Pull up the description page of {{location}} on Map", - "instantiation_dict": { - "location": "Carnegie Music Hall" - }, + "intent_template_id": 52, + "start_urls": ["__MAP__"], "intent": "Pull up the description page of Carnegie Music Hall on Map", - "require_reset": false, - "eval": { - "site": "map" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": null, - "program_html": [ - { - "url": "last", - "locator": "document.querySelector('[id=\"sidebar_content\"').outerText", - "required_contents": { - "must_include": [ - "Carnegie Music Hall" - ] - } + "intent_template": "Pull up the description page of {{location}} on Map", + "instantiation_dict": {"location": "Carnegie Music Hall"}, + "format_specification": null, + "start_url_context": null, + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} + }, + { + "evaluator": "NetworkEventEvaluator", + "site": "map", + "url_match_mode": "exact", + "last_event_only": true, + "expected": { + "url": "__MAP__/way/154257484/", + "response_status": 200, + "event_type": "navigation" } - ] - }, - "intent_template_id": 52 + } + ], + "revision": 2 }, { - "sites": [ - "map" - ], + "sites": ["map"], "task_id": 370, - "require_login": true, - "storage_state": null, - "start_url": "__MAP__", - "geolocation": null, - "intent_template": "Pull up the description page of {{location}} on Map", - "instantiation_dict": { - "location": "Carnegie Mellon University" - }, + "intent_template_id": 52, + "start_urls": ["__MAP__"], "intent": "Pull up the description page of Carnegie Mellon University on Map", - "require_reset": false, - "eval": { - "site": "map" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": null, - "program_html": [ - { - "url": "last", - "locator": "document.querySelector('[id=\"sidebar_content\"').outerText", - "required_contents": { - "must_include": [ - "Carnegie Mellon University" - ] - } + "intent_template": "Pull up the description page of {{location}} on Map", + "instantiation_dict": {"location": "Carnegie Mellon University"}, + "format_specification": null, + "start_url_context": null, + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} + }, + { + "evaluator": "NetworkEventEvaluator", + "site": "map", + "url_match_mode": "exact", + "last_event_only": true, + "expected": { + "url": "__MAP__/relation/2279034/", + "response_status": 200, + "event_type": "navigation" } - ] - }, - "intent_template_id": 52 + } + ], + "revision": 2 }, { - "sites": [ - "map" - ], + "sites": ["map"], "task_id": 371, - "require_login": true, - "storage_state": null, - "start_url": "__MAP__", - "geolocation": null, - "intent_template": "Pull up the description page of {{location}} on Map", - "instantiation_dict": { - "location": "Piada restaurant near Pitt" - }, + "intent_template_id": 52, + "start_urls": ["__MAP__"], "intent": "Pull up the description page of Piada restaurant near Pitt on Map", - "require_reset": false, - "eval": { - "site": "map" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": null, - "program_html": [ - { - "url": "last", - "locator": "document.querySelector('[id=\"sidebar_content\"').outerText", - "required_contents": { - "must_include": [ - "Piada Italian Street Food", - "Forbes Avenue" - ] - } + "intent_template": "Pull up the description page of {{location}} on Map", + "instantiation_dict": {"location": "Piada restaurant near Pitt"}, + "format_specification": null, + "start_url_context": null, + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} + }, + { + "evaluator": "NetworkEventEvaluator", + "site": "map", + "url_match_mode": "exact", + "last_event_only": true, + "expected": { + "url": "__MAP__/node/2710170970", + "response_status": 200, + "event_type": "navigation" } - ] - }, - "intent_template_id": 52 + } + ], + "revision": 2 }, { - "sites": [ - "map" - ], + "sites": ["map"], "task_id": 372, - "require_login": true, - "storage_state": null, - "start_url": "__MAP__", - "geolocation": null, + "intent_template_id": 52, + "start_urls": ["__MAP__"], + "intent": "Pull up the description page of the Costco in Pittsburgh near a river on Map", "intent_template": "Pull up the description page of {{location}} on Map", - "instantiation_dict": { - "location": "the Costco in Pittsburhg near a river" - }, - "intent": "Pull up the description page of the Costco in Pittsburhg near a river on Map", - "require_reset": false, - "eval": { - "site": "map" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": null, - "program_html": [ - { - "url": "last", - "locator": "document.querySelector('[id=\"sidebar_content\"').outerText", - "required_contents": { - "must_include": [ - "Costco", - "Waterfront Drive West" - ] - } + "instantiation_dict": {"location": "the Costco in Pittsburgh near a river"}, + "format_specification": null, + "start_url_context": null, + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} + }, + { + "evaluator": "NetworkEventEvaluator", + "site": "map", + "url_match_mode": "exact", + "last_event_only": true, + "expected": { + "url": "__MAP__/way/168456128", + "response_status": 200, + "event_type": "navigation" } - ] - }, - "intent_template_id": 52 + } + ], + "revision": 2 }, { - "sites": [ - "map" - ], + "sites": ["map"], "task_id": 373, - "require_login": true, - "storage_state": null, - "start_url": "__MAP__", - "geolocation": null, - "intent_template": "Pull up the description page of {{location}} on Map", - "instantiation_dict": { - "location": "Whole Foods near Carnegie Mellon" - }, + "intent_template_id": 52, + "start_urls": ["__MAP__"], "intent": "Pull up the description page of Whole Foods near Carnegie Mellon on Map", - "require_reset": false, - "eval": { - "site": "map" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": null, - "program_html": [ - { - "url": "last", - "locator": "document.querySelector('[id=\"sidebar_content\"').outerText", - "required_contents": { - "must_include": [ - "Whole Foods", - "East Liberty" - ] - } + "intent_template": "Pull up the description page of {{location}} on Map", + "instantiation_dict": {"location": "Whole Foods near Carnegie Mellon"}, + "format_specification": null, + "start_url_context": null, + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} + }, + { + "evaluator": "NetworkEventEvaluator", + "site": "map", + "url_match_mode": "exact", + "last_event_only": true, + "expected": { + "url": "__MAP__/node/10114377662", + "response_status": 200, + "event_type": "navigation" } - ] - }, - "intent_template_id": 52 + } + ], + "revision": 2 }, { - "sites": [ - "shopping_admin" - ], + "sites": ["shopping_admin"], "task_id": 374, - "require_login": true, - "storage_state": "./.auth/shopping_admin_state.json", - "start_url": "__SHOPPING_ADMIN__", - "geolocation": null, - "intent_template": "Show the {{name}} theme settings", - "original.intent_template": "Preview the {{name}} theme for my shop", - "instantiation_dict": { - "name": "Magento Blank" - }, - "intent": "Show the Magento Blank theme settings", - "original.intent": "Preview the Magento Blank theme for my shop", - "require_reset": false, - "eval": { - "expected_ui_state": [ - { - "url": "__SHOPPING_ADMIN__/admin/system_design_theme/edit/id/1" - } - ], - "site": "shopping_admin" - }, - "original.eval": { - "eval_types": [ - "url_match" - ], - "reference_answers": null, - "reference_url": "__SHOPPING_ADMIN__/admin/system_design_theme/edit/id/1", - "program_html": [], - "url_note": "GOLD in PRED" - }, "intent_template_id": 266, - "changelogs": [ + "start_urls": ["__SHOPPING_ADMIN__"], + "intent": "Show the Magento Blank theme settings", + "intent_template": "Show the {{name}} theme settings", + "instantiation_dict": {"name": "Magento Blank"}, + "format_specification": null, + "start_url_context": null, + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} + }, { - "key": "intent_template", - "category": "task_ambiguity", - "note": "Preview a theme can mean applying the theme. We match the intent to the original evaluation target which is navigating to the theme settings." + "evaluator": "NetworkEventEvaluator", + "site": "shopping_admin", + "url_match_mode": "exact", + "last_event_only": true, + "expected": { + "url": "__SHOPPING_ADMIN__/admin/system_design_theme/edit/id/1", + "event_type": "navigation" + } } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping_admin" - ], + "sites": ["shopping_admin"], "task_id": 375, - "require_login": true, - "storage_state": "./.auth/shopping_admin_state.json", - "start_url": "__SHOPPING_ADMIN__", - "geolocation": null, - "intent_template": "Show the {{name}} theme settings", - "original.intent_template": "Preview the {{name}} theme for my shop", - "instantiation_dict": { - "name": "Magento Luma" - }, - "intent": "Show the Magento Luma theme settings", - "original.intent": "Preview the Magento Luma theme for my shop", - "require_reset": false, - "eval": { - "expected_ui_state": [ - { - "url": "__SHOPPING_ADMIN__/admin/system_design_theme/edit/id/3/key/" - } - ], - "site": "shopping_admin" - }, - "original.eval": { - "eval_types": [ - "url_match" - ], - "reference_answers": null, - "reference_url": "__SHOPPING_ADMIN__/admin/system_design_theme/edit/id/3/key/", - "program_html": [], - "url_note": "GOLD in PRED" - }, "intent_template_id": 266, - "changelogs": [ + "start_urls": ["__SHOPPING_ADMIN__"], + "intent": "Show the Magento Luma theme settings", + "intent_template": "Show the {{name}} theme settings", + "instantiation_dict": {"name": "Magento Luma"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "intent_template", - "category": "task_ambiguity", - "note": "Preview a theme can mean applying the theme. We match the intent to the original evaluation target which is navigating to the theme settings." + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} + }, + { + "evaluator": "NetworkEventEvaluator", + "site": "shopping_admin", + "url_match_mode": "exact", + "last_event_only": true, + "expected": { + "url": "__SHOPPING_ADMIN__/admin/system_design_theme/edit/id/3/key/", + "event_type": "navigation" + } } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping" - ], + "sites": ["shopping"], "task_id": 376, - "require_login": true, - "storage_state": "./.auth/shopping_state.json", - "start_url": "__SHOPPING__", - "geolocation": null, - "intent_template": "Summarize customer reviews for {{product}}.", - "instantiation_dict": { - "product": "Amazon Echo Dot 3rd generation" - }, - "intent": "Summarize customer reviews for Amazon Echo Dot 3rd generation.", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "validate_status", - "expected_data": { - "expected_status": "NOT_FOUND_ERROR" - } - } - } - ], - "site": "shopping" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "fuzzy_match": "N/A" - }, - "reference_url": "", - "program_html": [], - "string_note": "There is no review for this product", - "reference_answer_raw_annotation": "N/A" - }, "intent_template_id": 182, - "changelogs": [ + "start_urls": ["__SHOPPING__"], + "intent": "Summarize customer reviews for Amazon Echo Dot 3rd generation.", + "intent_template": "Summarize customer reviews for {{product}}.", + "instantiation_dict": {"product": "Amazon Echo Dot 3rd generation"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "expected_retrieve_value", - "category": "unachievable_tasks", - "note": "Status check used instead of fuzzy match" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": { + "performed_operation": "retrieve", + "status": "NOT_FOUND_ERROR", + "retrieved_data": null + } } - ] + ], + "revision": 2 }, { - "sites": [ - "map" - ], + "sites": ["map"], "task_id": 377, - "require_login": true, - "storage_state": null, - "start_url": "__MAP__", - "geolocation": null, - "intent_template": "Find the {{space}} around {{location}}", - "instantiation_dict": { - "location": "CMU ArtPark Lab", - "space": "resturants" - }, - "intent": "Find the resturants around CMU ArtPark Lab", - "require_reset": false, - "eval": { - "expected_ui_state": [ - { - "url": "__MAP__/search?query=restaurants%20near%20CMU%20ArtPark%20Lab" - } - ], - "site": "map" - }, - "original.eval": { - "eval_types": [ - "url_match" - ], - "reference_answers": null, - "reference_url": "__MAP__/search?query=restaurants%20near%20CMU%20ArtPark%20Lab", - "program_html": [], - "url_note": "GOLD in PRED" - }, - "intent_template_id": 59 + "intent_template_id": 59, + "start_urls": ["__MAP__"], + "intent": "Search for \"resturants near CMU ArtPark Lab\"", + "intent_template": "Search for \"{{space}} near {{location}}\"", + "instantiation_dict": {"location": "CMU ArtPark Lab", "space": "resturants"}, + "format_specification": null, + "start_url_context": null, + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} + }, + { + "evaluator": "NetworkEventEvaluator", + "site": "map", + "url_match_mode": "exact", + "last_event_only": true, + "expected": { + "url": "__MAP__/search?query=restaurants%20near%20CMU%20ArtPark%20Lab", + "response_status": 200, + "event_type": "navigation" + } + } + ], + "revision": 2 }, { - "sites": [ - "map" - ], + "sites": ["map"], "task_id": 378, - "require_login": true, - "storage_state": null, - "start_url": "__MAP__", - "geolocation": null, - "intent_template": "Find the {{space}} around {{location}}", - "instantiation_dict": { - "location": "CMU main campus", - "space": "parking" - }, - "intent": "Find the parking around CMU main campus", - "require_reset": false, - "eval": { - "expected_ui_state": [ - { - "url": "__MAP__/search?query=parking%20near%20carnegie%20mellon%20university" - } - ], - "site": "map" - }, - "original.eval": { - "eval_types": [ - "url_match" - ], - "reference_answers": null, - "reference_url": "__MAP__/search?query=parking%20near%20carnegie%20mellon%20university", - "program_html": [], - "url_note": "GOLD in PRED" - }, - "intent_template_id": 59 + "intent_template_id": 59, + "start_urls": ["__MAP__"], + "intent": "Search for \"parking near Carnegie Mellon University\"", + "intent_template": "Search for \"{{space}} near {{location}}\"", + "instantiation_dict": {"location": "Carnegie Mellon University", "space": "parking"}, + "format_specification": null, + "start_url_context": null, + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} + }, + { + "evaluator": "NetworkEventEvaluator", + "site": "map", + "url_match_mode": "exact", + "last_event_only": true, + "expected": { + "url": "__MAP__/search?query=parking%20near%20Carnegie%20Mellon%20University", + "response_status": 200, + "event_type": "navigation" + } + } + ], + "revision": 2 }, { - "sites": [ - "map" - ], + "sites": ["map"], "task_id": 379, - "require_login": true, - "storage_state": null, - "start_url": "__MAP__", - "geolocation": null, - "intent_template": "Find the {{space}} around {{location}}", - "instantiation_dict": { - "location": "CMU main campus", - "space": "hotel" - }, - "intent": "Find the hotel around CMU main campus", - "require_reset": false, - "eval": { - "expected_ui_state": [ - { - "url": "__MAP__/search?query=hotels%20near%20carnegie%20mellon%20university" - } - ], - "site": "map" - }, - "original.eval": { - "eval_types": [ - "url_match" - ], - "reference_answers": null, - "reference_url": "__MAP__/search?query=hotels%20near%20carnegie%20mellon%20university", - "program_html": [], - "url_note": "GOLD in PRED" - }, - "intent_template_id": 59 + "intent_template_id": 59, + "start_urls": ["__MAP__"], + "intent": "Search for \"hotels near Carnegie Mellon University\"", + "intent_template": "Search for \"{{space}} near {{location}}\"", + "instantiation_dict": {"location": "Carnegie Mellon University", "space": "hotels"}, + "format_specification": null, + "start_url_context": null, + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} + }, + { + "evaluator": "NetworkEventEvaluator", + "site": "map", + "url_match_mode": "exact", + "last_event_only": true, + "expected": { + "url": "__MAP__/search?query=hotels%20near%20Carnegie%20Mellon%20University", + "response_status": 200, + "event_type": "navigation" + } + } + ], + "revision": 2 }, { - "sites": [ - "map" - ], + "sites": ["map"], "task_id": 380, - "require_login": true, - "storage_state": null, - "start_url": "__MAP__", - "geolocation": null, - "intent_template": "Find the {{space}} around {{location}}", - "instantiation_dict": { - "location": "Carnegie Music Hall", - "space": "bar" - }, - "intent": "Find the bar around Carnegie Music Hall", - "require_reset": false, - "eval": { - "expected_ui_state": [ - { - "url": "__MAP__/search?query=bars%20near%20Carnegie%20Music%20Hall" - } - ], - "site": "map" - }, - "original.eval": { - "eval_types": [ - "url_match" - ], - "reference_answers": null, - "reference_url": "__MAP__/search?query=bars%20near%20Carnegie%20Music%20Hall", - "program_html": [], - "url_note": "GOLD in PRED" - }, - "intent_template_id": 59 + "intent_template_id": 59, + "start_urls": ["__MAP__"], + "intent": "Search for \"bars near Carnegie Music Hall\"", + "intent_template": "Search for \"{{space}} near {{location}}\"", + "instantiation_dict": {"location": "Carnegie Music Hall", "space": "bars"}, + "format_specification": null, + "start_url_context": null, + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} + }, + { + "evaluator": "NetworkEventEvaluator", + "site": "map", + "url_match_mode": "exact", + "last_event_only": true, + "expected": { + "url": "__MAP__/search?query=bars%20near%20Carnegie%20Music%20Hall", + "response_status": 200, + "event_type": "navigation" + } + } + ], + "revision": 2 }, { - "sites": [ - "map" - ], + "sites": ["map"], "task_id": 381, - "require_login": true, - "storage_state": null, - "start_url": "__MAP__", - "geolocation": null, - "intent_template": "Find the {{space}} around {{location}}", - "instantiation_dict": { - "location": "Carnegie Music Hall", - "space": "hotel" - }, - "intent": "Find the hotel around Carnegie Music Hall", - "require_reset": false, - "eval": { - "expected_ui_state": [ - { - "url": "__MAP__/search?query=hotels%20near%20Carnegie%20Music%20Hall" - } - ], - "site": "map" - }, - "original.eval": { - "eval_types": [ - "url_match" - ], - "reference_answers": null, - "reference_url": "__MAP__/search?query=hotels%20near%20Carnegie%20Music%20Hall", - "program_html": [], - "url_note": "GOLD in PRED" - }, - "intent_template_id": 59 + "intent_template_id": 59, + "start_urls": ["__MAP__"], + "intent": "Search for \"hotels near Carnegie Music Hall\"", + "intent_template": "Search for \"{{space}} near {{location}}\"", + "instantiation_dict": {"location": "Carnegie Music Hall", "space": "hotels"}, + "format_specification": null, + "start_url_context": null, + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} + }, + { + "evaluator": "NetworkEventEvaluator", + "site": "map", + "url_match_mode": "exact", + "last_event_only": true, + "expected": { + "url": "__MAP__/search?query=hotels%20near%20Carnegie%20Music%20Hall", + "response_status": 200, + "event_type": "navigation" + } + } + ], + "revision": 2 }, { - "sites": [ - "map" - ], + "sites": ["map"], "task_id": 382, - "require_login": true, - "storage_state": null, - "start_url": "__MAP__", - "geolocation": null, + "intent_template_id": 781, + "start_urls": ["__MAP__"], + "intent": "I am arriving at Carnegie Mellon University. Find the nearby US Citizenship and Immigration Services and the walking distance to the nearest Social Security Administration from US Citizenship and Immigration Services", "intent_template": "I am arriving at Carnegie Mellon University. Find the nearby US Citizenship and Immigration Services and the walking distance to the nearest Social Security Administration from US Citizenship and Immigration Services", "instantiation_dict": {}, - "intent": "I am arriving at Carnegie Mellon University. Find the nearby US Citizenship and Immigration Services and the walking distance to the nearest Social Security Administration from US Citizenship and Immigration Services", - "require_reset": false, - "eval": { - "site": "map" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "fuzzy_match": "N/A" - }, - "reference_url": "", - "program_html": [], - "string_note": "There is no USCIS nearby", - "reference_answer_raw_annotation": "N/A" - }, - "intent_template_id": 781, - "changelogs": [ + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "expected_retrieve_value", - "category": "unachievable_tasks", - "note": "Status check used instead of fuzzy match" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": { + "performed_operation": "retrieve", + "status": "NOT_FOUND_ERROR", + "retrieved_data": null + } } - ] + ], + "revision": 2 }, { - "sites": [ - "map" - ], + "sites": ["map"], "task_id": 383, - "require_login": true, - "storage_state": null, - "start_url": "__MAP__", - "geolocation": null, + "intent_template_id": 782, + "start_urls": ["__MAP__"], + "intent": "I am arriving at Pittsburgh Airport. Show me the name of a Hyatt hotel if there is any nearby. Tell me the names of supermarkets that are within 15mins driving from the hotel", "intent_template": "I am arriving at Pittsburgh Airport. Show me the name of a Hyatt hotel if there is any nearby. Tell me the names of supermarkets that are within 15mins driving from the hotel", "instantiation_dict": {}, - "intent": "I am arriving at Pittsburgh Airport. Show me the name of a Hyatt hotel if there is any nearby. Tell me the names of supermarkets that are within 15mins driving from the hotel", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "verify_retrieved_value_normalized", - "eval_params": { - "ordered": false, - "type": "text" + "format_specification": "Use \"hotel\" for the hotel name and \"supermarkets\" for the list of supermarket names", + "start_url_context": null, + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "ordered": false, + "results_schema": { + "type": "array", + "items": { + "type": "object", + "properties": { + "hotel" : { "type": "string" }, + "supermarkets": { "type": "array" , "items": {"type": "string"} } }, - "expected_data": [ - "Hyatt Regency Pittsburgh International Airport", - "Giant Eagle", - "ALDI" - ] + "required": ["hotel", "supermarkets"] } + }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": [ + { + "hotel": "Hyatt Regency Pittsburgh International Airport", + "supermarkets": ["Giant Eagle", "ALDI"] + } + ] } - ], - "site": "map" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "must_include": [ - "Hyatt Regency Pittsburgh International Airport", - "Giant Eagle", - "ALDI" - ] - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "Hyatt Regency Pittsburgh International Airport Giant Eagle, ALDI" - }, - "intent_template_id": 782, - "changelogs": [ - { - "key": "eval", - "category": "permissive_string_match", - "note": "Switched to exact match" } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping" - ], + "sites": ["shopping"], "task_id": 384, - "require_login": true, - "storage_state": "./.auth/shopping_state.json", - "start_url": "__SHOPPING__", - "geolocation": null, + "intent_template_id": 666, + "start_urls": ["__SHOPPING__"], + "intent": "List the customer names who complain about the quality of EYZUTAK phone cases", "intent_template": "List the customer names who complain about the quality of EYZUTAK phone cases", "instantiation_dict": {}, - "intent": "List the customer names who complain about the quality of EYZUTAK phone cases", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "verify_retrieved_value_normalized", - "eval_params": { - "ordered": false, - "type": "text" - }, - "expected_data": [ - "Lisa Lee", - "Evelyn Kurver", - "Amanda", - "N Randall" - ] - } - } - ], - "site": "shopping" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "must_include": [ - "Lisa Lee", - "Evelyn Kurver", - "Amanda", - "N Randall" - ] - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "Lisa Lee, Evelyn Kurver, Amanda, N Randall" - }, - "intent_template_id": 666, - "changelogs": [ + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "eval", - "category": "permissive_string_match", - "note": "Switched to exact match" + "evaluator": "AgentResponseEvaluator", + "ordered": false, + "results_schema": { "type": "array", "items": {"type": "string"} }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": ["Lisa Lee", "Evelyn Kurver", "Amanda", "N Randall"] + } } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping" - ], + "sites": ["shopping"], "task_id": 385, - "require_login": true, - "storage_state": "./.auth/shopping_state.json", - "start_url": "__SHOPPING__", - "geolocation": null, + "intent_template_id": 666, + "start_urls": ["__SHOPPING__"], + "intent": "List the customer names who thinks EYZUTAK phone cases are of good looking", "intent_template": "List the customer names who thinks EYZUTAK phone cases are of good looking", "instantiation_dict": {}, - "intent": "List the customer names who thinks EYZUTAK phone cases are of good looking", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "verify_retrieved_value_normalized", - "eval_params": { - "ordered": false, - "type": "text" - }, - "expected_data": [ - "Lisa Lee", - "MH", - "Misba009", - "Amanda", - "N Randall", - "Amazon Customer", - "Cally", - "Bethany Robertson" - ] - } - } - ], - "site": "shopping" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "must_include": [ - "Lisa Lee", - "MH", - "Misba009", - "Amanda", - "N Randall", - "Amazon Customer", - "Cally", - "Bethany Robertson" - ] - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "Lisa Lee, MH, Misba009, Amanda, N Randall, Amazon Customer, Cally, Bethany Robertson" - }, - "intent_template_id": 666, - "changelogs": [ + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "eval", - "category": "permissive_string_match", - "note": "Switched to exact match" + "evaluator": "AgentResponseEvaluator", + "ordered": false, + "results_schema": { "type": "array", "items": {"type": "string"} }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": ["Lisa Lee", "MH", "Misba009", "Amanda", "Cally", "Bethany Robertson"] + } } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping" - ], + "sites": ["shopping"], "task_id": 386, - "require_login": true, - "storage_state": "./.auth/shopping_state.json", - "start_url": "__SHOPPING__", - "geolocation": null, - "intent_template": "What is the rating of {{product}}", - "instantiation_dict": { - "product": "Ugreen lightning to 3.5mm cable" - }, - "intent": "What is the rating of Ugreen lightning to 3.5mm cable", - "original.intent": "What is the rating of Ugreen lightning to 3.5mm cable. Please round to the nearest whole number", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "verify_retrieved_value_normalized", - "eval_params": { - "ordered": false, - "type": "numeric" - }, - "expected_data": [ - 3 - ] - } - } - ], - "site": "shopping" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "must_include": [ - "65 |OR| 3" - ] - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "Around 3.1 stars" - }, "intent_template_id": 1355, - "changelogs": [ + "start_urls": ["__SHOPPING__"], + "intent": "What is the rating of Ugreen lightning to 3.5mm cable", + "intent_template": "What is the rating of {{product}}", + "instantiation_dict": {"product": "Ugreen lightning to 3.5mm cable"}, + "format_specification": "Return a value between 0 and 100", + "start_url_context": null, + "eval": [ { - "key": "eval", - "category": "permissive_string_match", - "note": "Switched to exact match" + "evaluator": "AgentResponseEvaluator", + "ordered": false, + "results_schema": { "type": "array", "items": {"type": "number"} }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": [65] + } } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping" - ], + "sites": ["shopping"], "task_id": 387, - "require_login": true, - "storage_state": "./.auth/shopping_state.json", - "start_url": "__SHOPPING__", - "geolocation": null, - "intent_template": "Who gave {{stars}} for phone cases from EYZUTAK", - "instantiation_dict": { - "stars": "4 or 5 stars" - }, + "intent_template_id": 1356, + "start_urls": ["__SHOPPING__"], "intent": "Who gave 4 or 5 stars for phone cases from EYZUTAK", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "verify_retrieved_value_normalized", - "eval_params": { - "ordered": false, - "type": "text" - }, - "expected_data": [ - "MH", - "Misba009", - "Amanda", - "Amazon Customer", - "Cally", - "Bethany Robertson" - ] - } + "intent_template": "Who gave {{stars}} for phone cases from EYZUTAK", + "instantiation_dict": {"stars": "4 or 5 stars"}, + "format_specification": null, + "start_url_context": null, + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "ordered": false, + "results_schema": { "type": "array", "items": {"type": "string"} }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": [ + "MH", "Misba009", "Amanda", "N Randall", "Amazon Customer", "Cally", + "Bethany Robertson" + ] } - ], - "site": "shopping" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "must_include": [ - "MH", - "Misba009", - "Amanda", - "Amazon Customer", - "Cally", - "Bethany Robertson" - ] - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "MH, Misba009, Amanda, Amazon Customer, Cally, Bethany Robertson, Bethany Robertson" - }, - "intent_template_id": 1356, - "changelogs": [ - { - "key": "eval", - "category": "permissive_string_match", - "note": "Switched to exact match" } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping" - ], + "sites": ["shopping"], "task_id": 388, - "require_login": true, - "storage_state": "./.auth/shopping_state.json", - "start_url": "__SHOPPING__", - "geolocation": null, - "intent_template": "Who gave {{stars}} for phone cases from EYZUTAK", - "instantiation_dict": { - "stars": "1 or 2 stars" - }, - "intent": "Who gave 1 or 2 stars for phone cases from EYZUTAK", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "verify_retrieved_value_normalized", - "eval_params": { - "ordered": false, - "type": "text" - }, - "expected_data": [ - "Evelyn Kurver", - "N Randall" - ] - } - } - ], - "site": "shopping" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "must_include": [ - "Evelyn Kurver", - "N Randall" - ] - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "Evelyn Kurver, N Randall" - }, "intent_template_id": 1356, - "changelogs": [ + "start_urls": ["__SHOPPING__"], + "intent": "Who gave 1 or 2 stars for phone cases from EYZUTAK", + "intent_template": "Who gave {{stars}} for phone cases from EYZUTAK", + "instantiation_dict": {"stars": "1 or 2 stars"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "eval", - "category": "permissive_string_match", - "note": "Switched to exact match" + "evaluator": "AgentResponseEvaluator", + "ordered": false, + "results_schema": { "type": "array", "items": {"type": "string"} }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": ["Evelyn Kurver", "N Randall"] + } } - ] + ], + "revision": 2 }, { - "sites": [ - "gitlab" - ], + "sites": ["gitlab"], "task_id": 389, - "require_login": true, - "storage_state": "./.auth/gitlab_state.json", - "start_url": "__GITLAB__", - "geolocation": null, + "intent_template_id": 348, + "start_urls": ["__GITLAB__"], + "intent": "Post \"Thanks, working on reviews\" for the merge request related to octovisuals page in primer/design project", "intent_template": "Post \"{{content}}\" for the merge request related to {{mr}} in {{repo}} project", "instantiation_dict": { "mr": "octovisuals page", "repo": "primer/design", "content": "Thanks, working on reviews" }, - "intent": "Post \"Thanks, working on reviews\" for the merge request related to octovisuals page in primer/design project", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "does_merge_request_have_comment", - "eval_params": { - "group": "primer", - "project": "design", - "merge_request_iid": 450, - "comment": "Thanks, working on reviews" - } - } - } - ], - "site": "gitlab" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "__GITLAB__/primer/design/-/merge_requests/450", - "locator": "", - "required_contents": { - "must_include": [ - "Thanks, working on reviews" - ] - } - } - ] - }, - "intent_template_id": 348, - "changelogs": [ + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "expected_backend_state", - "category": "switch_to_api_eval", - "note": "API-based validation is more accurate" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "group": "primer", + "project": "design", + "merge_request_iid": 450, + "comment": "Thanks, working on reviews", + "site": "gitlab", + "expected": true } - ] + ], + "revision": 2 }, { - "sites": [ - "gitlab" - ], + "sites": ["gitlab"], "task_id": 390, - "require_login": true, - "storage_state": "./.auth/gitlab_state.json", - "start_url": "__GITLAB__", - "geolocation": null, + "intent_template_id": 348, + "start_urls": ["__GITLAB__"], + "intent": "Post \"lgtm\" for the merge request related to semantic HTML post in a11yproject/a11yproject.com project", "intent_template": "Post \"{{content}}\" for the merge request related to {{mr}} in {{repo}} project", "instantiation_dict": { "mr": "semantic HTML post", "repo": "a11yproject/a11yproject.com", "content": "lgtm" }, - "intent": "Post \"lgtm\" for the merge request related to semantic HTML post in a11yproject/a11yproject.com project", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "does_merge_request_have_comment", - "eval_params": { - "group": "a11yproject", - "project": "a11yproject.com", - "merge_request_iid": 1531, - "comment": "lgtm" - } - } - } - ], - "site": "gitlab" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "__GITLAB__/a11yproject/a11yproject.com/-/merge_requests/1531", - "locator": "document.querySelector('[id=\"notes-list\"').lastElementChild.querySelector('.timeline-discussion-body').outerText", - "required_contents": { - "exact_match": "lgtm" - } - } - ] - }, - "intent_template_id": 348, - "changelogs": [ + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "expected_backend_state", - "category": "switch_to_api_eval", - "note": "Use API call for MR comment verification; more accurate" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "group": "a11yproject", + "project": "a11yproject.com", + "merge_request_iid": 1531, + "comment": "lgtm", + "site": "gitlab", + "expected": true } - ] + ], + "revision": 2 }, { - "sites": [ - "gitlab" - ], + "sites": ["gitlab"], "task_id": 391, - "require_login": true, - "storage_state": "./.auth/gitlab_state.json", - "start_url": "__GITLAB__", - "geolocation": null, + "intent_template_id": 348, + "start_urls": ["__GITLAB__"], + "intent": "Post \"close because non reproducible\" for the merge request related to focus edge cases in a11yproject/a11yproject.com project", "intent_template": "Post \"{{content}}\" for the merge request related to {{mr}} in {{repo}} project", "instantiation_dict": { "mr": "focus edge cases", "repo": "a11yproject/a11yproject.com", "content": "close because non reproducible" }, - "intent": "Post \"close because non reproducible\" for the merge request related to focus edge cases in a11yproject/a11yproject.com project", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "does_merge_request_have_comment", - "eval_params": { - "group": "a11yproject", - "project": "a11yproject.com", - "merge_request_iid": 1265, - "comment": "close because non reproducible" - } - } - } - ], - "site": "gitlab" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "__GITLAB__/a11yproject/a11yproject.com/-/merge_requests/1265", - "locator": "document.querySelector('[id=\"notes-list\"').lastElementChild.querySelector('.timeline-discussion-body').outerText", - "required_contents": { - "exact_match": "close because non reproducible" - } - } - ] - }, - "intent_template_id": 348, - "changelogs": [ + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "expected_backend_state", - "category": "switch_to_api_eval", - "note": "Use API call for MR comment verification; more accurate" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "group": "a11yproject", + "project": "a11yproject.com", + "merge_request_iid": 1265, + "comment": "close because non reproducible", + "site": "gitlab", + "expected": true } - ] + ], + "revision": 2 }, { - "sites": [ - "gitlab" - ], + "sites": ["gitlab"], "task_id": 392, - "require_login": true, - "storage_state": "./.auth/gitlab_state.json", - "start_url": "__GITLAB__", - "geolocation": null, - "intent_template": "Post \"{{content}}\" for the merge request related to {{mr}} in {{repo}} project", - "instantiation_dict": { - "mr": "color ulitity", - "repo": "a11yproject.com", - "content": "Good idea" - }, - "intent": "Post \"Good idea\" for the merge request related to color ulitity in a11yproject.com project", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "does_merge_request_have_comment", - "eval_params": { - "group": "a11yproject", - "project": "a11yproject.com", - "merge_request_iid": 1071, - "comment": "Good idea" - } - } - } - ], - "site": "gitlab" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "__GITLAB__/a11yproject/a11yproject.com/-/merge_requests/1071", - "locator": "document.querySelector('[id=\"notes-list\"').lastElementChild.querySelector('.timeline-discussion-body').outerText", - "required_contents": { - "exact_match": "Good idea" - } - } - ] - }, "intent_template_id": 348, - "changelogs": [ + "start_urls": ["__GITLAB__"], + "intent": "Post \"Good idea\" for the merge request related to color ulitity in a11yproject.com project", + "intent_template": "Post \"{{content}}\" for the merge request related to {{mr}} in {{repo}} project", + "instantiation_dict": {"mr": "color ulitity", "repo": "a11yproject.com", "content": "Good idea"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "expected_backend_state", - "category": "switch_to_api_eval", - "note": "Use API call for MR comment verification; more accurate" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "group": "a11yproject", + "project": "a11yproject.com", + "merge_request_iid": 1071, + "comment": "Good idea", + "site": "gitlab", + "expected": true } - ] + ], + "revision": 2 }, { - "sites": [ - "gitlab" - ], + "sites": ["gitlab"], "task_id": 393, - "require_login": true, - "storage_state": "./.auth/gitlab_state.json", - "start_url": "__GITLAB__", - "geolocation": null, + "intent_template_id": 348, + "start_urls": ["__GITLAB__"], + "intent": "Post \"lgtm\" for the merge request related to fixing the broken links in byteblaze/empathy-prompts project", "intent_template": "Post \"{{content}}\" for the merge request related to {{mr}} in {{repo}} project", "instantiation_dict": { "mr": "fixing the broken links", "repo": "byteblaze/empathy-prompts", "content": "lgtm" }, - "intent": "Post \"lgtm\" for the merge request related to fixing the broken links in byteblaze/empathy-prompts project", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "does_merge_request_have_comment", - "eval_params": { - "group": "byteblaze", - "project": "empathy-prompts", - "merge_request_iid": 19, - "comment": "lgtm" - } - } - } - ], - "site": "gitlab" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "__GITLAB__/byteblaze/empathy-prompts/-/merge_requests/19", - "locator": "document.querySelector('[id=\"notes-list\"').lastElementChild.querySelector('.timeline-discussion-body').outerText", - "required_contents": { - "exact_match": "lgtm" - } - } - ] - }, - "intent_template_id": 348, - "changelogs": [ + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "expected_backend_state", - "category": "switch_to_api_eval", - "note": "Use API call for MR comment verification; more accurate" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "group": "byteblaze", + "project": "empathy-prompts", + "merge_request_iid": 19, + "comment": "lgtm", + "site": "gitlab", + "expected": true } - ] + ], + "revision": 2 }, { - "sites": [ - "gitlab" - ], + "sites": ["gitlab"], "task_id": 394, - "require_login": true, - "storage_state": "./.auth/gitlab_state.json", - "start_url": "__GITLAB__", - "geolocation": null, - "intent_template": "Fork {{repo}}.", - "instantiation_dict": { - "repo": "2019-nCov" - }, - "intent": "Fork 2019-nCov.", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "user_has_project", - "eval_params": { - "project": "2019-nCov" - } - } - } - ], - "site": "gitlab" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "__GITLAB__/byteblaze/2019-nCov", - "locator": "", - "required_contents": { - "must_include": [ - "2019-nCov" - ] - } - } - ] - }, "intent_template_id": 352, - "changelogs": [ + "start_urls": ["__GITLAB__"], + "intent": "Fork 2019-nCov.", + "intent_template": "Fork {{repo}}.", + "instantiation_dict": {"repo": "2019-nCov"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "expected_backend_state", - "category": "switch_to_api_eval", - "note": "Use API to verify; more accurate" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "project": "2019-nCov", + "site": "gitlab", + "expected": true } - ] + ], + "revision": 2 }, { - "sites": [ - "gitlab" - ], + "sites": ["gitlab"], "task_id": 395, - "require_login": true, - "storage_state": "./.auth/gitlab_state.json", - "start_url": "__GITLAB__", - "geolocation": null, - "intent_template": "Fork {{repo}}.", - "instantiation_dict": { - "repo": "the Pytorch GAN repo with most stars" - }, - "intent": "Fork the Pytorch GAN repo with most stars.", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "user_has_project", - "eval_params": { - "project": "Pytorch-GAN" - } - } - } - ], - "site": "gitlab" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "__GITLAB__/byteblaze/PyTorch-GAN", - "locator": "", - "required_contents": { - "must_include": [ - "Pytorch-GAN" - ] - } - } - ] - }, "intent_template_id": 352, - "changelogs": [ + "start_urls": ["__GITLAB__"], + "intent": "Fork the Pytorch GAN repo with most stars.", + "intent_template": "Fork {{repo}}.", + "instantiation_dict": {"repo": "the Pytorch GAN repo with most stars"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "expected_backend_state", - "category": "switch_to_api_eval", - "note": "Use API to verify; more accurate" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "project": "Pytorch-GAN", + "site": "gitlab", + "expected": true } - ] + ], + "revision": 2 }, { - "sites": [ - "gitlab" - ], + "sites": ["gitlab"], "task_id": 396, - "require_login": true, - "storage_state": "./.auth/gitlab_state.json", - "start_url": "__GITLAB__", - "geolocation": null, - "intent_template": "Fork {{repo}}.", - "instantiation_dict": { - "repo": "ChatGPT" - }, - "intent": "Fork ChatGPT.", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "user_has_project", - "eval_params": { - "project": "ChatGPT" - } - } - } - ], - "site": "gitlab" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "__GITLAB__/byteblaze/ChatGPT", - "locator": "", - "required_contents": { - "must_include": [ - "ChatGPT" - ] - } - } - ] - }, "intent_template_id": 352, - "changelogs": [ + "start_urls": ["__GITLAB__"], + "intent": "Fork ChatGPT.", + "intent_template": "Fork {{repo}}.", + "instantiation_dict": {"repo": "ChatGPT"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "expected_backend_state", - "category": "switch_to_api_eval", - "note": "Use API to verify; more accurate" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "project": "ChatGPT", + "site": "gitlab", + "expected": true } - ] + ], + "revision": 2 }, { - "sites": [ - "gitlab" - ], + "sites": ["gitlab"], "task_id": 397, - "require_login": true, - "storage_state": "./.auth/gitlab_state.json", - "start_url": "__GITLAB__", - "geolocation": null, - "intent_template": "Fork {{repo}}.", - "instantiation_dict": { - "repo": "MetaSeq" - }, - "intent": "Fork MetaSeq.", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "user_has_project", - "eval_params": { - "project": "metaseq" - } - } - } - ], - "site": "gitlab" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "__GITLAB__/byteblaze/metaseq", - "locator": "", - "required_contents": { - "must_include": [ - "metaseq" - ] - } - } - ] - }, "intent_template_id": 352, - "changelogs": [ + "start_urls": ["__GITLAB__"], + "intent": "Fork MetaSeq.", + "intent_template": "Fork {{repo}}.", + "instantiation_dict": {"repo": "MetaSeq"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "expected_backend_state", - "category": "switch_to_api_eval", - "note": "Use API to verify; more accurate" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "project": "metaseq", + "site": "gitlab", + "expected": true } - ] + ], + "revision": 2 }, { - "sites": [ - "gitlab" - ], + "sites": ["gitlab"], "task_id": 398, - "require_login": true, - "storage_state": "./.auth/gitlab_state.json", - "start_url": "__GITLAB__", - "geolocation": null, - "intent_template": "Fork {{repo}}.", - "instantiation_dict": { - "repo": "all source repos from Akilesh Kannan" - }, - "intent": "Fork all source repos from Akilesh Kannan.", - "original.intent": "Fork all source repos from Akilesh Kannan", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "user_has_project", - "eval_params": { - "project": [ - "SimCache", - "dots", - "CacheEval", - "nvidia-patch", - "viewgrades-scraper" - ] - } - } - } - ], - "site": "gitlab" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "__GITLAB__/byteblaze/SimCache", - "locator": "", - "required_contents": { - "must_include": [ - "SimCache" - ] - } - }, - { - "url": "__GITLAB__/byteblaze/dots", - "locator": "", - "required_contents": { - "must_include": [ - "dots" - ] - } - }, - { - "url": "__GITLAB__/byteblaze/CacheEval", - "locator": "", - "required_contents": { - "must_include": [ - "CacheEval" - ] - } - }, - { - "url": "__GITLAB__/byteblaze/nvidia-patch", - "locator": "", - "required_contents": { - "must_include": [ - "404" - ] - } - }, - { - "url": "__GITLAB__/byteblaze/viewgrades-scraper", - "locator": "", - "required_contents": { - "must_include": [ - "404" - ] - } - } - ] - }, "intent_template_id": 352, - "changelogs": [ + "start_urls": ["__GITLAB__"], + "intent": "Fork all source repos from Akilesh Kannan.", + "intent_template": "Fork {{repo}}.", + "instantiation_dict": {"repo": "all source repos from Akilesh Kannan"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "expected_backend_state", - "category": "switch_to_api_eval", - "note": "Use API to verify; more accurate" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "project": ["SimCache", "dots", "CacheEval", "nvidia-patch", "viewgrades-scraper"], + "site": "gitlab", + "expected": true } - ] + ], + "revision": 2 }, { - "sites": [ - "reddit" - ], + "sites": ["reddit"], "task_id": 399, - "require_login": true, - "storage_state": "./.auth/reddit_state.json", - "start_url": "__REDDIT__", - "geolocation": null, - "intent_template": "Change my reddit bio to \"{{content}}\"", - "instantiation_dict": { - "content": "I am a robot" - }, - "intent": "Change my reddit bio to \"I am a robot\"", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "validate_user_bio_content", - "expected_data": { - "bio": "I am a robot" - } - } - } - ], - "site": "reddit" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "__REDDIT__/user/MarvelsGrantMan136", - "locator": "document.querySelector(\".user-bio__biography\").outerText", - "required_contents": { - "exact_match": "I am a robot" - } - } - ] - }, "intent_template_id": 6, - "changelogs": [ + "start_urls": ["__REDDIT__"], + "intent": "Change my reddit bio to \"I am a robot\"", + "intent_template": "Change my reddit bio to \"{{content}}\"", + "instantiation_dict": {"content": "I am a robot"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "expected_backend_state", - "category": "switch_to_api_eval", - "note": "Update to use system check instead of dom check" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "site": "reddit", + "expected": {"content": "I am a robot"} } - ] + ], + "revision": 2 }, { - "sites": [ - "reddit" - ], + "sites": ["reddit"], "task_id": 400, - "require_login": true, - "storage_state": "./.auth/reddit_state.json", - "start_url": "__REDDIT__", - "geolocation": null, - "intent_template": "Change my reddit bio to \"{{content}}\"", - "instantiation_dict": { - "content": "Pro Python Developer with 20 years of Experience" - }, - "intent": "Change my reddit bio to \"Pro Python Developer with 20 years of Experience\"", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "validate_user_bio_content", - "expected_data": { - "bio": "Pro Python Developer with 20 years of Experience" - } - } - } - ], - "site": "reddit" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "__REDDIT__/user/MarvelsGrantMan136", - "locator": "document.querySelector(\".user-bio__biography\").outerText", - "required_contents": { - "exact_match": "Pro Python Developer with 20 years of Experience" - } - } - ] - }, "intent_template_id": 6, - "changelogs": [ + "start_urls": ["__REDDIT__"], + "intent": "Change my reddit bio to \"Pro Python Developer with 20 years of Experience\"", + "intent_template": "Change my reddit bio to \"{{content}}\"", + "instantiation_dict": {"content": "Pro Python Developer with 20 years of Experience"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "expected_backend_state", - "category": "switch_to_api_eval", - "note": "Update to use system check instead of dom check" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "site": "reddit", + "expected": {"content": "Pro Python Developer with 20 years of Experience"} } - ] + ], + "revision": 2 }, { - "sites": [ - "reddit" - ], + "sites": ["reddit"], "task_id": 401, - "require_login": true, - "storage_state": "./.auth/reddit_state.json", - "start_url": "__REDDIT__", - "geolocation": null, - "intent_template": "Change my reddit bio to \"{{content}}\"", - "instantiation_dict": { - "content": "Seeking SDE positions" - }, - "intent": "Change my reddit bio to \"Seeking SDE positions\"", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "validate_user_bio_content", - "expected_data": { - "bio": "Seeking SDE positions" - } - } - } - ], - "site": "reddit" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "__REDDIT__/user/MarvelsGrantMan136", - "locator": "document.querySelector(\".user-bio__biography\").outerText", - "required_contents": { - "exact_match": "Seeking SDE positions" - } - } - ] - }, "intent_template_id": 6, - "changelogs": [ + "start_urls": ["__REDDIT__"], + "intent": "Change my reddit bio to \"Seeking SDE positions\"", + "intent_template": "Change my reddit bio to \"{{content}}\"", + "instantiation_dict": {"content": "Seeking SDE positions"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "expected_backend_state", - "category": "switch_to_api_eval", - "note": "Update to use system check instead of dom check" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "site": "reddit", + "expected": {"content": "Seeking SDE positions"} } - ] + ], + "revision": 2 }, { - "sites": [ - "reddit" - ], + "sites": ["reddit"], "task_id": 402, - "require_login": true, - "storage_state": "./.auth/reddit_state.json", - "start_url": "__REDDIT__", - "geolocation": null, - "intent_template": "Change my reddit bio to \"{{content}}\"", - "instantiation_dict": { - "content": "Freelance Web Developer" - }, - "intent": "Change my reddit bio to \"Freelance Web Developer\"", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "validate_user_bio_content", - "expected_data": { - "bio": "Freelance Web Developer" - } - } - } - ], - "site": "reddit" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "__REDDIT__/user/MarvelsGrantMan136", - "locator": "document.querySelector(\".user-bio__biography\").outerText", - "required_contents": { - "exact_match": "Freelance Web Developer" - } - } - ] - }, "intent_template_id": 6, - "changelogs": [ + "start_urls": ["__REDDIT__"], + "intent": "Change my reddit bio to \"Freelance Web Developer\"", + "intent_template": "Change my reddit bio to \"{{content}}\"", + "instantiation_dict": {"content": "Freelance Web Developer"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "expected_backend_state", - "category": "switch_to_api_eval", - "note": "Update to use system check instead of dom check" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "site": "reddit", + "expected": {"content": "Freelance Web Developer"} } - ] + ], + "revision": 2 }, { - "sites": [ - "reddit" - ], + "sites": ["reddit"], "task_id": 403, - "require_login": true, - "storage_state": "./.auth/reddit_state.json", - "start_url": "__REDDIT__", - "geolocation": null, + "intent_template_id": 6, + "start_urls": ["__REDDIT__"], + "intent": "Change my reddit bio to \"Awesome Prompt Artist\"", "intent_template": "Change my reddit bio to \"{{content}}\"", - "instantiation_dict": { - "content": "Awesome Prompt Artist" - }, - "intent": "Change my reddit bio to \"Awesome Prompt Artist\"", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "validate_user_bio_content", - "expected_data": { - "bio": "Awesome Prompt Artist" - } - } - } - ], - "site": "reddit" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "__REDDIT__/user/MarvelsGrantMan136", - "locator": "document.querySelector(\".user-bio__biography\").outerText", - "required_contents": { - "exact_match": "Awesome Prompt Artist" - } - } - ] - }, - "intent_template_id": 6, - "changelogs": [ + "instantiation_dict": {"content": "Awesome Prompt Artist"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "expected_backend_state", - "category": "switch_to_api_eval", - "note": "Update to use system check instead of dom check" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "site": "reddit", + "expected": {"content": "Awesome Prompt Artist"} } - ] + ], + "revision": 2 }, { - "sites": [ - "reddit" - ], + "sites": ["reddit"], "task_id": 404, - "require_login": true, - "storage_state": "./.auth/reddit_state.json", - "start_url": "__REDDIT__", - "geolocation": null, - "intent_template": "Upvote the newest post in {{subreddit}} subreddit", - "instantiation_dict": { - "subreddit": "books" - }, - "intent": "Upvote the newest post in books subreddit", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "validate_user_upvoted_newest_post", - "expected_data": { - "subreddit": "books" - } - } - } - ], - "site": "reddit" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "__REDDIT__/f/books/124260/adults-reading-to-each-other-out-loud", - "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')", - "required_contents": { - "must_include": [ - "vote vote--user-upvoted" - ] - } - } - ] - }, "intent_template_id": 22, - "changelogs": [ + "start_urls": ["__REDDIT__"], + "intent": "Upvote the newest post in books subreddit", + "intent_template": "Upvote the newest post in {{subreddit}} subreddit", + "instantiation_dict": {"subreddit": "books"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "expected_backend_state", - "category": "switch_to_api_eval", - "note": "Update to use system check instead of dom check" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "site": "reddit", + "expected": { "post_ids": [124260] } } - ] + ], + "revision": 2 }, { - "sites": [ - "reddit" - ], + "sites": ["reddit"], "task_id": 405, - "require_login": true, - "storage_state": "./.auth/reddit_state.json", - "start_url": "__REDDIT__", - "geolocation": null, - "intent_template": "Upvote the newest post in {{subreddit}} subreddit", - "instantiation_dict": { - "subreddit": "DIY" - }, - "intent": "Upvote the newest post in DIY subreddit", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "validate_user_upvoted_newest_post", - "expected_data": { - "subreddit": "diy" - } - } - } - ], - "site": "reddit" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "__REDDIT__/f/DIY/119019/how-can-i-bring-an-hdmi-cable-from-my-pc-downstairs-to-my-tv", - "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')", - "required_contents": { - "must_include": [ - "vote vote--user-upvoted" - ] - } - } - ] - }, "intent_template_id": 22, - "changelogs": [ + "start_urls": ["__REDDIT__"], + "intent": "Upvote the newest post in DIY subreddit", + "intent_template": "Upvote the newest post in {{subreddit}} subreddit", + "instantiation_dict": {"subreddit": "DIY"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "expected_backend_state", - "category": "switch_to_api_eval", - "note": "Update to use system check instead of dom check" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "site": "reddit", + "expected": { "post_ids": [119019] } } - ] + ], + "revision": 2 }, { - "sites": [ - "reddit" - ], + "sites": ["reddit"], "task_id": 406, - "require_login": true, - "storage_state": "./.auth/reddit_state.json", - "start_url": "__REDDIT__", - "geolocation": null, - "intent_template": "Upvote the newest post in {{subreddit}} subreddit", - "instantiation_dict": { - "subreddit": "future technology" - }, - "intent": "Upvote the newest post in future technology subreddit", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "validate_user_upvoted_newest_post", - "expected_data": { - "subreddit": "future technology" - } - } - } - ], - "site": "reddit" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "__REDDIT__/f/Futurology/119517/openai-ceo-it-s-not-funny-that-i-m-afraid-of-the-ai-we-re", - "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')", - "required_contents": { - "must_include": [ - "vote vote--user-upvoted" - ] - } - } - ] - }, "intent_template_id": 22, - "changelogs": [ + "start_urls": ["__REDDIT__"], + "intent": "Upvote the newest post in future technology subreddit", + "intent_template": "Upvote the newest post in {{subreddit}} subreddit", + "instantiation_dict": {"subreddit": "future technology"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "expected_backend_state", - "category": "switch_to_api_eval", - "note": "Update to use system check instead of dom check" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "site": "reddit", + "expected": { "post_ids": [119517] } } - ] + ], + "revision": 2 }, { - "sites": [ - "reddit" - ], + "sites": ["reddit"], "task_id": 407, - "require_login": true, - "storage_state": "./.auth/reddit_state.json", - "start_url": "__REDDIT__", - "geolocation": null, - "intent_template": "Upvote the newest post in {{subreddit}} subreddit", - "instantiation_dict": { - "subreddit": "deeplearning" - }, - "intent": "Upvote the newest post in deeplearning subreddit", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "validate_user_upvoted_newest_post", - "expected_data": { - "subreddit": "deeplearning" - } - } - } - ], - "site": "reddit" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "__REDDIT__/f/deeplearning/124993/meta-s-llama-weights-leaked-on-torrent-and-the-best-thing", - "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')", - "required_contents": { - "must_include": [ - "vote vote--user-upvoted" - ] - } - } - ] - }, "intent_template_id": 22, - "changelogs": [ + "start_urls": ["__REDDIT__"], + "intent": "Upvote the newest post in deeplearning subreddit", + "intent_template": "Upvote the newest post in {{subreddit}} subreddit", + "instantiation_dict": {"subreddit": "deeplearning"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "expected_backend_state", - "category": "switch_to_api_eval", - "note": "Update to use system check instead of dom check" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "site": "reddit", + "expected": { "post_ids": [125036] } } - ] + ], + "revision": 2 }, { - "sites": [ - "reddit" - ], + "sites": ["reddit"], "task_id": 408, - "require_login": true, - "storage_state": "./.auth/reddit_state.json", - "start_url": "__REDDIT__", - "geolocation": null, - "intent_template": "Upvote the newest post in {{subreddit}} subreddit", - "instantiation_dict": { - "subreddit": "explain like im 5" - }, - "intent": "Upvote the newest post in explain like im 5 subreddit", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "validate_user_upvoted_newest_post", - "expected_data": { - "subreddit": "explain like im 5" - } - } - } - ], - "site": "reddit" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "__REDDIT__/f/explainlikeimfive/39244/eli5-how-does-pinching-a-ribbon-and-sliding-your-finger", - "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')", - "required_contents": { - "must_include": [ - "vote vote--user-upvoted" - ] - } - } - ] - }, "intent_template_id": 22, - "changelogs": [ + "start_urls": ["__REDDIT__"], + "intent": "Upvote the newest post in explain like im 5 subreddit", + "intent_template": "Upvote the newest post in {{subreddit}} subreddit", + "instantiation_dict": {"subreddit": "explain like im 5"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "expected_backend_state", - "category": "switch_to_api_eval", - "note": "Update to use system check instead of dom check" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "site": "reddit", + "expected": { "post_ids": [125342] } } - ] + ], + "revision": 2 }, { - "sites": [ - "reddit" - ], + "sites": ["reddit"], "task_id": 409, - "require_login": true, - "storage_state": "./.auth/reddit_state.json", - "start_url": "__REDDIT__/f/books/59421/friendly-reminder-bookshop-org-exists", - "geolocation": null, + "intent_template_id": 23, + "start_urls": ["__REDDIT__/f/books/59421/friendly-reminder-bookshop-org-exists"], + "intent": "Reply to the manager of the website in this post with \"thanks! I am a big fan of your website.\"", "intent_template": "Reply to {{position_description}} in this post with \"{{content_description}}\"", "instantiation_dict": { "position_description": "the manager of the website", "content_description": "thanks! I am a big fan of your website." }, - "intent": "Reply to the manager of the website in this post with \"thanks! I am a big fan of your website.\"", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "validate_user_upvoted_newest_post", - "expected_data": { - "post_id": "1235250", - "comment_content": "thanks! I am a big fan of your website." - } - } - } - ], - "site": "reddit" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "__REDDIT__/f/books/59421/-/comment/1235250", - "locator": "", - "required_contents": { - "must_include": [ - "thanks! I am a big fan of your website." - ] - } - } - ] - }, - "intent_template_id": 23, - "changelogs": [ + "format_specification": null, + "start_url_context": "Complete the task given the context of the current page.", + "eval": [ { - "key": "expected_backend_state", - "category": "switch_to_api_eval", - "note": "Update to use system check instead of dom check" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "site": "reddit", + "expected": { + "comment_id": "1235250", + "reply_content": "thanks! I am a big fan of your website." + } } - ] + ], + "revision": 2 }, { - "sites": [ - "reddit" - ], + "sites": ["reddit"], "task_id": 410, - "require_login": true, - "storage_state": "./.auth/reddit_state.json", - "start_url": "__REDDIT__/f/singularity/69404/this-is-how-chatgpt-sees-itself", - "geolocation": null, + "intent_template_id": 23, + "start_urls": ["__REDDIT__/f/singularity/69404/this-is-how-chatgpt-sees-itself"], + "intent": "Reply to the first reply in this post with \"don't panic\"", "intent_template": "Reply to {{position_description}} in this post with \"{{content_description}}\"", "instantiation_dict": { "position_description": "the first reply", "content_description": "don't panic" }, - "intent": "Reply to the first reply in this post with \"don't panic\"", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "validate_user_upvoted_newest_post", - "expected_data": { - "post_id": "1042264", - "comment_content": "don't panic" - } - } - } - ], - "site": "reddit" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "__REDDIT__/f/singularity/69404/-/comment/1042264", - "locator": "", - "required_contents": { - "must_include": [ - "don't panic" - ] - } - } - ] - }, - "intent_template_id": 23, - "changelogs": [ + "format_specification": null, + "start_url_context": "Complete the task given the context of the current page.", + "eval": [ { - "key": "expected_backend_state", - "category": "switch_to_api_eval", - "note": "Update to use system check instead of dom check" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "site": "reddit", + "expected": {"comment_id": "1042264", "reply_content": "don't panic"} } - ] + ], + "revision": 2 }, { - "sites": [ - "gitlab" - ], + "sites": ["gitlab"], "task_id": 411, - "require_login": true, - "storage_state": "./.auth/gitlab_state.json", - "start_url": "__GITLAB__", - "geolocation": null, - "intent_template": "Make the LICENSE of {{repo}} to MIT license.", - "instantiation_dict": { - "repo": "byteblaze/cloud-to-butt" - }, - "intent": "Make the LICENSE of byteblaze/cloud-to-butt to MIT license.", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "file_has_substring", - "eval_params": { - "group": "byteblaze", - "project": "cloud-to-butt", - "file_path": "LICENSE.txt", - "substring": "MIT License", - "branch": "master" - } - } - } - ], - "site": "gitlab" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "__GITLAB__/byteblaze/cloud-to-butt/-/blob/master/LICENSE.txt", - "locator": "", - "required_contents": { - "must_include": [ - "MIT license" - ] - } - } - ] - }, "intent_template_id": 355, - "changelogs": [ + "start_urls": ["__GITLAB__"], + "intent": "Change the LICENSE for repo byteblaze/cloud-to-butt to an MIT license", + "intent_template": "Change the LICENSE for repo {{repo}} to {{license}}", + "instantiation_dict": {"repo": "byteblaze/cloud-to-butt", "license": "an MIT license"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "expected_backend_state", - "category": "switch_to_api_eval", - "note": "Use GitLab API to fetch file content; more accurate" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "group": "byteblaze", + "project": "cloud-to-butt", + "file_path": "LICENSE.txt", + "substring": "MIT License", + "branch": "master", + "site": "gitlab", + "expected": true } - ] + ], + "revision": 2 }, { - "sites": [ - "gitlab" - ], + "sites": ["gitlab"], "task_id": 412, - "require_login": true, - "storage_state": "./.auth/gitlab_state.json", - "start_url": "__GITLAB__", - "geolocation": null, - "intent_template": "Make the LICENSE of {{repo}} to Apache License", + "intent_template_id": 355, + "start_urls": ["__GITLAB__"], + "intent": "Change the LICENSE for repo byteblaze/accessible-html-content-patterns to an Apache License", + "intent_template": "Change the LICENSE for repo {{repo}} to {{license}}", "instantiation_dict": { - "repo": "byteblaze/accessible-html-content-patterns" + "repo": "byteblaze/accessible-html-content-patterns", + "license": "an Apache License" }, - "intent": "Make the LICENSE of byteblaze/accessible-html-content-patterns to Apache License", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "file_has_substring", - "eval_params": { - "group": "byteblaze", - "project": "accessible-html-content-patterns", - "file_path": "LICENSE", - "substring": "Apache License", - "branch": "main" - } - } - } - ], - "site": "gitlab" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "__GITLAB__/byteblaze/accessible-html-content-patterns/-/blob/main/LICENSE", - "locator": "", - "required_contents": { - "must_include": [ - "Apache License", - "http://www.apache.org/licenses/LICENSE-2.0" - ] - } - } - ] - }, - "intent_template_id": 355, - "changelogs": [ + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "expected_backend_state", - "category": "switch_to_api_eval", - "note": "Use GitLab API to fetch file content; more accurate" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "group": "byteblaze", + "project": "accessible-html-content-patterns", + "file_path": "LICENSE", + "substring": "Apache License", + "branch": "main", + "site": "gitlab", + "expected": true } - ] + ], + "revision": 2 }, { - "sites": [ - "gitlab" - ], + "sites": ["gitlab"], "task_id": 413, - "require_login": true, - "storage_state": "./.auth/gitlab_state.json", - "start_url": "__GITLAB__", - "geolocation": null, - "intent_template": "Make the LICENSE of {{repo}} to one that mandates all copies and derivative works to be under the same license", + "intent_template_id": 355, + "start_urls": ["__GITLAB__"], + "intent": "Change the LICENSE for repo byteblaze/a11y-syntax-highlighting to one that mandates all copies and derivative works to be under the same license", + "intent_template": "Change the LICENSE for repo {{repo}} to {{license}}", "instantiation_dict": { - "repo": "byteblaze/a11y-syntax-highlighting" - }, - "intent": "Make the LICENSE of byteblaze/a11y-syntax-highlighting to one that mandates all copies and derivative works to be under the same license", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "file_has_substring", - "eval_params": { - "group": "byteblaze", - "project": "a11y-syntax-highlighting", - "file_path": "LICENSE", - "substring": "GNU GENERAL PUBLIC LICENSE", - "not_substring": "MIT License", - "branch": "main" - } - } - } - ], - "site": "gitlab" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "__GITLAB__/byteblaze/a11y-syntax-highlighting/-/blob/main/LICENSE", - "locator": "", - "required_contents": { - "must_include": [ - "GENERAL PUBLIC LICENSE" - ] - } - } - ] + "repo": "byteblaze/a11y-syntax-highlighting", + "license": "one that mandates all copies and derivative works to be under the same license" }, - "intent_template_id": 355, - "changelogs": [ + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "expected_backend_state", - "category": "switch_to_api_eval", - "note": "Use GitLab API to fetch file content; more accurate" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "group": "byteblaze", + "project": "a11y-syntax-highlighting", + "file_path": "LICENSE", + "substring": "GNU GENERAL PUBLIC LICENSE", + "not_substring": "MIT License", + "branch": "main", + "site": "gitlab", + "expected": true } - ] + ], + "revision": 2 }, { - "sites": [ - "gitlab" - ], + "sites": ["gitlab"], "task_id": 414, - "require_login": true, - "storage_state": "./.auth/gitlab_state.json", - "start_url": "__GITLAB__", - "geolocation": null, - "intent_template": "Make the LICENSE of {{repo}} to MIT license.", - "instantiation_dict": { - "repo": "byteblaze/dotfiles" - }, - "intent": "Make the LICENSE of byteblaze/dotfiles to MIT license.", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "file_has_substring", - "eval_params": { - "group": "byteblaze", - "project": "dotfiles", - "file_path": "LICENSE", - "substring": "MIT License", - "branch": "main" - } - } - } - ], - "site": "gitlab" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "__GITLAB__/byteblaze/dotfiles/-/blob/main/LICENSE", - "locator": "", - "required_contents": { - "must_include": [ - "MIT license", - "The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software." - ] - } - } - ] - }, "intent_template_id": 355, - "changelogs": [ + "start_urls": ["__GITLAB__"], + "intent": "Change the LICENSE for repo byteblaze/dotfiles to an MIT license", + "intent_template": "Change the LICENSE for repo {{repo}} to {{license}}", + "instantiation_dict": {"repo": "byteblaze/dotfiles", "license": "an MIT license"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "expected_backend_state", - "category": "switch_to_api_eval", - "note": "Use GitLab API to fetch file content; more accurate" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "group": "byteblaze", + "project": "dotfiles", + "file_path": "LICENSE", + "substring": "MIT License", + "branch": "main", + "site": "gitlab", + "expected": true } - ] + ], + "revision": 2 }, { - "sites": [ - "gitlab" - ], + "sites": ["gitlab"], "task_id": 415, - "require_login": true, - "storage_state": "./.auth/gitlab_state.json", - "start_url": "__GITLAB__", - "geolocation": null, - "intent_template": "Go to the merge request on {{topic}} I have to review, find if the author of the merge request responded at the end, and reply \"Thank you\" if he did. Otherwise remind him with a simple @.", - "instantiation_dict": { - "topic": "verification functions" - }, - "intent": "Go to the merge request on verification functions I have to review, find if the author of the merge request responded at the end, and reply \"Thank you\" if he did. Otherwise remind him with a simple @.", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "does_merge_request_have_comment", - "eval_params": { - "group": "byteblaze", - "project": "a11y-webring.club", - "merge_request_iid": 40, - "comment": "@davepgreene" - } - } - } - ], - "site": "gitlab" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "__GITLAB__/byteblaze/a11y-webring.club/-/merge_requests/40", - "locator": "document.querySelector('[id=\"notes-list\"').lastElementChild.querySelector('.timeline-discussion-body').outerText", - "required_contents": { - "exact_match": "@davepgreene" - } - } - ] - }, "intent_template_id": 360, - "changelogs": [ + "start_urls": ["__GITLAB__"], + "intent": "Go to the merge request on verification functions I have to review, find if the author of the merge request responded at the end, and reply \"Thank you\" if he did. Otherwise remind him with a simple @.", + "intent_template": "Go to the merge request on {{topic}} I have to review, find if the author of the merge request responded at the end, and reply \"Thank you\" if he did. Otherwise remind him with a simple @.", + "instantiation_dict": {"topic": "verification functions"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "expected_backend_state", - "category": "switch_to_api_eval", - "note": "Use API call for MR comment verification; more accurate" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "group": "byteblaze", + "project": "a11y-webring.club", + "merge_request_iid": 40, + "comment": "@davepgreene", + "site": "gitlab", + "expected": true } - ] + ], + "revision": 2 }, { - "sites": [ - "gitlab" - ], + "sites": ["gitlab"], "task_id": 416, - "require_login": true, - "storage_state": "./.auth/gitlab_state.json", - "start_url": "__GITLAB__", - "geolocation": null, - "intent_template": "Go to the merge request on {{topic}} I have to review, find if the author of the merge request responded at the end, and reply \"Thank you\" if he did. Otherwise remind him with a simple @.", - "instantiation_dict": { - "topic": "wcag" - }, - "intent": "Go to the merge request on wcag I have to review, find if the author of the merge request responded at the end, and reply \"Thank you\" if he did. Otherwise remind him with a simple @.", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "does_merge_request_have_comment", - "eval_params": { - "group": "a11yproject", - "project": "a11yproject.com", - "merge_request_iid": 1270, - "comment": "Thank you" - } - } - } - ], - "site": "gitlab" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "__GITLAB__/a11yproject/a11yproject.com/-/merge_requests/1270", - "locator": "document.querySelector('[id=\"notes-list\"').lastElementChild.querySelector('.timeline-discussion-body').outerText", - "required_contents": { - "exact_match": "Thank you" - } - } - ] - }, "intent_template_id": 360, - "changelogs": [ + "start_urls": ["__GITLAB__"], + "intent": "Go to the merge request on wcag I have to review, find if the author of the merge request responded at the end, and reply \"Thank you\" if he did. Otherwise remind him with a simple @.", + "intent_template": "Go to the merge request on {{topic}} I have to review, find if the author of the merge request responded at the end, and reply \"Thank you\" if he did. Otherwise remind him with a simple @.", + "instantiation_dict": {"topic": "wcag"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "expected_backend_state", - "category": "switch_to_api_eval", - "note": "Use API call for MR comment verification; more accurate" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "group": "a11yproject", + "project": "a11yproject.com", + "merge_request_iid": 1270, + "comment": "Thank you", + "site": "gitlab", + "expected": true } - ] + ], + "revision": 2 }, { - "sites": [ - "gitlab" - ], + "sites": ["gitlab"], "task_id": 417, - "require_login": true, - "storage_state": "./.auth/gitlab_state.json", - "start_url": "__GITLAB__", - "geolocation": null, - "intent_template": "Go to the merge request on {{topic}} I have to review, find if the author of the merge request responded at the end, and reply \"Thank you\" if he did. Otherwise remind him with a simple @.", - "instantiation_dict": { - "topic": "404 link" - }, - "intent": "Go to the merge request on 404 link I have to review, find if the author of the merge request responded at the end, and reply \"Thank you\" if he did. Otherwise remind him with a simple @.", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "does_merge_request_have_comment", - "eval_params": { - "group": "a11yproject", - "project": "a11yproject.com", - "merge_request_iid": 1485, - "comment": "@Roshanjossey" - } - } - } - ], - "site": "gitlab" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "__GITLAB__/a11yproject/a11yproject.com/-/merge_requests/1485", - "locator": "document.querySelector('[id=\"notes-list\"').lastElementChild.querySelector('.timeline-discussion-body').outerText", - "required_contents": { - "exact_match": "@Roshanjossey" - } - } - ] - }, "intent_template_id": 360, - "changelogs": [ + "start_urls": ["__GITLAB__"], + "intent": "Go to the merge request on 404 link I have to review, find if the author of the merge request responded at the end, and reply \"Thank you\" if he did. Otherwise remind him with a simple @.", + "intent_template": "Go to the merge request on {{topic}} I have to review, find if the author of the merge request responded at the end, and reply \"Thank you\" if he did. Otherwise remind him with a simple @.", + "instantiation_dict": {"topic": "404 link"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "expected_backend_state", - "category": "switch_to_api_eval", - "note": "Use API call for MR comment verification; more accurate" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "group": "a11yproject", + "project": "a11yproject.com", + "merge_request_iid": 1485, + "comment": "@Roshanjossey", + "site": "gitlab", + "expected": true } - ] + ], + "revision": 2 }, { - "sites": [ - "gitlab" - ], + "sites": ["gitlab"], "task_id": 418, - "require_login": true, - "storage_state": "./.auth/gitlab_state.json", - "start_url": "__GITLAB__", - "geolocation": null, - "intent_template": "Set my gitlab status as {{status}}.", - "instantiation_dict": { - "status": "Busy" - }, - "intent": "Set my gitlab status as Busy.", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "check_user_status_message", - "eval_params": { - "user_id": "byteblaze", - "compare_str": "Busy" - } - } - } - ], - "site": "gitlab" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "__GITLAB__/byteblaze", - "locator": "document.querySelector('.cover-status').lastChild.textContent", - "required_contents": { - "exact_match": "Busy" - } - } - ] - }, "intent_template_id": 361, - "changelogs": [ + "start_urls": ["__GITLAB__"], + "intent": "Set my gitlab status as Busy.", + "intent_template": "Set my gitlab status as {{status}}.", + "instantiation_dict": {"status": "Busy"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "expected_backend_state", - "category": "switch_to_api_eval", - "note": "Use API to verify; more accurate" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "user_id": "byteblaze", + "compare_str": "Busy", + "site": "gitlab", + "expected": true } - ] + ], + "revision": 2 }, { - "sites": [ - "gitlab" - ], + "sites": ["gitlab"], "task_id": 419, - "require_login": true, - "storage_state": "./.auth/gitlab_state.json", - "start_url": "__GITLAB__", - "geolocation": null, - "intent_template": "Set my gitlab status as {{status}}.", - "instantiation_dict": { - "status": "Enjoying life" - }, + "intent_template_id": 361, + "start_urls": ["__GITLAB__"], "intent": "Set my gitlab status as Enjoying life.", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "check_user_status_message", - "eval_params": { - "user_id": "byteblaze", - "compare_str": "Enjoying life" - } - } - } - ], - "site": "gitlab" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "__GITLAB__/byteblaze", - "locator": "document.querySelector('.cover-status').lastChild.textContent", - "required_contents": { - "exact_match": "Enjoying life" - } - } - ] - }, - "intent_template_id": 361, - "changelogs": [ + "intent_template": "Set my gitlab status as {{status}}.", + "instantiation_dict": {"status": "Enjoying life"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "expected_backend_state", - "category": "switch_to_api_eval", - "note": "Use API to verify; more accurate" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "user_id": "byteblaze", + "compare_str": "Enjoying life", + "site": "gitlab", + "expected": true } - ] + ], + "revision": 2 }, { - "sites": [ - "gitlab" - ], + "sites": ["gitlab"], "task_id": 420, - "require_login": true, - "storage_state": "./.auth/gitlab_state.json", - "start_url": "__GITLAB__", - "geolocation": null, - "intent_template": "Set my gitlab status as {{status}}.", - "instantiation_dict": { - "status": "Playing Badminton" - }, - "intent": "Set my gitlab status as Playing Badminton.", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "check_user_status_message", - "eval_params": { - "user_id": "byteblaze", - "compare_str": "Playing Badminton" - } - } - } - ], - "site": "gitlab" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "__GITLAB__/byteblaze", - "locator": "document.querySelector('.cover-status').lastChild.textContent", - "required_contents": { - "exact_match": "Playing Badminton" - } - } - ] - }, "intent_template_id": 361, - "changelogs": [ + "start_urls": ["__GITLAB__"], + "intent": "Set my gitlab status as Playing Badminton.", + "intent_template": "Set my gitlab status as {{status}}.", + "instantiation_dict": {"status": "Playing Badminton"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "expected_backend_state", - "category": "switch_to_api_eval", - "note": "Use API to verify; more accurate" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "user_id": "byteblaze", + "compare_str": "Playing Badminton", + "site": "gitlab", + "expected": true } - ] + ], + "revision": 2 }, { - "sites": [ - "gitlab" - ], + "sites": ["gitlab"], "task_id": 421, - "require_login": true, - "storage_state": "./.auth/gitlab_state.json", - "start_url": "__GITLAB__", - "geolocation": null, - "intent_template": "Set my gitlab status as {{status}}.", - "instantiation_dict": { - "status": "Resting due to leg injury" - }, - "intent": "Set my gitlab status as Resting due to leg injury.", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "check_user_status_message", - "eval_params": { - "user_id": "byteblaze", - "compare_str": "Resting due to leg injury" - } - } - } - ], - "site": "gitlab" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "__GITLAB__/byteblaze", - "locator": "document.querySelector('.cover-status').lastChild.textContent", - "required_contents": { - "exact_match": "Resting due to leg injury" - } - } - ] - }, "intent_template_id": 361, - "changelogs": [ + "start_urls": ["__GITLAB__"], + "intent": "Set my gitlab status as Resting due to leg injury.", + "intent_template": "Set my gitlab status as {{status}}.", + "instantiation_dict": {"status": "Resting due to leg injury"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "expected_backend_state", - "category": "switch_to_api_eval", - "note": "Use API to verify; more accurate" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "user_id": "byteblaze", + "compare_str": "Resting due to leg injury", + "site": "gitlab", + "expected": true } - ] + ], + "revision": 2 }, { - "sites": [ - "gitlab" - ], + "sites": ["gitlab"], "task_id": 422, - "require_login": true, - "storage_state": "./.auth/gitlab_state.json", - "start_url": "__GITLAB__", - "geolocation": null, - "intent_template": "Set my gitlab status as {{status}}.", - "instantiation_dict": { - "status": "Out of Office" - }, - "intent": "Set my gitlab status as Out of Office.", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "check_user_status_message", - "eval_params": { - "user_id": "byteblaze", - "compare_str": "Out of Office" - } - } - } - ], - "site": "gitlab" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "__GITLAB__/byteblaze", - "locator": "document.querySelector('.cover-status').lastChild.textContent", - "required_contents": { - "exact_match": "Out of Office" - } - } - ] - }, "intent_template_id": 361, - "changelogs": [ + "start_urls": ["__GITLAB__"], + "intent": "Set my gitlab status as Out of Office.", + "intent_template": "Set my gitlab status as {{status}}.", + "instantiation_dict": {"status": "Out of Office"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "expected_backend_state", - "category": "switch_to_api_eval", - "note": "Use API to verify; more accurate" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "user_id": "byteblaze", + "compare_str": "Out of Office", + "site": "gitlab", + "expected": true } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping_admin" - ], + "sites": ["shopping_admin"], "task_id": 423, - "require_login": true, - "storage_state": "./.auth/shopping_admin_state.json", - "start_url": "__SHOPPING_ADMIN__", - "geolocation": null, - "intent_template": "Mark all {{brand}} shirts on sale", - "instantiation_dict": { - "brand": "Hollister" - }, - "intent": "Mark all Hollister shirts on sale", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "verify_product_details", - "eval_params": { - "product_id": "126" - }, - "expected_data": { - "on_sale": true - } - } - } - ], - "site": "shopping_admin" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "__SHOPPING_ADMIN__/catalog/product/edit/id/126/", - "locator": "document.querySelector('input[name=\"product[sale]\"]').value", - "required_contents": { - "exact_match": "1" - } - } - ] - }, "intent_template_id": 237, - "changelogs": [ + "start_urls": ["__SHOPPING_ADMIN__"], + "intent": "Mark all Hollister shirts on sale", + "intent_template": "Mark all {{brand}} shirts on sale", + "instantiation_dict": {"brand": "Hollister"}, + "format_specification": null, + "start_url_context": null, + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "NetworkEventEvaluator", + "site": "shopping_admin", + "url_match_mode": "exact", + "last_event_only": false, + "expected": { + "url": "__SHOPPING_ADMIN__/catalog/product/validate/id/126/type/configurable/store/0/set/9/?isAjax=true", + "event_type": "modification", + "headers": { + "referer": "__SHOPPING_ADMIN__/admin/catalog/product/edit/id/126/", + "X-Requested-With": "XMLHttpRequest" + }, + "post_data": {"report_type": "created_at_order", "from": "02/1/2023", "to": "02/28/2023"} + } } - ] + ], + "revision": 2 }, { - "sites": [ - "wikipedia", - "map" - ], + "sites": ["wikipedia", "map"], "task_id": 424, - "require_login": true, - "storage_state": null, - "start_url": "__MAP__", - "geolocation": null, - "intent_template": "Find the page of {{description}} on the map.", - "instantiation_dict": { - "description": "the place where Mr. Rogers was filmed" - }, + "intent_template_id": 371, + "start_urls": ["__MAP__", "__WIKIPEDIA__"], "intent": "Find the page of the place where Mr. Rogers was filmed on the map.", - "require_reset": false, - "eval": { - "site": "map" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "last", - "locator": "document.querySelector('[id=\"sidebar_content\"').outerText", - "required_contents": { - "must_include": [ - "Pittsburgh" - ] - } - } - ] - }, - "intent_template_id": 371 + "intent_template": "Find the page of {{description}} on the map.", + "instantiation_dict": {"description": "the place where Mr. Rogers was filmed"}, + "format_specification": null, + "start_url_context": null, + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} + } + ], + "revision": 2 }, { - "sites": [ - "wikipedia", - "map" - ], + "sites": ["wikipedia", "map"], "task_id": 425, - "require_login": true, - "storage_state": null, - "start_url": "__MAP__", - "geolocation": null, - "intent_template": "Find the page of {{description}} on the map.", - "instantiation_dict": { - "description": "the longest bridge in the Western hemisphere" - }, + "intent_template_id": 371, + "start_urls": ["__MAP__", "__WIKIPEDIA__"], "intent": "Find the page of the longest bridge in the Western hemisphere on the map.", - "require_reset": false, - "eval": { - "site": "map" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "last", - "locator": "document.querySelector('[id=\"sidebar_content\"').outerText", - "required_contents": { - "must_include": [ - "Mackinac Bridge" - ] - } - } - ] - }, - "intent_template_id": 371 + "intent_template": "Find the page of {{description}} on the map.", + "instantiation_dict": {"description": "the longest bridge in the Western hemisphere"}, + "format_specification": null, + "start_url_context": null, + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} + } + ], + "revision": 2 }, { - "sites": [ - "wikipedia", - "map" - ], + "sites": ["wikipedia", "map"], "task_id": 426, - "require_login": true, - "storage_state": null, - "start_url": "__MAP__", - "geolocation": null, + "intent_template_id": 371, + "start_urls": ["__MAP__", "__WIKIPEDIA__"], + "intent": "Find the page of the place in Pennsylvania where a plane crashed during the September 11th attacks on the map.", "intent_template": "Find the page of {{description}} on the map.", "instantiation_dict": { "description": "the place in Pennsylvania where a plane crashed during the September 11th attacks" }, - "intent": "Find the page of the place in Pennsylvania where a plane crashed during the September 11th attacks on the map.", - "require_reset": false, - "eval": { - "site": "map" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "last", - "locator": "document.querySelector('[id=\"sidebar_content\"').outerText", - "required_contents": { - "must_include": [ - "Somerset County" - ] - } - } - ] - }, - "intent_template_id": 371 + "format_specification": null, + "start_url_context": null, + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} + } + ], + "revision": 2 }, { - "sites": [ - "wikipedia", - "map" - ], + "sites": ["wikipedia", "map"], "task_id": 427, - "require_login": true, - "storage_state": null, - "start_url": "__MAP__", - "geolocation": null, - "intent_template": "Find the page of {{description}} on the map.", - "instantiation_dict": { - "description": "the university that has most Turning Award winners" - }, + "intent_template_id": 371, + "start_urls": ["__MAP__", "__WIKIPEDIA__"], "intent": "Find the page of the university that has most Turning Award winners on the map.", - "require_reset": false, - "eval": { - "site": "map" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "last", - "locator": "document.querySelector('[id=\"sidebar_content\"').outerText", - "required_contents": { - "must_include": [ - "Massachusetts Institute of Technology" - ] - } - } - ] - }, - "intent_template_id": 371 + "intent_template": "Find the page of {{description}} on the map.", + "instantiation_dict": {"description": "the university that has most Turning Award winners"}, + "format_specification": null, + "start_url_context": null, + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} + } + ], + "revision": 2 }, { - "sites": [ - "wikipedia", - "map" - ], + "sites": ["wikipedia", "map"], "task_id": 428, - "require_login": true, - "storage_state": null, - "start_url": "__MAP__", - "geolocation": null, + "intent_template_id": 371, + "start_urls": ["__MAP__", "__WIKIPEDIA__"], + "intent": "Find the page of the undergrad college of the person who developed the Nash equilibrium on the map.", "intent_template": "Find the page of {{description}} on the map.", "instantiation_dict": { "description": "the undergrad college of the person who developed the Nash equilibrium" }, - "intent": "Find the page of the undergrad college of the person who developed the Nash equilibrium on the map.", - "require_reset": false, - "eval": { - "site": "map" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "last", - "locator": "document.querySelector('[id=\"sidebar_content\"').outerText", - "required_contents": { - "must_include": [ - "Carnegie Mellon University" - ] - } - } - ] - }, - "intent_template_id": 371 + "format_specification": null, + "start_url_context": null, + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} + } + ], + "revision": 2 }, { - "sites": [ - "wikipedia", - "map" - ], + "sites": ["wikipedia", "map"], "task_id": 429, - "require_login": true, - "storage_state": null, - "start_url": "__MAP__", - "geolocation": null, - "intent_template": "Find the page of {{description}} on the map.", - "instantiation_dict": { - "description": "the colleges where The Chair was filmed in Pittsburgh" - }, + "intent_template_id": 371, + "start_urls": ["__MAP__", "__WIKIPEDIA__"], "intent": "Find the page of the colleges where The Chair was filmed in Pittsburgh on the map.", - "require_reset": false, - "eval": { - "site": "map" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "last", - "locator": "document.querySelector('[id=\"sidebar_content\"').outerText", - "required_contents": { - "must_include": [ - "Chatham University" - ] - } - } - ] - }, - "intent_template_id": 371 + "intent_template": "Find the page of {{description}} on the map.", + "instantiation_dict": {"description": "the colleges where The Chair was filmed in Pittsburgh"}, + "format_specification": null, + "start_url_context": null, + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} + } + ], + "revision": 2 }, { - "sites": [ - "wikipedia", - "map" - ], + "sites": ["wikipedia", "map"], "task_id": 430, - "require_login": true, - "storage_state": null, - "start_url": "__MAP__", - "geolocation": null, + "intent_template_id": 371, + "start_urls": ["__MAP__", "__WIKIPEDIA__"], + "intent": "Find the page of the college(s) where The Chair was filmed in Pennsylvania other than the ones in Pittsburgh on the map.", "intent_template": "Find the page of {{description}} on the map.", "instantiation_dict": { "description": "the college(s) where The Chair was filmed in Pennsylvania other than the ones in Pittsburgh" }, - "intent": "Find the page of the college(s) where The Chair was filmed in Pennsylvania other than the ones in Pittsburgh on the map.", - "require_reset": false, - "eval": { - "site": "map" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "last", - "locator": "document.querySelector('[id=\"sidebar_content\"').outerText", - "required_contents": { - "must_include": [ - "Washington & Jefferson College" - ] - } - } - ] - }, - "intent_template_id": 371 + "format_specification": null, + "start_url_context": null, + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} + } + ], + "revision": 2 }, { - "sites": [ - "shopping" - ], + "sites": ["shopping"], "task_id": 431, - "require_login": true, - "storage_state": "./.auth/shopping_state.json", - "start_url": "__SHOPPING__/tall-pink-taper-candles-4-piece-orange-colored-tapered-candles-gradient-candles-10-6-inches-tall-tie-dye-candle-set-large-dripless-long-burning-candlesticks-two-color-taper-candles-candlesticks.html |AND| __SHOPPING__/spaas-white-taper-candles-4-pack-10-inch-tall-candles-scent-free-premium-wax-candle-sticks-8-hour-long-burning-white-candlesticks-for-home-decoration-wedding-holiday-and-parties.html |AND| __SHOPPING__/white-starfish-wall-candle-sconces-set-of-2-beach-decor-ocean-themed-wall-mount-candleholders-nautical-style-beach-bathroom-decor-coastal-farmhouse-seashell-candle-holders.html", - "geolocation": null, + "intent_template_id": 145, + "start_urls": [ + "__SHOPPING__/tall-pink-taper-candles-4-piece-orange-colored-tapered-candles-gradient-candles-10-6-inches-tall-tie-dye-candle-set-large-dripless-long-burning-candlesticks-two-color-taper-candles-candlesticks.html", + "__SHOPPING__/spaas-white-taper-candles-4-pack-10-inch-tall-candles-scent-free-premium-wax-candle-sticks-8-hour-long-burning-white-candlesticks-for-home-decoration-wedding-holiday-and-parties.html", + "__SHOPPING__/white-starfish-wall-candle-sconces-set-of-2-beach-decor-ocean-themed-wall-mount-candleholders-nautical-style-beach-bathroom-decor-coastal-farmhouse-seashell-candle-holders.html" + ], + "intent": "Add the product with the lowest per unit price from my open tabs to the shopping cart", "intent_template": "Add the product with the lowest per unit price from my open tabs to the shopping cart", "instantiation_dict": {}, - "intent": "Add the product with the lowest per unit price from my open tabs to the shopping cart", - "require_reset": false, - "eval": { - "site": "shopping" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "__SHOPPING__/checkout/cart", - "locator": "", - "required_contents": { - "must_include": [ - "SPAAS White Taper Candles - 4 Pack |OR| 10 Inch Tall Candles, Scent-Free Premium Wax Candle Sticks |OR| 8 Hour Long Burning White Candlesticks for Home Decoration, Wedding, Holiday and Parties" - ] - } - } - ] - }, - "intent_template_id": 145 + "format_specification": null, + "start_url_context": "Complete the task given the context of the current page.", + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + }, + { + "evaluator": "BackendStateEvaluator", + "site": "shopping", + "expected": {"sku": "B0933NCMSC"} + } + ], + "revision": 2 }, { - "sites": [ - "shopping" - ], + "sites": ["shopping"], "task_id": 432, - "require_login": true, - "storage_state": "./.auth/shopping_state.json", - "start_url": "__SHOPPING__/ciclon-energy-drink-regular-24-cans-8-3oz.html |AND| __SHOPPING__/v8-energy-healthy-energy-drink-steady-energy-from-black-and-green-tea-pomegranate-blueberry-8-ounce-can-pack-of-24.html", - "geolocation": null, + "intent_template_id": 145, + "start_urls": [ + "__SHOPPING__/ciclon-energy-drink-regular-24-cans-8-3oz.html", + "__SHOPPING__/v8-energy-healthy-energy-drink-steady-energy-from-black-and-green-tea-pomegranate-blueberry-8-ounce-can-pack-of-24.html" + ], + "intent": "Add the product with the lowest per unit price from my open tabs to the shopping cart", "intent_template": "Add the product with the lowest per unit price from my open tabs to the shopping cart", "instantiation_dict": {}, - "intent": "Add the product with the lowest per unit price from my open tabs to the shopping cart", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "validate_in_shopping_cart", - "expected_data": { - "sku": "B00CPTR7WS" - } - } - } - ], - "site": "shopping" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "__SHOPPING__/checkout/cart", - "locator": "", - "required_contents": { - "must_include": [ - "V8 +Energy, Healthy Energy Drink, Steady Energy from Black and Green Tea, Pomegranate Blueberry, 8 Ounce Can ,Pack of 24" - ] - } - } - ] - }, - "intent_template_id": 145, - "changelogs": [ + "format_specification": null, + "start_url_context": "Complete the task given the context of the current page.", + "eval": [ { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + }, + { + "evaluator": "BackendStateEvaluator", + "site": "shopping", + "expected": {"sku": "B00CPTR7WS"} } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping" - ], + "sites": ["shopping"], "task_id": 433, - "require_login": true, - "storage_state": "./.auth/shopping_state.json", - "start_url": "__SHOPPING__/tazrigo-5pcs-white-dental-resin-brush-pens-dental-shaping-silicone-tooth-tool.html |AND| __SHOPPING__/stylus-pens-for-touch-screens-2-pcs-universal-stylus-2-in-1-2022-updated-touch-screen-pens-for-all-touch-screens-cell-phones-tablets-laptops-with-6-replacement-tips-4-discstips-2-fiber-tips.html", - "geolocation": null, + "intent_template_id": 145, + "start_urls": [ + "__SHOPPING__/tazrigo-5pcs-white-dental-resin-brush-pens-dental-shaping-silicone-tooth-tool.html", + "__SHOPPING__/stylus-pens-for-touch-screens-2-pcs-universal-stylus-2-in-1-2022-updated-touch-screen-pens-for-all-touch-screens-cell-phones-tablets-laptops-with-6-replacement-tips-4-discstips-2-fiber-tips.html" + ], + "intent": "Add the product with the lowest per unit price from my open tabs to the shopping cart", "intent_template": "Add the product with the lowest per unit price from my open tabs to the shopping cart", "instantiation_dict": {}, - "intent": "Add the product with the lowest per unit price from my open tabs to the shopping cart", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "validate_in_shopping_cart", - "expected_data": { - "sku": "B07Q1NRQBW" - } - } - } - ], - "site": "shopping" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "__SHOPPING__/checkout/cart", - "locator": "", - "required_contents": { - "must_include": [ - "Tazrigo 5pcs White Dental Resin Brush Pens Dental Shaping Silicone Tooth Tool" - ] - } - } - ] - }, - "intent_template_id": 145, - "changelogs": [ + "format_specification": null, + "start_url_context": "Complete the task given the context of the current page.", + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "site": "shopping", + "expected": {"sku": "B07Q1NRQBW"} } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping" - ], + "sites": ["shopping"], "task_id": 434, - "require_login": true, - "storage_state": "./.auth/shopping_state.json", - "start_url": "__SHOPPING__/3-pairs-ruffle-socks-lace-ankle-socks-for-girls-frilly-socks-women-decorative.html |AND| __SHOPPING__/viviki-women-glitter-socks-ultrathin-transparent-tulle-lace-socks-no-show-ankle-crew-socks-3-pack.html", - "geolocation": null, + "intent_template_id": 145, + "start_urls": [ + "__SHOPPING__/3-pairs-ruffle-socks-lace-ankle-socks-for-girls-frilly-socks-women-decorative.html", + "__SHOPPING__/viviki-women-glitter-socks-ultrathin-transparent-tulle-lace-socks-no-show-ankle-crew-socks-3-pack.html" + ], + "intent": "Add the product with the lowest per unit price from my open tabs to the shopping cart", "intent_template": "Add the product with the lowest per unit price from my open tabs to the shopping cart", "instantiation_dict": {}, - "intent": "Add the product with the lowest per unit price from my open tabs to the shopping cart", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "validate_in_shopping_cart", - "expected_data": { - "sku": "B08MFJFHQ4" - } - } - } - ], - "site": "shopping" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "__SHOPPING__/checkout/cart", - "locator": "", - "required_contents": { - "must_include": [ - "VIVIKI Women Glitter Socks Ultrathin Transparent Tulle Lace Socks - No Show Ankle Crew Socks 3 Pack" - ] - } - } - ] - }, - "intent_template_id": 145, - "changelogs": [ + "format_specification": null, + "start_url_context": "Complete the task given the context of the current page.", + "eval": [ { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + }, + { + "evaluator": "BackendStateEvaluator", + "site": "shopping", + "expected": {"sku": "B08MFJFHQ4"} } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping" - ], + "sites": ["shopping"], "task_id": 435, - "require_login": true, - "storage_state": "./.auth/shopping_state.json", - "start_url": "__SHOPPING__/35-ft-hdmi-cable-gearit-pro-series-hdmi-cable-35-feet-high-speed-ethernet-4k-resolution-3d-video-and-arc-audio-return-channel-hdmi-cable-white.html |AND| __SHOPPING__/dp-to-hdmi-cable-6ft-2-pack-fosmon-gold-plated-displayport-to-hdmi-cable-1080p-full-hd-for-pcs-to-hdtv-monitor-projector-with-hdmi-port.html", - "geolocation": null, + "intent_template_id": 145, + "start_urls": [ + "__SHOPPING__/35-ft-hdmi-cable-gearit-pro-series-hdmi-cable-35-feet-high-speed-ethernet-4k-resolution-3d-video-and-arc-audio-return-channel-hdmi-cable-white.html", + "__SHOPPING__/dp-to-hdmi-cable-6ft-2-pack-fosmon-gold-plated-displayport-to-hdmi-cable-1080p-full-hd-for-pcs-to-hdtv-monitor-projector-with-hdmi-port.html" + ], + "intent": "Add the product with the lowest per unit price from my open tabs to the shopping cart", "intent_template": "Add the product with the lowest per unit price from my open tabs to the shopping cart", "instantiation_dict": {}, - "intent": "Add the product with the lowest per unit price from my open tabs to the shopping cart", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "validate_in_shopping_cart", - "expected_data": { - "sku": "B01JMA0YX6" - } - } - } - ], - "site": "shopping" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "__SHOPPING__/checkout/cart", - "locator": "", - "required_contents": { - "must_include": [ - "DP to HDMI Cable 6FT (2 Pack), Fosmon Gold Plated Displayport to HDMI Cable 1080p Full HD for PCs to HDTV, Monitor, Projector with HDMI Port" - ] - } - } - ] - }, - "intent_template_id": 145, - "changelogs": [ + "format_specification": null, + "start_url_context": "Complete the task given the context of the current page.", + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "site": "shopping", + "expected": {"sku": "B01JMA0YX6"} } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping" - ], + "sites": ["shopping"], "task_id": 436, - "require_login": true, - "storage_state": "./.auth/shopping_state.json", - "start_url": "__SHOPPING__", - "geolocation": null, + "intent_template_id": 156, + "start_urls": ["__SHOPPING__"], + "intent": "I previously ordered a mattress foundation around Feb or March 2023 and later cancelled. Can you reorder it for me?", "intent_template": "I previously ordered {{product}} {{time}} and later cancelled. Can you reorder it for me?", - "original.intent_template": "I previously ordered some {{product}} {{time}} and later cancelled. Can you reorder it for me?", - "instantiation_dict": { - "product": "a mattress foundation", - "time": "around Feb or March 2023" - }, - "intent": "I previously ordered a mattress foundation around Feb or March 2023 and later cancelled. Can you reorder it for me?", - "original.intent": "I previously ordered some a mattress foundation around Feb or March 2023 and later cancelled. Can you reorder it for me?", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "validate_ordered", - "expected_data": { - "sku": "B07DFJ5XKH" - } - } - } - ], - "site": "shopping" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "func:shopping_get_latest_order_url()", - "locator": "document.querySelector(\".order-details-items.ordered\").outerText", - "required_contents": { - "must_include": [ - "B07DFJ5XKH" - ] - } - } - ] - }, - "intent_template_id": 156, - "changelogs": [ + "instantiation_dict": {"product": "a mattress foundation", "time": "around Feb or March 2023"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "intent_template", - "category": "spelling_or_grammar", - "note": "Fix wording" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "site": "shopping", + "expected": {"sku": "B07DFJ5XKH"} } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping" - ], + "sites": ["shopping"], "task_id": 437, - "require_login": true, - "storage_state": "./.auth/shopping_state.json", - "start_url": "__SHOPPING__", - "geolocation": null, - "intent_template": "I previously ordered {{product}} {{time}} and later cancelled. Can you reorder it for me?", - "original.intent_template": "I previously ordered some {{product}} {{time}} and later cancelled. Can you reorder it for me?", - "instantiation_dict": { - "product": "a table lamp", - "time": "in May 2023" - }, - "intent": "I previously ordered a table lamp in May 2023 and later cancelled. Can you reorder it for me?", - "original.intent": "I previously ordered some a table lamp in May 2023 and later cancelled. Can you reorder it for me?", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "validate_ordered", - "expected_data": { - "sku": "B087QSCXGT" - } - } - } - ], - "site": "shopping" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "func:shopping_get_latest_order_url()", - "locator": "document.querySelector(\".order-details-items.ordered\").outerText", - "required_contents": { - "must_include": [ - "B072XS3F6W" - ] - } - } - ] - }, "intent_template_id": 156, - "changelogs": [ - { - "key": "intent_template", - "category": "spelling_or_grammar", - "note": "Fix wording" - }, + "start_urls": ["__SHOPPING__"], + "intent": "I previously ordered a table lamp in May 2023 and later cancelled. Can you reorder it for me?", + "intent_template": "I previously ordered {{product}} {{time}} and later cancelled. Can you reorder it for me?", + "instantiation_dict": {"product": "a table lamp", "time": "in May 2023"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "expected_backend_state", - "category": "reference_alignment", - "note": "Original sku is for a lamp ordered in February not May. Updated to the lamp ordered in May" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "site": "shopping", + "expected": {"sku": "B087QSCXGT"} } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping" - ], + "sites": ["shopping"], "task_id": 438, - "require_login": true, - "storage_state": "./.auth/shopping_state.json", - "start_url": "__SHOPPING__", - "geolocation": null, - "intent_template": "I previously ordered {{product}} {{time}} and later cancelled. Can you reorder it for me?", - "original.intent_template": "I previously ordered some {{product}} {{time}} and later cancelled. Can you reorder it for me?", - "instantiation_dict": { - "product": "a TV stand", - "time": "sometime around sep 2022" - }, - "intent": "I previously ordered a TV stand sometime around sep 2022 and later cancelled. Can you reorder it for me?", - "original.intent": "I previously ordered some a TV stand sometime around sep 2022 and later cancelled. Can you reorder it for me?", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "validate_ordered", - "expected_data": { - "sku": "B017SK5E3M" - } - } - } - ], - "site": "shopping" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "func:shopping_get_latest_order_url()", - "locator": "document.querySelector(\".order-details-items.ordered\").outerText", - "required_contents": { - "must_include": [ - "B08PVHRRB7" - ] - } - } - ] - }, "intent_template_id": 156, - "changelogs": [ + "start_urls": ["__SHOPPING__"], + "intent": "I previously ordered a TV stand sometime around sep 2022 and later cancelled. Can you reorder it for me?", + "intent_template": "I previously ordered {{product}} {{time}} and later cancelled. Can you reorder it for me?", + "instantiation_dict": {"product": "a TV stand", "time": "sometime around sep 2022"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "intent_template", - "category": "spelling_or_grammar", - "note": "Fix wording" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "site": "shopping", + "expected": {"sku": "B017SK5E3M"} } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping" - ], + "sites": ["shopping"], "task_id": 439, - "require_login": true, - "storage_state": "./.auth/shopping_state.json", - "start_url": "__SHOPPING__", - "geolocation": null, - "intent_template": "I previously ordered {{product}} {{time}} and later cancelled. Can you reorder it for me?", - "original.intent_template": "I previously ordered some {{product}} {{time}} and later cancelled. Can you reorder it for me?", - "instantiation_dict": { - "product": "a cat t-shirt", - "time": "during 2022" - }, - "intent": "I previously ordered a cat t-shirt during 2022 and later cancelled. Can you reorder it for me?", - "original.intent": "I previously ordered some a cat t-shirt during 2022 and later cancelled. Can you reorder it for me?", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "validate_ordered", - "expected_data": { - "sku": "B0844BWS76" - } - } - } - ], - "site": "shopping" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "func:shopping_get_latest_order_url()", - "locator": "document.querySelector(\".order-details-items.ordered\").outerText", - "required_contents": { - "must_include": [ - "B0844BWS76" - ] - } - } - ] - }, "intent_template_id": 156, - "changelogs": [ + "start_urls": ["__SHOPPING__"], + "intent": "I previously ordered a cat t-shirt during 2022 and later cancelled. Can you reorder it for me?", + "intent_template": "I previously ordered {{product}} {{time}} and later cancelled. Can you reorder it for me?", + "instantiation_dict": {"product": "a cat t-shirt", "time": "during 2022"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "intent_template", - "category": "spelling_or_grammar", - "note": "Fix wording" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "site": "shopping", + "expected": {"sku": "B0844BWS76"} } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping" - ], + "sites": ["shopping"], "task_id": 440, - "require_login": true, - "storage_state": "./.auth/shopping_state.json", - "start_url": "__SHOPPING__", - "geolocation": null, - "intent_template": "I previously ordered {{product}} {{time}} and later cancelled. Can you reorder it for me?", - "original.intent_template": "I previously ordered some {{product}} {{time}} and later cancelled. Can you reorder it for me?", - "instantiation_dict": { - "product": "a make up removal kit", - "time": "during summer 2022" - }, - "intent": "I previously ordered a make up removal kit during summer 2022 and later cancelled. Can you reorder it for me?", - "original.intent": "I previously ordered some a make up removal kit during summer 2022 and later cancelled. Can you reorder it for me?", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "validate_ordered", - "expected_data": { - "sku": "B0738JQG6Q" - } - } - } - ], - "site": "shopping" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "func:shopping_get_latest_order_url()", - "locator": "document.querySelector(\".order-details-items.ordered\").outerText", - "required_contents": { - "must_include": [ - "B0738JQG6Q" - ] - } - } - ] - }, "intent_template_id": 156, - "changelogs": [ + "start_urls": ["__SHOPPING__"], + "intent": "I previously ordered a make up removal kit during summer 2022 and later cancelled. Can you reorder it for me?", + "intent_template": "I previously ordered {{product}} {{time}} and later cancelled. Can you reorder it for me?", + "instantiation_dict": {"product": "a make up removal kit", "time": "during summer 2022"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "intent_template", - "category": "spelling_or_grammar", - "note": "Fix wording" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "site": "shopping", + "expected": {"sku": "B0738JQG6Q"} } - ] + ], + "revision": 2 }, { - "sites": [ - "gitlab" - ], + "sites": ["gitlab"], "task_id": 441, - "require_login": true, - "storage_state": "./.auth/gitlab_state.json", - "start_url": "__GITLAB__/byteblaze/gimmiethat.space", - "geolocation": null, - "intent_template": "Update the project site's title to \"{{title}}\"", - "instantiation_dict": { - "title": "GIVE ME SPACE" - }, - "intent": "Update the project site's title to \"GIVE ME SPACE\"", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "file_has_substring", - "eval_params": { - "group": "byteblaze", - "project": "gimmiethat.space", - "file_path": "index.html", - "substring": "GIVE ME SPACE", - "branch": "main" - } - } - } - ], - "site": "gitlab" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "__GITLAB__/byteblaze/gimmiethat.space/-/raw/main/index.html", - "locator": "", - "required_contents": { - "must_include": [ - "GIVE ME SPACE" - ] - } - } - ] - }, "intent_template_id": 308, - "changelogs": [ + "start_urls": ["__GITLAB__/byteblaze/gimmiethat.space"], + "intent": "Update the project site's title to \"GIVE ME SPACE\"", + "intent_template": "Update the project site's title to \"{{title}}\"", + "instantiation_dict": {"title": "GIVE ME SPACE"}, + "format_specification": null, + "start_url_context": "Complete the task given the context of the current page.", + "eval": [ { - "key": "expected_backend_state", - "category": "switch_to_api_eval", - "note": "Use API to verify; more accurate" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "group": "byteblaze", + "project": "gimmiethat.space", + "file_path": "index.html", + "substring": "GIVE ME SPACE", + "branch": "main", + "site": "gitlab", + "expected": true } - ] + ], + "revision": 2 }, { - "sites": [ - "gitlab" - ], + "sites": ["gitlab"], "task_id": 442, - "require_login": true, - "storage_state": "./.auth/gitlab_state.json", - "start_url": "__GITLAB__/byteblaze/gimmiethat.space", - "geolocation": null, - "intent_template": "Update the project site's title to \"{{title}}\"", - "instantiation_dict": { - "title": "Welcome to my site" - }, - "intent": "Update the project site's title to \"Welcome to my site\"", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "file_has_substring", - "eval_params": { - "group": "byteblaze", - "project": "gimmiethat.space", - "file_path": "index.html", - "substring": "Welcome to my site", - "branch": "main" - } - } - } - ], - "site": "gitlab" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "__GITLAB__/byteblaze/gimmiethat.space/-/raw/main/index.html", - "locator": "", - "required_contents": { - "must_include": [ - "Welcome to my site" - ] - } - } - ] - }, "intent_template_id": 308, - "changelogs": [ + "start_urls": ["__GITLAB__/byteblaze/gimmiethat.space"], + "intent": "Update the project site's title to \"Welcome to my site\"", + "intent_template": "Update the project site's title to \"{{title}}\"", + "instantiation_dict": {"title": "Welcome to my site"}, + "format_specification": null, + "start_url_context": "Complete the task given the context of the current page.", + "eval": [ { - "key": "expected_backend_state", - "category": "switch_to_api_eval", - "note": "Use API to verify; more accurate" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "group": "byteblaze", + "project": "gimmiethat.space", + "file_path": "index.html", + "substring": "Welcome to my site", + "branch": "main", + "site": "gitlab", + "expected": true } - ] + ], + "revision": 2 }, { - "sites": [ - "gitlab" - ], + "sites": ["gitlab"], "task_id": 443, - "require_login": true, - "storage_state": "./.auth/gitlab_state.json", - "start_url": "__GITLAB__/byteblaze/gimmiethat.space", - "geolocation": null, - "intent_template": "Update the project site's title to \"{{title}}\"", - "instantiation_dict": { - "title": "Not an interesting site" - }, - "intent": "Update the project site's title to \"Not an interesting site\"", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "file_has_substring", - "eval_params": { - "group": "byteblaze", - "project": "gimmiethat.space", - "file_path": "index.html", - "substring": "Not an interesting site", - "branch": "main" - } - } - } - ], - "site": "gitlab" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "__GITLAB__/byteblaze/gimmiethat.space/-/raw/main/index.html", - "locator": "", - "required_contents": { - "must_include": [ - "Not an interesting site" - ] - } - } - ] - }, "intent_template_id": 308, - "changelogs": [ + "start_urls": ["__GITLAB__/byteblaze/gimmiethat.space"], + "intent": "Update the project site's title to \"Not an interesting site\"", + "intent_template": "Update the project site's title to \"{{title}}\"", + "instantiation_dict": {"title": "Not an interesting site"}, + "format_specification": null, + "start_url_context": "Complete the task given the context of the current page.", + "eval": [ { - "key": "expected_backend_state", - "category": "switch_to_api_eval", - "note": "Use API to verify; more accurate" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "group": "byteblaze", + "project": "gimmiethat.space", + "file_path": "index.html", + "substring": "Not an interesting site", + "branch": "main", + "site": "gitlab", + "expected": true } - ] + ], + "revision": 2 }, { - "sites": [ - "gitlab" - ], + "sites": ["gitlab"], "task_id": 444, - "require_login": true, - "storage_state": "./.auth/gitlab_state.json", - "start_url": "__GITLAB__/byteblaze/gimmiethat.space", - "geolocation": null, - "intent_template": "Update the project site's title to \"{{title}}\"", - "instantiation_dict": { - "title": "Title Wanted" - }, - "intent": "Update the project site's title to \"Title Wanted\"", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "file_has_substring", - "eval_params": { - "group": "byteblaze", - "project": "gimmiethat.space", - "file_path": "index.html", - "substring": "Title Wanted", - "branch": "main" - } - } - } - ], - "site": "gitlab" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "__GITLAB__/byteblaze/gimmiethat.space/-/raw/main/index.html", - "locator": "", - "required_contents": { - "must_include": [ - "Title Wanted" - ] - } - } - ] - }, "intent_template_id": 308, - "changelogs": [ + "start_urls": ["__GITLAB__/byteblaze/gimmiethat.space"], + "intent": "Update the project site's title to \"Title Wanted\"", + "intent_template": "Update the project site's title to \"{{title}}\"", + "instantiation_dict": {"title": "Title Wanted"}, + "format_specification": null, + "start_url_context": "Complete the task given the context of the current page.", + "eval": [ { - "key": "expected_backend_state", - "category": "switch_to_api_eval", - "note": "Use API to verify; more accurate" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "group": "byteblaze", + "project": "gimmiethat.space", + "file_path": "index.html", + "substring": "Title Wanted", + "branch": "main", + "site": "gitlab", + "expected": true } - ] + ], + "revision": 2 }, { - "sites": [ - "gitlab" - ], + "sites": ["gitlab"], "task_id": 445, - "require_login": true, - "storage_state": "./.auth/gitlab_state.json", - "start_url": "__GITLAB__/byteblaze/gimmiethat.space", - "geolocation": null, - "intent_template": "Update the project site's title to \"{{title}}\"", - "instantiation_dict": { - "title": "Hello" - }, - "intent": "Update the project site's title to \"Hello\"", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "file_has_substring", - "eval_params": { - "group": "byteblaze", - "project": "gimmiethat.space", - "file_path": "index.html", - "substring": "Hello", - "branch": "main" - } - } - } - ], - "site": "gitlab" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "__GITLAB__/byteblaze/gimmiethat.space/-/raw/main/index.html", - "locator": "", - "required_contents": { - "must_include": [ - "Hello" - ] - } - } - ] - }, "intent_template_id": 308, - "changelogs": [ + "start_urls": ["__GITLAB__/byteblaze/gimmiethat.space"], + "intent": "Update the project site's title to \"Hello\"", + "intent_template": "Update the project site's title to \"{{title}}\"", + "instantiation_dict": {"title": "Hello"}, + "format_specification": null, + "start_url_context": "Complete the task given the context of the current page.", + "eval": [ { - "key": "expected_backend_state", - "category": "switch_to_api_eval", - "note": "Use API to verify; more accurate" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "group": "byteblaze", + "project": "gimmiethat.space", + "file_path": "index.html", + "substring": "Hello", + "branch": "main", + "site": "gitlab", + "expected": true } - ] + ], + "revision": 2 }, { - "sites": [ - "gitlab" - ], + "sites": ["gitlab"], "task_id": 446, - "require_login": true, - "storage_state": "./.auth/gitlab_state.json", - "start_url": "__GITLAB__", - "geolocation": null, - "intent_template": "Assign the issue regarding {{issue}} in {{repo}} to {{account}}.", - "instantiation_dict": { - "repo": "a11yproject", - "issue": 404, - "account": "Roshanjossey" - }, - "intent": "Assign the issue regarding 404 in a11yproject to Roshanjossey.", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "issue_assignee_is", - "eval_params": { - "usernames": [ - "Roshanjossey" - ], - "group": "a11yproject", - "project": "a11yproject.com", - "issue_iid": 1478 - } - } - } - ], - "site": "gitlab" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "__GITLAB__/dashboard/issues?scope=all&state=opened&assignee_username=Roshanjossey", - "locator": "", - "required_contents": { - "must_include": [ - "404s, bad host, timeouts, bad urls for URLs linked from website" - ] - } - } - ] - }, "intent_template_id": 999, - "changelogs": [ + "start_urls": ["__GITLAB__"], + "intent": "Assign the issue regarding 404 in a11yproject to Roshanjossey.", + "intent_template": "Assign the issue regarding {{issue}} in {{repo}} to {{account}}.", + "instantiation_dict": {"repo": "a11yproject", "issue": 404, "account": "Roshanjossey"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "expected_backend_state", - "category": "switch_to_api_eval", - "note": "Use API to verify; more accurate" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "usernames": ["Roshanjossey"], + "group": "a11yproject", + "project": "a11yproject.com", + "issue_iid": 1478, + "site": "gitlab", + "expected": true } - ] + ], + "revision": 2 }, { - "sites": [ - "gitlab" - ], + "sites": ["gitlab"], "task_id": 447, - "require_login": true, - "storage_state": "./.auth/gitlab_state.json", - "start_url": "__GITLAB__", - "geolocation": null, + "intent_template_id": 999, + "start_urls": ["__GITLAB__"], + "intent": "Assign the issue regarding linking to an accessibility statement in a11y-webring.club to Rohan.", "intent_template": "Assign the issue regarding {{issue}} in {{repo}} to {{account}}.", "instantiation_dict": { "repo": "a11y-webring.club", "issue": "linking to an accessibility statement", "account": "Rohan" }, - "intent": "Assign the issue regarding linking to an accessibility statement in a11y-webring.club to Rohan.", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "issue_assignee_is", - "eval_params": { - "usernames": [ - "Seirdy" - ], - "group": "byteblaze", - "project": "a11y-webring.club", - "issue_iid": 71 - } - } - } - ], - "site": "gitlab" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "__GITLAB__/dashboard/issues?scope=all&state=opened&assignee_username=Seirdy", - "locator": "", - "required_contents": { - "must_include": [ - "linking to an accessibility statement" - ] - } - } - ] - }, - "intent_template_id": 999, - "changelogs": [ + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "expected_backend_state", - "category": "switch_to_api_eval", - "note": "Use API to verify; more accurate" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "usernames": ["Seirdy"], + "group": "byteblaze", + "project": "a11y-webring.club", + "issue_iid": 71, + "site": "gitlab", + "expected": true } - ] + ], + "revision": 2 }, { - "sites": [ - "gitlab" - ], + "sites": ["gitlab"], "task_id": 448, - "require_login": true, - "storage_state": "./.auth/gitlab_state.json", - "start_url": "__GITLAB__", - "geolocation": null, - "intent_template": "set the homepage URL on my GitLab profile to {{url}}", - "instantiation_dict": { - "url": "https://egg.tart.com" - }, - "intent": "set the homepage URL on my GitLab profile to https://egg.tart.com", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "is_user_website_url", - "eval_params": { - "username": "byteblaze", - "website_url": "https://egg.tart.com" - } - } - } - ], - "site": "gitlab" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "__GITLAB__/byteblaze", - "locator": "document.querySelector('.profile-header [itemprop=\"url\"]').outerText", - "required_contents": { - "exact_match": "egg.tart.com" - } - } - ] - }, "intent_template_id": 331, - "changelogs": [ + "start_urls": ["__GITLAB__"], + "intent": "set the homepage URL on my GitLab profile to https://egg.tart.com", + "intent_template": "set the homepage URL on my GitLab profile to {{url}}", + "instantiation_dict": {"url": "https://egg.tart.com"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "expected_backend_state", - "category": "switch_to_api_eval", - "note": "Use API to verify; more accurate" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "username": "byteblaze", + "website_url": "https://egg.tart.com", + "site": "gitlab", + "expected": true } - ] + ], + "revision": 2 }, { - "sites": [ - "gitlab" - ], + "sites": ["gitlab"], "task_id": 449, - "require_login": true, - "storage_state": "./.auth/gitlab_state.json", - "start_url": "__GITLAB__", - "geolocation": null, - "intent_template": "set the homepage URL on my GitLab profile to {{url}}", - "instantiation_dict": { - "url": "https://helloworld.xyz" - }, - "intent": "set the homepage URL on my GitLab profile to https://helloworld.xyz", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "is_user_website_url", - "eval_params": { - "username": "byteblaze", - "website_url": "https://helloworld.xyz" - } - } - } - ], - "site": "gitlab" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "__GITLAB__/byteblaze", - "locator": "document.querySelector('.profile-header [itemprop=\"url\"]').outerText", - "required_contents": { - "exact_match": "helloworld.xyz" - } - } - ] - }, "intent_template_id": 331, - "changelogs": [ + "start_urls": ["__GITLAB__"], + "intent": "set the homepage URL on my GitLab profile to https://helloworld.xyz", + "intent_template": "set the homepage URL on my GitLab profile to {{url}}", + "instantiation_dict": {"url": "https://helloworld.xyz"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "expected_backend_state", - "category": "switch_to_api_eval", - "note": "Use API to verify; more accurate" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "username": "byteblaze", + "website_url": "https://helloworld.xyz", + "site": "gitlab", + "expected": true } - ] + ], + "revision": 2 }, { - "sites": [ - "gitlab" - ], + "sites": ["gitlab"], "task_id": 450, - "require_login": true, - "storage_state": "./.auth/gitlab_state.json", - "start_url": "__GITLAB__", - "geolocation": null, - "intent_template": "set the homepage URL on my GitLab profile to {{url}}", - "instantiation_dict": { - "url": "a11yproject.contributor.me" - }, - "intent": "set the homepage URL on my GitLab profile to a11yproject.contributor.me", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "validate_status", - "expected_data": { - "expected_status": "ACTION_NOT_ALLOWED_ERROR" - } - } - } - ], - "site": "gitlab" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "__GITLAB__/byteblaze", - "locator": "document.querySelector('.profile-header [itemprop=\"url\"]').outerText", - "required_contents": { - "exact_match": "a11yproject.contributor.me" - } - } - ] - }, "intent_template_id": 331, - "changelogs": [ + "start_urls": ["__GITLAB__"], + "intent": "set the homepage URL on my GitLab profile to https://a11yproject.contributor.me", + "intent_template": "set the homepage URL on my GitLab profile to {{url}}", + "instantiation_dict": {"url": "https://a11yproject.contributor.me"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "expected_data", - "category": "unachievable_tasks", - "note": "Website URL invalid in GitLab; cannot fulfill" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "key": "expected_retrieve_value", - "category": "avoid_permissive_text", - "note": "Use structured response to judge correctness" + "evaluator": "BackendStateEvaluator", + "username": "byteblaze", + "website_url": "https://a11yproject.contributor.me", + "site": "gitlab", + "expected": true } - ] + ], + "revision": 2 }, { - "sites": [ - "gitlab" - ], + "sites": ["gitlab"], "task_id": 451, - "require_login": true, - "storage_state": "./.auth/gitlab_state.json", - "start_url": "__GITLAB__", - "geolocation": null, + "intent_template_id": 331, + "start_urls": ["__GITLAB__"], + "intent": "set the homepage URL on my GitLab profile to https://www.byteblaze.com", "intent_template": "set the homepage URL on my GitLab profile to {{url}}", - "instantiation_dict": { - "url": "www.byteblaze.com" - }, - "intent": "set the homepage URL on my GitLab profile to www.byteblaze.com", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "validate_status", - "expected_data": { - "expected_status": "ACTION_NOT_ALLOWED_ERROR" - } - } - } - ], - "site": "gitlab" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "__GITLAB__/byteblaze", - "locator": "document.querySelector('.profile-header [itemprop=\"url\"]').outerText", - "required_contents": { - "exact_match": "www.byteblaze.com" - } - } - ] - }, - "intent_template_id": 331, - "changelogs": [ + "instantiation_dict": {"url": "https://www.byteblaze.com"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "expected_data", - "category": "unachievable_tasks", - "note": "Website URL invalid in GitLab; cannot fulfill" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "key": "expected_retrieve_value", - "category": "avoid_permissive_text", - "note": "Use structured response to judge correctness" + "evaluator": "BackendStateEvaluator", + "username": "byteblaze", + "website_url": "https://www.byteblaze.com", + "site": "gitlab", + "expected": true } - ] + ], + "revision": 2 }, { - "sites": [ - "gitlab" - ], + "sites": ["gitlab"], "task_id": 452, - "require_login": true, - "storage_state": "./.auth/gitlab_state.json", - "start_url": "__GITLAB__", - "geolocation": null, - "intent_template": "set the homepage URL on my GitLab profile to {{url}}", - "instantiation_dict": { - "url": "byteblaze.github.io" - }, - "intent": "set the homepage URL on my GitLab profile to byteblaze.github.io", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "validate_status", - "expected_data": { - "expected_status": "ACTION_NOT_ALLOWED_ERROR" - } - } - } - ], - "site": "gitlab" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "__GITLAB__/byteblaze", - "locator": "document.querySelector('.profile-header [itemprop=\"url\"]').outerText", - "required_contents": { - "exact_match": "byteblaze.github.io" - } - } - ] - }, "intent_template_id": 331, - "changelogs": [ + "start_urls": ["__GITLAB__"], + "intent": "set the homepage URL on my GitLab profile to https://byteblaze.github.io", + "intent_template": "set the homepage URL on my GitLab profile to {{url}}", + "instantiation_dict": {"url": "https://byteblaze.github.io"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "expected_data", - "category": "unachievable_tasks", - "note": "Website URL not valid in GitLab" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "key": "expected_retrieve_value", - "category": "avoid_permissive_text", - "note": "Use structured response to judge correctness" + "evaluator": "BackendStateEvaluator", + "username": "byteblaze", + "website_url": "https://byteblaze.github.io", + "site": "gitlab", + "expected": true } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping_admin" - ], + "sites": ["shopping_admin"], "task_id": 453, - "require_login": true, - "storage_state": "./.auth/shopping_admin_state.json", - "start_url": "__SHOPPING_ADMIN__", - "geolocation": null, - "intent_template": "Disable {{product}} from the site, they are facing some quality issues.", - "instantiation_dict": { - "product": "Teton pullover hoodie" - }, - "intent": "Disable Teton pullover hoodie from the site, they are facing some quality issues.", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "verify_product_details", - "eval_params": { - "name": "Teton pullover hoodie" - }, - "expected_data": { - "is_disabled": true - } - } - } - ], - "site": "shopping_admin" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "__SHOPPING_ADMIN__/catalog/product/edit/id/78/", - "locator": "document.querySelector('[name=\"product[status]\"').value", - "required_contents": { - "exact_match": "2" - } - } - ] - }, "intent_template_id": 242, - "changelogs": [ + "start_urls": ["__SHOPPING_ADMIN__"], + "intent": "Disable Teton pullover hoodie from the site, they are facing some quality issues.", + "intent_template": "Disable {{product}} from the site, they are facing some quality issues.", + "instantiation_dict": {"product": "Teton pullover hoodie"}, + "format_specification": null, + "start_url_context": null, + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "name": "Teton pullover hoodie", + "site": "shopping_admin", + "expected": {"is_disabled": true} } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping_admin" - ], + "sites": ["shopping_admin"], "task_id": 454, - "require_login": true, - "storage_state": "./.auth/shopping_admin_state.json", - "start_url": "__SHOPPING_ADMIN__", - "geolocation": null, - "intent_template": "Disable {{product}} from the site, they are facing some quality issues.", - "instantiation_dict": { - "product": "Ryker Tee Crew Neck" - }, - "intent": "Disable Ryker Tee Crew Neck from the site, they are facing some quality issues.", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "verify_product_details", - "eval_params": { - "name": "Ryker LumaTech™ Tee (Crew-neck)" - }, - "expected_data": { - "is_disabled": true - } - } - } - ], - "site": "shopping_admin" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "__SHOPPING_ADMIN__/catalog/product/edit/id/478/", - "locator": "document.querySelector('[name=\"product[status]\"').value", - "required_contents": { - "exact_match": "2" - } - } - ] - }, "intent_template_id": 242, - "changelogs": [ + "start_urls": ["__SHOPPING_ADMIN__"], + "intent": "Disable Ryker Tee Crew Neck from the site, they are facing some quality issues.", + "intent_template": "Disable {{product}} from the site, they are facing some quality issues.", + "instantiation_dict": {"product": "Ryker Tee Crew Neck"}, + "format_specification": null, + "start_url_context": null, + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "name": "Ryker LumaTech™ Tee (Crew-neck)", + "site": "shopping_admin", + "expected": {"is_disabled": true} } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping_admin" - ], + "sites": ["shopping_admin"], "task_id": 455, - "require_login": true, - "storage_state": "./.auth/shopping_admin_state.json", - "start_url": "__SHOPPING_ADMIN__", - "geolocation": null, - "intent_template": "Disable {{product}} from the site, they are facing some quality issues.", - "instantiation_dict": { - "product": "Helios Endurance Tank" - }, - "original.instantiation_dict": { - "product": "lHelios Endurance Tank" - }, - "intent": "Disable Helios Endurance Tank from the site, they are facing some quality issues.", - "original.intent": "Disable lHelios Endurance Tank from the site, they are facing some quality issues.", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "verify_product_details", - "eval_params": { - "name": "Helios Endurance Tank" - }, - "expected_data": { - "is_disabled": true - } - } - } - ], - "site": "shopping_admin" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "__SHOPPING_ADMIN__/catalog/product/edit/id/676/", - "locator": "document.querySelector('[name=\"product[status]\"').value", - "required_contents": { - "exact_match": "2" - } - } - ] - }, "intent_template_id": 242, - "changelogs": [ + "start_urls": ["__SHOPPING_ADMIN__"], + "intent": "Disable Helios Endurance Tank from the site, they are facing some quality issues.", + "intent_template": "Disable {{product}} from the site, they are facing some quality issues.", + "instantiation_dict": {"product": "Helios Endurance Tank"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "expected_backend_state", - "category": "reference_alignment", - "note": "Product name misspelled causing failure to find product" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "name": "Helios Endurance Tank", + "site": "shopping_admin", + "expected": {"is_disabled": true} } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping_admin" - ], + "sites": ["shopping_admin"], "task_id": 456, - "require_login": true, - "storage_state": "./.auth/shopping_admin_state.json", - "start_url": "__SHOPPING_ADMIN__", - "geolocation": null, - "intent_template": "Disable {{product}} from the site, they are facing some quality issues.", - "instantiation_dict": { - "product": "Cora Pant" - }, - "intent": "Disable Cora Pant from the site, they are facing some quality issues.", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "verify_product_details", - "eval_params": { - "name": "Cora Parachute Pant" - }, - "expected_data": { - "is_disabled": true - } - } - } - ], - "site": "shopping_admin" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "__SHOPPING_ADMIN__/catalog/product/edit/id/1840/", - "locator": "document.querySelector('[name=\"product[status]\"').value", - "required_contents": { - "exact_match": "2" - } - } - ] - }, "intent_template_id": 242, - "changelogs": [ + "start_urls": ["__SHOPPING_ADMIN__"], + "intent": "Disable Cora Pant from the site, they are facing some quality issues.", + "intent_template": "Disable {{product}} from the site, they are facing some quality issues.", + "instantiation_dict": {"product": "Cora Pant"}, + "format_specification": null, + "start_url_context": null, + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "name": "Cora Parachute Pant", + "site": "shopping_admin", + "expected": {"is_disabled": true} } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping_admin" - ], + "sites": ["shopping_admin"], "task_id": 457, - "require_login": true, - "storage_state": "./.auth/shopping_admin_state.json", - "start_url": "__SHOPPING_ADMIN__", - "geolocation": null, - "intent_template": "Disable {{product}} from the site, they are facing some quality issues.", - "instantiation_dict": { - "product": "Karmen yoga pants" - }, - "intent": "Disable Karmen yoga pants from the site, they are facing some quality issues.", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "verify_product_details", - "eval_params": { - "name": "Karmen Yoga Pant" - }, - "expected_data": { - "is_disabled": true - } - } - } - ], - "site": "shopping_admin" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "__SHOPPING_ADMIN__/catalog/product/edit/id/1819/", - "locator": "document.querySelector('[name=\"product[status]\"').value", - "required_contents": { - "exact_match": "2" - } - } - ] - }, "intent_template_id": 242, - "changelogs": [ + "start_urls": ["__SHOPPING_ADMIN__"], + "intent": "Disable Karmen yoga pants from the site, they are facing some quality issues.", + "intent_template": "Disable {{product}} from the site, they are facing some quality issues.", + "instantiation_dict": {"product": "Karmen yoga pants"}, + "format_specification": null, + "start_url_context": null, + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "name": "Karmen Yoga Pant", + "site": "shopping_admin", + "expected": {"is_disabled": true} } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping_admin" - ], + "sites": ["shopping_admin"], "task_id": 458, - "require_login": true, - "storage_state": "./.auth/shopping_admin_state.json", - "start_url": "__SHOPPING_ADMIN__/catalog/product/edit/id/1481/", - "geolocation": null, - "intent_template": "{{action}} the price of this product by {{amount}}", - "instantiation_dict": { - "amount": "$5", - "action": "Reduce" - }, - "intent": "Reduce the price of this product by $5", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "verify_product_details", - "eval_params": { - "product_id": "1481" - }, - "expected_data": { - "price": "27.00" - } - } - } - ], - "site": "shopping_admin" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "__SHOPPING_ADMIN__/catalog/product/edit/id/1481/", - "locator": "document.querySelector('[name=\"product[price]\"').value", - "required_contents": { - "exact_match": "27.00" - } - } - ] - }, "intent_template_id": 247, - "changelogs": [ + "start_urls": ["__SHOPPING_ADMIN__/catalog/product/edit/id/1481/"], + "intent": "Reduce the price of this product by $5", + "intent_template": "{{action}} the price of this product by {{amount}}", + "instantiation_dict": {"amount": "$5", "performed_operation": "Reduce"}, + "format_specification": null, + "start_url_context": "Complete the task given the context of the current page.", + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "product_id": "1481", + "site": "shopping_admin", + "expected": {"price": "27.00"} } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping_admin" - ], + "sites": ["shopping_admin"], "task_id": 459, - "require_login": true, - "storage_state": "./.auth/shopping_admin_state.json", - "start_url": "__SHOPPING_ADMIN__/catalog/product/edit/id/237/", - "geolocation": null, - "intent_template": "{{action}} the price of this product by {{amount}}", - "instantiation_dict": { - "amount": "10%", - "action": "Reduce" - }, - "intent": "Reduce the price of this product by 10%", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "verify_product_details", - "eval_params": { - "product_id": "237" - }, - "expected_data": { - "price": "62.10" - } - } - } - ], - "site": "shopping_admin" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "__SHOPPING_ADMIN__/catalog/product/edit/id/237/", - "locator": "document.querySelector('[name=\"product[price]\"').value", - "required_contents": { - "exact_match": "62.10" - } - } - ] - }, "intent_template_id": 247, - "changelogs": [ + "start_urls": ["__SHOPPING_ADMIN__/catalog/product/edit/id/237/"], + "intent": "Reduce the price of this product by 10%", + "intent_template": "{{action}} the price of this product by {{amount}}", + "instantiation_dict": {"amount": "10%", "performed_operation": "Reduce"}, + "format_specification": null, + "start_url_context": "Complete the task given the context of the current page.", + "eval": [ { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + }, + { + "evaluator": "BackendStateEvaluator", + "product_id": "237", + "site": "shopping_admin", + "expected": {"price": "62.10"} } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping_admin" - ], + "sites": ["shopping_admin"], "task_id": 460, - "require_login": true, - "storage_state": "./.auth/shopping_admin_state.json", - "start_url": "__SHOPPING_ADMIN__/catalog/product/edit/id/418/", - "geolocation": null, - "intent_template": "{{action}} the price of this product by {{amount}}", - "instantiation_dict": { - "amount": "15%", - "action": "Reduce" - }, - "intent": "Reduce the price of this product by 15%", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "verify_product_details", - "eval_params": { - "product_id": "418" - }, - "expected_data": { - "price": "38.25" - } - } - } - ], - "site": "shopping_admin" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "__SHOPPING_ADMIN__/catalog/product/edit/id/418/", - "locator": "document.querySelector('[name=\"product[price]\"').value", - "required_contents": { - "exact_match": "38.25" - } - } - ] - }, "intent_template_id": 247, - "changelogs": [ + "start_urls": ["__SHOPPING_ADMIN__/catalog/product/edit/id/418/"], + "intent": "Reduce the price of this product by 15%", + "intent_template": "{{action}} the price of this product by {{amount}}", + "instantiation_dict": {"amount": "15%", "performed_operation": "Reduce"}, + "format_specification": null, + "start_url_context": "Complete the task given the context of the current page.", + "eval": [ { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + }, + { + "evaluator": "BackendStateEvaluator", + "product_id": "418", + "site": "shopping_admin", + "expected": {"price": "38.25"} } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping_admin" - ], + "sites": ["shopping_admin"], "task_id": 461, - "require_login": true, - "storage_state": "./.auth/shopping_admin_state.json", - "start_url": "__SHOPPING_ADMIN__/catalog/product/edit/id/721/", - "geolocation": null, - "intent_template": "{{action}} the price of this product by {{amount}}", - "instantiation_dict": { - "amount": "$11.5", - "action": "Increase" - }, - "intent": "Increase the price of this product by $11.5", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "verify_product_details", - "eval_params": { - "product_id": "721" - }, - "expected_data": { - "price": "29.50" - } - } - } - ], - "site": "shopping_admin" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "__SHOPPING_ADMIN__/catalog/product/edit/id/721/", - "locator": "document.querySelector('[name=\"product[price]\"').value", - "required_contents": { - "exact_match": "29.50" - } - } - ] - }, "intent_template_id": 247, - "changelogs": [ + "start_urls": ["__SHOPPING_ADMIN__/catalog/product/edit/id/721/"], + "intent": "Increase the price of this product by $11.5", + "intent_template": "{{action}} the price of this product by {{amount}}", + "instantiation_dict": {"amount": "$11.5", "performed_operation": "Increase"}, + "format_specification": null, + "start_url_context": "Complete the task given the context of the current page.", + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "product_id": "721", + "site": "shopping_admin", + "expected": {"price": "29.50"} } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping_admin" - ], + "sites": ["shopping_admin"], "task_id": 462, - "require_login": true, - "storage_state": "./.auth/shopping_admin_state.json", - "start_url": "__SHOPPING_ADMIN__/catalog/product/edit/id/710/", - "geolocation": null, - "intent_template": "{{action}} the price of this product by {{amount}}", - "instantiation_dict": { - "amount": "10%", - "action": "Increase" - }, - "intent": "Increase the price of this product by 10%", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "verify_product_details", - "eval_params": { - "product_id": "710" - }, - "expected_data": { - "price": "19.80" - } - } - } - ], - "site": "shopping_admin" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "__SHOPPING_ADMIN__/catalog/product/edit/id/710/", - "locator": "document.querySelector('[name=\"product[price]\"').value", - "required_contents": { - "exact_match": "19.80" - } - } - ] - }, "intent_template_id": 247, - "changelogs": [ + "start_urls": ["__SHOPPING_ADMIN__/catalog/product/edit/id/710/"], + "intent": "Increase the price of this product by 10%", + "intent_template": "{{action}} the price of this product by {{amount}}", + "instantiation_dict": {"amount": "10%", "performed_operation": "Increase"}, + "format_specification": null, + "start_url_context": "Complete the task given the context of the current page.", + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "product_id": "710", + "site": "shopping_admin", + "expected": {"price": "19.80"} } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping_admin" - ], + "sites": ["shopping_admin"], "task_id": 463, - "require_login": true, - "storage_state": "./.auth/shopping_admin_state.json", - "start_url": "__SHOPPING_ADMIN__/catalog/product/edit/id/996/", - "geolocation": null, - "intent_template": "{{action}} the price of this product by {{amount}}", - "instantiation_dict": { - "amount": "15%", - "action": "Increase" - }, - "intent": "Increase the price of this product by 15%", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "verify_product_details", - "eval_params": { - "product_id": "996" - }, - "expected_data": { - "price": "36.80" - } - } - } - ], - "site": "shopping_admin" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "__SHOPPING_ADMIN__/catalog/product/edit/id/996/", - "locator": "document.querySelector('[name=\"product[price]\"').value", - "required_contents": { - "exact_match": "36.80" - } - } - ] - }, "intent_template_id": 247, - "changelogs": [ + "start_urls": ["__SHOPPING_ADMIN__/catalog/product/edit/id/996/"], + "intent": "Increase the price of this product by 15%", + "intent_template": "{{action}} the price of this product by {{amount}}", + "instantiation_dict": {"amount": "15%", "performed_operation": "Increase"}, + "format_specification": null, + "start_url_context": "Complete the task given the context of the current page.", + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "product_id": "996", + "site": "shopping_admin", + "expected": {"price": "36.80"} } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping_admin" - ], + "sites": ["shopping_admin"], "task_id": 464, - "require_login": true, - "storage_state": "./.auth/shopping_admin_state.json", - "start_url": "__SHOPPING_ADMIN__", - "geolocation": null, - "intent_template": "Update the product description of {{product}} with the titles of all reviews with 4 stars or above.", - "original.intent_template": "Update the product description of {{product}} to highlight the real user positive reviews by quoting the comments", - "instantiation_dict": { - "product": "Antonia Racer Tank" - }, - "intent": "Update the product description of Antonia Racer Tank with the titles of all reviews with 4 stars or above.", - "original.intent": "Update the product description of Antonia Racer Tank to highlight the real user positive reviews by quoting the comments", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "verify_product_details", - "eval_params": { - "product_id": "1796" - }, - "expected_data": { - "description": [ - "A regular or me" - ] - } - } - } - ], - "site": "shopping_admin" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "__SHOPPING_ADMIN__/../antonia-racer-tank.html", - "locator": "document.querySelector('.data.item.content').outerText + (document.querySelector('.product.attribute.overview [itemprop=\"description\"]')?.outerText || '')", - "required_contents": { - "must_include": [ - "This is in regular rotation at the gym", - "Its colorful and looks kinda cute under my exercise tanks", - "it's very stylish for yoga or something else low impact" - ] - } - } - ] - }, "intent_template_id": 251, - "changelogs": [ + "start_urls": ["__SHOPPING_ADMIN__"], + "intent": "Update the product description of Antonia Racer Tank with the titles of all reviews with 4 stars or above.", + "intent_template": "Update the product description of {{product}} with the titles of all reviews with 4 stars or above.", + "instantiation_dict": {"product": "Antonia Racer Tank"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "intent_template", - "category": "permissive_string_match", - "note": "Clarify the intent to specify exactly what to update the value with" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "product_id": "1796", + "site": "shopping_admin", + "expected": { "description": ["A regular or me"] } } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping" - ], + "sites": ["shopping"], "task_id": 465, - "require_login": true, - "storage_state": "./.auth/shopping_state.json", - "start_url": "__SHOPPING__", - "geolocation": null, + "intent_template_id": 186, + "start_urls": ["__SHOPPING__"], + "intent": "Add Tide PODS Spring Meadow Scent HE Turbo Laundry Detergent Pacs, 81 Count to my wish list", "intent_template": "Add {{product}} to my wish list", "instantiation_dict": { "product": "Tide PODS Spring Meadow Scent HE Turbo Laundry Detergent Pacs, 81 Count" }, - "intent": "Add Tide PODS Spring Meadow Scent HE Turbo Laundry Detergent Pacs, 81 Count to my wish list", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "validate_in_wishlist", - "expected_data": { - "sku": "B074QVN413" - } - } - } - ], - "site": "shopping" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "__SHOPPING__/wishlist/", - "locator": "document.querySelector('.products-grid.wishlist').outerText", - "required_contents": { - "must_include": [ - "Tide PODS Spring Meadow Scent HE Turbo Laundry Detergent Pacs, 81 Count" - ] - } - } - ] - }, - "intent_template_id": 186, - "changelogs": [ + "format_specification": null, + "start_url_context": null, + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "site": "shopping", + "expected": {"sku": "B074QVN413"} } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping" - ], + "sites": ["shopping"], "task_id": 466, - "require_login": true, - "storage_state": "./.auth/shopping_state.json", - "start_url": "__SHOPPING__", - "geolocation": null, - "intent_template": "Add {{product}} to my wish list", - "instantiation_dict": { - "product": "2 Hawaiian Bamboo Orchid Roots #zc50 - by Discount Hawaiian Gifts" - }, - "intent": "Add 2 Hawaiian Bamboo Orchid Roots #zc50 - by Discount Hawaiian Gifts to my wish list", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "validate_in_wishlist", - "expected_data": { - "sku": "B01M1RMOLX" - } - } - } - ], - "site": "shopping" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "__SHOPPING__/wishlist/", - "locator": "document.querySelector('.products-grid.wishlist').outerText", - "required_contents": { - "must_include": [ - "2 Hawaiian Bamboo Orchid Roots #zc50 - by Discount Hawaiian Gifts" - ] - } - } - ] - }, "intent_template_id": 186, - "changelogs": [ + "start_urls": ["__SHOPPING__"], + "intent": "Add 2 Hawaiian Bamboo Orchid Roots #zc50 - by Discount Hawaiian Gifts to my wish list", + "intent_template": "Add {{product}} to my wish list", + "instantiation_dict": {"product": "2 Hawaiian Bamboo Orchid Roots #zc50 - by Discount Hawaiian Gifts"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + }, + { + "evaluator": "BackendStateEvaluator", + "site": "shopping", + "expected": {"sku": "B01M1RMOLX"} } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping" - ], + "sites": ["shopping"], "task_id": 467, - "require_login": true, - "storage_state": "./.auth/shopping_state.json", - "start_url": "__SHOPPING__", - "geolocation": null, + "intent_template_id": 186, + "start_urls": ["__SHOPPING__"], + "intent": "Add HONGJ Hawaiian Beach Outfits Set for Mens, Summer Tropical Tree Printed Relaxed-fit Hawaii Shirts Shorts 2 Piece Suits to my wish list", "intent_template": "Add {{product}} to my wish list", "instantiation_dict": { "product": "HONGJ Hawaiian Beach Outfits Set for Mens, Summer Tropical Tree Printed Relaxed-fit Hawaii Shirts Shorts 2 Piece Suits" }, - "intent": "Add HONGJ Hawaiian Beach Outfits Set for Mens, Summer Tropical Tree Printed Relaxed-fit Hawaii Shirts Shorts 2 Piece Suits to my wish list", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "validate_in_wishlist", - "expected_data": { - "sku": "B09STCV25D" - } - } - } - ], - "site": "shopping" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "__SHOPPING__/wishlist/", - "locator": "document.querySelector('.products-grid.wishlist').outerText", - "required_contents": { - "must_include": [ - "HONGJ Hawaiian Beach Outfits Set for Mens, Summer Tropical Tree Printed Relaxed-fit Hawaii Shirts Shorts 2 Piece Suits" - ] - } - } - ] - }, - "intent_template_id": 186, - "changelogs": [ + "format_specification": null, + "start_url_context": null, + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "site": "shopping", + "expected": {"sku": "B09STCV25D"} } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping" - ], + "sites": ["shopping"], "task_id": 468, - "require_login": true, - "storage_state": "./.auth/shopping_state.json", - "start_url": "__SHOPPING__", - "geolocation": null, + "intent_template_id": 186, + "start_urls": ["__SHOPPING__"], + "intent": "Add DkRgVNY Lace Spcling Lingerie Womens Sexy Hollow Out Underwear Bodysuit One Piece Snap Crotch Clubwear Teddy Bodysuit to my wish list", "intent_template": "Add {{product}} to my wish list", "instantiation_dict": { "product": "DkRgVNY Lace Spcling Lingerie Womens Sexy Hollow Out Underwear Bodysuit One Piece Snap Crotch Clubwear Teddy Bodysuit" }, - "intent": "Add DkRgVNY Lace Spcling Lingerie Womens Sexy Hollow Out Underwear Bodysuit One Piece Snap Crotch Clubwear Teddy Bodysuit to my wish list", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "validate_in_wishlist", - "expected_data": { - "sku": "B09QZRWT97" - } - } - } - ], - "site": "shopping" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "__SHOPPING__/wishlist/", - "locator": "document.querySelector('.products-grid.wishlist').outerText", - "required_contents": { - "must_include": [ - "DkRgVNY Lace Spcling Lingerie Womens Sexy Hollow Out Underwear Bodysuit One Piece Snap Crotch Clubwear Teddy Bodysuit" - ] - } - } - ] - }, - "intent_template_id": 186, - "changelogs": [ + "format_specification": null, + "start_url_context": null, + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "site": "shopping", + "expected": {"sku": "B09QZRWT97"} } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping" - ], + "sites": ["shopping"], "task_id": 469, - "require_login": true, - "storage_state": "./.auth/shopping_state.json", - "start_url": "__SHOPPING__", - "geolocation": null, + "intent_template_id": 186, + "start_urls": ["__SHOPPING__"], + "intent": "Add Light Blue Simple Summer New Low Heels Slippers for Women Fashion Chunky Heels Pointed Toe Wine Glasses Sandals Comfortable Walking Shoes Ladies All-Match Sexy Party Shoes to my wish list", "intent_template": "Add {{product}} to my wish list", "instantiation_dict": { "product": "Light Blue Simple Summer New Low Heels Slippers for Women Fashion Chunky Heels Pointed Toe Wine Glasses Sandals Comfortable Walking Shoes Ladies All-Match Sexy Party Shoes" }, - "intent": "Add Light Blue Simple Summer New Low Heels Slippers for Women Fashion Chunky Heels Pointed Toe Wine Glasses Sandals Comfortable Walking Shoes Ladies All-Match Sexy Party Shoes to my wish list", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "validate_in_wishlist", - "expected_data": { - "sku": "B09QXM7B42" - } - } - } - ], - "site": "shopping" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "__SHOPPING__/wishlist/", - "locator": "document.querySelector('.products-grid.wishlist').outerText", - "required_contents": { - "must_include": [ - "Light Blue Simple Summer New Low Heels Slippers for Women Fashion Chunky Heels Pointed Toe Wine Glasses Sandals Comfortable Walking Shoes Ladies All-Match Sexy Party Shoes" - ] - } - } - ] - }, - "intent_template_id": 186, - "changelogs": [ + "format_specification": null, + "start_url_context": null, + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "site": "shopping", + "expected": {"sku": "B09QXM7B42"} } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping_admin" - ], + "sites": ["shopping_admin"], "task_id": 470, - "require_login": true, - "storage_state": "./.auth/shopping_admin_state.json", - "start_url": "__SHOPPING_ADMIN__", - "geolocation": null, - "intent_template": "Cancel order {{id}}", - "instantiation_dict": { - "id": "302" - }, - "intent": "Cancel order 302", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "verify_order_details", - "eval_params": { - "order_id": "302" - }, - "expected_data": { - "status": "Canceled" - } - } - } - ], - "site": "shopping_admin" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "__SHOPPING_ADMIN__/sales/order/view/order_id/302/", - "locator": "document.querySelector(\"#order_status\").outerText", - "required_contents": { - "exact_match": "Canceled" - } - } - ] - }, "intent_template_id": 257, - "changelogs": [ + "start_urls": ["__SHOPPING_ADMIN__"], + "intent": "Cancel order 302", + "intent_template": "Cancel order {{id}}", + "instantiation_dict": {"id": "302"}, + "format_specification": null, + "start_url_context": null, + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "order_id": "302", + "site": "shopping_admin", + "expected": {"status": "Canceled"} } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping_admin" - ], + "sites": ["shopping_admin"], "task_id": 471, - "require_login": true, - "storage_state": "./.auth/shopping_admin_state.json", - "start_url": "__SHOPPING_ADMIN__", - "geolocation": null, - "intent_template": "Cancel order {{id}}", - "instantiation_dict": { - "id": "307" - }, - "intent": "Cancel order 307", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "verify_order_details", - "eval_params": { - "order_id": "307" - }, - "expected_data": { - "status": "Canceled" - } - } - } - ], - "site": "shopping_admin" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "__SHOPPING_ADMIN__/sales/order/view/order_id/307/", - "locator": "document.querySelector(\"#order_status\").outerText", - "required_contents": { - "exact_match": "Canceled" - } - } - ] - }, "intent_template_id": 257, - "changelogs": [ + "start_urls": ["__SHOPPING_ADMIN__"], + "intent": "Cancel order 307", + "intent_template": "Cancel order {{id}}", + "instantiation_dict": {"id": "307"}, + "format_specification": null, + "start_url_context": null, + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "order_id": "307", + "site": "shopping_admin", + "expected": {"status": "Canceled"} } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping_admin" - ], + "sites": ["shopping_admin"], "task_id": 472, - "require_login": true, - "storage_state": "./.auth/shopping_admin_state.json", - "start_url": "__SHOPPING_ADMIN__", - "geolocation": null, - "intent_template": "Cancel order {{id}}", - "instantiation_dict": { - "id": "299" - }, - "intent": "Cancel order 299", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "verify_order_details", - "eval_params": { - "order_id": "299" - }, - "expected_data": { - "status": "Canceled" - } - } - } - ], - "site": "shopping_admin" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "__SHOPPING_ADMIN__/sales/order/view/order_id/299/", - "locator": "document.querySelector(\"#order_status\").outerText", - "required_contents": { - "exact_match": "Canceled" - } - } - ] - }, "intent_template_id": 257, - "changelogs": [ + "start_urls": ["__SHOPPING_ADMIN__"], + "intent": "Cancel order 299", + "intent_template": "Cancel order {{id}}", + "instantiation_dict": {"id": "299"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + }, + { + "evaluator": "BackendStateEvaluator", + "order_id": "299", + "site": "shopping_admin", + "expected": {"status": "Canceled"} } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping_admin" - ], + "sites": ["shopping_admin"], "task_id": 473, - "require_login": true, - "storage_state": "./.auth/shopping_admin_state.json", - "start_url": "__SHOPPING_ADMIN__", - "geolocation": null, - "intent_template": "Cancel order {{id}}", - "instantiation_dict": { - "id": "301" - }, - "intent": "Cancel order 301", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "verify_order_details", - "eval_params": { - "order_id": "301" - }, - "expected_data": { - "status": "Canceled" - } - } - } - ], - "site": "shopping_admin" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "__SHOPPING_ADMIN__/sales/order/view/order_id/301/", - "locator": "document.querySelector(\"#order_status\").outerText", - "required_contents": { - "exact_match": "Canceled" - } - } - ] - }, "intent_template_id": 257, - "changelogs": [ + "start_urls": ["__SHOPPING_ADMIN__"], + "intent": "Cancel order 301", + "intent_template": "Cancel order {{id}}", + "instantiation_dict": {"id": "301"}, + "format_specification": null, + "start_url_context": null, + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "order_id": "301", + "site": "shopping_admin", + "expected": {"status": "Canceled"} } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping_admin" - ], + "sites": ["shopping_admin"], "task_id": 474, - "require_login": true, - "storage_state": "./.auth/shopping_admin_state.json", - "start_url": "__SHOPPING_ADMIN__", - "geolocation": null, - "intent_template": "Cancel order {{id}}", - "instantiation_dict": { - "id": "305" - }, - "intent": "Cancel order 305", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "verify_order_details", - "eval_params": { - "order_id": "305" - }, - "expected_data": { - "status": "Canceled" - } - } - } - ], - "site": "shopping_admin" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "__SHOPPING_ADMIN__/sales/order/view/order_id/305/", - "locator": "document.querySelector(\"#order_status\").outerText", - "required_contents": { - "exact_match": "Canceled" - } - } - ] - }, "intent_template_id": 257, - "changelogs": [ + "start_urls": ["__SHOPPING_ADMIN__"], + "intent": "Cancel order 305", + "intent_template": "Cancel order {{id}}", + "instantiation_dict": {"id": "305"}, + "format_specification": null, + "start_url_context": null, + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "order_id": "305", + "site": "shopping_admin", + "expected": {"status": "Canceled"} } - ] + ], + "revision": 2 }, { - "sites": [ - "gitlab" - ], + "sites": ["gitlab"], "task_id": 475, - "require_login": true, - "storage_state": "./.auth/gitlab_state.json", - "start_url": "__GITLAB__", - "geolocation": null, - "intent_template": "Set up a new, empty repository with the name {{project_name}}?", - "instantiation_dict": { - "project_name": "chatgpt_plugin" - }, - "intent": "Set up a new, empty repository with the name chatgpt_plugin?", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "user_has_project", - "eval_params": { - "project": "chatgpt_plugin" - } - } - } - ], - "site": "gitlab" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "__GITLAB__/byteblaze/chatgpt_plugin", - "locator": "", - "required_contents": { - "must_include": [ - "chatgpt_plugin" - ] - } - } - ] - }, "intent_template_id": 292, - "changelogs": [ + "start_urls": ["__GITLAB__"], + "intent": "Set up a new, empty repository with the name chatgpt_plugin?", + "intent_template": "Set up a new, empty repository with the name {{project_name}}?", + "instantiation_dict": {"project_name": "chatgpt_plugin"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "expected_backend_state", - "category": "switch_to_api_eval", - "note": "Use API to verify; more accurate" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "project": "chatgpt_plugin", + "site": "gitlab", + "expected": true } - ] + ], + "revision": 2 }, { - "sites": [ - "gitlab" - ], + "sites": ["gitlab"], "task_id": 476, - "require_login": true, - "storage_state": "./.auth/gitlab_state.json", - "start_url": "__GITLAB__", - "geolocation": null, - "intent_template": "Set up a new, empty repository with the name {{project_name}}?", - "instantiation_dict": { - "project_name": "awesome_llm_reading" - }, - "intent": "Set up a new, empty repository with the name awesome_llm_reading?", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "user_has_project", - "eval_params": { - "project": "awesome_llm_reading" - } - } - } - ], - "site": "gitlab" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "__GITLAB__/byteblaze/awesome_llm_reading", - "locator": "", - "required_contents": { - "must_include": [ - "awesome_llm_reading" - ] - } - } - ] - }, "intent_template_id": 292, - "changelogs": [ + "start_urls": ["__GITLAB__"], + "intent": "Set up a new, empty repository with the name awesome_llm_reading?", + "intent_template": "Set up a new, empty repository with the name {{project_name}}?", + "instantiation_dict": {"project_name": "awesome_llm_reading"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "expected_backend_state", - "category": "switch_to_api_eval", - "note": "Use API to verify; more accurate" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "project": "awesome_llm_reading", + "site": "gitlab", + "expected": true } - ] + ], + "revision": 2 }, { - "sites": [ - "gitlab" - ], + "sites": ["gitlab"], "task_id": 477, - "require_login": true, - "storage_state": "./.auth/gitlab_state.json", - "start_url": "__GITLAB__", - "geolocation": null, - "intent_template": "Set up a new, empty repository with the name {{project_name}}?", - "instantiation_dict": { - "project_name": "awesome_program_aided_reasoning" - }, - "intent": "Set up a new, empty repository with the name awesome_program_aided_reasoning?", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "user_has_project", - "eval_params": { - "project": "awesome_program_aided_reasoning" - } - } - } - ], - "site": "gitlab" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "__GITLAB__/byteblaze/awesome_program_aided_reasoning", - "locator": "", - "required_contents": { - "must_include": [ - "awesome_program_aided_reasoning" - ] - } - } - ] - }, "intent_template_id": 292, - "changelogs": [ + "start_urls": ["__GITLAB__"], + "intent": "Set up a new, empty repository with the name awesome_program_aided_reasoning?", + "intent_template": "Set up a new, empty repository with the name {{project_name}}?", + "instantiation_dict": {"project_name": "awesome_program_aided_reasoning"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "expected_backend_state", - "category": "switch_to_api_eval", - "note": "Use API to verify; more accurate" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "project": "awesome_program_aided_reasoning", + "site": "gitlab", + "expected": true } - ] + ], + "revision": 2 }, { - "sites": [ - "gitlab" - ], + "sites": ["gitlab"], "task_id": 478, - "require_login": true, - "storage_state": "./.auth/gitlab_state.json", - "start_url": "__GITLAB__", - "geolocation": null, - "intent_template": "Set up a new, empty repository with the name {{project_name}}?", - "instantiation_dict": { - "project_name": "webagent" - }, - "intent": "Set up a new, empty repository with the name webagent?", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "user_has_project", - "eval_params": { - "project": "webagent" - } - } - } - ], - "site": "gitlab" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "__GITLAB__/byteblaze/webagent", - "locator": "", - "required_contents": { - "must_include": [ - "webagent" - ] - } - } - ] - }, "intent_template_id": 292, - "changelogs": [ + "start_urls": ["__GITLAB__"], + "intent": "Set up a new, empty repository with the name webagent?", + "intent_template": "Set up a new, empty repository with the name {{project_name}}?", + "instantiation_dict": {"project_name": "webagent"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "expected_backend_state", - "category": "switch_to_api_eval", - "note": "Use API to verify; more accurate" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "project": "webagent", + "site": "gitlab", + "expected": true } - ] + ], + "revision": 2 }, { - "sites": [ - "gitlab" - ], + "sites": ["gitlab"], "task_id": 479, - "require_login": true, - "storage_state": "./.auth/gitlab_state.json", - "start_url": "__GITLAB__", - "geolocation": null, - "intent_template": "Set up a new, empty repository with the name {{project_name}}?", - "instantiation_dict": { - "project_name": "awesome_webagent" - }, + "intent_template_id": 292, + "start_urls": ["__GITLAB__"], "intent": "Set up a new, empty repository with the name awesome_webagent?", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "user_has_project", - "eval_params": { - "project": "awesome_webagent" - } - } - } - ], - "site": "gitlab" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "__GITLAB__/byteblaze/awesome_webagent", - "locator": "", - "required_contents": { - "must_include": [ - "awesome_webagent" - ] - } - } - ] - }, - "intent_template_id": 292, - "changelogs": [ + "intent_template": "Set up a new, empty repository with the name {{project_name}}?", + "instantiation_dict": {"project_name": "awesome_webagent"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "expected_backend_state", - "category": "switch_to_api_eval", - "note": "Use API to verify; more accurate" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "project": "awesome_webagent", + "site": "gitlab", + "expected": true } - ] + ], + "revision": 2 }, { - "sites": [ - "gitlab" - ], + "sites": ["gitlab"], "task_id": 480, - "require_login": true, - "storage_state": "./.auth/gitlab_state.json", - "start_url": "__GITLAB__", - "geolocation": null, - "intent_template": "Invite {{collaborator_account_list}} as collaborator to {{repo}}", - "instantiation_dict": { - "collaborator_account_list": "yjlou", - "repo": "solarized-prism-theme" - }, - "intent": "Invite yjlou as collaborator to solarized-prism-theme", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "project_has_collaborators", - "eval_params": { - "collaborators": [ - "yjlou" - ], - "group": "byteblaze", - "project": "solarized-prism-theme" - } - } - } - ], - "site": "gitlab" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "__GITLAB__/byteblaze/solarized-prism-theme/-/project_members", - "locator": "", - "required_contents": { - "must_include": [ - "yjlou" - ] - } - } - ] - }, "intent_template_id": 293, - "changelogs": [ + "start_urls": ["__GITLAB__"], + "intent": "Invite yjlou as collaborator(s) to solarized-prism-theme repo", + "intent_template": "Invite {{collaborator_account_list}} as collaborator(s) to {{repo}} repo", + "instantiation_dict": {"collaborator_account_list": "yjlou", "repo": "solarized-prism-theme"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "expected_backend_state", - "category": "switch_to_api_eval", - "note": "Use API to verify; more accurate" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "collaborators": ["yjlou"], + "group": "byteblaze", + "project": "solarized-prism-theme", + "site": "gitlab", + "expected": true } - ] + ], + "revision": 2 }, { - "sites": [ - "gitlab" - ], + "sites": ["gitlab"], "task_id": 481, - "require_login": true, - "storage_state": "./.auth/gitlab_state.json", - "start_url": "__GITLAB__", - "geolocation": null, - "intent_template": "{{name}} wants to check my dotfile configurations. Please invite him to the repo as a guest.", - "instantiation_dict": { - "name": "Abishek" - }, - "intent": "Abishek wants to check my dotfile configurations. Please invite him to the repo as a guest.", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "project_has_collaborators", - "eval_params": { - "collaborators": [ - "abisubramanya27" - ], - "group": "byteblaze", - "project": "dotfiles", - "access_level": 10 - } - } - } - ], - "site": "gitlab" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "__GITLAB__/byteblaze/dotfiles/-/project_members", - "locator": "func:gitlab_get_project_memeber_role(__page__, 'abisubramanya27')", - "required_contents": { - "must_include": [ - "Guest" - ] - } - } - ] - }, "intent_template_id": 294, - "changelogs": [ + "start_urls": ["__GITLAB__"], + "intent": "Abishek wants to check my dotfile configurations. Please invite him to the repo as a guest.", + "intent_template": "{{name}} wants to check my dotfile configurations. Please invite him to the repo as a guest.", + "instantiation_dict": {"name": "Abishek"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "expected_backend_state", - "category": "switch_to_api_eval", - "note": "Use API to verify; more accurate" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "collaborators": ["abisubramanya27"], + "group": "byteblaze", + "project": "dotfiles", + "access_level": 10, + "site": "gitlab", + "expected": true } - ] + ], + "revision": 2 }, { - "sites": [ - "gitlab" - ], + "sites": ["gitlab"], "task_id": 482, - "require_login": true, - "storage_state": "./.auth/gitlab_state.json", - "start_url": "__GITLAB__", - "geolocation": null, - "intent_template": "{{name}} wants to check my dotfile configurations. Please invite him to the repo as a guest.", - "instantiation_dict": { - "name": "yjlou" - }, - "intent": "yjlou wants to check my dotfile configurations. Please invite him to the repo as a guest.", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "project_has_collaborators", - "eval_params": { - "collaborators": [ - "yjlou" - ], - "group": "byteblaze", - "project": "dotfiles", - "access_level": 10 - } - } - } - ], - "site": "gitlab" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "__GITLAB__/byteblaze/dotfiles/-/project_members", - "locator": "func:gitlab_get_project_memeber_role(__page__, 'yjlou')", - "required_contents": { - "must_include": [ - "Guest" - ] - } - } - ] - }, "intent_template_id": 294, - "changelogs": [ + "start_urls": ["__GITLAB__"], + "intent": "yjlou wants to check my dotfile configurations. Please invite him to the repo as a guest.", + "intent_template": "{{name}} wants to check my dotfile configurations. Please invite him to the repo as a guest.", + "instantiation_dict": {"name": "yjlou"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "expected_backend_state", - "category": "switch_to_api_eval", - "note": "Use API to verify; more accurate" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "collaborators": ["yjlou"], + "group": "byteblaze", + "project": "dotfiles", + "access_level": 10, + "site": "gitlab", + "expected": true } - ] + ], + "revision": 2 }, { - "sites": [ - "gitlab" - ], + "sites": ["gitlab"], "task_id": 483, - "require_login": true, - "storage_state": "./.auth/gitlab_state.json", - "start_url": "__GITLAB__", - "geolocation": null, - "intent_template": "{{name}} wants to check my dotfile configurations. Please invite him to the repo as a guest.", - "instantiation_dict": { - "name": "Koushik" - }, - "intent": "Koushik wants to check my dotfile configurations. Please invite him to the repo as a guest.", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "project_has_collaborators", - "eval_params": { - "collaborators": [ - "koush" - ], - "group": "byteblaze", - "project": "dotfiles", - "access_level": 10 - } - } - } - ], - "site": "gitlab" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "__GITLAB__/byteblaze/dotfiles/-/project_members", - "locator": "func:gitlab_get_project_memeber_role(__page__, 'koush')", - "required_contents": { - "must_include": [ - "Guest" - ] - } - } - ] - }, "intent_template_id": 294, - "changelogs": [ + "start_urls": ["__GITLAB__"], + "intent": "Koushik wants to check my dotfile configurations. Please invite him to the repo as a guest.", + "intent_template": "{{name}} wants to check my dotfile configurations. Please invite him to the repo as a guest.", + "instantiation_dict": {"name": "Koushik"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "expected_backend_state", - "category": "switch_to_api_eval", - "note": "Use API to verify; more accurate" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "collaborators": ["koush"], + "group": "byteblaze", + "project": "dotfiles", + "access_level": 10, + "site": "gitlab", + "expected": true } - ] + ], + "revision": 2 }, { - "sites": [ - "gitlab" - ], + "sites": ["gitlab"], "task_id": 484, - "require_login": true, - "storage_state": "./.auth/gitlab_state.json", - "start_url": "__GITLAB__", - "geolocation": null, - "intent_template": "{{name}} wants to check my dotfile configurations. Please invite him to the repo as a guest.", - "instantiation_dict": { - "name": "Jakub Klinkovsk\u00fd" - }, - "intent": "Jakub Klinkovsk\u00fd wants to check my dotfile configurations. Please invite him to the repo as a guest.", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "project_has_collaborators", - "eval_params": { - "collaborators": [ - "lahwaacz" - ], - "group": "byteblaze", - "project": "dotfiles", - "access_level": 10 - } - } - } - ], - "site": "gitlab" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "__GITLAB__/byteblaze/dotfiles/-/project_members", - "locator": "func:gitlab_get_project_memeber_role(__page__, 'lahwaacz')", - "required_contents": { - "must_include": [ - "Guest" - ] - } - } - ] - }, "intent_template_id": 294, - "changelogs": [ + "start_urls": ["__GITLAB__"], + "intent": "Jakub Klinkovsk\u00fd wants to check my dotfile configurations. Please invite him to the repo as a guest.", + "intent_template": "{{name}} wants to check my dotfile configurations. Please invite him to the repo as a guest.", + "instantiation_dict": {"name": "Jakub Klinkovsk\u00fd"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "expected_backend_state", - "category": "switch_to_api_eval", - "note": "Use API to verify; more accurate" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "collaborators": ["lahwaacz"], + "group": "byteblaze", + "project": "dotfiles", + "access_level": 10, + "site": "gitlab", + "expected": true } - ] + ], + "revision": 2 }, { - "sites": [ - "gitlab" - ], + "sites": ["gitlab"], "task_id": 485, - "require_login": true, - "storage_state": "./.auth/gitlab_state.json", - "start_url": "__GITLAB__", - "geolocation": null, - "intent_template": "{{name}} wants to check my dotfile configurations. Please invite him to the repo as a guest.", - "instantiation_dict": { - "name": "Vinta" - }, - "intent": "Vinta wants to check my dotfile configurations. Please invite him to the repo as a guest.", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "project_has_collaborators", - "eval_params": { - "collaborators": [ - "vinta" - ], - "group": "byteblaze", - "project": "dotfiles", - "access_level": 10 - } - } - } - ], - "site": "gitlab" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "__GITLAB__/byteblaze/dotfiles/-/project_members", - "locator": "func:gitlab_get_project_memeber_role(__page__, 'vinta')", - "required_contents": { - "must_include": [ - "Guest" - ] - } - } - ] - }, "intent_template_id": 294, - "changelogs": [ + "start_urls": ["__GITLAB__"], + "intent": "Vinta wants to check my dotfile configurations. Please invite him to the repo as a guest.", + "intent_template": "{{name}} wants to check my dotfile configurations. Please invite him to the repo as a guest.", + "instantiation_dict": {"name": "Vinta"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "expected_backend_state", - "category": "switch_to_api_eval", - "note": "Use API to verify; more accurate" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "collaborators": ["vinta"], + "group": "byteblaze", + "project": "dotfiles", + "access_level": 10, + "site": "gitlab", + "expected": true } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping_admin" - ], + "sites": ["shopping_admin"], "task_id": 486, - "require_login": true, - "storage_state": "./.auth/shopping_admin_state.json", - "start_url": "__SHOPPING_ADMIN__", - "geolocation": null, + "intent_template_id": 275, + "start_urls": ["__SHOPPING_ADMIN__"], + "intent": "Change the page title of \"404 Not Found\" page on my site to \"Bruh bro you clicked the wrong page\".", "intent_template": "Change the page title of \"{{old-heading}}\" page on my site to \"{{heading}}\".", "instantiation_dict": { "old-heading": "404 Not Found", "heading": "Bruh bro you clicked the wrong page" }, - "intent": "Change the page title of \"404 Not Found\" page on my site to \"Bruh bro you clicked the wrong page\".", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "verify_site_page_details", - "eval_params": { - "page_id": "1" - }, - "expected_data": { - "title": "Bruh bro you clicked the wrong page" - } - } - } - ], - "site": "shopping_admin" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "__SHOPPING_ADMIN__/cms/page/edit/page_id/1/", - "locator": "document.querySelector('input[name=\"title\"').value", - "required_contents": { - "exact_match": "Bruh bro you clicked the wrong page" - } - } - ] - }, - "intent_template_id": 275, - "changelogs": [ + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + }, + { + "evaluator": "BackendStateEvaluator", + "page_id": "1", + "site": "shopping_admin", + "expected": {"title": "Bruh bro you clicked the wrong page"} } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping_admin" - ], + "sites": ["shopping_admin"], "task_id": 487, - "require_login": true, - "storage_state": "./.auth/shopping_admin_state.json", - "start_url": "__SHOPPING_ADMIN__", - "geolocation": null, + "intent_template_id": 275, + "start_urls": ["__SHOPPING_ADMIN__"], + "intent": "Change the page title of \"Enable Cookies\" page on my site to \"Cookie monster coming to your place\".", "intent_template": "Change the page title of \"{{old-heading}}\" page on my site to \"{{heading}}\".", "instantiation_dict": { "old-heading": "Enable Cookies", "heading": "Cookie monster coming to your place" }, - "intent": "Change the page title of \"Enable Cookies\" page on my site to \"Cookie monster coming to your place\".", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "verify_site_page_details", - "eval_params": { - "page_id": "3" - }, - "expected_data": { - "title": "Cookie monster coming to your place" - } - } - } - ], - "site": "shopping_admin" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "__SHOPPING_ADMIN__/cms/page/edit/page_id/3/", - "locator": "document.querySelector('input[name=\"title\"').value", - "required_contents": { - "exact_match": "Cookie monster coming to your place" - } - } - ] - }, - "intent_template_id": 275, - "changelogs": [ + "format_specification": null, + "start_url_context": null, + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "page_id": "3", + "site": "shopping_admin", + "expected": {"title": "Cookie monster coming to your place"} } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping_admin" - ], + "sites": ["shopping_admin"], "task_id": 488, - "require_login": true, - "storage_state": "./.auth/shopping_admin_state.json", - "start_url": "__SHOPPING_ADMIN__", - "geolocation": null, - "intent_template": "Change the page title of \"{{old-heading}}\" page on my site to \"{{heading}}\".", - "instantiation_dict": { - "old-heading": "Home Page", - "heading": "This is the home page!! Leave here!!" - }, - "intent": "Change the page title of \"Home Page\" page on my site to \"This is the home page!! Leave here!!\".", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "verify_site_page_details", - "eval_params": { - "page_id": "2" - }, - "expected_data": { - "title": "This is the home page!! Leave here!!" - } - } - } - ], - "site": "shopping_admin" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "__SHOPPING_ADMIN__/cms/page/edit/page_id/2/", - "locator": "document.querySelector('input[name=\"title\"').value", - "required_contents": { - "exact_match": "This is the home page!! Leave here!!" - } - } - ] - }, "intent_template_id": 275, - "changelogs": [ + "start_urls": ["__SHOPPING_ADMIN__"], + "intent": "Change the page title of \"Home Page\" page on my site to \"This is the home page!! Leave here!!\".", + "intent_template": "Change the page title of \"{{old-heading}}\" page on my site to \"{{heading}}\".", + "instantiation_dict": {"old-heading": "Home Page", "heading": "This is the home page!! Leave here!!"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + }, + { + "evaluator": "BackendStateEvaluator", + "page_id": "2", + "site": "shopping_admin", + "expected": {"title": "This is the home page!! Leave here!!"} } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping_admin" - ], + "sites": ["shopping_admin"], "task_id": 489, - "require_login": true, - "storage_state": "./.auth/shopping_admin_state.json", - "start_url": "__SHOPPING_ADMIN__", - "geolocation": null, + "intent_template_id": 275, + "start_urls": ["__SHOPPING_ADMIN__"], + "intent": "Change the page title of \"Privacy Policy\" page on my site to \"No privacy policy is needed in this dystopian world\".", "intent_template": "Change the page title of \"{{old-heading}}\" page on my site to \"{{heading}}\".", "instantiation_dict": { "old-heading": "Privacy Policy", "heading": "No privacy policy is needed in this dystopian world" }, - "original.instantiation_dict": { - "old-heading": "Privacy Policy", - "heading": "No privacy policy is needed is this dystopian world" - }, - "intent": "Change the page title of \"Privacy Policy\" page on my site to \"No privacy policy is needed in this dystopian world\".", - "original.intent": "Change the page title of \"Privacy Policy\" page on my site to \"No privacy policy is needed is this dystopian world\".", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "verify_site_page_details", - "eval_params": { - "page_id": "4" - }, - "expected_data": { - "title": "No privacy policy is needed in this dystopian world" - } - } - } - ], - "site": "shopping_admin" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "__SHOPPING_ADMIN__/cms/page/edit/page_id/4/", - "locator": "document.querySelector('input[name=\"title\"').value", - "required_contents": { - "exact_match": "No privacy policy is needed is this dystopian world" - } - } - ] - }, - "intent_template_id": 275, - "changelogs": [ + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "instantiation_dict", - "category": "spelling_or_grammar", - "note": "Update grammar for heading" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "page_id": "4", + "site": "shopping_admin", + "expected": {"title": "No privacy policy is needed in this dystopian world"} } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping_admin" - ], + "sites": ["shopping_admin"], "task_id": 490, - "require_login": true, - "storage_state": "./.auth/shopping_admin_state.json", - "start_url": "__SHOPPING_ADMIN__", - "geolocation": null, - "intent_template": "Change the page title of \"{{old-heading}}\" page on my site to \"{{heading}}\".", - "instantiation_dict": { - "old-heading": "About us", - "heading": "Secret" - }, - "intent": "Change the page title of \"About us\" page on my site to \"Secret\".", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "verify_site_page_details", - "eval_params": { - "page_id": "5" - }, - "expected_data": { - "title": "Secret" - } - } - } - ], - "site": "shopping_admin" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "__SHOPPING_ADMIN__/cms/page/edit/page_id/5/", - "locator": "document.querySelector('input[name=\"title\"').value", - "required_contents": { - "exact_match": "Secret" - } - } - ] - }, "intent_template_id": 275, - "changelogs": [ + "start_urls": ["__SHOPPING_ADMIN__"], + "intent": "Change the page title of \"About us\" page on my site to \"Secret\".", + "intent_template": "Change the page title of \"{{old-heading}}\" page on my site to \"{{heading}}\".", + "instantiation_dict": {"old-heading": "About us", "heading": "Secret"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + }, + { + "evaluator": "BackendStateEvaluator", + "page_id": "5", + "site": "shopping_admin", + "expected": {"title": "Secret"} } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping_admin" - ], + "sites": ["shopping_admin"], "task_id": 491, - "require_login": true, - "storage_state": "./.auth/shopping_admin_state.json", - "start_url": "__SHOPPING_ADMIN__", - "geolocation": null, - "intent_template": "Notify {{name}} in their most recent pending order with message \"{{message}}\"", - "instantiation_dict": { - "name": "Sarah Miller", - "message": "the order is ready to be shipped soon!" - }, - "intent": "Notify Sarah Miller in their most recent pending order with message \"the order is ready to be shipped soon!\"", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "validate_status", - "expected_data": { - "expected_status": "ACTION_NOT_ALLOWED_ERROR" - } - } - } - ], - "site": "shopping_admin" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "fuzzy_match": "N/A" - }, - "reference_url": "", - "program_html": [], - "string_note": "System message: We cannot add order history." - }, "intent_template_id": 280, - "changelogs": [ + "start_urls": ["__SHOPPING_ADMIN__"], + "intent": "Notify Sarah Miller in their most recent pending order with message \"the order is ready to be shipped soon!\"", + "intent_template": "Notify {{name}} in their most recent pending order with message \"{{message}}\"", + "instantiation_dict": {"name": "Sarah Miller", "message": "the order is ready to be shipped soon!"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "expected_retrieve_value", - "category": "unachievable_tasks", - "note": "Status check used instead of fuzzy match" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": { + "performed_operation": "retrieve", + "status": "ACTION_NOT_ALLOWED_ERROR", + "retrieved_data": null + } } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping_admin" - ], + "sites": ["shopping_admin"], "task_id": 492, - "require_login": true, - "storage_state": "./.auth/shopping_admin_state.json", - "start_url": "__SHOPPING_ADMIN__", - "geolocation": null, - "intent_template": "Notify {{name}} in their most recent pending order with message \"{{message}}\"", - "instantiation_dict": { - "name": "Jane Doe", - "message": "sorry we are out of stock, please reorder" - }, - "intent": "Notify Jane Doe in their most recent pending order with message \"sorry we are out of stock, please reorder\"", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "verify_order_details", - "eval_params": { - "order_id": "302" - }, - "expected_data": { - "comment": "sorry we are out of stock, please reorder" - } - } - } - ], - "site": "shopping_admin" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "__SHOPPING_ADMIN__/sales/order/view/order_id/302/", - "locator": "document.querySelector(\"#order_history_block\").querySelector(\".note-list\").firstElementChild.querySelector(\".note-list-comment\").outerText", - "required_contents": { - "exact_match": "sorry we are out of stock, please reorder" - } - } - ] - }, "intent_template_id": 280, - "changelogs": [ + "start_urls": ["__SHOPPING_ADMIN__"], + "intent": "Notify Jane Doe in their most recent pending order with message \"sorry we are out of stock, please reorder\"", + "intent_template": "Notify {{name}} in their most recent pending order with message \"{{message}}\"", + "instantiation_dict": {"name": "Jane Doe", "message": "sorry we are out of stock, please reorder"}, + "format_specification": null, + "start_url_context": null, + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "order_id": "302", + "site": "shopping_admin", + "expected": {"comment": "sorry we are out of stock, please reorder"} } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping_admin" - ], + "sites": ["shopping_admin"], "task_id": 493, - "require_login": true, - "storage_state": "./.auth/shopping_admin_state.json", - "start_url": "__SHOPPING_ADMIN__", - "geolocation": null, + "intent_template_id": 280, + "start_urls": ["__SHOPPING_ADMIN__"], + "intent": "Notify Grace Nguyen in their most recent pending order with message \"sorry we are bankrupt, please contact our customer service for refund\"", "intent_template": "Notify {{name}} in their most recent pending order with message \"{{message}}\"", "instantiation_dict": { "name": "Grace Nguyen", "message": "sorry we are bankrupt, please contact our customer service for refund" }, - "intent": "Notify Grace Nguyen in their most recent pending order with message \"sorry we are bankrupt, please contact our customer service for refund\"", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "verify_order_details", - "eval_params": { - "order_id": "307" - }, - "expected_data": { - "comment": "sorry we are bankrupt, please contact our customer service for refund" - } - } - } - ], - "site": "shopping_admin" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "__SHOPPING_ADMIN__/sales/order/view/order_id/307/", - "locator": "document.querySelector(\"#order_history_block\").querySelector(\".note-list\").firstElementChild.querySelector(\".note-list-comment\").outerText", - "required_contents": { - "exact_match": "sorry we are bankrupt, please contact our customer service for refund" - } - } - ] - }, - "intent_template_id": 280, - "changelogs": [ + "format_specification": null, + "start_url_context": null, + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "order_id": "307", + "site": "shopping_admin", + "expected": { + "comment": "sorry we are bankrupt, please contact our customer service for refund" + } } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping_admin" - ], + "sites": ["shopping_admin"], "task_id": 494, - "require_login": true, - "storage_state": "./.auth/shopping_admin_state.json", - "start_url": "__SHOPPING_ADMIN__", - "geolocation": null, - "intent_template": "Notify {{name}} in their most recent pending order with message \"{{message}}\"", - "instantiation_dict": { - "name": "Alex Thomas", - "message": "Yo, your order will be shipped soon!" - }, + "intent_template_id": 280, + "start_urls": ["__SHOPPING_ADMIN__"], "intent": "Notify Alex Thomas in their most recent pending order with message \"Yo, your order will be shipped soon!\"", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "verify_order_details", - "eval_params": { - "order_id": "304" - }, - "expected_data": { - "comment": "Yo, your order will be shipped soon!" - } - } - } - ], - "site": "shopping_admin" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "__SHOPPING_ADMIN__/sales/order/view/order_id/304/", - "locator": "document.querySelector(\"#order_history_block\").querySelector(\".note-list\").firstElementChild.querySelector(\".note-list-comment\").outerText", - "required_contents": { - "exact_match": "Yo, your order will be shipped soon!" - } - } - ] - }, - "intent_template_id": 280, - "changelogs": [ + "intent_template": "Notify {{name}} in their most recent pending order with message \"{{message}}\"", + "instantiation_dict": {"name": "Alex Thomas", "message": "Yo, your order will be shipped soon!"}, + "format_specification": null, + "start_url_context": null, + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "order_id": "304", + "site": "shopping_admin", + "expected": {"comment": "Yo, your order will be shipped soon!"} } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping_admin" - ], + "sites": ["shopping_admin"], "task_id": 495, - "require_login": true, - "storage_state": "./.auth/shopping_admin_state.json", - "start_url": "__SHOPPING_ADMIN__", - "geolocation": null, - "intent_template": "Notify {{name}} in their most recent pending order with message \"{{message}}\"", - "instantiation_dict": { - "name": "Lily Potter", - "message": "Thanks, your order is ready to be shipped!" - }, - "intent": "Notify Lily Potter in their most recent pending order with message \"Thanks, your order is ready to be shipped!\"", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "verify_order_details", - "eval_params": { - "order_id": "303" - }, - "expected_data": { - "comment": "Thanks, your order is ready to be shipped!" - } - } - } - ], - "site": "shopping_admin" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "__SHOPPING_ADMIN__/sales/order/view/order_id/303/", - "locator": "document.querySelector(\"#order_history_block\").querySelector(\".note-list\").firstElementChild.querySelector(\".note-list-comment\").outerText", - "required_contents": { - "exact_match": "Thanks, your order is ready to be shipped!" - } - } - ] - }, "intent_template_id": 280, - "changelogs": [ + "start_urls": ["__SHOPPING_ADMIN__"], + "intent": "Notify Lily Potter in their most recent pending order with message \"Thanks, your order is ready to be shipped!\"", + "intent_template": "Notify {{name}} in their most recent pending order with message \"{{message}}\"", + "instantiation_dict": {"name": "Lily Potter", "message": "Thanks, your order is ready to be shipped!"}, + "format_specification": null, + "start_url_context": null, + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "order_id": "303", + "site": "shopping_admin", + "expected": {"comment": "Thanks, your order is ready to be shipped!"} } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping_admin" - ], + "sites": ["shopping_admin"], "task_id": 496, - "require_login": true, - "storage_state": "./.auth/shopping_admin_state.json", - "start_url": "__SHOPPING_ADMIN__", - "geolocation": null, - "intent_template": "Update order #{{order}} with the {{service}} tracking number {{tracking}}", - "instantiation_dict": { - "tracking": "8974568499", - "order": "299", - "service": "Federal Express" - }, - "intent": "Update order #299 with the Federal Express tracking number 8974568499", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "verify_order_details", - "eval_params": { - "order_id": "299" - }, - "expected_data": { - "shipping_carrier": "Federal Express", - "tracking_number": "8974568499" - } - } - } - ], - "site": "shopping_admin" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "__SHOPPING_ADMIN__/sales/order/commentsHistory/order_id/299/active_tab/order_shipments/", - "locator": "", - "required_contents": { - "must_include": [ - "Tracking number 8974568499 for Federal Express assigned" - ] - } - } - ] - }, "intent_template_id": 284, - "changelogs": [ + "start_urls": ["__SHOPPING_ADMIN__"], + "intent": "Update order #299 with the Federal Express tracking number 8974568499", + "intent_template": "Update order #{{order}} with the {{service}} tracking number {{tracking}}", + "instantiation_dict": {"tracking": "8974568499", "order": "299", "service": "Federal Express"}, + "format_specification": null, + "start_url_context": null, + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "order_id": "299", + "site": "shopping_admin", + "expected": {"shipping_carrier": "Federal Express", "tracking_number": "8974568499"} } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping_admin" - ], + "sites": ["shopping_admin"], "task_id": 497, - "require_login": true, - "storage_state": "./.auth/shopping_admin_state.json", - "start_url": "__SHOPPING_ADMIN__", - "geolocation": null, - "intent_template": "Update order #{{order}} with the {{service}} tracking number {{tracking}}", - "instantiation_dict": { - "tracking": "24353446464", - "order": "307", - "service": "DHL" - }, - "intent": "Update order #307 with the DHL tracking number 24353446464", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "verify_order_details", - "eval_params": { - "order_id": "307" - }, - "expected_data": { - "shipping_carrier": "DHL", - "tracking_number": "24353446464" - } - } - } - ], - "site": "shopping_admin" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "__SHOPPING_ADMIN__/sales/order/commentsHistory/order_id/307/active_tab/order_shipments/", - "locator": "", - "required_contents": { - "must_include": [ - "Tracking number 24353446464 for DHL assigned" - ] - } - } - ] - }, "intent_template_id": 284, - "changelogs": [ + "start_urls": ["__SHOPPING_ADMIN__"], + "intent": "Update order #307 with the DHL tracking number 24353446464", + "intent_template": "Update order #{{order}} with the {{service}} tracking number {{tracking}}", + "instantiation_dict": {"tracking": "24353446464", "order": "307", "service": "DHL"}, + "format_specification": null, + "start_url_context": null, + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "order_id": "307", + "site": "shopping_admin", + "expected": {"shipping_carrier": "DHL", "tracking_number": "24353446464"} } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping_admin" - ], + "sites": ["shopping_admin"], "task_id": 498, - "require_login": true, - "storage_state": "./.auth/shopping_admin_state.json", - "start_url": "__SHOPPING_ADMIN__", - "geolocation": null, - "intent_template": "Update order #{{order}} with the {{service}} tracking number {{tracking}}", - "instantiation_dict": { - "tracking": "55591023930", - "order": "306", - "service": "UPS" - }, - "intent": "Update order #306 with the UPS tracking number 55591023930", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "verify_order_details", - "eval_params": { - "order_id": "306" - }, - "expected_data": { - "shipping_carrier": "UPS", - "tracking_number": "55591023930" - } - } - } - ], - "site": "shopping_admin" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "__SHOPPING_ADMIN__/sales/order/commentsHistory/order_id/306/active_tab/order_shipments/", - "locator": "", - "required_contents": { - "must_include": [ - "Tracking number 55591023930 for United Parcel Service assigned" - ] - } - } - ] - }, "intent_template_id": 284, - "changelogs": [ + "start_urls": ["__SHOPPING_ADMIN__"], + "intent": "Update order #306 with the UPS tracking number 55591023930", + "intent_template": "Update order #{{order}} with the {{service}} tracking number {{tracking}}", + "instantiation_dict": {"tracking": "55591023930", "order": "306", "service": "UPS"}, + "format_specification": null, + "start_url_context": null, + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "order_id": "306", + "site": "shopping_admin", + "expected": {"shipping_carrier": "UPS", "tracking_number": "55591023930"} } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping_admin" - ], + "sites": ["shopping_admin"], "task_id": 499, - "require_login": true, - "storage_state": "./.auth/shopping_admin_state.json", - "start_url": "__SHOPPING_ADMIN__", - "geolocation": null, - "intent_template": "Update order #{{order}} with the {{service}} tracking number {{tracking}}", - "instantiation_dict": { - "tracking": "13849373987", - "order": "304", - "service": "USPS" - }, - "intent": "Update order #304 with the USPS tracking number 13849373987", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "verify_order_details", - "eval_params": { - "order_id": "304" - }, - "expected_data": { - "shipping_carrier": "USPS", - "tracking_number": "13849373987" - } - } - } - ], - "site": "shopping_admin" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "__SHOPPING_ADMIN__/sales/order/commentsHistory/order_id/304/active_tab/order_shipments/", - "locator": "", - "required_contents": { - "must_include": [ - "Tracking number 13849373987 for United States Postal Service assigned" - ] - } - } - ] - }, "intent_template_id": 284, - "changelogs": [ + "start_urls": ["__SHOPPING_ADMIN__"], + "intent": "Update order #304 with the USPS tracking number 13849373987", + "intent_template": "Update order #{{order}} with the {{service}} tracking number {{tracking}}", + "instantiation_dict": {"tracking": "13849373987", "order": "304", "service": "USPS"}, + "format_specification": null, + "start_url_context": null, + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "order_id": "304", + "site": "shopping_admin", + "expected": {"shipping_carrier": "USPS", "tracking_number": "13849373987"} } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping_admin" - ], + "sites": ["shopping_admin"], "task_id": 500, - "require_login": true, - "storage_state": "./.auth/shopping_admin_state.json", - "start_url": "__SHOPPING_ADMIN__", - "geolocation": null, - "intent_template": "Update order #{{order}} with the {{service}} tracking number {{tracking}}", - "instantiation_dict": { - "tracking": "239028439840", - "order": "301", - "service": "DHL" - }, - "intent": "Update order #301 with the DHL tracking number 239028439840", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "verify_order_details", - "eval_params": { - "order_id": "301" - }, - "expected_data": { - "shipping_carrier": "DHL", - "tracking_number": "239028439840" - } - } - } - ], - "site": "shopping_admin" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "__SHOPPING_ADMIN__/sales/order/commentsHistory/order_id/301/active_tab/order_shipments/", - "locator": "", - "required_contents": { - "must_include": [ - "Tracking number 239028439840 for DHL assigned" - ] - } - } - ] - }, "intent_template_id": 284, - "changelogs": [ + "start_urls": ["__SHOPPING_ADMIN__"], + "intent": "Update order #301 with the DHL tracking number 239028439840", + "intent_template": "Update order #{{order}} with the {{service}} tracking number {{tracking}}", + "instantiation_dict": {"tracking": "239028439840", "order": "301", "service": "DHL"}, + "format_specification": null, + "start_url_context": null, + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "order_id": "301", + "site": "shopping_admin", + "expected": {"shipping_carrier": "DHL", "tracking_number": "239028439840"} } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping_admin" - ], + "sites": ["shopping_admin"], "task_id": 501, - "require_login": true, - "storage_state": "./.auth/shopping_admin_state.json", - "start_url": "__SHOPPING_ADMIN__", - "geolocation": null, - "intent_template": "Mark all {{product}} as out of stock", - "original.intent_template": "Make all {{product}} as out of stock", - "instantiation_dict": { - "product": "Taurus Elements Shell" - }, - "intent": "Mark all Taurus Elements Shell as out of stock", - "original.intent": "Make all Taurus Elements Shell as out of stock", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "verify_product_details", - "eval_params": { - "product_id": "350" - }, - "expected_data": { - "in_stock": false - } - } - } - ], - "site": "shopping_admin" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "__SHOPPING_ADMIN__/catalog/product/edit/id/350/", - "locator": "document.querySelector('[name=\"product[quantity_and_stock_status][is_in_stock]\"').value", - "required_contents": { - "exact_match": "0" - } - } - ] - }, "intent_template_id": 287, - "changelogs": [ + "start_urls": ["__SHOPPING_ADMIN__"], + "intent": "Mark all Taurus Elements Shell as out of stock", + "intent_template": "Make all {{product}} as out of stock", + "instantiation_dict": {"product": "Taurus Elements Shell"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "intent_template", - "category": "spelling_or_grammar", - "note": "Use 'mark' instead of 'make'" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "product_id": "350", + "site": "shopping_admin", + "expected": {"in_stock": false} } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping_admin" - ], + "sites": ["shopping_admin"], "task_id": 502, - "require_login": true, - "storage_state": "./.auth/shopping_admin_state.json", - "start_url": "__SHOPPING_ADMIN__", - "geolocation": null, - "intent_template": "Mark all {{product}} as out of stock", - "original.intent_template": "Make all {{product}} as out of stock", - "instantiation_dict": { - "product": "Gobi HeatTec Tee" - }, - "intent": "Mark all Gobi HeatTec Tee as out of stock", - "original.intent": "Make all Gobi HeatTec Tee as out of stock", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "verify_product_details", - "eval_params": { - "product_id": "446" - }, - "expected_data": { - "in_stock": false - } - } - } - ], - "site": "shopping_admin" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "__SHOPPING_ADMIN__/catalog/product/edit/id/446/", - "locator": "document.querySelector('[name=\"product[quantity_and_stock_status][is_in_stock]\"').value", - "required_contents": { - "exact_match": "0" - } - } - ] - }, "intent_template_id": 287, - "changelogs": [ + "start_urls": ["__SHOPPING_ADMIN__"], + "intent": "Mark all Gobi HeatTec Tee as out of stock", + "intent_template": "Make all {{product}} as out of stock", + "instantiation_dict": {"product": "Gobi HeatTec Tee"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "intent_template", - "category": "spelling_or_grammar", - "note": "Use 'mark' instead of 'make'" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "product_id": "446", + "site": "shopping_admin", + "expected": {"in_stock": false} } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping_admin" - ], + "sites": ["shopping_admin"], "task_id": 503, - "require_login": true, - "storage_state": "./.auth/shopping_admin_state.json", - "start_url": "__SHOPPING_ADMIN__", - "geolocation": null, - "intent_template": "Mark all {{product}} as out of stock", - "original.intent_template": "Make all {{product}} as out of stock", - "instantiation_dict": { - "product": "rocco gym tank" - }, - "intent": "Mark all rocco gym tank as out of stock", - "original.intent": "Make all rocco gym tank as out of stock", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "verify_product_details", - "eval_params": { - "product_id": "682" - }, - "expected_data": { - "in_stock": false - } - } - } - ], - "site": "shopping_admin" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "__SHOPPING_ADMIN__/catalog/product/edit/id/682/", - "locator": "document.querySelector('[name=\"product[quantity_and_stock_status][is_in_stock]\"').value", - "required_contents": { - "exact_match": "0" - } - } - ] - }, "intent_template_id": 287, - "changelogs": [ + "start_urls": ["__SHOPPING_ADMIN__"], + "intent": "Mark all rocco gym tank as out of stock", + "intent_template": "Make all {{product}} as out of stock", + "instantiation_dict": {"product": "rocco gym tank"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "intent_template", - "category": "spelling_or_grammar", - "note": "Use 'mark' instead of 'make'" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "product_id": "682", + "site": "shopping_admin", + "expected": {"in_stock": false} } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping_admin" - ], + "sites": ["shopping_admin"], "task_id": 504, - "require_login": true, - "storage_state": "./.auth/shopping_admin_state.json", - "start_url": "__SHOPPING_ADMIN__", - "geolocation": null, - "intent_template": "Mark all {{product}} as out of stock", - "original.intent_template": "Make all {{product}} as out of stock", - "instantiation_dict": { - "product": "Selene yoga hoodie" - }, - "intent": "Mark all Selene yoga hoodie as out of stock", - "original.intent": "Make all Selene yoga hoodie as out of stock", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "verify_product_details", - "eval_params": { - "product_id": "1108" - }, - "expected_data": { - "in_stock": false - } - } - } - ], - "site": "shopping_admin" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "__SHOPPING_ADMIN__/catalog/product/edit/id/1108/", - "locator": "document.querySelector('[name=\"product[quantity_and_stock_status][is_in_stock]\"').value", - "required_contents": { - "exact_match": "0" - } - } - ] - }, "intent_template_id": 287, - "changelogs": [ + "start_urls": ["__SHOPPING_ADMIN__"], + "intent": "Mark all Selene yoga hoodie as out of stock", + "intent_template": "Make all {{product}} as out of stock", + "instantiation_dict": {"product": "Selene yoga hoodie"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "intent_template", - "category": "spelling_or_grammar", - "note": "Use 'mark' instead of 'make'" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "product_id": "1108", + "site": "shopping_admin", + "expected": {"in_stock": false} } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping_admin" - ], + "sites": ["shopping_admin"], "task_id": 505, - "require_login": true, - "storage_state": "./.auth/shopping_admin_state.json", - "start_url": "__SHOPPING_ADMIN__", - "geolocation": null, - "intent_template": "Mark all {{product}} as out of stock", - "original.intent_template": "Make all {{product}} as out of stock", - "instantiation_dict": { - "product": "Aeon capri" - }, - "original.instantiation_dict": { - "product": "Aeno capri" - }, - "intent": "Mark all Aeon capri as out of stock", - "original.intent": "Make all Aeno capri as out of stock", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "verify_product_details", - "eval_params": { - "product_id": "1861" - }, - "expected_data": { - "in_stock": false - } - } - } - ], - "site": "shopping_admin" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "__SHOPPING_ADMIN__/catalog/product/edit/id/1861/", - "locator": "document.querySelector('[name=\"product[quantity_and_stock_status][is_in_stock]\"').value", - "required_contents": { - "exact_match": "0" - } - } - ] - }, "intent_template_id": 287, - "changelogs": [ - { - "key": "intent_template", - "category": "spelling_or_grammar", - "note": "Use 'mark' instead of 'make'" - }, + "start_urls": ["__SHOPPING_ADMIN__"], + "intent": "Mark all Aeon capri as out of stock", + "intent_template": "Make all {{product}} as out of stock", + "instantiation_dict": {"product": "Aeon capri"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "instantiation_dict", - "category": "task_ambiguity", - "note": "Incorrect spelling of product name leading inability to find product. Changed 'Aeno' to 'Aeon'" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "product_id": "1861", + "site": "shopping_admin", + "expected": {"in_stock": false} } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping" - ], + "sites": ["shopping"], "task_id": 506, - "require_login": true, - "storage_state": "./.auth/shopping_state.json", - "start_url": "__SHOPPING__", - "geolocation": null, - "intent_template": "Buy the highest rated product from the {{product_category}} category within a budget {{dollar_value}}.", - "instantiation_dict": { - "product_category": "meat substitute", - "dollar_value": "between 100 and 200" - }, - "intent": "Buy the highest rated product from the meat substitute category within a budget between 100 and 200.", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "validate_ordered", - "expected_data": { - "sku": "B01CTR3DLE" - } - } - } - ], - "site": "shopping" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "func:shopping_get_latest_order_url()", - "locator": "document.querySelector(\".order-details-items.ordered\").outerText", - "required_contents": { - "must_include": [ - "B01CTR3DLE" - ] - } - } - ] - }, "intent_template_id": 172, - "changelogs": [ + "start_urls": ["__SHOPPING__"], + "intent": "Buy the highest rated product from the meat substitute category within a budget between 100 and 200.", + "intent_template": "Buy the highest rated product from the {{product_category}} category within a budget {{dollar_value}}.", + "instantiation_dict": {"product_category": "meat substitute", "dollar_value": "between 100 and 200"}, + "format_specification": null, + "start_url_context": null, + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "site": "shopping", + "expected": {"sku": "B01CTR3DLE"} } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping" - ], + "sites": ["shopping"], "task_id": 507, - "require_login": true, - "storage_state": "./.auth/shopping_state.json", - "start_url": "__SHOPPING__", - "geolocation": null, - "intent_template": "Buy the highest rated product from the {{product_category}} category within a budget {{dollar_value}}.", - "instantiation_dict": { - "product_category": "Ceiling light", - "dollar_value": "above 1000" - }, - "intent": "Buy the highest rated product from the Ceiling light category within a budget above 1000.", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "validate_ordered", - "expected_data": { - "sku": "B07BVL3P1V" - } - } - } - ], - "site": "shopping" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "func:shopping_get_latest_order_url()", - "locator": "document.querySelector(\".order-details-items.ordered\").outerText", - "required_contents": { - "must_include": [ - "B07BVL3P1V" - ] - } - } - ] - }, "intent_template_id": 172, - "changelogs": [ + "start_urls": ["__SHOPPING__"], + "intent": "Buy the highest rated product from the Ceiling light category within a budget above 1000.", + "intent_template": "Buy the highest rated product from the {{product_category}} category within a budget {{dollar_value}}.", + "instantiation_dict": {"product_category": "Ceiling light", "dollar_value": "above 1000"}, + "format_specification": null, + "start_url_context": null, + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "site": "shopping", + "expected": {"sku": "B07BVL3P1V"} } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping" - ], + "sites": ["shopping"], "task_id": 508, - "require_login": true, - "storage_state": "./.auth/shopping_state.json", - "start_url": "__SHOPPING__", - "geolocation": null, - "intent_template": "Buy the highest rated product from the {{product_category}} category within a budget {{dollar_value}}.", - "instantiation_dict": { - "product_category": "NS switch pouch", - "dollar_value": "under 60" - }, + "intent_template_id": 172, + "start_urls": ["__SHOPPING__"], "intent": "Buy the highest rated product from the NS switch pouch category within a budget under 60.", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "validate_ordered", - "expected_data": { - "sku": "B07116LGP6" - } - } - } - ], - "site": "shopping" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "func:shopping_get_latest_order_url()", - "locator": "document.querySelector(\".order-details-items.ordered\").outerText", - "required_contents": { - "must_include": [ - "B07116LGP6" - ] - } - } - ] - }, - "intent_template_id": 172, - "changelogs": [ + "intent_template": "Buy the highest rated product from the {{product_category}} category within a budget {{dollar_value}}.", + "instantiation_dict": {"product_category": "NS switch pouch", "dollar_value": "under 60"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + }, + { + "evaluator": "BackendStateEvaluator", + "site": "shopping", + "expected": {"sku": "B07116LGP6"} } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping" - ], + "sites": ["shopping"], "task_id": 509, - "require_login": true, - "storage_state": "./.auth/shopping_state.json", - "start_url": "__SHOPPING__", - "geolocation": null, - "intent_template": "Buy the best rating product from \"{{category}}\" category with at least 5 reviews and the product is least expensive", - "instantiation_dict": { - "category": "Men's shoe" - }, - "intent": "Buy the best rating product from \"Men's shoe\" category with at least 5 reviews and the product is least expensive", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "validate_ordered", - "expected_data": { - "sku": "B01J4MM3KO" - } - } - } - ], - "site": "shopping" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "func:shopping_get_latest_order_url()", - "locator": "document.querySelector(\".order-details-items.ordered\").outerText", - "required_contents": { - "must_include": [ - "B01J4MM3KO" - ] - } - } - ] - }, "intent_template_id": 216, - "changelogs": [ + "start_urls": ["__SHOPPING__"], + "intent": "Buy the best rating product from \"Men's shoe\" category with at least 5 reviews and the product is least expensive", + "intent_template": "Buy the best rating product from \"{{category}}\" category with at least 5 reviews and the product is least expensive", + "instantiation_dict": {"category": "Men's shoe"}, + "format_specification": null, + "start_url_context": null, + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "site": "shopping", + "expected": {"sku": "B01J4MM3KO"} } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping" - ], + "sites": ["shopping"], "task_id": 510, - "require_login": true, - "storage_state": "./.auth/shopping_state.json", - "start_url": "__SHOPPING__", - "geolocation": null, - "intent_template": "Buy the best rating product from \"{{category}}\" category with at least 5 reviews and the product is least expensive", - "instantiation_dict": { - "category": "Home Audio Speaker" - }, - "intent": "Buy the best rating product from \"Home Audio Speaker\" category with at least 5 reviews and the product is least expensive", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "validate_ordered", - "expected_data": { - "sku": "B002R5ABIW" - } - } - } - ], - "site": "shopping" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "func:shopping_get_latest_order_url()", - "locator": "document.querySelector(\".order-details-items.ordered\").outerText", - "required_contents": { - "must_include": [ - "B002R5ABIW" - ] - } - } - ] - }, "intent_template_id": 216, - "changelogs": [ + "start_urls": ["__SHOPPING__"], + "intent": "Buy the best rating product from \"Home Audio Speaker\" category with at least 5 reviews and the product is least expensive", + "intent_template": "Buy the best rating product from \"{{category}}\" category with at least 5 reviews and the product is least expensive", + "instantiation_dict": {"category": "Home Audio Speaker"}, + "format_specification": null, + "start_url_context": null, + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "site": "shopping", + "expected": {"sku": "B002R5ABIW"} } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping" - ], + "sites": ["shopping"], "task_id": 511, - "require_login": true, - "storage_state": "./.auth/shopping_state.json", - "start_url": "__SHOPPING__", - "geolocation": null, - "intent_template": "Add a {{product}} to my wish list.", - "instantiation_dict": { - "product": "laundry detergent" - }, + "intent_template_id": 189, + "start_urls": ["__SHOPPING__"], "intent": "Add a laundry detergent to my wish list.", - "require_reset": false, - "eval": { - "site": "shopping" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "__SHOPPING__/wishlist/", - "locator": "document.querySelector('.products-grid.wishlist').outerText", - "required_contents": { - "must_include": [ - "laundry", - "detergent" - ] - } + "intent_template": "Add a {{product}} to my wish list.", + "instantiation_dict": {"product": "laundry detergent"}, + "format_specification": null, + "start_url_context": null, + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + }, + { + "evaluator": "BackendStateEvaluator", + "site": "shopping", + "expected": { + "skus": [ + "B00VRAF73M", "B01M10LBSQ", "B074QVN413", "B0777TM4B8", "B07N8SJ5GF", + "B08V7GXG7F" + ] } - ] - }, - "intent_template_id": 189 + } + ], + "revision": 2 }, { - "sites": [ - "shopping" - ], + "sites": ["shopping"], "task_id": 512, - "require_login": true, - "storage_state": "./.auth/shopping_state.json", - "start_url": "__SHOPPING__", - "geolocation": null, - "intent_template": "Add a {{product}} to my wish list.", - "instantiation_dict": { - "product": "toothpaste" - }, + "intent_template_id": 189, + "start_urls": ["__SHOPPING__"], "intent": "Add a toothpaste to my wish list.", - "require_reset": false, - "eval": { - "site": "shopping" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "__SHOPPING__/wishlist/", - "locator": "document.querySelector('.products-grid.wishlist').outerText", - "required_contents": { - "must_include": [ - "toothpaste" - ] - } - } - ] - }, - "intent_template_id": 189 - }, - { - "sites": [ - "shopping" - ], - "task_id": 513, - "require_login": true, - "storage_state": "./.auth/shopping_state.json", - "start_url": "__SHOPPING__", - "geolocation": null, "intent_template": "Add a {{product}} to my wish list.", - "instantiation_dict": { - "product": "chair" - }, + "instantiation_dict": {"product": "toothpaste"}, + "format_specification": null, + "start_url_context": null, + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + }, + { + "evaluator": "BackendStateEvaluator", + "site": "shopping", + "expected": { + "skus": [ + "B00028F3WU", "B0007DA4IS", "B000KU720Q", "B000NRT0TW", "B000OEXMW0", + "B000RL6K0Q", "B000WSV82Y", "B0013G6M3E", "B0013USYR2", "B001CMW800", + "B001E0SWB4", "B001E16LEI", "B001E6JLOA", "B001E77NOO", "B001ECQRGO", + "B001EJOPUC", "B001FWXS10", "B001G0MEC0", "B001I63AUW", "B001RK3Q5S", + "B001RUXR2K", "B001WAKI3Q", "B00277BGOI", "B00288D752", "B002BL75FO", + "B002EIW31A", "B002JAI47K", "B002K5V4OY", "B0031Z66O0", "B00394ZNUQ", + "B003CMNQZY", "B003FWQ0JA", "B003LZSQ5M", "B003WVTGSQ", "B0041U8TGW", + "B0041WF82I", "B004EML5CW", "B004FSYPC2", "B004GW64K8", "B004GWVQEC", + "B004HNJ7HI", "B004I75GXW", "B004I7756S", "B004I7985O", "B004J759U6", + "B004K3WSBM", "B004M99CDG", "B004NTFXKG", "B004XRN0DU", "B004YWOMZE", + "B004Z2484S", "B00518BXO8", "B00531UB6Y", "B0053XLIN2", "B005B0RHR8", + "B005B0RIZY", "B005FAL8ZG", "B005FGKT9Q", "B005FUHD8W", "B005JC3N0I", + "B005PKTDZ8", "B005TKZH9U", "B0061WDOHW", "B006H9J4VY", "B00757A9B6", + "B007HO56HO", "B0080L9FHA", "B0080L9FUM", "B0084DKJL0", "B008D5I4XY", + "B008K8BWL0", "B008L39LAS", "B008UBWJTG", "B0092IS26U", "B0094E5ZOY", + "B0096CZ7W4", "B0096I3OSM", "B009AO70OG", "B00A15CDKE", "B00A1CGLUU", + "B00A3UIHIY", "B00B977F60", "B00CID86AI", "B00CLG85M6", "B00D5766VC", + "B00DAHBSZQ", "B00DGDLYBC", "B00DLSSDNO", "B00DX5LCOC", "B00DZEE7O8", + "B00E4MKPYQ", "B00E4ML7PC", "B00E4MQY3W", "B00F27E520", "B00F5DXIJC", + "B00FD1YRNC", "B00FJD7C68", "B00G4ELZF0", "B00G4EM07M", "B00GDTFK1Q", + "B00GHR8X4K", "B00GYG1NUU", "B00GYUTEZM", "B00H202F0O", "B00HA71X7A", + "B00HT5CQ06", "B00I5H5ZEG", "B00IAJG0XO", "B00IG0SXRI", "B00IOPZ7W4", + "B00IUIVMGK", "B00IXKX0DI", "B00IXL0BSY", "B00IZ6F8K8", "B00J2L54O0", + "B00J36583Q", "B00J5J7F0A", "B00J5JAAJI", "B00J5JB89Y", "B00J7GHNLC", + "B00JIJ6KXU", "B00JIKZHCY", "B00JR20OKS", "B00JRTG2FW", "B00JUJ1BCS", + "B00JUJ1BI2", "B00JVG9PV4", "B00KF98NAU", "B00KIHL02M", "B00LAY88ZY", + "B00LEVU3KG", "B00LJ0X24G", "B00MH85X7S", "B00MPRGT1A", "B00NESSWAA", + "B00NIAULVC", "B00NQOYQCK", "B00R3K2KYE", "B00S7MC9SY", "B00TA4B7JA", + "B00TNMWYFA", "B00VQTOSCA", "B00VVLZ0CA", "B00W0FEWUC", "B00W5MLOFG", + "B00XU23FFW", "B00XWTNKD0", "B00ZATYH3C", "B00ZGX4OW6", "B0105DZI3U", + "B010AXZ916", "B010TGMIO0", "B010XR5MJI", "B012RIL1M8", "B01414P92Y", + "B0141GSTA6", "B01488F40K", "B014DUHR94", "B015T474SC", "B0160NT6FU", + "B016OPY4EC", "B0170YM9EI", "B017KQRZAE", "B0184ZBLPK", "B018ILDW3Y", + "B018KS268M", "B018KS2U8S", "B018KURG1W", "B018LM1MSM", "B0195UTBGY", + "B0195UTBHI", "B019BR67UY", "B019JE0WXW", "B01ANVG068", "B01B1E050S", + "B01BJQ4GDK", "B01BLP49HW", "B01BNF2904", "B01BYQZ52G", "B01C3GFNA0", + "B01C47HJH8", "B01DJBDXH8", "B01DWR0AR0", "B01F4JR0TA", "B01F66UR5K", + "B01GR1QZSQ", "B01H8ABSAA", "B01HA8D1SM", "B01HNCJ864", "B01HQVC38C", + "B01HVQWGQQ", "B01IA9BJ4I", "B01IA9BN44", "B01IADWIAS", "B01IADWJOS", + "B01IADWNUI", "B01IADWUUQ", "B01IADX5DW", "B01IADXOXS", "B01IAFLGHW", + "B01IAFLM0I", "B01IAFLNE8", "B01IFHDLYQ", "B01JN6HHDY", "B01KCJENYC", + "B01KCJG1RO", "B01LVXPXY9", "B01LWOHLOV", "B01LXO6WHU", "B01LZW6QAY", + "B01M0INUID", "B01MR5SH9Z", "B01N23FCL2", "B01N3NPZTL", "B01N49LBSI", + "B01N7JXTVU", "B01N7PMK4M", "B01N97Q4BE", "B01NASHAAW", "B01NBU1MJT", + "B06W5JWFPJ", "B06WP9KG52", "B06WP9Y81S", "B06X419HV1", "B06XCWJJ53", + "B06XH21RDK", "B06XJNZHQC", "B06XKWKY3C", "B06XX2P91N", "B06Y4B4VNY", + "B06ZY1ZZ85", "B071D9J53C", "B071H57RP1", "B071J4W1VW", "B071WVVDSZ", + "B071XT8YMQ", "B072WB4W3Y", "B0735BJP4B", "B0735DL5W5", "B073FCXT4Z", + "B073VPT81R", "B073WD693Y", "B073WR94PV", "B073ZJCG5V", "B0744G9ZRH", + "B07469FDK1", "B0748DLWD2", "B074D9RYKG", "B074J27TZF", "B074J99K5Z", + "B074KHS4J9", "B074T52KP5", "B074ZPFJNM", "B075XSN2HZ", "B0763NTP8X", + "B0763PFFKH", "B0767PTXS8", "B076H3WKRL", "B0771V325R", "B077H6PVN1", + "B077H74B75", "B078938GT9", "B078HQ48JX", "B078JJ538V", "B078JJ56VP", + "B078NJFR48", "B078YG5SVQ", "B078YGYYNB", "B0792LV9TN", "B0792M4NQL", + "B0792MRTG1", "B0792PZCKN", "B0792QF9Z6", "B079945GC2", "B079DD9GWZ", + "B079Y99F75", "B079YW2HNF", "B079Z94R32", "B07B2PT7T3", "B07BMRVLV5", + "B07BQDPVQ3", "B07BS8D4RC", "B07C1ZWH94", "B07C5M9VNV", "B07CCDD61K", + "B07CKX86XP", "B07CMXS929", "B07CX7R8RP", "B07CZ6QXX9", "B07D1S5468", + "B07D496BHJ", "B07DF7G67C", "B07DP6W9HK", "B07DP72KYF", "B07FCJWVCC", + "B07FKXXPWR", "B07FN84GP8", "B07FXWSXW8", "B07G7F5HHR", "B07GD6DSJQ", + "B07GNTD35G", "B07GZ5X5VC", "B07GZ5X6M1", "B07GZ6XYJR", "B07GZ72CLF", + "B07GZ771DJ", "B07GZ8JK4Q", "B07GZGN942", "B07HR3M4TC", "B07JNJ3LPT", + "B07JQT13NL", "B07KFZ5TZQ", "B07KY1D3T4", "B07KZR6PYK", "B07L61QFSR", + "B07LHJDLS3", "B07MCQ1WNJ", "B07MK5VKQY", "B07MMHZ7RD", "B07N85QTL5", + "B07N89BMDZ", "B07NDGM7WD", "B07NGP8R4J", "B07NK8GCGD", "B07NQRS8ZS", + "B07P6LR9ZD", "B07PMNCCH4", "B07PS5NFCW", "B07PT41YZQ", "B07PTKVVML", + "B07PYRBBKV", "B07Q4QNK49", "B07Q6LLPM5", "B07Q8BMCB6", "B07QGC7N6R", + "B07QKZNL77", "B07QMB4FZ6", "B07QSS11LC", "B07R23Q7VQ", "B07R926VLW", + "B07RHYZBD9", "B07RJ9WCVD", "B07RN2L869", "B07RZTYVBB", "B07S2YB6VH", + "B07S77SVS9", "B07S844G5N", "B07S9LLHJD", "B07SNPBW86", "B07SPTPF4Y", + "B07T4B7YNF", "B07T9CK42J", "B07TLZSWSL", "B07TRKYZ9D", "B07TVSF5S9", + "B07TX37WW2", "B07TXZ8ML7", "B07V5QQH6S", "B07V6Q6FV5", "B07VN73SNX", + "B07W4CSVBY", "B07WR6FCLR", "B07XHWS8Q1", "B07XQQNVN1", "B07Y2BVJ4X", + "B07Y2DXRCY", "B07Y4JR7SR", "B07YBJT86S", "B07YFYNLZC", "B07YQ39VKP", + "B07YYQJM8T", "B07ZHZWH7X", "B07ZQ6SJFG", "B07ZYBYQST", "B0813Y9JSW", + "B0813YH41G", "B0816Y4YRX", "B081B632Q3", "B081CZ1Q2S", "B081JL4F8B", + "B081Q5PF5W", "B081VW91ZT", "B08258WTC9", "B082F1LVBR", "B082VL1KQ9", + "B0836PYM8J", "B083BT1JV2", "B083HV5HLG", "B083JHCCV2", "B083TZXWB6", + "B08414HZ3S", "B0848M8FYQ", "B084BPVPMZ", "B084BTXCMZ", "B084BZP8QY", + "B0853BQ42Q", "B0856Q567M", "B085L3842H", "B085WBW3N2", "B086H6WSTJ", + "B086KM71JR", "B087WZL9CG", "B087X1R6JH", "B0887XN77Z", "B0887Y8F3V", + "B088FVSMVF", "B088MFQTHY", "B088WVS5VX", "B08923V311", "B0899XZSNT", + "B089KVNV55", "B089S9HKF6", "B089VGNJSB", "B08B125T7N", "B08CYCJFVQ", + "B08D6H45V1", "B08DNRS5PW", "B08FMPTFTG", "B08FXP4757", "B08FXS828J", + "B08G1XJJFV", "B08G81N9NB", "B08H5MR8QW", "B08HKF9NSN", "B08HQWQR2S", + "B08J7RYJSM", "B08KJHMSKJ", "B08LM728R2", "B08N4BXJTK", "B08NCQGZJ8", + "B08NK4SHXY", "B08P3FRWQR", "B08PMTSBBB", "B08PTDZZQQ", "B08PV13VLD", + "B08QSJTQXS", "B08R7CYPWG", "B08R7W193L", "B08RCRNJBT", "B08SKX2Z1M", + "B08TRMF995", "B08TTHQF5Q", "B08VDRL2T7", "B08VJDVQ81", "B08W277P5L", + "B08WTWYXT2", "B08XN22DQN", "B08XQ97YV2", "B08YNK6MVV", "B08YY4LST7", + "B08ZKMV7MG", "B0915TLNJC", "B0915X52TC", "B0917PN39F", "B091DWV93B", + "B0923S1MWS", "B092XB11YC", "B0936X16ZX", "B093QG3KBD", "B093RF6CPZ", + "B09413DRYS", "B094H16MCZ", "B094ZBNGH6", "B095CVGGK7", "B095SYZ16P", + "B095YDBXKX", "B095Z6SVNB", "B0963Q5CT6", "B097T129C8", "B0982ZJMND", + "B098DYSPVX", "B098FLJKVD", "B099BZHZP1", "B099J4YBBG", "B099JF2K7J", + "B099KVT3M6", "B099PD2FDN", "B09B6G62JK", "B09B7M8K3R", "B09BCTJC4Y", + "B09BCV4NS1", "B09BCVBK53", "B09BFZZ19Z", "B09BMFFRB2", "B09CFR181D", + "B09CM36M5Q", "B09CPB2ZCH", "B09DRT2DBM", "B09DYQ1RBR", "B09F5GD2YW", + "B09FF5811V", "B09FHXKBWK", "B09FJH65B4", "B09FLHJBD5", "B09FM129KX", + "B09FM2DBN6", "B09FXSFY8P", "B09G2QZYTP", "B09G2ZPB5L", "B09G5THVP8", + "B09GBLDC1W", "B09GFC528V", "B09GM6WV79", "B09GVQZRDR", "B09H5BWDPG", + "B09HNWBNGF", "B09HQ83ZY4", "B09HTPC8G9", "B09J4ZPZQX", "B09J88Z66L", + "B09JGBFBQY", "B09JP1QVTC", "B09K7DF7JF", "B09KG3ZJJN", "B09KMY1RVP", + "B09KRVB3W7", "B09KYCLCBZ", "B09L3B3V9D", "B09L7VVY17", "B09LCTPKGT", + "B09LLKTB2G", "B09LQV19Z5", "B09LTTCGGJ", "B09LVDV5GS", "B09M1QCB7N", + "B09MFNJYXF", "B09MGWGF2P", "B09MQL93X6", "B09MQLZ9GW", "B09MQM3DY2", + "B09MQMLD6Q", "B09MQSMD1G", "B09MQVN528", "B09MRCQJ3Y", "B09MRRTSMR", + "B09MRTX572", "B09MSNXR2W", "B09N9CBVLL", "B09N9WM3VG", "B09NBQ7BYL", + "B09NMCZ634", "B09NNK1NK5", "B09NP1876Z", "B09NVY6472", "B09NWBSTM9", + "B09NX91TXG", "B09NY92KRC", "B09NY9TVQJ", "B09NY9WRJP", "B09NYF3N14", + "B09P4RXW46", "B09P67MF9B", "B09P7T1P6T", "B09P7ZCHY9", "B09P8GG6JY", + "B09P9Z7VMZ", "B09PBT25FQ", "B09PDFMLK3", "B09PDT7PQL", "B09PHB4XGP", + "B09PNC6152", "B09PRDQG1R", "B09PV2KQN3", "B09PVFJDB1", "B09PVPJJ7L", + "B09PVQQPJV", "B09PXYS6HC", "B09PY7RR75", "B09PYL23PC", "B09PYLHLLJ", + "B09PYPGNTS", "B09Q17RP7T", "B09Q2DRTJB", "B09Q4CY5SG", "B09Q5X789L", + "B09Q5ZRS2X", "B09Q675GH6", "B09Q6H3LN5", "B09Q827KDV", "B09Q8Q62PP", + "B09Q8XWKHX", "B09QBZQJ61", "B09QCMHFT5", "B09QFWWD9B", "B09QG1GD19", + "B09QG6QB9K", "B09QG88RXS", "B09QGHBB3M", "B09QJ7C3XX", "B09QKPW5WC", + "B09QL1HXG6", "B09QM3V2FZ", "B09QMDQXCV", "B09QMFFLDR", "B09QP3TYXJ", + "B09QPRL8PJ", "B09QQ5KZFY", "B09QQDMG75", "B09QSMDB16", "B09QSR758Z", + "B09QWZ36C9", "B09QWZFNTF", "B09QX3GMJ2", "B09QX4VW3M", "B09R1Q8Z74", + "B09R1QZJQK", "B09R1WC4PH", "B09R1YK2NV", "B09R2KPMMW", "B09R2LSSPM", + "B09R47Z36F", "B09R6NKZ7B", "B09R7KK3H6", "B09R8CTYHJ", "B09RFD44WZ", + "B09RJM2B15", "B09RMN3Y5M", "B09RMRPPCW", "B09RSJ5D9S", "B09RZPGCWB", + "B09RZR9VYH", "B09S3NVLCV", "B09S3SP6LB", "B09S5NS2WH", "B09S5RW1ZX", + "B09S5T2T4R", "B09S5WT75M", "B09S5Y2253", "B09S63FZNP", "B09S6QSQ4V", + "B09S6SN1XW", "B09SBD9YZY", "B09SCVNW6X", "B09SG21HJV", "B09SG21SHS", + "B09SGMW7T6", "B09SHBNH3B", "B09SHT47YZ", "B09SJ13L93", "B09SJ1CC73", + "B09SJ1LYKM", "B09SPPXSG6", "B09SQDJW6D", "B09SYPS6SJ", "B09T3LZNFL" + ] + } + } + ], + "revision": 2 + }, + { + "sites": ["shopping"], + "task_id": 513, + "intent_template_id": 189, + "start_urls": ["__SHOPPING__"], "intent": "Add a chair to my wish list.", - "require_reset": false, - "eval": { - "site": "shopping" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "__SHOPPING__/wishlist/", - "locator": "document.querySelector('.products-grid.wishlist').outerText", - "required_contents": { - "must_include": [ - "chair" - ] - } - } - ] - }, - "intent_template_id": 189 - }, - { - "sites": [ - "shopping" - ], - "task_id": 514, - "require_login": true, - "storage_state": "./.auth/shopping_state.json", - "start_url": "__SHOPPING__", - "geolocation": null, "intent_template": "Add a {{product}} to my wish list.", - "instantiation_dict": { - "product": "white desk" - }, + "instantiation_dict": {"product": "chair"}, + "format_specification": null, + "start_url_context": null, + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + }, + { + "evaluator": "BackendStateEvaluator", + "site": "shopping", + "expected": { + "skus": [ + "B0002M9LP6", "B000Q5XTE8", "B000UKLJ5K", "B000WK1X4U", "B001BWYA48", + "B001GCKOAW", "B001LJY36Q", "B001OW7JW0", "B0039MIBH0", "B0039MIMRE", + "B003BWS81C", "B003R8BOXO", "B003S7HHEY", "B003UYZMA6", "B003VYAJMG", + "B003WX99I6", "B0042H6SLW", "B0042TU6WW", "B0046TR4IW", "B0049MWPGC", + "B004D2UD30", "B004LQ1U9O", "B004VTHKXQ", "B005L0R0OI", "B005QHQYR0", + "B006SYV38K", "B00752SXZU", "B007HDY64O", "B007VLXFA8", "B008J4ZE7I", + "B008OTQ864", "B008OTSHSQ", "B008OTSIY4", "B009WNCPHW", "B00AVUQPSU", + "B00AZQKG4O", "B00BJ7D1FE", "B00BXKFHTK", "B00CKR4QT2", "B00EQ1TJII", + "B00F9GDMM8", "B00FW1AHP0", "B00G4NTR38", "B00GBUQAYI", "B00GOJDB7A", + "B00GPK8ODY", "B00HV9YK22", "B00IT42FXE", "B00J8U1C2S", "B00JWJJP3S", + "B00KOP4V8S", "B00KUPS6G0", "B00L3NQP0C", "B00MPG96M0", "B00MQ3GWFG", + "B00MSIB4H0", "B00NAWEE9S", "B00OCSNMO2", "B00OIQHCDA", "B00OU7MFAW", + "B00P21TU6C", "B00PW9UF0E", "B00PZMS73U", "B00QGY0PQS", "B00QSLD8XQ", + "B00RDJ8FNU", "B00RKNESIK", "B00TV4FYOI", "B00UGB65C0", "B00UZ369KC", + "B00YB262HS", "B00YRBDRTE", "B010C71PJA", "B0116W5B9K", "B01257NFZ2", + "B013B74U5O", "B013JBE2IC", "B016OIF2JU", "B017NEJWC2", "B017UM91PU", + "B0183K9PNQ", "B018462YV4", "B01AFODPO8", "B01B4X0RGI", "B01BDPX98A", + "B01BIEHME8", "B01BL29M40", "B01BTH2XZM", "B01CINCTE6", "B01DMT92M2", + "B01DOBJLRY", "B01DZRM30Y", "B01F7B219O", "B01F8MD9HK", "B01FUAQCSS", + "B01FWQ90KC", "B01G2ELLMI", "B01GTQF5N0", "B01GTRTGDE", "B01GTS4L1U", + "B01H765O3C", "B01HMWRNUS", "B01HQGUBVS", "B01I5B1Q1M", "B01IKMHF66", + "B01IQYBFVE", "B01IR8U42K", "B01J4MVL48", "B01J4NE2FC", "B01KQ4KS1A", + "B01LDHMZO2", "B01LJ1GKZW", "B01LN2K7FQ", "B01LYNG09O", "B01LZHLH12", + "B01M0JHX3F", "B01M10CXK5", "B01M5FURAH", "B01MQTZTFY", "B01MQW49T3", + "B01MU7H89G", "B01MYA7OAK", "B01MYFPK88", "B01N0C01Z6", "B01N0XC48T", + "B01N1V0RT0", "B01N2WKUZB", "B01N4QB5WP", "B01N5FV14H", "B01N5O58OA", + "B01N5R9MBW", "B01N8SQPGS", "B01NAEM3NY", "B01NBWJDJZ", "B01NCEU0UK", + "B01NCY3ZH1", "B01NH2WXQ8", "B06VSXQXPX", "B06VVW24GY", "B06WD2PTCX", + "B06XD964Q5", "B06XDPT7BG", "B06XG4L63W", "B06XJ9KGCZ", "B06XP295XG", + "B06XPGYDCP", "B06XQW9KYP", "B06XTNKQFB", "B0716Z6WSJ", "B07176BLWK", + "B0719JSKS8", "B071JN9CM3", "B071LJTW4H", "B071NFSYLP", "B071NVV6B6", + "B071VG6PCC", "B07255YSWD", "B0725JXS9V", "B072BZ4XVJ", "B072FD6GLT", + "B072JMFMJC", "B072KDRZHP", "B072YFNQSR", "B073TY9SG9", "B073TYNJG4", + "B073VD36X8", "B073WQGDLC", "B073YDPZ5P", "B07451FV2F", "B074MZMXRY", + "B074VB3WNV", "B0759WQ3YW", "B075F9ZTT6", "B075H5D2LP", "B075ZX9F9Y", + "B0761R67K6", "B076DFC9RY", "B076HBJSVZ", "B076JJXY3X", "B076JRNQSQ", + "B076MGX5S5", "B076PMYFLL", "B076PS6FJ8", "B076Q33BYY", "B076ZYJPDL", + "B0773KJ2J9", "B077DRRXQT", "B077HDH76X", "B077HF6PML", "B077KY33TK", + "B077T9NHCS", "B077XWCV6D", "B078NPB4MR", "B078TDS6XG", "B078WZH3BQ", + "B078XZ211C", "B078Y49NC6", "B078Y8DSZS", "B078Z2P4XN", "B078ZL3H6J", + "B078ZLDSB5", "B078ZLFHV8", "B078ZM8VLB", "B0791CRM7L", "B0795W72NN", + "B0797HZ8W1", "B079CQN4KV", "B079HQ38SG", "B079J438ZT", "B079K3HP8G", + "B079RKQD1F", "B07B3NM7SP", "B07B9W2JSS", "B07BDHGYXR", "B07BKY1198", + "B07BXZX8N8", "B07BYJK8WK", "B07C61J4YR", "B07C69LPXX", "B07C7FZR5D", + "B07C89GZ6V", "B07CBG28PZ", "B07CBG3HFR", "B07CBG3HFY", "B07CBG496R", + "B07CHSR5SZ", "B07CHXSTJF", "B07CNX8SQ6", "B07CQK1W6F", "B07CWVR72G", + "B07CYLPD36", "B07D3RHMR8", "B07D9GR9KH", "B07DB45MPS", "B07DB4R43W", + "B07DD4QG4Y", "B07DLTW6TP", "B07DS19FCY", "B07DWVP5P6", "B07DXS9Z3K", + "B07F1R6QSG", "B07F2258F2", "B07FFG458D", "B07FKVFMXS", "B07FQTNSJP", + "B07FSCJTLX", "B07FSN9Z25", "B07FY56WR2", "B07FYMJMZX", "B07G3ZL8BM", + "B07G4K6VXG", "B07G9YL7YN", "B07GFSZ1GT", "B07GHS2GGN", "B07GTCHQSK", + "B07GTJH9S9", "B07GVFJP9R", "B07GVLXQDG", "B07GXJ3X7K", "B07GXJ5SHY", + "B07H3GKDYT", "B07H3KDMCY", "B07H3PG4ZJ", "B07H61TKMJ", "B07H717K62", + "B07H8XC2TC", "B07HF4YY52", "B07HFV5WXS", "B07HJVF26H", "B07HKZG9Y2", + "B07HNG4BR1", "B07HRHPZ69", "B07HSLP8FL", "B07HSWTVH8", "B07HWXN5DS", + "B07JGCB2GC", "B07JH6H5MP", "B07JPL7LC5", "B07JV7WWLC", "B07K1KYFWV", + "B07K3WRHLC", "B07K5TBB46", "B07KDS893Z", "B07KGPM6NX", "B07KNNM1BZ", + "B07L4KG1RW", "B07L5JB9ZX", "B07L82FM8Z", "B07L871TG5", "B07LFDK2R2", + "B07M5DQ1T9", "B07M64YFVN", "B07MC3V467", "B07MC7F12W", "B07MJQBY1C", + "B07MLT29MK", "B07MQJ5G1B", "B07MQTJ8MB", "B07MTM3H4W", "B07N1VR1SB", + "B07N1WG3ZZ", "B07N39N41Q", "B07N64X68W", "B07NJN8C92", "B07NKJLB7Z", + "B07NNWGJMY", "B07NRHSG7Y", "B07NY28PLV", "B07NYWXLDN", "B07P1DLRDY", + "B07P6JTVZV", "B07PBKZ59L", "B07PDXZS7H", "B07PG1Q7HF", "B07PMLRM15", + "B07PMRWR8W", "B07PNB13FL", "B07PS1H6XX", "B07PV2J77H", "B07Q26LJL6", + "B07Q2H39CW", "B07Q5L5488", "B07Q5XRMLD", "B07Q8JSJ6F", "B07QDNRFKJ", + "B07QG1SV98", "B07QLT67GN", "B07QLWTLLN", "B07QNTQKDP", "B07QRV2WKJ", + "B07QXKDV1V", "B07R3V952B", "B07R6RWYMZ", "B07R7PFW15", "B07R9Q36XM", + "B07RGKYS9K", "B07RZYCYK7", "B07S1L49ZT", "B07S5PYH6Q", "B07S745TYG", + "B07S8PCBGF", "B07S8VR9WJ", "B07S95TJDL", "B07S9RHZCD", "B07SCX2YMW", + "B07SJG72FG", "B07SW2JLK6", "B07SYHF5R2", "B07SZBWWDH", "B07T1193KF", + "B07T59J8YV", "B07T65P8XS", "B07T6Y57RM", "B07TD186H6", "B07TKYD9WK", + "B07TQT2YMB", "B07TT87TQW", "B07TTWTXQH", "B07TW28S2S", "B07TX4KV7K", + "B07TZPSB6G", "B07V1JYGT4", "B07V43SDMD", "B07V9S45CD", "B07VB7D6HZ", + "B07VB8TVRZ", "B07VCH7L8J", "B07VCH7PP6", "B07VFXKMNH", "B07VGLTMLY", + "B07VHPQMSX", "B07VK653KB", "B07VKLSZ4K", "B07VLMLJ4K", "B07VMGVZ1P", + "B07VPSPRR2", "B07VTZZMNY", "B07VVPD3PN", "B07VXTT5JW", "B07VY8K19X", + "B07VYX245K", "B07VYXM16M", "B07W5DZLZK", "B07W5S5D7Z", "B07WK5FT6N", + "B07WLDNZTW", "B07WNNCV74", "B07WNT5GMZ", "B07WS7YJLJ", "B07WVD6VL3", + "B07WWT8LWZ", "B07WWTKBJW", "B07WX825B5", "B07X2CKWD8", "B07X5NBRCL", + "B07X7MBYV2", "B07X92VH7C", "B07X9ZJBCV", "B07X9ZQG3J", "B07XC6G91T", + "B07XCM9SMW", "B07XCVJHP3", "B07XDYLCLD", "B07XFCT1P6", "B07XHCL2BD", + "B07XHKZ7J4", "B07XK1LVC7", "B07XM9P3X3", "B07XQ6H8LS", "B07XQPBCSY", + "B07XRFTD77", "B07XWYFKRH", "B07Y1SPDFS", "B07Y29VWLF", "B07Y38GFGT", + "B07YB5M9XK", "B07YDW4SPD", "B07YF3RZ6Q", "B07YFHGKDW", "B07YGNBFVV", + "B07YP7LNVZ", "B07YQFV7QK", "B07YQNGDKM", "B07YSGJZMT", "B07YXSZ2DF", + "B07YXSZ7H3", "B07YZBKFP7", "B07Z4FS3LF", "B07Z79B7Z3", "B07ZF9Y59G", + "B07ZHW9KF7", "B07ZNNJW9S", "B07ZPMBS7V", "B07ZS1JLHK", "B07ZW9QP69", + "B07ZWVZTF3", "B07ZX5SMM4", "B07ZYV95VZ", "B07ZZJKKK1", "B0811M32PM", + "B0812Z17RR", "B0813XXC6C", "B0816CVW6L", "B081GW63LW", "B081K72B57", + "B081MQWK51", "B081SXJ1FV", "B081VQ2CX3", "B081VYCDZM", "B081YDP24W", + "B08243WG1C", "B0827SZBY6", "B08281XPW1", "B082BKMHM1", "B082DR2HGZ", + "B082G9SKDC", "B082L1F4MS", "B082MDWHXM", "B082NW3WQZ", "B082PBJTT5", + "B082PCPQS9", "B082VKNM9K", "B082VQ21KB", "B082XJL59F", "B082YHQ859", + "B0831J7L9Z", "B08337T95Q", "B0833M8CLS", "B083545L5B", "B083BTC9Q1", + "B083FNGNZF", "B083MT6XHB", "B083TM2C5F", "B083VRVYCJ", "B083W3W9F4", + "B083W51RGF", "B083XYGFTN", "B083Z7HNNT", "B08411WVL8", "B0842NJ414", + "B0842Z6B97", "B0842ZKXPJ", "B0845P87XZ", "B0845Q6YKS", "B0846W55GW", + "B084C34FKY", "B084CZVRYH", "B084FF2ZNL", "B084FFM2YG", "B084FFML4Q", + "B084LJ7ZTX", "B084RZVHD2", "B084YT2MT5", "B084YWJ2HY", "B0853LZ2KK", + "B0854Z235T", "B085DJ6ZRH", "B085DNHL71", "B085SVMFCJ", "B085TS6HPZ", + "B085XZTMTK", "B085Y476KB", "B085ZJ1VLM", "B085ZRWSLC", "B0861XFFMK", + "B0866PCRV2", "B0867MY52S", "B0868VQVHX", "B086C285VB", "B086MLNLXN", + "B086R53Q71", "B086RPYM5H", "B086TXCVNR", "B086VYSZ29", "B086WN6DC2", + "B086Y77KLH", "B086YQWT6C", "B0872CR3QP", "B08765864R", "B087BJHNKX", + "B087DXNSN2", "B087JQM31Q", "B087KLT13X", "B087NLS2CQ", "B087PKXJFK", + "B087QBQRD5", "B087R8RMZC", "B087TZPGKL", "B0882P46V4", "B08835M1CR", + "B0887XLDYL", "B088H5CYXF", "B088KB1P3S", "B088KKCS4W", "B088NQBLP9", + "B088R5VDCB", "B088WC5W7Z", "B08976V7Z4", "B089DJ3X2D", "B089KB9843", + "B089LPSLTQ", "B089MCN5BT", "B089MWZN3P", "B089Q4TSFX", "B089VYHBHC", + "B089W5Y9J6", "B089W629P6", "B089YNHYQR", "B08B2HDJKF", "B08B5YRKL7", + "B08B637Q6X", "B08B68BBNY", "B08B8CFW5S", "B08BC7MMWZ", "B08BC8Q1LN", + "B08BCNB8X7", "B08BG5NDKY", "B08BJ67ZZM", "B08BJ723GP", "B08BKDVPDW", + "B08BL2YYG4", "B08BLM97LP", "B08BP9L79M", "B08BR4ZTRV", "B08BWR9XHP", + "B08BY91YQ9", "B08BZ6RDD9", "B08BZDXMTP", "B08C2WDVZ8", "B08C7D5VVW", + "B08C9B3R9F", "B08C9LTK5P", "B08CGJM3NZ", "B08CGYXL6B", "B08CHDKHXS", + "B08CMNWG55", "B08CTCLV36", "B08CVF664C", "B08CZ57S6V", "B08CZDXDPF", + "B08CZP8RBX", "B08CZQ56SD", "B08CZR9NNH", "B08D7Q4FHP", "B08D97PLHF", + "B08DNSN23H", "B08DRDWYCG", "B08DS3PGZ6", "B08DTQS27V", "B08DV73BW2", + "B08DWPMMCF", "B08DXKQTZS", "B08DXYJCVZ", "B08DY4RS68", "B08DY5QW1R", + "B08F3CGWVV", "B08F3JXDRX", "B08F4TZ458", "B08F5DDYMR", "B08F7ZQWNB", + "B08FB17N5F", "B08FBMW4KR", "B08FC6YMF2", "B08FDV3KDM", "B08FG3XN2C", + "B08FGMMV7Z", "B08FGNTTGG", "B08FHNZSV9", "B08FJ9WBBK", "B08FMK7KK1", + "B08FPJBVWF", "B08FRMFWY3", "B08FSNN6YM", "B08FT68T9Z", "B08FY9HJQ7", + "B08G1C1F4F", "B08G4CK7ZX", "B08G4Q2V85", "B08G77SGLY", "B08G8JZ8JB", + "B08GC5C5BV", "B08GCP34MQ", "B08GCQH2TY", "B08GGBGFBB", "B08GHSJ2VQ", + "B08GJ1B5M7", "B08GJHBL37", "B08GKKGFTF", "B08GNV4YYV", "B08GP89Q33", + "B08GQ696ZD", "B08GWT6T1Z", "B08GYMV5G4", "B08H4MY6QM", "B08H4RGBK1", + "B08H4RTBMJ", "B08H7DG9HW", "B08HC9DH2Z", "B08HD8SDX7", "B08HHZ46J7", + "B08HJ5VNBN", "B08HK39749", "B08HKZSN2M", "B08HM3LDB2", "B08HN1NPGZ", + "B08HN9K15V", "B08HRB68HY", "B08HRQHYWY", "B08HVM3GHY", "B08HWDCRSF", + "B08HYXGN71", "B08J2G7CV8", "B08J2R6L2G", "B08J4334DD", "B08J8C4R3F", + "B08JBX9DFD", "B08JGD6XWS", "B08JGT2YCQ", "B08JH43ZZH", "B08JHNGP9K", + "B08JPF7VGR", "B08JRXF687", "B08JSL8XNB", "B08JV62XGM", "B08JYY7MT7", + "B08JZ6NMZ3", "B08K2K3J4C", "B08K2XHCV4", "B08K34RNM1", "B08K758FG2", + "B08K91HN9V", "B08K94395B", "B08KFXYKMN", "B08KFXZZC5", "B08KHD75NQ", + "B08KL9BSCN", "B08KLPPNG1", "B08KPZMLWQ", "B08KT5GGKC", "B08L39H9T5", + "B08L3JZGDS", "B08L3YH8SD", "B08L461N1Q", "B08L5TVPXX", "B08L62TV6F", + "B08L6HKTF9", "B08L8LCRS3", "B08LD2RHFN", "B08LFYQ9C6", "B08LG1NNRN", + "B08LG9XSTQ", "B08LH7BQHL", "B08LHF7VLT", "B08LKMM9SG", "B08LMF347W", + "B08LS9FPZZ", "B08LSG69MN", "B08LSGKC84", "B08LVWBQVP", "B08LYRSQW6", + "B08M215R6C", "B08M3MS9W5", "B08M3MW7Z7", "B08M3ZDH7Y", "B08M3ZGLN6", + "B08M47BL4V", "B08M5CNSLW", "B08M5GWW3W", "B08M5LBJ9J", "B08M5LVPPV", + "B08M5M4772", "B08M5T347P", "B08M64ZY29", "B08M983DC1", "B08MBJTVQX", + "B08MFGSTY3", "B08MFJBFK2", "B08MFK1NL6", "B08ML5JQDC", "B08MPXPT2M", + "B08MQBHTL7", "B08MQMB5GD", "B08MTBFXP2", "B08MTS1KW7", "B08MZRF1LF", + "B08N3WPZ5C", "B08N5B2BFL", "B08N6F6BHQ", "B08N6JW8QN", "B08N6SMPSW", + "B08N6XKFG9", "B08NB6QJFJ", "B08NBBTFPG", "B08NJ1V4SR", "B08NPKWYBW", + "B08NPNBFXP", "B08NPYXF4H", "B08NQXP7RF", "B08NS77Q62", "B08NW152HQ", + "B08NWYV3BJ", "B08NX2DJ65", "B08NX2FP94", "B08NX7KYHM", "B08NXHVMCJ", + "B08NYLWX9V", "B08P1MN98G", "B08P1Z1PZ4", "B08P1ZXPJT", "B08P22Y83L", + "B08P2K8JHB", "B08P2L5F83", "B08P2NWVDS", "B08P3MYHCY", "B08P4MVNBW", + "B08P4PNTMR", "B08P4YFYMK", "B08P51BZXM", "B08P52YC4Q", "B08P5KF71X", + "B08P5Q2BPP", "B08P6M9L5V", "B08PCQFWH2", "B08PD2843V", "B08PKLQSZQ", + "B08PL1W4NC", "B08PL3CNGP", "B08PNY8LT3", "B08PP9KQLZ", "B08PPVVLJ4", + "B08Q1S1T9G", "B08Q3JX9W6", "B08Q7CY95N", "B08QCVQ1MG", "B08QF8XD1W", + "B08QG5J7TC", "B08QGPTJM9", "B08QHY5Z87", "B08QN37CV2", "B08QRFKQ9X", + "B08QRMY3JH", "B08QRPHTWM", "B08QV2QGSX", "B08QV6G73F", "B08QVCGJ29", + "B08R17ZBNG", "B08R2S2839", "B08R2ZFB1W", "B08R3PSW2C", "B08R66L28Y", + "B08R6VSYS6", "B08R74NWKH", "B08R8BWHLP", "B08R8JT7T7", "B08RD82QNR", + "B08RDGM6FD", "B08RN71L4L", "B08RR4JZNM", "B08RR5PJ1X", "B08RR6DQSQ", + "B08RSDX2TB", "B08RTHGKZM", "B08RTK7V3W", "B08RYLHPZ3", "B08RYY2GQL", + "B08RZ25KWB", "B08S2Y6KP9", "B08S38DZS9", "B08S3KL85G", "B08S6M942F", + "B08S6THC6H", "B08S6VS51T", "B08S7HHXDL", "B08SBDYZGR", "B08SCCKGM3", + "B08SH9SWGP", "B08SHXQ4BV", "B08SJLLKNP", "B08SK4HQVV", "B08SKY32DN", + "B08SPYC6FP", "B08SQJPH85", "B08SR3LY72", "B08SW1K2NV", "B08SW226WM", + "B08SW6RWMR", "B08SWL74DK", "B08T14FH5J", "B08T186JTQ", "B08T1DBKT2", + "B08T1MPFGX", "B08T61JT7H", "B08T7CS25P", "B08TB8B97C", "B08TBG9H3X", + "B08TBGDN6R", "B08TBGJPHG", "B08TBPQ53V", "B08TCR5V7N", "B08TCRZFPZ", + "B08TGH733K", "B08TGRN4X3", "B08TP1JS4W", "B08TTMZSW7", "B08TW3PKMF", + "B08TWY9XZ2", "B08V59811M", "B08VD1V9XB", "B08VD4739M", "B08VDNZMZZ", + "B08VF6PPD7", "B08VGCVRTN", "B08VJ4G41Q", "B08VND3Q31", "B08VRFGD3K", + "B08VRS2DVC", "B08VS7V4YW", "B08VVWSBYP", "B08VW7QR35", "B08W45HQLP", + "B08W52ZQFC", "B08W8QJPQ7", "B08W9RSB8H", "B08WCRHZ12", "B08WHBHCSG", + "B08WRY3ZVB", "B08WZ9Y9T2", "B08WZGPK5S", "B08X25GSSC", "B08X28WKM1", + "B08X3YJPJ7", "B08X41Z2VD", "B08X4CQ3QG", "B08X4F84WV", "B08X6GRX1M", + "B08X6LZQFV", "B08XB4K221", "B08XJTFF68", "B08XK1WY3S", "B08XLYS92M", + "B08XNM8LYM", "B08XNN9R75", "B08XNPY7ZY", "B08XNQSV69", "B08XWF74F8", + "B08XX55W2V", "B08XXD126G", "B08XYQPB1M", "B08Y1LXSGL", "B08Y1RR8F9", + "B08Y2PQZP2", "B08Y5F3BVV", "B08Y6L6GHS", "B08Y735644", "B08Y7JNGSH", + "B08Y8P3B27", "B08YDKKT8G", "B08YFL2CPD", "B08YJJ8D98", "B08YX151QW", + "B08Z1Z66LY", "B08Z26SFLK", "B08Z274YPS", "B08Z42K4X1", "B08Z8CYVNR", + "B08ZCF5H21", "B08ZCKYPG1", "B08ZCNQ275", "B08ZDRDMZY", "B08ZJ23PWT", + "B08ZJP6GWV", "B08ZKFH9C8", "B08ZMN5ZHQ", "B08ZN5BB95", "B08ZN8LBJF", + "B08ZNDLXFD", "B08ZV6FXZC", "B08ZXWR255", "B08ZYJVB2N", "B0911V3TPX", + "B0912TZB1P", "B0915TFYT9", "B0915XF43F", "B091BXL814", "B091C5862F", + "B091CFLDJY", "B091FGL75K", "B091FNRR2G", "B091J9NRS1", "B091K7C42P", + "B091KLXYLX", "B091MS2XMC", "B091MS9CH9", "B091MXRWWQ", "B091PMM9VJ", + "B091TKZJ76", "B091TPZCG3", "B091XWR8G4", "B091Y4TQWS", "B091YHHNHY", + "B091YQ9FV4", "B0922PHCGJ", "B0925FZL1V", "B0927FPD86", "B0927GZ5YJ", + "B0928QWQYC", "B09291D4G9", "B092HD5RZK", "B092HM68XW", "B092JR1L91", + "B092LCGN7Q", "B092M8F53R", "B092PNLVL3", "B092VLQNPN", "B092ZSFGM5", + "B09333RV2J", "B093B9RLHH", "B093BVWZY8", "B093C5L3Y7", "B093FDJ8H3", + "B093FNL67D", "B093K9SGX6", "B093P8P24D", "B093Y5KLLY", "B0943XZ96B", + "B094631831", "B0946M2JJ9", "B0946MNDB4", "B0946XWLWJ", "B094869J1B", + "B0948SLBBJ", "B0948WSSMH", "B094C1RLQM", "B094D271ZY", "B094FDK42K", + "B094MWK38V", "B094Q9NS4Y", "B094QJXDT2", "B094V6X323", "B094VDS76B", + "B094VG34CW", "B094VK56GN", "B094VNCW7L", "B094VVWLCH", "B094Y5JW7C", + "B0951GRZYW", "B0953846HT", "B09538PJ8K", "B0953D5CFT", "B0953KK817", + "B0957Y994D", "B095D8NPT2", "B095GXYP7W", "B095HWS33F", "B095KLDLZP", + "B095NNLX17", "B095P6HZ5Q", "B095VYB71Y", "B095WZ4HV2", "B095WZ8H98", + "B0963775J3", "B09656X96F", "B096783WXN", "B09679MGFV", "B0967GTGGK", + "B0967XY96K", "B0967Z4425", "B09687YJB7", "B09696STL5", "B096B1J7LH", + "B096CPR7JL", "B096F758K3", "B096FDJ3VG", "B096FFCC92", "B096FGV4Q8", + "B096FMJFZ8", "B096FXY4LX", "B096HR2S41", "B096KND1P2", "B096LZG6DX", + "B096QFVPTG", "B096QK6WBZ", "B096TPYMQ8", "B096VY3HHV", "B096Z7ZSQW", + "B096Z8T537", "B09729TDWR", "B0972PQKGH", "B097339C54", "B097357HWV", + "B0975RXRV1", "B0976TD987", "B0978TF24P", "B097B984NG", "B097BFFLF1", + "B097BG6P5Q", "B097DF29V3", "B097DLV23M", "B097DM8D9B", "B097DZK928", + "B097GHB6VR", "B097GZ2R31", "B097H41F7X", "B097HJ5XLQ", "B097JDGTHV", + "B097N1KCGH", "B097NJGPR2", "B097QV48FX", "B097R5R7CC", "B097RD43N7", + "B097RHM1PW", "B097SPTQH6", "B097SRVKKX", "B097TDRN7Y", "B097TH2BMB", + "B097YBDXJ7", "B097ZL4GS5", "B097ZL6F2G", "B0982QN762", "B0982SV1LN", + "B0983W33P5", "B0985P3DXF", "B0986F6Y1R", "B0986LJ35X", "B0987P3S4T", + "B0987TQ7V1", "B0988THPWT", "B0989GLPJF", "B0989Q594H", "B098B336HF", + "B098B62F42", "B098C2R1TJ", "B098DW5DCM", "B098JJZCBP", "B098JYFFYH", + "B098K86DR9", "B098KMPTWH", "B098MGK1PV", "B098MJ59GL", "B098P4J9X1", + "B098SMDGQQ", "B098W664WT", "B098X9VFLW", "B0992JNZW2", "B0995W5MPL", + "B0999C8LDB", "B099DDYFH5", "B099DFXQLW", "B099DR1GWX", "B099HSLHPG", + "B099JCH1X4", "B099MFTNYJ", "B099MRK5QG", "B099NTND1N", "B099Q49CZM", + "B099RFN7K6", "B099RMVMMC", "B099RRX3YL", "B099S7Z3WY", "B099SFDW8W", + "B099TZT2XR", "B099W2BV72", "B099WPCHH9", "B099Z777JG", "B09B14JPDC", + "B09B1BDFJY", "B09B38TXW8", "B09B6TW6KJ", "B09B7358HK", "B09B7JVJ6L", + "B09B7TD3T4", "B09B7ZLC6V", "B09B8258TS", "B09BCNMBFJ", "B09BDCHCY9", + "B09BDF115M", "B09BDGGT8J", "B09BF663Q4", "B09BFNDK3N", "B09BKPZB72", + "B09BL595RW", "B09BLQD1RG", "B09BMQ4JM5", "B09BMSW4JX", "B09BNFVMQC", + "B09BPYBD11", "B09BQ9DX7M", "B09BQD2NWG", "B09BR4BFVL", "B09BTTX77H", + "B09BVJCNSF", "B09BVM5TJZ", "B09BVT3GXL", "B09BZ2LJJH", "B09C16KJ99", + "B09C1DTMZB", "B09C1X9T2Y", "B09C3GNBHL", "B09C3NXR2Z", "B09C5CJNF3", + "B09C5Q13VH", "B09C6122Q1", "B09C61WXR1", "B09C7TYWP1", "B09C7X2HG2", + "B09C81TYGG", "B09C8GNSDS", "B09C8YD8NJ", "B09CDC9PQM", "B09CFTYGTD", + "B09CGNL32M", "B09CGVVLZZ", "B09CH5RKK7", "B09CJNS6MX", "B09CNMH58Z", + "B09CP7ZZFM", "B09CPZP5TW", "B09CQ293P1", "B09CQ3JJ18", "B09CTCJBPP", + "B09CYDL7Y2", "B09CYLDD5Z", "B09CYRJYHX", "B09CYXVFGV", "B09CZH3W21", + "B09D135T6F", "B09D2N8ZPC", "B09D31RLP6", "B09D3CF86R", "B09D3RVQB5", + "B09D4WPBD5", "B09D7LD5KW", "B09D9FV7CP", "B09DBJ82ZH", "B09DC7QJLG", + "B09DF5TY21", "B09DFCJ258", "B09DFPKJ2J", "B09DG799FB", "B09DGHXZS6", + "B09DKQ62JF", "B09DKYT56G", "B09DNZVKZT", "B09DPCKNBK", "B09DPT467Z", + "B09DRX4YT4", "B09DSTH5W9", "B09DV5D228", "B09DVK7Q8W", "B09DVTJPKP", + "B09DX1J7VD", "B09DYL8GJN", "B09DYPCDVW", "B09DYSVGBP", "B09DYVBZY8", + "B09F2Y6P3F", "B09F37SCVD", "B09F64B3KS", "B09F66XLWY", "B09F6BNVT6", + "B09F6JPT3G", "B09F6PDNDZ", "B09F9C1Y7F", "B09F9CFZJJ", "B09F9LFGDJ", + "B09F9Z2DFB", "B09FFP37XY", "B09FHHYMVM", "B09FJRG9G7", "B09FJYV6ZK", + "B09FNVYV8D", "B09FNYB5GL", "B09FPND68L", "B09FSJZCY8", "B09FSTTJKS", + "B09FXVS48N", "B09FY387CL", "B09FY51L6B", "B09FYXNNBF", "B09FZ6GQ15", + "B09FZFH7QV", "B09FZTYT8X", "B09G2D8WCM", "B09G2MX3Z4", "B09G2RF3PG", + "B09G6QSLZJ", "B09G6S72QV", "B09G71YGSV", "B09G75PX77", "B09G9LB74G", + "B09G9N6GKP", "B09GF87ZNG", "B09GG22167", "B09GK1DX1C", "B09GK5FN5C", + "B09GK91RLQ", "B09GK9SGBM", "B09GL53HWC", "B09GLJT3WZ", "B09GP72MFV", + "B09GPWST54", "B09GT58PZL", "B09GTM8CZP", "B09GVNWBF8", "B09GW74VT2", + "B09GXG8HXK", "B09GXPZLQH", "B09GXR7S6Q", "B09GXTNQW1", "B09GXWGV6H", + "B09GY2R4K9", "B09H2DL8SK", "B09H2GC8JJ", "B09H2QHLGY", "B09H3DZ1HP", + "B09H3KHGMT", "B09H3LWR2B", "B09H3P43Z5", "B09H3YDVJ9", "B09H5NK549", + "B09H5S27RQ", "B09H6F4MD7", "B09H6KFRV9", "B09H6VRJYY", "B09H72RWXJ", + "B09HBBJ95K", "B09HBGBDDL", "B09HBKP3DP", "B09HBMKTMV", "B09HBSTWK9", + "B09HBY52DP", "B09HC6FJHH", "B09HC7Q2V7", "B09HC7W6BH", "B09HGM5B1C", + "B09HGXQJ8G", "B09HGYVSJM", "B09HHLPN9T", "B09HJM8Z5T", "B09HJR6FP2", + "B09HJTC3YR", "B09HKJFT7Y", "B09HKKW6W8", "B09HKM463K", "B09HKPBCX3", + "B09HN1CY8H", "B09HN4NBDF", "B09HP3HJJ3", "B09HP3NLY7", "B09HPJSHQ1", + "B09HR5651S", "B09HR8253V", "B09HRJ7G74", "B09HRLD8LK", "B09HSNJBVH", + "B09HSSCQ7N", "B09HT7G7VJ", "B09HT872JP", "B09HT8LTDZ", "B09HT95S1S", + "B09HT99C2M", "B09HTZYS5X", "B09HWR2FHP", "B09HWVLWRW", "B09HX1YV12", + "B09HX5VVWB", "B09HX6S6HN", "B09HX7MVG1", "B09HX7Q84R", "B09HX945GX", + "B09HXC1RTV", "B09HXHQVSG", "B09HXJYXQG", "B09HY4FN3Q", "B09HYTCT9G", + "B09HYV5JSP", "B09HZ1ST61", "B09HZ8FPG2", "B09HZMPNC1", "B09J1N9BMW", + "B09J1ZVSYR", "B09J1ZWZX1", "B09J217SNW", "B09J218FDX", "B09J21FD2K", + "B09J21H9N9", "B09J24QDDP", "B09J2F3J7K", "B09J2LWNZX", "B09J4PY9C9", + "B09J4VCNY5", "B09J54N8JD", "B09J7KJ6W2", "B09J7RCXMC", "B09J85G48F", + "B09J8CXXF1", "B09J8DBLBY", "B09J8FXQKG", "B09J8H4QTK", "B09J8LTH94", + "B09J8SZMH4", "B09J96SQGH", "B09JB21P2R", "B09JB469QT", "B09JBDHXQN", + "B09JBL2GK5", "B09JBY9QSH", "B09JFQVBJ9", "B09JG5XS8Z", "B09JGFZSRR", + "B09JJQCLF3", "B09JJVG3SQ", "B09JK6BQ51", "B09JK8FW4H", "B09JL12CPV", + "B09JLDN7FB", "B09JLK9Y85", "B09JM1HFLS", "B09JMW9ZLT", "B09JPGZKYR", + "B09JS71VK6", "B09JSS6JYD", "B09JW1M5FS", "B09JW8ZBHH", "B09JWBG2VM", + "B09JWCDJMN", "B09JWJ4997", "B09JWJNTQ1", "B09JYNSJBX", "B09JYQMLQZ", + "B09JZBHRLP", "B09JZD4Q46", "B09JZGNY5H", "B09K44F4S8", "B09K4FFJ51", + "B09K6FKQFN", "B09K761P9F", "B09K7K5HPR", "B09KBN9PWN", "B09KG8VTW3", + "B09KGNFP6P", "B09KGZ68TR", "B09KH1K7LM", "B09KH1S88H", "B09KH8YJKL", + "B09KHCFG6Q", "B09KHKMFVL", "B09KKTJ83P", "B09KLMP553", "B09KLQ8XG2", + "B09KLS5P9Z", "B09KLSYQRL", "B09KLW9P47", "B09KMJ6XVY", "B09KMVHB9Q", + "B09KNBGN29", "B09KNC6126", "B09KNDLKN7", "B09KNGVQJN", "B09KNQDG5D", + "B09KPFLFMR", "B09KPYXNQC", "B09KRDF2PY", "B09KRJX2BB", "B09KRNZD48", + "B09KRZF4NQ", "B09KT5J4HK", "B09KT8YJ46", "B09KTRYQZP", "B09KTVDWG5", + "B09KV2Y8X4", "B09KV4T2FM", "B09KXRCBQL", "B09KYZGH9M", "B09KZN7GB1", + "B09L12YHK8", "B09L1CP58B", "B09L4SX5CQ", "B09L7JHPMY", "B09L7KCC3Y", + "B09L7KL3JR", "B09L7KTZXK", "B09L7KW8QM", "B09L7LQZCY", "B09L7QXNCR", + "B09L837ZRJ", "B09L83TVZF", "B09L85ZBZR", "B09LC16X7H", "B09LC4J9BC", + "B09LC9CLQD", "B09LCBPRXY", "B09LCCJ6NT", "B09LCJDCHK", "B09LCKP897", + "B09LCM2WG4", "B09LCNV4JW", "B09LCPNNDM", "B09LCQ21BC", "B09LCWSSMY", + "B09LGZG9LN", "B09LLFDV5Q", "B09LLPXVZY", "B09LLV7PSS", "B09LM1ZTM8", + "B09LM6GKMM", "B09LM6TD73", "B09LMJMFDG", "B09LQ9MKYQ", "B09LQC12Q6", + "B09LQG5WPX", "B09LQGY84X", "B09LQHNJJL", "B09LR24RXV", "B09LR26H5Z", + "B09LRRBCBZ", "B09LRRFX5N", "B09LRRQ7PY", "B09LRRRTYF", "B09LRRW25Y", + "B09LSWJF7L", "B09LTSJ9TC", "B09LTSS9QQ", "B09LV9G8VL", "B09LYH5SYK", + "B09LYTRXW1", "B09LYVF3C8", "B09LYW87WH", "B09M3PZLVZ", "B09M6V66CC", + "B09M73P45B", "B09M7C1YWB", "B09M7QFNTC", "B09M8963F5", "B09M89P2KH", + "B09M8G3QZ9", "B09M8HQ8GD", "B09M9HBRZJ", "B09M9WTVQX", "B09M9XJDSP", + "B09MD26QFB", "B09MF8YB2K", "B09MFKTG3C", "B09MFLB7VR", "B09MFMK4VX", + "B09MH9JTKB", "B09MHS5NY9", "B09MJ18V7P", "B09MJQVMBQ", "B09MJWY5SY", + "B09MJYSGZH", "B09MK7FNWS", "B09MKC5BRF", "B09MKCYDPB", "B09MKDGXXG", + "B09MKMMJ55", "B09MKQV3XB", "B09MKQZG5N", "B09MKVV4DV", "B09MLB4CNP", + "B09MLD9MRJ", "B09MLDGRQ1", "B09MLV4XF2", "B09MQ7PGZ3", "B09MQNLF8X", + "B09MQRPL77", "B09MQS99QJ", "B09MQVWH8Q", "B09MQY9TVN", "B09MR7PHN1", + "B09MRLK1M8", "B09MSXRTBS", "B09MT96W4Y", "B09MTCVQ8Y", "B09MTM3F72", + "B09MTMX7Q3", "B09MTPCN7L", "B09MTPTX1Q", "B09MVML5PX", "B09MVS9QWN", + "B09MYHB7BB", "B09MYQK6QH", "B09MYQZ3PW", "B09MZ5XNX3", "B09MZ8KYJR", + "B09MZ9BKJP", "B09MZ9TT3R", "B09N1D9LRC", "B09N1DF64V", "B09N1F6H2P", + "B09N1NCG5W", "B09N3CH82P", "B09N3DGR48", "B09N3KPPRF", "B09N3MR529", + "B09N3NWMZD", "B09N5HBSDD", "B09N6XPL23", "B09N6ZTQX9", "B09N749QL4", + "B09N79697S", "B09N8QWGP7", "B09N8TTWND", "B09N94LQMG", "B09NBB6Y2R", + "B09NBBMRJH", "B09NBGWSKM", "B09NBGY5Z3", "B09NBJ5X1W", "B09NBVK9JG", + "B09NCWK16N", "B09NCXSXLH", "B09NCXT4VD", "B09ND17LSX", "B09ND1M3WH", + "B09NDH2Z4V", "B09NDHNT4B", "B09NDHZF36", "B09NDJVG6J", "B09NFDZ6NS", + "B09NFSSG7Y", "B09NH1ZGFB", "B09NJS9RS2", "B09NKDD3HY", "B09NKFNG45", + "B09NLPMXP8", "B09NM62G51", "B09NM83C17", "B09NMD225N", "B09NMFXBKD", + "B09NMZWDP1", "B09NN1S77Q", "B09NN3Q2TM", "B09NN3WN2W", "B09NN4K7N3", + "B09NN5CMVH", "B09NNBPHGQ", "B09NNLF61S", "B09NPGDPZC", "B09NPHC8P6", + "B09NPLW3CM", "B09NPMT85W", "B09NQ2HW66", "B09NQ3WVZJ", "B09NQ5KHQ6", + "B09NQZ94VD", "B09NQZJWTS", "B09NQZPJVV", "B09NSDDNLV", "B09NSRCT17", + "B09NSXX182", "B09NVFX6N7", "B09NVG4NHF", "B09NVPPFDL", "B09NVT3Q7H", + "B09NVX6VRW", "B09NW45CX4", "B09NXRR9GR", "B09NXS6P5X", "B09NXT3HPT", + "B09NXT7YZM", "B09NXTR2YY", "B09NXTSJYT", "B09NXTXNCR", "B09NY3T7R8", + "B09NY4NJJ5", "B09NY5RBHC", "B09NYFJ1TF", "B09NZDG5HT", "B09NZX8THX", + "B09P15DFSZ", "B09P17L22Y", "B09P1FMM5R", "B09P1H7L7Z", "B09P1JQS7V", + "B09P1S8SRH", "B09P32YSBX", "B09P4RZ8RN", "B09P4ZGSR2", "B09P512BDK", + "B09P5HRJP6", "B09P5VCDL8", "B09P5VVPV7", "B09P61D6VD", "B09P6DP7WT", + "B09P7TH2ZW", "B09P7TXXG9", "B09P844C92", "B09P85WQ5X", "B09P87NZTN", + "B09P8B149Z", "B09P9YZWD6", "B09PB5GR3L", "B09PB5TV6G", "B09PBWXC2K", + "B09PDPG31D", "B09PFX9F5T", "B09PG9KQ24", "B09PJ74NFR", "B09PL77NP9", + "B09PL7D2YZ", "B09PMJVCZF", "B09PN3GYPD", "B09PND2Z9N", "B09PNDMJHZ", + "B09PNMNVN7", "B09PQ9YNRJ", "B09PQFT8KV", "B09PQM615Y", "B09PQQ3TFN", + "B09PQQ9D41", "B09PQVGZYT", "B09PR7L89L", "B09PRKB6JY", "B09PTJSWG7", + "B09PTKTSB8", "B09PTPYZM7", "B09PTRBVJ9", "B09PTYB6BV", "B09PV6CJJB", + "B09PY71FMR", "B09PY85J4X", "B09PYFXBZ5", "B09PYN1G1P", "B09PYQSKC7", + "B09PYVX7P6", "B09PZ7BGTN", "B09PZB9GJV", "B09Q11K4TR", "B09Q1Q3FWD", + "B09Q2R4L5H", "B09Q2VK1L1", "B09Q3GYP3V", "B09Q3JFQV3", "B09Q5M2DSR", + "B09Q6D967X", "B09QBPF3TR", "B09QBSY8Z6", "B09QBXLC1M", "B09QBXW33C", + "B09QCNXTHL", "B09QFXK7XG", "B09QHP2YYP", "B09QHPKXCV", "B09QHQ3TF4", + "B09QHQX9L9", "B09QHTJLNL", "B09QJTBFJ2", "B09QK17DSV", "B09QKNCZGP", + "B09QKQSS4P", "B09QKRP818", "B09QKS4XB2", "B09QKW4Y1J", "B09QKZ41KB", + "B09QLV2WSS", "B09QM9TNS8", "B09QMFD8VX", "B09QPQ79J6", "B09QQD1M6W", + "B09QQG26L8", "B09QSL2ZZ6", "B09QXF864N", "B09QYKRVR8", "B09QYPBHK5", + "B09R25MDB6", "B09R5184ZB", "B09R7GKTQ2", "B09R8YPSD9", "B09R94254P", + "B09R9WLR4L", "B09RHK53JB", "B09RJ7V7BK", "B09RTDDSKD", "B09RTGR9GS", + "B09RVVNCB9", "B09S5QLQHF", "B09S5XRTB4", "B09S611N6J", "B09S9L9SJZ", + "B09S9YFWQN", "B09SB2Q7JM", "B09SCQFLBS", "B09SD7SCHF", "B09SDB7M9D", + "B09SDBRS8T", "B09SDV6WHM", "B09SDVQYLX", "B09SDW5RQP", "B09SGGRYP6", + "B09SHCXCQJ", "B09SLR2VL5", "B09SP9WGBB", "B09SPH833Z", "B09SPTMT7Y", + "B09SSS6BVC", "B09STDR2Q7", "B09SW42QBL", "B09SWNMWXH", "B09SXPDQ4P", + "B09SY7N3X7", "B09SZCXM2L", "B09T6V1M4Q" + ] + } + } + ], + "revision": 2 + }, + { + "sites": ["shopping"], + "task_id": 514, + "intent_template_id": 189, + "start_urls": ["__SHOPPING__"], "intent": "Add a white desk to my wish list.", - "require_reset": false, - "eval": { - "site": "shopping" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "__SHOPPING__/wishlist/", - "locator": "document.querySelector('.products-grid.wishlist').outerText", - "required_contents": { - "must_include": [ - "white", - "desk" - ] - } - } - ] - }, - "intent_template_id": 189 - }, - { - "sites": [ - "shopping" - ], - "task_id": 515, - "require_login": true, - "storage_state": "./.auth/shopping_state.json", - "start_url": "__SHOPPING__", - "geolocation": null, "intent_template": "Add a {{product}} to my wish list.", - "instantiation_dict": { - "product": "white computer desk" - }, + "instantiation_dict": {"product": "white desk"}, + "format_specification": null, + "start_url_context": null, + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + }, + { + "evaluator": "BackendStateEvaluator", + "site": "shopping", + "expected": { + "skus": [ + "B003B3NR62", "B004773CKW", "B00846JN00", "B00881ECVG", "B00EUT6LX2", + "B00FHXI13I", "B00FOOG5RY", "B00GMPDA9K", "B00NQHH5XO", "B0192REQ8I", + "B01MQFOWCJ", "B01N8SKYHP", "B06XPTGYL7", "B071KF55KQ", "B074H7JDCS", + "B076B6P1W8", "B0774FK6B7", "B077TK98RF", "B079313Z19", "B07CMJWM3Y", + "B07CPQPXGS", "B07DG6PY4P", "B07FR7KNRX", "B07H56P8LM", "B07HK436BZ", + "B07JFRM23P", "B07KGLPCRQ", "B07KT4NRFK", "B07MCCQ2HV", "B07MV1FZFB", + "B07NBXGXZN", "B07PB6179B", "B07PJG6YGY", "B07PMRWR8W", "B07R7KVGQG", + "B07RKLTDQS", "B07S6D3NPM", "B07T1FS6X2", "B07V8SDLBQ", "B07W1GGR6X", + "B07W5DZLZK", "B07XKS5X6G", "B07Y38GFGT", "B07YKSG7XF", "B07YWTSPQ3", + "B07ZJH1Z3X", "B07ZX3PBJ1", "B0812Z74DR", "B0813XDJFZ", "B08243WG1C", + "B0829XTKK1", "B082Q1LC55", "B082XXDVC4", "B082YVMJYS", "B083NX39PD", + "B083WM9PLP", "B084JMJ7B2", "B0868HY623", "B086WN6DC2", "B086Z8JGW6", + "B0876FCLTY", "B08DFJZCV8", "B08GPH26G3", "B08GWWZ3QP", "B08GYG3M6F", + "B08HCS3W8J", "B08HKZSN2M", "B08HN9K15V", "B08HVNQ5M2", "B08J1D4LLB", + "B08J4334DD", "B08JHCP28Q", "B08KTB1M8Q", "B08L4K5R93", "B08M5GWW3W", + "B08MKVKT5Q", "B08MTNPRX9", "B08N41HK9V", "B08ND9WC77", "B08NGDMJXN", + "B08NTWRS9X", "B08PB1TJ59", "B08PZ6QYLG", "B08QCGV9NS", "B08QF7FMG6", + "B08QF8XD1W", "B08QJFMBYZ", "B08RDGM6FD", "B08RHWN4L5", "B08S3TWCJ6", + "B08SDZS9QW", "B08SW9NN1P", "B08T64F9DB", "B08TRPQ4HQ", "B08TTGPZBM", + "B08VRFGD3K", "B08XGJHZQC", "B08Y1YSZT4", "B08YJLKR5R", "B08ZDRDMZY", + "B08ZNCSSF1", "B08ZXWR255", "B0914V6DY8", "B0927FPD86", "B092D256Q1", + "B092M5LW83", "B092M6GPJY", "B093225WFY", "B093K9SGX6", "B093KTVYF6", + "B093T9JT18", "B094CWP4M3", "B094NDDCSG", "B094QKZMY8", "B094QL8C87", + "B094VG34CW", "B094XPL4V8", "B096JZQLS5", "B096VQVV17", "B0972GMJ5N", + "B097N8ZCRR", "B0987G7T84", "B098L2JVY7", "B098LLNQGH", "B098Q9N5ZV", + "B098RVVNN9", "B098XJRTBH", "B099KRGV13", "B099PCGKXZ", "B099RPWD9M", + "B099WF1B5R", "B099WVV4BG", "B099ZCD7FS", "B09BVP53ZX", "B09BW1L6CH", + "B09C225J31", "B09CZ4776Q", "B09D3N1FJK", "B09DG76B6M", "B09DSHCMQX", + "B09DSRQGHP", "B09F37XFG5", "B09F628ZJQ", "B09F6KLB84", "B09FLP98S2", + "B09FTJQKVM", "B09FXGF9J5", "B09GK3H2CW", "B09HGVCTW5", "B09HN7XYR9", + "B09HN87JYR", "B09HTZFPLY", "B09J46P6ZG", "B09J53SGDV", "B09J8FDSBH", + "B09KN9KPWV", "B09KSYWTK1", "B09KTZ3K7P", "B09L5QVNPP", "B09L5ZPQBL", + "B09LC9DPFZ", "B09LHZK55X", "B09LYNXBZ9", "B09MCLF3FC", "B09MFX8S3W", + "B09MH9JTKB", "B09MQFZT7G", "B09MW76LJP", "B09MW7LTTK", "B09N8M8F3M", + "B09NBRVMXM", "B09NKNVMQX", "B09NQWPHW2", "B09NY4NJJ5", "B09P1BY2ZK", + "B09P3MT8BW", "B09P556Z72", "B09P8QDR8R", "B09PBJX6QJ", "B09PBTKW3V", + "B09PDRWG7K", "B09PDSLHG9", "B09PDTY47P", "B09PFRHMCK", "B09PFTFZT5", + "B09PFVQCWJ", "B09PFW3WBJ", "B09PSMZ213", "B09PTL9FYT", "B09PYWDGKZ", + "B09S8RSPTY", "B09SY4YD23" + ] + } + } + ], + "revision": 2 + }, + { + "sites": ["shopping"], + "task_id": 515, + "intent_template_id": 189, + "start_urls": ["__SHOPPING__"], "intent": "Add a white computer desk to my wish list.", - "require_reset": false, - "eval": { - "site": "shopping" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "__SHOPPING__/wishlist/", - "locator": "document.querySelector('.products-grid.wishlist').outerText", - "required_contents": { - "must_include": [ - "white", - "computer", - "desk" - ] - } - } - ] - }, - "intent_template_id": 189 - }, - { - "sites": [ - "shopping" - ], + "intent_template": "Add a {{product}} to my wish list.", + "instantiation_dict": {"product": "white computer desk"}, + "format_specification": null, + "start_url_context": null, + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + }, + { + "evaluator": "BackendStateEvaluator", + "site": "shopping", + "expected": { + "skus": [ + "B003B3NR62", "B00EUT6LX2", "B00FHXI13I", "B00GMPDA9K", "B01MQFOWCJ", + "B01N8SKYHP", "B071KF55KQ", "B0774FK6B7", "B07CPQPXGS", "B07FR7KNRX", + "B07PB6179B", "B07PMRWR8W", "B07R7KVGQG", "B07RKLTDQS", "B07W1GGR6X", + "B07W5DZLZK", "B07XKS5X6G", "B07Y38GFGT", "B07YWTSPQ3", "B0813XDJFZ", + "B08243WG1C", "B0829XTKK1", "B082XXDVC4", "B083NX39PD", "B0868HY623", + "B08GPH26G3", "B08GYG3M6F", "B08HCS3W8J", "B08HN9K15V", "B08HVNQ5M2", + "B08J1D4LLB", "B08J4334DD", "B08L4K5R93", "B08N41HK9V", "B08QF8XD1W", + "B08QJFMBYZ", "B08RHWN4L5", "B08S3TWCJ6", "B08T64F9DB", "B08TRPQ4HQ", + "B08TTGPZBM", "B08VRFGD3K", "B08ZDRDMZY", "B08ZXWR255", "B0927FPD86", + "B093225WFY", "B093K9SGX6", "B093KTVYF6", "B093T9JT18", "B094QKZMY8", + "B097N8ZCRR", "B098LLNQGH", "B098Q9N5ZV", "B098RVVNN9", "B099KRGV13", + "B099PCGKXZ", "B099WF1B5R", "B099ZCD7FS", "B09BW1L6CH", "B09DG76B6M", + "B09DSHCMQX", "B09F628ZJQ", "B09FTJQKVM", "B09HN7XYR9", "B09HN87JYR", + "B09HTZFPLY", "B09J46P6ZG", "B09KN9KPWV", "B09KSYWTK1", "B09L5ZPQBL", + "B09LHZK55X", "B09LYNXBZ9", "B09MH9JTKB", "B09MW76LJP", "B09N8M8F3M", + "B09NY4NJJ5", "B09P1BY2ZK", "B09P8QDR8R", "B09PBTKW3V", "B09PDRWG7K", + "B09PDSLHG9", "B09PDTY47P", "B09PFTFZT5", "B09PFVQCWJ", "B09PFW3WBJ", + "B09PSMZ213", "B094R8LKJZ" + ] + } + } + ], + "revision": 2 + }, + { + "sites": ["shopping"], "task_id": 516, - "require_login": true, - "storage_state": "./.auth/shopping_state.json", - "start_url": "__SHOPPING__/elmwood-inn-fine-teas-orange-vanilla-caffeine-free-fruit-infusion-16-ounce-pouch.html", - "geolocation": null, + "intent_template_id": 196, + "start_urls": [ + "__SHOPPING__/elmwood-inn-fine-teas-orange-vanilla-caffeine-free-fruit-infusion-16-ounce-pouch.html" + ], + "intent": "Add this product to my wishlist", "intent_template": "Add this product to my wishlist", "instantiation_dict": {}, - "intent": "Add this product to my wishlist", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "validate_in_wishlist", - "expected_data": { - "sku": "B0040WHKIY" - } - } - } - ], - "site": "shopping" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "__SHOPPING__/wishlist/", - "locator": "document.querySelector('.products-grid.wishlist').outerText", - "required_contents": { - "must_include": [ - "Elmwood Inn Fine Teas, Orange Vanilla Caffeine-free Fruit Infusion, 16-Ounce Pouch" - ] - } - } - ] - }, - "intent_template_id": 196, - "changelogs": [ + "format_specification": null, + "start_url_context": "Complete the task given the context of the current page.", + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "site": "shopping", + "expected": {"sku": "B0040WHKIY"} } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping" - ], + "sites": ["shopping"], "task_id": 517, - "require_login": true, - "storage_state": "./.auth/shopping_state.json", - "start_url": "__SHOPPING__/skinit-decal-gaming-skin-compatible-with-xbox-one-s-console-and-controller-bundle-officially-licensed-nfl-baltimore-ravens-design.html", - "geolocation": null, + "intent_template_id": 196, + "start_urls": [ + "__SHOPPING__/skinit-decal-gaming-skin-compatible-with-xbox-one-s-console-and-controller-bundle-officially-licensed-nfl-baltimore-ravens-design.html" + ], + "intent": "Add this product to my wishlist", "intent_template": "Add this product to my wishlist", "instantiation_dict": {}, - "intent": "Add this product to my wishlist", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "validate_in_wishlist", - "expected_data": { - "sku": "B01MTYJG38" - } - } - } - ], - "site": "shopping" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "__SHOPPING__/wishlist/", - "locator": "document.querySelector('.products-grid.wishlist').outerText", - "required_contents": { - "must_include": [ - "Skinit Decal Gaming Skin Compatible with Xbox One S Console and Controller Bundle - Officially Licensed NFL Baltimore Ravens Design" - ] - } - } - ] - }, - "intent_template_id": 196, - "changelogs": [ + "format_specification": null, + "start_url_context": "Complete the task given the context of the current page.", + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "site": "shopping", + "expected": {"sku": "B01MTYJG38"} } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping" - ], + "sites": ["shopping"], "task_id": 518, - "require_login": true, - "storage_state": "./.auth/shopping_state.json", - "start_url": "__SHOPPING__/sceptre-e195bd-srr-19-inch-720p-led-tv-true-black-2017.html", - "geolocation": null, + "intent_template_id": 196, + "start_urls": ["__SHOPPING__/sceptre-e195bd-srr-19-inch-720p-led-tv-true-black-2017.html"], + "intent": "Add this product to my wishlist", "intent_template": "Add this product to my wishlist", "instantiation_dict": {}, - "intent": "Add this product to my wishlist", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "validate_in_wishlist", - "expected_data": { - "sku": "B01MY87FWG" - } - } - } - ], - "site": "shopping" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "__SHOPPING__/wishlist/", - "locator": "document.querySelector('.products-grid.wishlist').outerText", - "required_contents": { - "must_include": [ - "Sceptre E195BD-SRR 19-Inch 720P LED TV, True Black (2017)" - ] - } - } - ] - }, - "intent_template_id": 196, - "changelogs": [ + "format_specification": null, + "start_url_context": "Complete the task given the context of the current page.", + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "site": "shopping", + "expected": {"sku": "B01MY87FWG"} } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping" - ], + "sites": ["shopping"], "task_id": 519, - "require_login": true, - "storage_state": "./.auth/shopping_state.json", - "start_url": "__SHOPPING__/iphone-13-pro-max-case-neon-turtle-iphone-13-pro-max-cases-tempered-glass-back-soft-silicone-tpu-shock-protective-case-for-apple-iphone-13-pro-max.html", - "geolocation": null, + "intent_template_id": 196, + "start_urls": [ + "__SHOPPING__/iphone-13-pro-max-case-neon-turtle-iphone-13-pro-max-cases-tempered-glass-back-soft-silicone-tpu-shock-protective-case-for-apple-iphone-13-pro-max.html" + ], + "intent": "Add this product to my wishlist", "intent_template": "Add this product to my wishlist", "instantiation_dict": {}, - "intent": "Add this product to my wishlist", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "validate_in_wishlist", - "expected_data": { - "sku": "B09GG4P4MD" - } - } - } - ], - "site": "shopping" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "__SHOPPING__/wishlist/", - "locator": "document.querySelector('.products-grid.wishlist').outerText", - "required_contents": { - "must_include": [ - "iPhone 13 Pro Max Case, Neon Turtle iPhone 13 Pro Max Cases, Tempered Glass Back+Soft Silicone TPU Shock Protective Case for Apple iPhone 13 Pro Max" - ] - } - } - ] - }, - "intent_template_id": 196, - "changelogs": [ + "format_specification": null, + "start_url_context": "Complete the task given the context of the current page.", + "eval": [ { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + }, + { + "evaluator": "BackendStateEvaluator", + "site": "shopping", + "expected": {"sku": "B09GG4P4MD"} } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping" - ], + "sites": ["shopping"], "task_id": 520, - "require_login": true, - "storage_state": "./.auth/shopping_state.json", - "start_url": "__SHOPPING__/magnetic-metal-stainless-steel-d-pads-kits-directional-pad-replacement-parts-for-xbox-one-elite-controller-elite-series-2-xbox-one-xbox-one-s-x-controller.html", - "geolocation": null, + "intent_template_id": 196, + "start_urls": [ + "__SHOPPING__/magnetic-metal-stainless-steel-d-pads-kits-directional-pad-replacement-parts-for-xbox-one-elite-controller-elite-series-2-xbox-one-xbox-one-s-x-controller.html" + ], + "intent": "Add this product to my wishlist", "intent_template": "Add this product to my wishlist", "instantiation_dict": {}, - "intent": "Add this product to my wishlist", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "validate_in_wishlist", - "expected_data": { - "sku": "B073XDR3K6" - } - } - } - ], - "site": "shopping" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "__SHOPPING__/wishlist/", - "locator": "document.querySelector('.products-grid.wishlist').outerText", - "required_contents": { - "must_include": [ - "Magnetic Metal Stainless Steel D-pads Kits Directional Pad Replacement Parts for Xbox One Elite Controller, Elite Series 2, Xbox One, Xbox One S/X Controller" - ] - } - } - ] - }, - "intent_template_id": 196, - "changelogs": [ + "format_specification": null, + "start_url_context": "Complete the task given the context of the current page.", + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "site": "shopping", + "expected": {"sku": "B073XDR3K6"} } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping" - ], + "sites": ["shopping"], "task_id": 521, - "require_login": true, - "storage_state": "./.auth/shopping_state.json", - "start_url": "__SHOPPING__", - "geolocation": null, + "intent_template_id": 199, + "start_urls": ["__SHOPPING__"], + "intent": "Subscribe to the newsletter of OneStopMarket", "intent_template": "Subscribe to the newsletter of OneStopMarket", "instantiation_dict": {}, - "intent": "Subscribe to the newsletter of OneStopMarket", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "validate_newsletter", - "expected_data": null - } - } - ], - "site": "shopping" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "__SHOPPING__/newsletter/manage/", - "locator": "document.querySelector('[title=\"General Subscription\"').checked.toString()", - "required_contents": { - "exact_match": "true" - } - } - ] - }, - "intent_template_id": 199, - "changelogs": [ + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" - } - ] + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + }, + {"evaluator": "BackendStateEvaluator", "site": "shopping", "expected": true} + ], + "revision": 2 }, { - "sites": [ - "gitlab" - ], + "sites": ["gitlab"], "task_id": 522, - "require_login": true, - "storage_state": "./.auth/gitlab_state.json", - "start_url": "__GITLAB__", - "geolocation": null, - "intent_template": "Fork {{repo}}.", - "instantiation_dict": { - "repo": "all repos from facebook" - }, - "intent": "Fork all repos from facebook.", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "user_has_project", - "eval_params": { - "project": [ - "create-react-app", - "buck" - ] - } - } - } - ], - "site": "gitlab" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "__GITLAB__/dashboard/projects", - "locator": "document.querySelector('[data-qa-selector=\"projects_list\"').outerText", - "required_contents": { - "must_include": [ - "create-react-app", - "buck" - ] - } - } - ] - }, "intent_template_id": 352, - "changelogs": [ + "start_urls": ["__GITLAB__"], + "intent": "Fork all repos from facebook.", + "intent_template": "Fork {{repo}}.", + "instantiation_dict": {"repo": "all repos from facebook"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "expected_backend_state", - "category": "switch_to_api_eval", - "note": "Use API to verify; more accurate" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "project": ["create-react-app", "buck"], + "site": "gitlab", + "expected": true } - ] + ], + "revision": 2 }, { - "sites": [ - "gitlab" - ], + "sites": ["gitlab"], "task_id": 523, - "require_login": true, - "storage_state": "./.auth/gitlab_state.json", - "start_url": "__GITLAB__", - "geolocation": null, - "intent_template": "Star the top {{number}} most stared repos in Gitlab", - "instantiation_dict": { - "number": "five" - }, - "intent": "Star the top five most stared repos in Gitlab", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "user_has_starred_project", - "eval_params": { - "user_id": "byteblaze", - "project_names": [ - "AndroidSlidingUpPanel", - "create-react-app", - "ffmpeg-python", - "PHP_XLSXWriter", - "AndroidAsync" - ] - } - } - } - ], - "site": "gitlab" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "__GITLAB__/users/byteblaze/starred", - "locator": "", - "required_contents": { - "must_include": [ - "AndroidSlidingUpPanel", - "create-react-app", - "ffmpeg-python", - "PHP_XLSXWriter", - "AndroidAsync" - ] - } - } - ] - }, "intent_template_id": 354, - "changelogs": [ + "start_urls": ["__GITLAB__"], + "intent": "Star the top five most stared repos in Gitlab", + "intent_template": "Star the top {{number}} most stared repos in Gitlab", + "instantiation_dict": {"number": "five"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "expected_backend_state", - "category": "switch_to_api_eval", - "note": "Use API to verify; more accurate" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "user_id": "byteblaze", + "project_names": [ + "AndroidSlidingUpPanel", "create-react-app", "ffmpeg-python", "PHP_XLSXWriter", + "AndroidAsync" + ], + "site": "gitlab", + "expected": true } - ] + ], + "revision": 2 }, { - "sites": [ - "gitlab" - ], + "sites": ["gitlab"], "task_id": 524, - "require_login": true, - "storage_state": "./.auth/gitlab_state.json", - "start_url": "__GITLAB__", - "geolocation": null, - "intent_template": "Star the top {{number}} most stared repos in Gitlab", - "instantiation_dict": { - "number": "eight" - }, - "intent": "Star the top eight most stared repos in Gitlab", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "user_has_starred_project", - "eval_params": { - "user_id": "byteblaze", - "project_names": [ - "AndroidSlidingUpPanel", - "create-react-app", - "ffmpeg-python", - "PHP_XLSXWriter", - "AndroidAsync", - "Pytorch-GAN", - "administrate", - "keycloak" - ] - } - } - } - ], - "site": "gitlab" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "__GITLAB__/users/byteblaze/starred", - "locator": "", - "required_contents": { - "must_include": [ - "AndroidSlidingUpPanel", - "create-react-app", - "ffmpeg-python", - "PHP_XLSXWriter", - "AndroidAsync", - "Pytorch-GAN", - "administrate", - "keycloak" - ] - } - } - ] - }, "intent_template_id": 354, - "changelogs": [ + "start_urls": ["__GITLAB__"], + "intent": "Star the top eight most stared repos in Gitlab", + "intent_template": "Star the top {{number}} most stared repos in Gitlab", + "instantiation_dict": {"number": "eight"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "expected_backend_state", - "category": "switch_to_api_eval", - "note": "Use API to verify; more accurate" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "user_id": "byteblaze", + "project_names": [ + "AndroidSlidingUpPanel", "create-react-app", "ffmpeg-python", "PHP_XLSXWriter", + "AndroidAsync", "Pytorch-GAN", "administrate", "keycloak" + ], + "site": "gitlab", + "expected": true } - ] + ], + "revision": 2 }, { - "sites": [ - "gitlab" - ], + "sites": ["gitlab"], "task_id": 525, - "require_login": true, - "storage_state": "./.auth/gitlab_state.json", - "start_url": "__GITLAB__", - "geolocation": null, - "intent_template": "Star the top {{number}} most stared repos in Gitlab", - "instantiation_dict": { - "number": "four" - }, - "intent": "Star the top four most stared repos in Gitlab", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "user_has_starred_project", - "eval_params": { - "user_id": "byteblaze", - "project_names": [ - "AndroidSlidingUpPanel", - "create-react-app", - "ffmpeg-python", - "PHP_XLSXWriter" - ] - } - } - } - ], - "site": "gitlab" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "__GITLAB__/users/byteblaze/starred", - "locator": "", - "required_contents": { - "must_include": [ - "AndroidSlidingUpPanel", - "create-react-app", - "ffmpeg-python", - "PHP_XLSXWriter" - ] - } - } - ] - }, "intent_template_id": 354, - "changelogs": [ + "start_urls": ["__GITLAB__"], + "intent": "Star the top four most stared repos in Gitlab", + "intent_template": "Star the top {{number}} most stared repos in Gitlab", + "instantiation_dict": {"number": "four"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "expected_backend_state", - "category": "switch_to_api_eval", - "note": "Use API to verify; more accurate" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "user_id": "byteblaze", + "project_names": ["AndroidSlidingUpPanel", "create-react-app", "ffmpeg-python", "PHP_XLSXWriter"], + "site": "gitlab", + "expected": true } - ] + ], + "revision": 2 }, { - "sites": [ - "gitlab" - ], + "sites": ["gitlab"], "task_id": 526, - "require_login": true, - "storage_state": "./.auth/gitlab_state.json", - "start_url": "__GITLAB__", - "geolocation": null, - "intent_template": "Star the top {{number}} most stared repos in Gitlab", - "instantiation_dict": { - "number": "three" - }, - "intent": "Star the top three most stared repos in Gitlab", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "user_has_starred_project", - "eval_params": { - "user_id": "byteblaze", - "project_names": [ - "AndroidSlidingUpPanel", - "create-react-app", - "ffmpeg-python" - ] - } - } - } - ], - "site": "gitlab" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "__GITLAB__/users/byteblaze/starred", - "locator": "", - "required_contents": { - "must_include": [ - "AndroidSlidingUpPanel", - "create-react-app", - "ffmpeg-python" - ] - } - } - ] - }, "intent_template_id": 354, - "changelogs": [ + "start_urls": ["__GITLAB__"], + "intent": "Star the top three most stared repos in Gitlab", + "intent_template": "Star the top {{number}} most stared repos in Gitlab", + "instantiation_dict": {"number": "three"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "expected_backend_state", - "category": "switch_to_api_eval", - "note": "Use API to verify; more accurate" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "user_id": "byteblaze", + "project_names": ["AndroidSlidingUpPanel", "create-react-app", "ffmpeg-python"], + "site": "gitlab", + "expected": true } - ] + ], + "revision": 2 }, { - "sites": [ - "gitlab" - ], + "sites": ["gitlab"], "task_id": 527, - "require_login": true, - "storage_state": "./.auth/gitlab_state.json", - "start_url": "__GITLAB__", - "geolocation": null, - "intent_template": "Star the top {{number}} most stared repos in Gitlab", - "instantiation_dict": { - "number": "one" - }, - "intent": "Star the top one most stared repos in Gitlab", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "user_has_starred_project", - "eval_params": { - "user_id": "byteblaze", - "project_names": [ - "AndroidSlidingUpPanel" - ] - } - } - } - ], - "site": "gitlab" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "__GITLAB__/users/byteblaze/starred", - "locator": "", - "required_contents": { - "must_include": [ - "AndroidSlidingUpPanel" - ] - } - } - ] - }, "intent_template_id": 354, - "changelogs": [ + "start_urls": ["__GITLAB__"], + "intent": "Star the top one most stared repos in Gitlab", + "intent_template": "Star the top {{number}} most stared repos in Gitlab", + "instantiation_dict": {"number": "one"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "expected_backend_state", - "category": "switch_to_api_eval", - "note": "Use API to verify; more accurate" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "user_id": "byteblaze", + "project_names": ["AndroidSlidingUpPanel"], + "site": "gitlab", + "expected": true } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping" - ], + "sites": ["shopping"], "task_id": 528, - "require_login": true, - "storage_state": "./.auth/shopping_state.json", - "start_url": "__SHOPPING__", - "geolocation": null, - "intent_template": "Draft a refund message via their \"contact us\" form for the {{product}} I bought {{time}}. State: \"It broke after three days of use\". The shop requires the order id, the reason and the amount to refund in the message. Don't submit yet", - "original.intent_template": "Draft a refund message via their \"contact us\" form for the {{product}} I bought {{time}}. It broke after three days of use. The shop requires the order id, the reason and the amount to refund in the message. Don't submit yet", - "instantiation_dict": { - "product": "phone screen protector", - "time": "March 2023" - }, - "intent": "Draft a refund message via their \"contact us\" form for the phone screen protector I bought March 2023. State: \"It broke after three days of use\". The shop requires the order id, the reason and the amount to refund in the message. Don't submit yet", - "original.intent": "Draft a refund message via their \"contact us\" form for the phone screen protector I bought March 2023. It broke after three days of use. The shop requires the order id, the reason and the amount to refund in the message. Don't submit yet", - "require_reset": false, - "eval": { - "site": "shopping" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "last", - "locator": "document.querySelector('[title=\"What\u2019s on your mind?\"').value", - "required_contents": { - "must_include": [ - "refund", - "it broke after three days of use", - "000000180", - "12.99" - ] - } - } - ] - }, "intent_template_id": 154, - "changelogs": [ + "start_urls": ["__SHOPPING__"], + "intent": "Draft a refund message via their \"contact us\" form for the phone screen protector I bought March 2023. State: \"It broke after three days of use\". The shop requires the order id, the reason and the amount to refund in the message. Don't submit yet", + "intent_template": "Draft a refund message via their \"contact us\" form for the {{product}} I bought {{time}}. State: \"It broke after three days of use\". The shop requires the order id, the reason and the amount to refund in the message. Don't submit yet", + "instantiation_dict": {"product": "phone screen protector", "time": "March 2023"}, + "format_specification": null, + "start_url_context": null, + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} + }, { - "key": "intent_template", - "category": "task_ambiguity", - "note": "Clarify the intent to provide exact string to enter" + "evaluator": "NetworkEventEvaluator", + "site": "shopping", + "url_match_mode": "exact", + "last_event_only": true, + "expected": { + "url": "__SHOPPING__/contact", + "response_status": 200, + "event_type": "navigation" + } } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping" - ], + "sites": ["shopping"], "task_id": 529, - "require_login": true, - "storage_state": "./.auth/shopping_state.json", - "start_url": "__SHOPPING__", - "geolocation": null, - "intent_template": "Draft a refund message via their \"contact us\" form for the {{product}} I bought {{time}}. State: \"It broke after three days of use\". The shop requires the order id, the reason and the amount to refund in the message. Don't submit yet", - "original.intent_template": "Draft a refund message via their \"contact us\" form for the {{product}} I bought {{time}}. It broke after three days of use. The shop requires the order id, the reason and the amount to refund in the message. Don't submit yet", - "instantiation_dict": { - "product": "bluetooth speaker", - "time": "Feb 2023" - }, - "intent": "Draft a refund message via their \"contact us\" form for the bluetooth speaker I bought Feb 2023. State: \"It broke after three days of use\". The shop requires the order id, the reason and the amount to refund in the message. Don't submit yet", - "original.intent": "Draft a refund message via their \"contact us\" form for the bluetooth speaker I bought Feb 2023. It broke after three days of use. The shop requires the order id, the reason and the amount to refund in the message. Don't submit yet", - "require_reset": false, - "eval": { - "site": "shopping" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "last", - "locator": "document.querySelector('[title=\"What\u2019s on your mind?\"').value", - "required_contents": { - "must_include": [ - "refund", - "it broke after three days of use", - "000000148", - "169.95" - ] - } - } - ] - }, "intent_template_id": 154, - "changelogs": [ + "start_urls": ["__SHOPPING__"], + "intent": "Draft a refund message via their \"contact us\" form for the bluetooth speaker I bought Feb 2023. State: \"It broke after three days of use\". The shop requires the order id, the reason and the amount to refund in the message. Don't submit yet", + "intent_template": "Draft a refund message via their \"contact us\" form for the {{product}} I bought {{time}}. State: \"It broke after three days of use\". The shop requires the order id, the reason and the amount to refund in the message. Don't submit yet", + "instantiation_dict": {"product": "bluetooth speaker", "time": "Feb 2023"}, + "format_specification": null, + "start_url_context": null, + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} + }, { - "key": "intent_template", - "category": "task_ambiguity", - "note": "Clarify the intent to provide exact string to enter" + "evaluator": "NetworkEventEvaluator", + "site": "shopping", + "url_match_mode": "exact", + "last_event_only": true, + "expected": { + "url": "__SHOPPING__/contact", + "response_status": 200, + "event_type": "navigation" + } } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping" - ], + "sites": ["shopping"], "task_id": 530, - "require_login": true, - "storage_state": "./.auth/shopping_state.json", - "start_url": "__SHOPPING__", - "geolocation": null, - "intent_template": "Draft a refund message via their \"contact us\" form for the {{product}} I bought {{time}}. State: \"It broke after three days of use\". The shop requires the order id, the reason and the amount to refund in the message. Don't submit yet", - "original.intent_template": "Draft a refund message via their \"contact us\" form for the {{product}} I bought {{time}}. It broke after three days of use. The shop requires the order id, the reason and the amount to refund in the message. Don't submit yet", - "instantiation_dict": { - "product": "kitchen organizer", - "time": "around Feb 2023" - }, - "intent": "Draft a refund message via their \"contact us\" form for the kitchen organizer I bought around Feb 2023. State: \"It broke after three days of use\". The shop requires the order id, the reason and the amount to refund in the message. Don't submit yet", - "original.intent": "Draft a refund message via their \"contact us\" form for the kitchen organizer I bought around Feb 2023. It broke after three days of use. The shop requires the order id, the reason and the amount to refund in the message. Don't submit yet", - "require_reset": false, - "eval": { - "site": "shopping" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "last", - "locator": "document.querySelector('[title=\"What\u2019s on your mind?\"').value", - "required_contents": { - "must_include": [ - "refund", - "it broke after three days of use", - "000000161", - "68.88" - ] - } - } - ] - }, "intent_template_id": 154, - "changelogs": [ + "start_urls": ["__SHOPPING__"], + "intent": "Draft a refund message via their \"contact us\" form for the kitchen organizer I bought around Feb 2023. State: \"It broke after three days of use\". The shop requires the order id, the reason and the amount to refund in the message. Don't submit yet", + "intent_template": "Draft a refund message via their \"contact us\" form for the {{product}} I bought {{time}}. State: \"It broke after three days of use\". The shop requires the order id, the reason and the amount to refund in the message. Don't submit yet", + "instantiation_dict": {"product": "kitchen organizer", "time": "around Feb 2023"}, + "format_specification": null, + "start_url_context": null, + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} + }, { - "key": "intent_template", - "category": "task_ambiguity", - "note": "Clarify the intent to provide exact string to enter" + "evaluator": "NetworkEventEvaluator", + "site": "shopping", + "url_match_mode": "exact", + "last_event_only": true, + "expected": { + "url": "__SHOPPING__/contact", + "response_status": 200, + "event_type": "navigation" + } } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping" - ], + "sites": ["shopping"], "task_id": 531, - "require_login": true, - "storage_state": "./.auth/shopping_state.json", - "start_url": "__SHOPPING__", - "geolocation": null, - "intent_template": "Draft a refund message via their \"contact us\" form for the {{product}} I bought {{time}}. State: \"It broke after three days of use\". The shop requires the order id, the reason and the amount to refund in the message. Don't submit yet", - "original.intent_template": "Draft a refund message via their \"contact us\" form for the {{product}} I bought {{time}}. It broke after three days of use. The shop requires the order id, the reason and the amount to refund in the message. Don't submit yet", - "instantiation_dict": { - "product": "phone case", - "time": "March 2023" - }, - "intent": "Draft a refund message via their \"contact us\" form for the phone case I bought March 2023. State: \"It broke after three days of use\". The shop requires the order id, the reason and the amount to refund in the message. Don't submit yet", - "original.intent": "Draft a refund message via their \"contact us\" form for the phone case I bought March 2023. It broke after three days of use. The shop requires the order id, the reason and the amount to refund in the message. Don't submit yet", - "require_reset": false, - "eval": { - "site": "shopping" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "last", - "locator": "document.querySelector('[title=\"What\u2019s on your mind?\"').value", - "required_contents": { - "must_include": [ - "refund", - "it broke after three days of use", - "000000180", - "$12.99" - ] - } - } - ] - }, "intent_template_id": 154, - "changelogs": [ + "start_urls": ["__SHOPPING__"], + "intent": "Draft a refund message via their \"contact us\" form for the phone case I bought March 2023. State: \"It broke after three days of use\". The shop requires the order id, the reason and the amount to refund in the message. Don't submit yet", + "intent_template": "Draft a refund message via their \"contact us\" form for the {{product}} I bought {{time}}. State: \"It broke after three days of use\". The shop requires the order id, the reason and the amount to refund in the message. Don't submit yet", + "instantiation_dict": {"product": "phone case", "time": "March 2023"}, + "format_specification": null, + "start_url_context": null, + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} + }, { - "key": "intent_template", - "category": "task_ambiguity", - "note": "Clarify the intent to provide exact string to enter" + "evaluator": "NetworkEventEvaluator", + "site": "shopping", + "url_match_mode": "exact", + "last_event_only": true, + "expected": { + "url": "__SHOPPING__/contact", + "response_status": 200, + "event_type": "navigation" + } } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping" - ], + "sites": ["shopping"], "task_id": 532, - "require_login": true, - "storage_state": "./.auth/shopping_state.json", - "start_url": "__SHOPPING__", - "geolocation": null, - "intent_template": "Draft a refund message via their \"contact us\" form for the {{product}} I bought {{time}}. State: \"It broke after three days of use\". The shop requires the order id, the reason and the amount to refund in the message. Don't submit yet", - "original.intent_template": "Draft a refund message via their \"contact us\" form for the {{product}} I bought {{time}}. It broke after three days of use. The shop requires the order id, the reason and the amount to refund in the message. Don't submit yet", - "instantiation_dict": { - "product": "PS3 remote controller", - "time": "early 2023" - }, - "intent": "Draft a refund message via their \"contact us\" form for the PS3 remote controller I bought early 2023. State: \"It broke after three days of use\". The shop requires the order id, the reason and the amount to refund in the message. Don't submit yet", - "original.intent": "Draft a refund message via their \"contact us\" form for the PS3 remote controller I bought early 2023. It broke after three days of use. The shop requires the order id, the reason and the amount to refund in the message. Don't submit yet", - "require_reset": false, - "eval": { - "site": "shopping" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "last", - "locator": "document.querySelector('[title=\"What\u2019s on your mind?\"').value", - "required_contents": { - "must_include": [ - "refund", - "it broke after three days of use", - "000000180", - "1.63" - ] - } - } - ] - }, "intent_template_id": 154, - "changelogs": [ + "start_urls": ["__SHOPPING__"], + "intent": "Draft a refund message via their \"contact us\" form for the PS3 remote controller I bought early 2023. State: \"It broke after three days of use\". The shop requires the order id, the reason and the amount to refund in the message. Don't submit yet", + "intent_template": "Draft a refund message via their \"contact us\" form for the {{product}} I bought {{time}}. State: \"It broke after three days of use\". The shop requires the order id, the reason and the amount to refund in the message. Don't submit yet", + "instantiation_dict": {"product": "PS3 remote controller", "time": "early 2023"}, + "format_specification": null, + "start_url_context": null, + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} + }, { - "key": "intent_template", - "category": "task_ambiguity", - "note": "Clarify the intent to provide exact string to enter" + "evaluator": "NetworkEventEvaluator", + "site": "shopping", + "url_match_mode": "exact", + "last_event_only": true, + "expected": { + "url": "__SHOPPING__/contact", + "response_status": 200, + "event_type": "navigation" + } } - ] + ], + "revision": 2 }, { - "sites": [ - "gitlab" - ], + "sites": ["gitlab"], "task_id": 533, - "require_login": true, - "storage_state": "./.auth/gitlab_state.json", - "start_url": "__GITLAB__", - "geolocation": null, - "intent_template": "Follow {{account_list}} on Gitlab", - "instantiation_dict": { - "account_list": [ - "convexegg", - "yjlou" - ] - }, - "intent": "Follow ['convexegg', 'yjlou'] on Gitlab", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "is_user_following", - "eval_params": { - "username": "byteblaze", - "following_usernames": [ - "convexegg", - "yjlou" - ] - } - } - } - ], - "site": "gitlab" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "__GITLAB__/users/byteblaze/following", - "locator": "document.querySelector('.user-profile').outerText", - "required_contents": { - "must_include": [ - "@convexegg", - "@yjlou" - ] - } - } - ] - }, "intent_template_id": 330, - "changelogs": [ + "start_urls": ["__GITLAB__"], + "intent": "Follow ['convexegg', 'yjlou'] on Gitlab", + "intent_template": "Follow {{account_list}} on Gitlab", + "instantiation_dict": { "account_list": ["convexegg", "yjlou"] }, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "expected_backend_state", - "category": "switch_to_api_eval", - "note": "Use API-based follower check; more accurate" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "username": "byteblaze", + "following_usernames": ["convexegg", "yjlou"], + "site": "gitlab", + "expected": true } - ] + ], + "revision": 2 }, { - "sites": [ - "gitlab" - ], + "sites": ["gitlab"], "task_id": 534, - "require_login": true, - "storage_state": "./.auth/gitlab_state.json", - "start_url": "__GITLAB__", - "geolocation": null, - "intent_template": "Follow {{account_list}} on Gitlab", - "instantiation_dict": { - "account_list": [ - "Jakub Klinkovsk\u00fd", - "Koushik", - "Vinta Chen" - ] - }, - "intent": "Follow ['Jakub Klinkovsk\u00fd', 'Koushik', 'Vinta Chen'] on Gitlab", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "is_user_following", - "eval_params": { - "username": "byteblaze", - "following_usernames": [ - "lahwaacz", - "koush", - "vinta" - ] - } - } - } - ], - "site": "gitlab" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "__GITLAB__/users/byteblaze/following", - "locator": "document.querySelector('.user-profile').outerText", - "required_contents": { - "must_include": [ - "@lahwaacz", - "@koush", - "@vinta" - ] - } - } - ] - }, "intent_template_id": 330, - "changelogs": [ + "start_urls": ["__GITLAB__"], + "intent": "Follow ['Jakub Klinkovsk\u00fd', 'Koushik', 'Vinta Chen'] on Gitlab", + "intent_template": "Follow {{account_list}} on Gitlab", + "instantiation_dict": { "account_list": ["Jakub Klinkovsk\u00fd", "Koushik", "Vinta Chen"] }, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "expected_backend_state", - "category": "switch_to_api_eval", - "note": "Use API-based follower check; more accurate" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "username": "byteblaze", + "following_usernames": ["lahwaacz", "koush", "vinta"], + "site": "gitlab", + "expected": true } - ] + ], + "revision": 2 }, { - "sites": [ - "gitlab" - ], + "sites": ["gitlab"], "task_id": 535, - "require_login": true, - "storage_state": "./.auth/gitlab_state.json", - "start_url": "__GITLAB__", - "geolocation": null, - "intent_template": "Follow {{account_list}} on Gitlab", - "instantiation_dict": { - "account_list": [ - "Jakub K", - "ghost", - "Beno\u00eet Blanchon" - ] - }, - "intent": "Follow ['Jakub K', 'ghost', 'Beno\u00eet Blanchon'] on Gitlab", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "is_user_following", - "eval_params": { - "username": "byteblaze", - "following_usernames": [ - "lahwaacz", - "ghost", - "bblanchon" - ] - } - } - } - ], - "site": "gitlab" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "__GITLAB__/users/byteblaze/following", - "locator": "document.querySelector('.user-profile').outerText", - "required_contents": { - "must_include": [ - "@lahwaacz", - "@ghost", - "@bblanchon" - ] - } - } - ] - }, "intent_template_id": 330, - "changelogs": [ + "start_urls": ["__GITLAB__"], + "intent": "Follow ['Jakub K', 'ghost', 'Beno\u00eet Blanchon'] on Gitlab", + "intent_template": "Follow {{account_list}} on Gitlab", + "instantiation_dict": { "account_list": ["Jakub K", "ghost", "Beno\u00eet Blanchon"] }, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "expected_backend_state", - "category": "switch_to_api_eval", - "note": "Use API-based follower check; more accurate" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "username": "byteblaze", + "following_usernames": ["lahwaacz", "ghost", "bblanchon"], + "site": "gitlab", + "expected": true } - ] + ], + "revision": 2 }, { - "sites": [ - "gitlab" - ], + "sites": ["gitlab"], "task_id": 536, - "require_login": true, - "storage_state": "./.auth/gitlab_state.json", - "start_url": "__GITLAB__", - "geolocation": null, - "intent_template": "Follow {{account_list}} on Gitlab", - "instantiation_dict": { - "account_list": [ - "ghost", - "R1kk3r", - "Abishek" - ] - }, - "intent": "Follow ['ghost', 'R1kk3r', 'Abishek'] on Gitlab", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "is_user_following", - "eval_params": { - "username": "byteblaze", - "following_usernames": [ - "ghost", - "R1kk3r", - "abisubramanya27" - ] - } - } - } - ], - "site": "gitlab" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "__GITLAB__/users/byteblaze/following", - "locator": "document.querySelector('.user-profile').outerText", - "required_contents": { - "must_include": [ - "@lahwaacz", - "@R1kk3r", - "@abisubramanya27" - ] - } - } - ] - }, "intent_template_id": 330, - "changelogs": [ + "start_urls": ["__GITLAB__"], + "intent": "Follow ['ghost', 'R1kk3r', 'Abishek'] on Gitlab", + "intent_template": "Follow {{account_list}} on Gitlab", + "instantiation_dict": { "account_list": ["ghost", "R1kk3r", "Abishek"] }, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "expected_backend_state", - "category": "switch_to_api_eval", - "note": "Use API-based follower check; more accurate" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "username": "byteblaze", + "following_usernames": ["ghost", "R1kk3r", "abisubramanya27"], + "site": "gitlab", + "expected": true } - ] + ], + "revision": 2 }, { - "sites": [ - "gitlab" - ], + "sites": ["gitlab"], "task_id": 537, - "require_login": true, - "storage_state": "./.auth/gitlab_state.json", - "start_url": "__GITLAB__", - "geolocation": null, + "intent_template_id": 330, + "start_urls": ["__GITLAB__"], + "intent": "Follow ['Jakub Klinkovsk', 'convexegg', 'Vinta Chen', 'yjlou', 'Abishek S'] on Gitlab", "intent_template": "Follow {{account_list}} on Gitlab", "instantiation_dict": { - "account_list": [ - "Jakub Klinkovsk", - "convexegg", - "Vinta Chen", - "yjlou", - "Abishek S" - ] - }, - "intent": "Follow ['Jakub Klinkovsk', 'convexegg', 'Vinta Chen', 'yjlou', 'Abishek S'] on Gitlab", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "is_user_following", - "eval_params": { - "username": "byteblaze", - "following_usernames": [ - "lahwaacz", - "convexegg", - "vinta", - "yjlou", - "abisubramanya27" - ] - } - } - } - ], - "site": "gitlab" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "__GITLAB__/users/byteblaze/following", - "locator": "document.querySelector('.user-profile').outerText", - "required_contents": { - "must_include": [ - "@lahwaacz", - "@convexegg", - "@vinta", - "@yjlou", - "@abisubramanya27" - ] - } - } - ] + "account_list": ["Jakub Klinkovsk", "convexegg", "Vinta Chen", "yjlou", "Abishek S"] }, - "intent_template_id": 330, - "changelogs": [ + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "expected_backend_state", - "category": "switch_to_api_eval", - "note": "Use API-based follower check; more accurate" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "username": "byteblaze", + "following_usernames": ["lahwaacz", "convexegg", "vinta", "yjlou", "abisubramanya27"], + "site": "gitlab", + "expected": true } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping_admin" - ], + "sites": ["shopping_admin"], "task_id": 538, - "require_login": true, - "storage_state": "./.auth/shopping_admin_state.json", - "start_url": "__SHOPPING_ADMIN__", - "geolocation": null, + "intent_template_id": 240, + "start_urls": ["__SHOPPING_ADMIN__"], + "intent": "Modify the address of order #299 to 456 Oak Avenue, Apartment 5B, New York, NY, 10001", "intent_template": "Modify the address of order #{{order_id}} to {{address}}", "instantiation_dict": { "order_id": "299", "address": "456 Oak Avenue, Apartment 5B, New York, NY, 10001" }, - "intent": "Modify the address of order #299 to 456 Oak Avenue, Apartment 5B, New York, NY, 10001", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "verify_order_details", - "eval_params": { - "order_id": "299" - }, - "expected_data": { - "address": "456 Oak Avenue", - "address2": "Apartment 5B", - "city": "New York", - "state": "New York", - "zip_code": "10001", - "country": "US" - } - } - } - ], - "site": "shopping_admin" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "__SHOPPING_ADMIN__/sales/order/view/order_id/299", - "locator": "", - "required_contents": { - "must_include": [ - "456 Oak Avenue", - "Apartment 5B", - "New York", - "10001" - ] - } - } - ] - }, - "intent_template_id": 240, - "changelogs": [ + "format_specification": null, + "start_url_context": null, + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "order_id": "299", + "site": "shopping_admin", + "expected": { + "address": "456 Oak Avenue", + "address2": "Apartment 5B", + "city": "New York", + "state": "New York", + "zip_code": "10001", + "country": "US" + } } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping_admin" - ], + "sites": ["shopping_admin"], "task_id": 539, - "require_login": true, - "storage_state": "./.auth/shopping_admin_state.json", - "start_url": "__SHOPPING_ADMIN__", - "geolocation": null, - "intent_template": "Modify the address of order #{{order_id}} to {{address}}", - "instantiation_dict": { - "order_id": "65", - "address": "789 Pine Lane, San Francisco, CA, 94102" - }, - "intent": "Modify the address of order #65 to 789 Pine Lane, San Francisco, CA, 94102", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "verify_order_details", - "eval_params": { - "order_id": "65" - }, - "expected_data": { - "address": "789 Pine Lane", - "city": "San Francisco", - "state": "California", - "zip_code": "94102" - } - } - } - ], - "site": "shopping_admin" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "__SHOPPING_ADMIN__/sales/order/view/order_id/65", - "locator": "", - "required_contents": { - "must_include": [ - "789 Pine Lane", - "San Francisco", - "California", - "94102" - ] - } - } - ] - }, "intent_template_id": 240, - "changelogs": [ + "start_urls": ["__SHOPPING_ADMIN__"], + "intent": "Modify the address of order #65 to 789 Pine Lane, San Francisco, CA, 94102", + "intent_template": "Modify the address of order #{{order_id}} to {{address}}", + "instantiation_dict": {"order_id": "65", "address": "789 Pine Lane, San Francisco, CA, 94102"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + }, + { + "evaluator": "BackendStateEvaluator", + "order_id": "65", + "site": "shopping_admin", + "expected": { + "address": "789 Pine Lane", + "city": "San Francisco", + "state": "California", + "zip_code": "94102" + } } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping_admin" - ], + "sites": ["shopping_admin"], "task_id": 540, - "require_login": true, - "storage_state": "./.auth/shopping_admin_state.json", - "start_url": "__SHOPPING_ADMIN__", - "geolocation": null, + "intent_template_id": 240, + "start_urls": ["__SHOPPING_ADMIN__"], + "intent": "Modify the address of order #301 to 321 Birch Boulevard, Suite 200, Dallas, TX, 75201", "intent_template": "Modify the address of order #{{order_id}} to {{address}}", "instantiation_dict": { "order_id": "301", "address": "321 Birch Boulevard, Suite 200, Dallas, TX, 75201" }, - "intent": "Modify the address of order #301 to 321 Birch Boulevard, Suite 200, Dallas, TX, 75201", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "verify_order_details", - "eval_params": { - "order_id": "301" - }, - "expected_data": { - "address": "321 Birch Boulevard", - "address2": "Suite 200", - "city": "Dallas", - "state": "Texas", - "zip_code": "75201" - } - } - } - ], - "site": "shopping_admin" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "__SHOPPING_ADMIN__/sales/order/view/order_id/301", - "locator": "", - "required_contents": { - "must_include": [ - "321 Birch Boulevard", - "Suite 200", - "Dallas", - "Texas", - "75201" - ] - } - } - ] - }, - "intent_template_id": 240, - "changelogs": [ + "format_specification": null, + "start_url_context": null, + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "order_id": "301", + "site": "shopping_admin", + "expected": { + "address": "321 Birch Boulevard", + "address2": "Suite 200", + "city": "Dallas", + "state": "Texas", + "zip_code": "75201" + } } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping_admin" - ], + "sites": ["shopping_admin"], "task_id": 541, - "require_login": true, - "storage_state": "./.auth/shopping_admin_state.json", - "start_url": "__SHOPPING_ADMIN__", - "geolocation": null, - "intent_template": "Modify the address of order #{{order_id}} to {{address}}", - "instantiation_dict": { - "order_id": "125", - "address": "654 Elm Drive, Apartment 12, Miami, FL, 33101" - }, - "intent": "Modify the address of order #125 to 654 Elm Drive, Apartment 12, Miami, FL, 33101", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "verify_order_details", - "eval_params": { - "order_id": "125" - }, - "expected_data": { - "address": "654 Elm Drive", - "address2": "Apartment 12", - "city": "Miami", - "state": "Florida", - "zip_code": "33101" - } - } - } - ], - "site": "shopping_admin" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "__SHOPPING_ADMIN__/sales/order/view/order_id/125", - "locator": "", - "required_contents": { - "must_include": [ - "654 Elm Drive", - "Apartment 12", - "Miami", - "Florida", - "33101" - ] - } - } - ] - }, "intent_template_id": 240, - "changelogs": [ + "start_urls": ["__SHOPPING_ADMIN__"], + "intent": "Modify the address of order #125 to 654 Elm Drive, Apartment 12, Miami, FL, 33101", + "intent_template": "Modify the address of order #{{order_id}} to {{address}}", + "instantiation_dict": {"order_id": "125", "address": "654 Elm Drive, Apartment 12, Miami, FL, 33101"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + }, + { + "evaluator": "BackendStateEvaluator", + "order_id": "125", + "site": "shopping_admin", + "expected": { + "address": "654 Elm Drive", + "address2": "Apartment 12", + "city": "Miami", + "state": "Florida", + "zip_code": "33101" + } } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping_admin" - ], + "sites": ["shopping_admin"], "task_id": 542, - "require_login": true, - "storage_state": "./.auth/shopping_admin_state.json", - "start_url": "__SHOPPING_ADMIN__", - "geolocation": null, - "intent_template": "Modify the address of order #{{order_id}} to {{address}}", - "instantiation_dict": { - "order_id": "300", - "address": "987 Cedar Court, Los Angeles, CA, 90012" - }, - "intent": "Modify the address of order #300 to 987 Cedar Court, Los Angeles, CA, 90012", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "verify_order_details", - "eval_params": { - "order_id": "300" - }, - "expected_data": { - "address": "987 Cedar Court", - "city": "Los Angeles", - "state": "California", - "zip_code": "90012" - } - } - } - ], - "site": "shopping_admin" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "__SHOPPING_ADMIN__/sales/order/view/order_id/300", - "locator": "", - "required_contents": { - "must_include": [ - "987 Cedar Court", - "Los Angeles", - "California", - "90012" - ] - } - } - ] - }, "intent_template_id": 240, - "changelogs": [ + "start_urls": ["__SHOPPING_ADMIN__"], + "intent": "Modify the address of order #300 to 987 Cedar Court, Los Angeles, CA, 90012", + "intent_template": "Modify the address of order #{{order_id}} to {{address}}", + "instantiation_dict": {"order_id": "300", "address": "987 Cedar Court, Los Angeles, CA, 90012"}, + "format_specification": null, + "start_url_context": null, + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "order_id": "300", + "site": "shopping_admin", + "expected": { + "address": "987 Cedar Court", + "city": "Los Angeles", + "state": "California", + "zip_code": "90012" + } } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping_admin" - ], + "sites": ["shopping_admin"], "task_id": 543, - "require_login": true, - "storage_state": "./.auth/shopping_admin_state.json", - "start_url": "__SHOPPING_ADMIN__", - "geolocation": null, - "intent_template": "Update the product description of {{product}} with the titles of all reviews with 4 stars or above.", - "original.intent_template": "Update the product description of {{product}} to highlight the real user positive reviews by quoting the comments", - "instantiation_dict": { - "product": "Bella Tank" - }, - "intent": "Update the product description of Bella Tank with the titles of all reviews with 4 stars or above.", - "original.intent": "Update the product description of Bella Tank to highlight the real user positive reviews by quoting the comments", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "verify_product_details", - "eval_params": { - "product_id": "1684" - }, - "expected_data": { - "description": [ - "A sweet n sporty look for the gym", - "Good choice for working out" - ] - } - } - } - ], - "site": "shopping_admin" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "__SHOPPING_ADMIN__/../bella-tank.html", - "locator": "document.querySelector('.data.item.content').outerText + (document.querySelector('.product.attribute.overview [itemprop=\"description\"]')?.outerText || '')", - "required_contents": { - "must_include": [ - "Good choice for working out and stylin' enough to wear when I'm hanging with friends on hot days", - "Also washes really well", - "Always a sweet n sporty look for the gym", - "Keeps me cool and the seams don't rub up against me like some of my other tanks" - ] - } - } - ] - }, "intent_template_id": 251, - "changelogs": [ + "start_urls": ["__SHOPPING_ADMIN__"], + "intent": "Update the product description of Bella Tank with the titles of all reviews with 4 stars or above.", + "intent_template": "Update the product description of {{product}} with the titles of all reviews with 4 stars or above.", + "instantiation_dict": {"product": "Bella Tank"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "intent_template", - "category": "permissive_string_match", - "note": "Clarify the intent to specify exactly what to update the value with" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "product_id": "1684", + "site": "shopping_admin", + "expected": { + "description": ["A sweet n sporty look for the gym", "Good choice for working out"] + } } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping_admin" - ], + "sites": ["shopping_admin"], "task_id": 544, - "require_login": true, - "storage_state": "./.auth/shopping_admin_state.json", - "start_url": "__SHOPPING_ADMIN__", - "geolocation": null, - "intent_template": "Update the product description of {{product}} with the titles of all reviews with 4 stars or above.", - "original.intent_template": "Update the description of {{product}} to highlight the real user positive reviews by quoting the comments", - "instantiation_dict": { - "product": "Selene Yoga Hoodie" - }, - "original.instantiation_dict": { - "product": "Selena Yoga Hoodie" - }, - "intent": "Update the product description of Selene Yoga Hoodie with the titles of all reviews with 4 stars or above.", - "original.intent": "Update the description of Selena Yoga Hoodie to highlight the real user positive reviews by quoting the comments", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "verify_product_details", - "eval_params": { - "product_id": "1108" - }, - "expected_data": { - "description": [ - "Best hoodies I've owned.", - "Great value", - "Kept me warm" - ] - } - } - } - ], - "site": "shopping_admin" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "__SHOPPING_ADMIN__/../selene-yoga-hoodie.html", - "locator": "document.querySelector('.data.item.content').outerText + (document.querySelector('.product.attribute.overview [itemprop=\"description\"]')?.outerText || '')", - "required_contents": { - "must_include": [ - "I was super cold and it did the job.", - "The sleeves are definitely thicker than you realize, which is a good thing", - "really quite substantial", - "planning on buying another one of these in another color", - "the best hoodie ive ever owned" - ] - } - } - ] - }, "intent_template_id": 251, - "changelogs": [ - { - "key": "intent_template", - "category": "permissive_string_match", - "note": "Clarify the intent to specify exactly what to update the value with" - }, + "start_urls": ["__SHOPPING_ADMIN__"], + "intent": "Update the product description of Selene Yoga Hoodie with the titles of all reviews with 4 stars or above.", + "intent_template": "Update the product description of {{product}} with the titles of all reviews with 4 stars or above.", + "instantiation_dict": {"product": "Selene Yoga Hoodie"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "instantiation_dict", - "category": "task_ambiguity", - "note": "Product name misspelled leading to inability to update the product description. Changed from 'Selena' to 'Selene'" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "product_id": "1108", + "site": "shopping_admin", + "expected": { "description": ["Best hoodies I've owned.", "Great value", "Kept me warm"] } } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping_admin" - ], + "sites": ["shopping_admin"], "task_id": 545, - "require_login": true, - "storage_state": "./.auth/shopping_admin_state.json", - "start_url": "__SHOPPING_ADMIN__", - "geolocation": null, - "intent_template": "Update the product description of {{product}} with the titles of all reviews with 4 stars or above.", - "original.intent_template": "Update the description of {{product}} to highlight the real user positive reviews by quoting the comments", - "instantiation_dict": { - "product": "Radiant Tee" - }, - "intent": "Update the product description of Radiant Tee with the titles of all reviews with 4 stars or above.", - "original.intent": "Update the description of Radiant Tee to highlight the real user positive reviews by quoting the comments", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "verify_product_details", - "eval_params": { - "product_id": "1556" - }, - "expected_data": { - "description": [ - "What a versatile shirt!" - ] - } - } - } - ], - "site": "shopping_admin" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "__SHOPPING_ADMIN__/../radiant-tee.html", - "locator": "document.querySelector('.data.item.content').outerText + (document.querySelector('.product.attribute.overview [itemprop=\"description\"]')?.outerText || '')", - "required_contents": { - "must_include": [ - "What I rally love here is that it does the job of keeping me cool and dry", - "I'm a big guy and sweat A LOT", - "Even after a day of gulf, I'm still dry and comfortable", - "What a versatile shirt", - "Not only does it feel very soft compared to my old worn out polos, but it also does the job promised", - "I like going out after my game for drinks so I look good then too and don't need to change into something fresh" - ] - } - } - ] - }, "intent_template_id": 251, - "changelogs": [ + "start_urls": ["__SHOPPING_ADMIN__"], + "intent": "Update the product description of Radiant Tee with the titles of all reviews with 4 stars or above.", + "intent_template": "Update the product description of {{product}} with the titles of all reviews with 4 stars or above.", + "instantiation_dict": {"product": "Radiant Tee"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "intent_template", - "category": "permissive_string_match", - "note": "Clarify the intent to specify exactly what to update the value with" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "product_id": "1556", + "site": "shopping_admin", + "expected": { "description": ["What a versatile shirt!"] } } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping_admin" - ], + "sites": ["shopping_admin"], "task_id": 546, - "require_login": true, - "storage_state": "./.auth/shopping_admin_state.json", - "start_url": "__SHOPPING_ADMIN__", - "geolocation": null, - "intent_template": "Update the product description of {{product}} with the titles of all reviews with 4 stars or above.", - "original.intent_template": "Update the description of {{product}} to highlight the real user positive reviews by quoting the comments", - "instantiation_dict": { - "product": "Lucia Cross-Fit Bra" - }, - "intent": "Update the product description of Lucia Cross-Fit Bra with the titles of all reviews with 4 stars or above.", - "original.intent": "Update the description of Lucia Cross-Fit Bra to highlight the real user positive reviews by quoting the comments", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "validate_status", - "expected_data": { - "expected_status": "ACTION_NOT_ALLOWED_ERROR" - } - } - } - ], - "site": "shopping_admin" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "__SHOPPING_ADMIN__/../affirm-water-bottle.html", - "locator": "document.querySelector('.data.item.content').outerText + (document.querySelector('.product.attribute.overview [itemprop=\"description\"]')?.outerText || '')", - "required_contents": { - "must_include": [ - "Wide mouth opening makes it easy to clean" - ] - } - } - ] - }, "intent_template_id": 251, - "changelogs": [ - { - "key": "intent_template", - "category": "permissive_string_match", - "note": "Clarify the intent to specify exactly what to update the value with" - }, + "start_urls": ["__SHOPPING_ADMIN__"], + "intent": "Update the product description of Lucia Cross-Fit Bra with the titles of all reviews with 4 stars or above.", + "intent_template": "Update the product description of {{product}} with the titles of all reviews with 4 stars or above.", + "instantiation_dict": {"product": "Lucia Cross-Fit Bra"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "expected_retrieve_value", - "category": "reference_alignment", - "note": "Original expected value include incorrect review" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": { + "performed_operation": "retrieve", + "status": "ACTION_NOT_ALLOWED_ERROR", + "retrieved_data": null + } } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping_admin" - ], + "sites": ["shopping_admin"], "task_id": 547, - "require_login": true, - "storage_state": "./.auth/shopping_admin_state.json", - "start_url": "__SHOPPING_ADMIN__", - "geolocation": null, + "intent_template_id": 252, + "start_urls": ["__SHOPPING_ADMIN__"], + "intent": "Add a new color option brown to the size S of Phoebe Zipper Sweatshirt", "intent_template": "Add a new {{option}} option {{value}} to the {{base_setting}} of {{product}}", "instantiation_dict": { "option": "color", @@ -31306,62 +15493,29 @@ "base_setting": "size S", "product": "Phoebe Zipper Sweatshirt" }, - "intent": "Add a new color option brown to the size S of Phoebe Zipper Sweatshirt", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "verify_product_details", - "eval_params": { - "product_id": "1130" - }, - "expected_data": { - "variants": [ - "Phoebe Zipper Sweatshirt-S-Brown" - ] - } - } - } - ], - "site": "shopping_admin" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "__SHOPPING_ADMIN__/catalog/product/edit/id/1130/", - "locator": "document.querySelector('[data-index=\"configurable\"').outerText", - "required_contents": { - "must_include": [ - "Phoebe Zipper Sweatshirt-S-Brown" - ] - } - } - ] - }, - "intent_template_id": 252, - "changelogs": [ + "format_specification": null, + "start_url_context": null, + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "product_id": "1130", + "site": "shopping_admin", + "expected": { "variants": ["Phoebe Zipper Sweatshirt-S-Brown"] } } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping_admin" - ], + "sites": ["shopping_admin"], "task_id": 548, - "require_login": true, - "storage_state": "./.auth/shopping_admin_state.json", - "start_url": "__SHOPPING_ADMIN__", - "geolocation": null, + "intent_template_id": 252, + "start_urls": ["__SHOPPING_ADMIN__"], + "intent": "Add a new color blue to size S and M of Frankie Sweatshirt", "intent_template": "Add a new {{option}} {{value}} to {{base_setting}} of {{product}}", "instantiation_dict": { "option": "color", @@ -31369,139 +15523,59 @@ "base_setting": "size S and M", "product": "Frankie Sweatshirt" }, - "intent": "Add a new color blue to size S and M of Frankie Sweatshirt", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "verify_product_details", - "eval_params": { - "product_id": "110" - }, - "expected_data": { - "variants": [ - "Frankie Sweatshirt-M-Blue", - "Frankie Sweatshirt-S-Blue" - ] - } - } - } - ], - "site": "shopping_admin" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "__SHOPPING_ADMIN__/catalog/product/edit/id/110/", - "locator": "document.querySelector('[data-index=\"configurable\"').outerText", - "required_contents": { - "must_include": [ - "Sweatshirt-M-Blue", - "Sweatshirt-S-Blue" - ] - } - } - ] - }, - "intent_template_id": 252, - "changelogs": [ + "format_specification": null, + "start_url_context": null, + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "product_id": "110", + "site": "shopping_admin", + "expected": { "variants": ["Frankie Sweatshirt-M-Blue", "Frankie Sweatshirt-S-Blue"] } } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping_admin" - ], + "sites": ["shopping_admin"], "task_id": 549, - "require_login": true, - "storage_state": "./.auth/shopping_admin_state.json", - "start_url": "__SHOPPING_ADMIN__", - "geolocation": null, + "intent_template_id": 252, + "start_urls": ["__SHOPPING_ADMIN__"], + "intent": "Add a new size XXXL to green Minerva LumaTech\u2122 V-Tee", "intent_template": "Add a new {{option}} {{value}} to {{base_setting}} {{product}}", "instantiation_dict": { - "option": "size", - "value": "XXXL", - "base_setting": "green", - "product": "Minerva LumaTech\u2122 V-Tee" - }, - "original.instantiation_dict": { "option": "size", "value": "XXXL", "base_setting": "green", "product": "Minerva LumaTech V-Tee" }, - "intent": "Add a new size XXXL to green Minerva LumaTech\u2122 V-Tee", - "original.intent": "Add a new size XXXL to green Minerva LumaTech V-Tee", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "verify_product_details", - "eval_params": { - "product_id": "1492" - }, - "expected_data": { - "variants": [ - "Minerva LumaTech\u2122 V-Tee-XXXL-Green" - ] - } - } - } - ], - "site": "shopping_admin" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "__SHOPPING_ADMIN__/catalog/product/edit/id/1492/", - "locator": "document.querySelector('[data-index=\"configurable\"').outerText", - "required_contents": { - "must_include": [ - "V-Tee-XXXL-Green" - ] - } - } - ] - }, - "intent_template_id": 252, - "changelogs": [ + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "instantiation_dict", - "category": "task_ambiguity", - "note": "Changed 'Minerva LumaTech V-Tee' to full product name 'Minerva LumaTech\u2122 V-Tee'" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "product_id": "1492", + "site": "shopping_admin", + "expected": { "variants": ["Minerva LumaTech\u2122 V-Tee-XXXL-Green"] } } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping_admin" - ], + "sites": ["shopping_admin"], "task_id": 550, - "require_login": true, - "storage_state": "./.auth/shopping_admin_state.json", - "start_url": "__SHOPPING_ADMIN__", - "geolocation": null, + "intent_template_id": 252, + "start_urls": ["__SHOPPING_ADMIN__"], + "intent": "Add a new size XXS to blue and purple Nona Fitness Tank", "intent_template": "Add a new {{option}} {{value}} to {{base_setting}} {{product}}", "instantiation_dict": { "option": "size", @@ -31509,64 +15583,29 @@ "base_setting": "blue and purple", "product": "Nona Fitness Tank" }, - "intent": "Add a new size XXS to blue and purple Nona Fitness Tank", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "verify_product_details", - "eval_params": { - "product_id": "1732" - }, - "expected_data": { - "variants": [ - "Nona Fitness Tank-XXS-Blue", - "Nona Fitness Tank-XXS-Purple" - ] - } - } - } - ], - "site": "shopping_admin" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "__SHOPPING_ADMIN__/catalog/product/edit/id/1732/", - "locator": "document.querySelector('[data-index=\"configurable\"').outerText", - "required_contents": { - "must_include": [ - "Tank-XXS-Blue", - "Tank-XXS-Purple" - ] - } - } - ] - }, - "intent_template_id": 252, - "changelogs": [ + "format_specification": null, + "start_url_context": null, + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "product_id": "1732", + "site": "shopping_admin", + "expected": { "variants": ["Nona Fitness Tank-XXS-Blue", "Nona Fitness Tank-XXS-Purple"] } } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping_admin" - ], + "sites": ["shopping_admin"], "task_id": 551, - "require_login": true, - "storage_state": "./.auth/shopping_admin_state.json", - "start_url": "__SHOPPING_ADMIN__", - "geolocation": null, + "intent_template_id": 252, + "start_urls": ["__SHOPPING_ADMIN__"], + "intent": "Add new size 30 and 31 to all color variants of Diana Tights", "intent_template": "Add new {{option}} {{value}} to {{base_setting}} of {{product}}", "instantiation_dict": { "option": "size", @@ -31574,7198 +15613,3089 @@ "base_setting": "all color variants", "product": "Diana Tights" }, - "intent": "Add new size 30 and 31 to all color variants of Diana Tights", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "verify_product_details", - "eval_params": { - "product_id": "1854" - }, - "expected_data": { - "variants": [ - "Diana Tights-30-Blue", - "Diana Tights-30-Black", - "Diana Tights-30-Orange", - "Diana Tights-31-Blue", - "Diana Tights-31-Black", - "Diana Tights-31-Orange" - ] - } - } + "format_specification": null, + "start_url_context": null, + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + }, + { + "evaluator": "BackendStateEvaluator", + "product_id": "1854", + "site": "shopping_admin", + "expected": { + "variants": [ + "Diana Tights-30-Blue", "Diana Tights-30-Black", "Diana Tights-30-Orange", + "Diana Tights-31-Blue", "Diana Tights-31-Black", "Diana Tights-31-Orange" + ] } - ], - "site": "shopping_admin" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "__SHOPPING_ADMIN__/catalog/product/edit/id/1854/", - "locator": "document.querySelector('[data-index=\"configurable\"').outerText", - "required_contents": { - "must_include": [ - "Tights-30-Blue", - "Tights-30-Black", - "Tights-30-Orange", - "Tights-31-Blue", - "Tights-31-Black", - "Tights-31-Orange" - ] - } - } - ] - }, - "intent_template_id": 252, - "changelogs": [ - { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" } - ] + ], + "revision": 2 }, { - "sites": [ - "gitlab", - "reddit" - ], + "sites": ["gitlab", "reddit"], "task_id": 552, - "require_login": true, - "storage_state": "./.auth/gitlab.reddit_state.json", - "start_url": "__GITLAB__", - "geolocation": null, - "intent_template": "Create a folder named {{directory}} in {{gitlab_repo}} repo. Within it, create a file named urls.txt that contains the URLs of the 5 most recent posts from the {{subreddit}}?", + "intent_template_id": 84, + "start_urls": ["__GITLAB__", "__REDDIT__"], + "intent": "Create a folder named real_space in gimmiethat.space repo. Within it, create a file named urls.txt that contains the URLs of the 5 most recent posts from the space subreddit.", + "intent_template": "Create a folder named {{directory}} in {{gitlab_repo}} repo. Within it, create a file named urls.txt that contains the URLs of the 5 most recent posts from the {{subreddit}} subreddit.", "instantiation_dict": { "directory": "real_space", "subreddit": "space", "gitlab_repo": "gimmiethat.space" }, - "intent": "Create a folder named real_space in gimmiethat.space repo. Within it, create a file named urls.txt that contains the URLs of the 5 most recent posts from the space?", - "require_reset": false, - "eval": { - "site": "gitlab" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "__GITLAB__/byteblaze/gimmiethat.space/-/raw/main/real_space/urls.txt", - "locator": "", - "required_contents": { - "must_include": [ - "__REDDIT__/f/space/134164/scientists-erupt-at-nasa-gutting-funding-for-crucial-venus", - "__REDDIT__/f/space/134163/virgin-orbit-fails-to-secure-funding-will-cease-operations", - "__REDDIT__/f/space/134162/nasa-to-name-artemis-2-crew-next-week-the-first-moon", - "__REDDIT__/f/space/134161/bent-light-in-deep-space-reveals-one-of-the-biggest-black", - "__REDDIT__/f/space/134160/seti-s-new-machine-learning-algorithm-works-like-google-s" - ] - } - } - ] - }, - "intent_template_id": 84 + "format_specification": null, + "start_url_context": null, + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + }, + { + "evaluator": "BackendStateEvaluator", + "group": "byteblaze", + "project": "gimmiethat.space", + "file_path": "real_space/urls.txt", + "substring": [ + "/f/space/134164/scientists-erupt-at-nasa-gutting-funding-for-crucial-venus", + "/f/space/134163/virgin-orbit-fails-to-secure-funding-will-cease-operations", + "/f/space/134162/nasa-to-name-artemis-2-crew-next-week-the-first-moon", + "/f/space/134161/bent-light-in-deep-space-reveals-one-of-the-biggest-black", + "/f/space/134160/seti-s-new-machine-learning-algorithm-works-like-google-s" + ], + "branch": "main", + "site": "gitlab", + "expected": true + } + ], + "revision": 2 }, { - "sites": [ - "gitlab", - "reddit" - ], + "sites": ["gitlab", "reddit"], "task_id": 553, - "require_login": true, - "storage_state": "./.auth/gitlab.reddit_state.json", - "start_url": "__GITLAB__", - "geolocation": null, - "intent_template": "Create a folder named {{directory}} in {{gitlab_repo}} repo. Within it, create a file named urls.txt that contains the URLs of the 5 most recent posts from the {{subreddit}}?", + "intent_template_id": 84, + "start_urls": ["__GITLAB__", "__REDDIT__"], + "intent": "Create a folder named news in gimmiethat.space repo. Within it, create a file named urls.txt that contains the URLs of the 5 most recent posts from the news related subreddits subreddit.", + "intent_template": "Create a folder named {{directory}} in {{gitlab_repo}} repo. Within it, create a file named urls.txt that contains the URLs of the 5 most recent posts from the {{subreddit}} subreddit.", "instantiation_dict": { "directory": "news", "gitlab_repo": "gimmiethat.space", "subreddit": "news related subreddits" }, - "intent": "Create a folder named news in gimmiethat.space repo. Within it, create a file named urls.txt that contains the URLs of the 5 most recent posts from the news related subreddits?", - "require_reset": false, - "eval": { - "site": "gitlab" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "__GITLAB__/byteblaze/gimmiethat.space/-/raw/main/news/urls.txt", - "locator": "", - "required_contents": { - "must_include": [ - "__REDDIT__/f/news/129905/ohio-man-charged-for-using-molotov-cocktails-to-attack", - "__REDDIT__/f/news/129904/in-a-loss-for-fox-news-judge-allows-dominion-s-defamation", - "__REDDIT__/f/news/129903/theater-group-sues-to-block-tennessee-s-new-anti-drag-law", - "__REDDIT__/f/news/129902/andrew-tate-released-from-jail-in-romania-and-placed-under", - "__REDDIT__/f/news/129901/rare-high-risk-storm-alert-issued-for-parts-of-midwest-and" - ] - } - } - ] - }, - "intent_template_id": 84 + "format_specification": null, + "start_url_context": null, + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + }, + { + "evaluator": "BackendStateEvaluator", + "group": "byteblaze", + "project": "gimmiethat.space", + "file_path": "news/urls.txt", + "substring": [ + "/f/news/129905/ohio-man-charged-for-using-molotov-cocktails-to-attack", + "/f/news/129904/in-a-loss-for-fox-news-judge-allows-dominion-s-defamation", + "/f/news/129903/theater-group-sues-to-block-tennessee-s-new-anti-drag-law", + "/f/news/129902/andrew-tate-released-from-jail-in-romania-and-placed-under", + "/f/news/129901/rare-high-risk-storm-alert-issued-for-parts-of-midwest-and" + ], + "branch": "main", + "site": "gitlab", + "expected": true + } + ], + "revision": 2 }, { - "sites": [ - "gitlab", - "reddit" - ], + "sites": ["gitlab", "reddit"], "task_id": 554, - "require_login": true, - "storage_state": "./.auth/gitlab.reddit_state.json", - "start_url": "__GITLAB__", - "geolocation": null, - "intent_template": "Create a folder named {{directory}} in {{gitlab_repo}} repo. Within it, create a file named urls.txt that contains the URLs of the 5 most recent posts from the {{subreddit}}?", + "intent_template_id": 84, + "start_urls": ["__GITLAB__", "__REDDIT__"], + "intent": "Create a folder named moive_space in gimmiethat.space repo. Within it, create a file named urls.txt that contains the URLs of the 5 most recent posts from the movies subreddit.", + "intent_template": "Create a folder named {{directory}} in {{gitlab_repo}} repo. Within it, create a file named urls.txt that contains the URLs of the 5 most recent posts from the {{subreddit}} subreddit.", "instantiation_dict": { "directory": "moive_space", "gitlab_repo": "gimmiethat.space", "subreddit": "movies" }, - "intent": "Create a folder named moive_space in gimmiethat.space repo. Within it, create a file named urls.txt that contains the URLs of the 5 most recent posts from the movies?", - "require_reset": false, - "eval": { - "site": "gitlab" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "__GITLAB__/byteblaze/gimmiethat.space/-/raw/main/moive_space/urls.txt", - "locator": "", - "required_contents": { - "must_include": [ - "__REDDIT__/f/movies/128825/scenes-in-film-that-feel-off-or-wrong-in-some-way-and-make", - "__REDDIT__/f/movies/128824/disney-s-live-action-lilo-amp-stitch-movie-finds-its-lilo-in", - "__REDDIT__/f/movies/128823/fantastic-four-movie-gets-new-writer-with-avatar-the-way-of", - "__REDDIT__/f/movies/128822/can-someone-explain-what-made-steven-seagal-so-appealing-for", - "__REDDIT__/f/movies/128821/ban-on-fetish-sex-depictions-in-film-should-end-australia" - ] - } - } - ] - }, - "intent_template_id": 84 + "format_specification": null, + "start_url_context": null, + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + }, + { + "evaluator": "BackendStateEvaluator", + "group": "byteblaze", + "project": "gimmiethat.space", + "file_path": "movie_space/urls.txt", + "substring": [ + "/f/movies/128825/scenes-in-film-that-feel-off-or-wrong-in-some-way-and-make", + "/f/movies/128824/disney-s-live-action-lilo-amp-stitch-movie-finds-its-lilo-in", + "/f/movies/128823/fantastic-four-movie-gets-new-writer-with-avatar-the-way-of", + "/f/movies/128822/can-someone-explain-what-made-steven-seagal-so-appealing-for", + "/f/movies/128821/ban-on-fetish-sex-depictions-in-film-should-end-australia" + ], + "branch": "main", + "site": "gitlab", + "expected": true + } + ], + "revision": 2 }, { - "sites": [ - "gitlab", - "reddit" - ], + "sites": ["gitlab", "reddit"], "task_id": 555, - "require_login": true, - "storage_state": "./.auth/gitlab.reddit_state.json", - "start_url": "__GITLAB__", - "geolocation": null, - "intent_template": "Create a folder named {{directory}} in {{gitlab_repo}} repo. Within it, create a file named urls.txt that contains the URLs of the 5 most recent posts from the {{subreddit}}?", + "intent_template_id": 84, + "start_urls": ["__GITLAB__", "__REDDIT__"], + "intent": "Create a folder named funny_pic in gimmiethat.space repo. Within it, create a file named urls.txt that contains the URLs of the 5 most recent posts from the memes subreddit.", + "intent_template": "Create a folder named {{directory}} in {{gitlab_repo}} repo. Within it, create a file named urls.txt that contains the URLs of the 5 most recent posts from the {{subreddit}} subreddit.", "instantiation_dict": { "directory": "funny_pic", "gitlab_repo": "gimmiethat.space", "subreddit": "memes" }, - "intent": "Create a folder named funny_pic in gimmiethat.space repo. Within it, create a file named urls.txt that contains the URLs of the 5 most recent posts from the memes?", - "require_reset": false, - "eval": { - "site": "gitlab" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "__GITLAB__/byteblaze/gimmiethat.space/-/raw/main/funny_pic/urls.txt", - "locator": "", - "required_contents": { - "must_include": [ - "__REDDIT__/f/memes/127991/it-do-be-like-that-tho", - "__REDDIT__/f/memes/127990/thank-you-memers-this-wouldn-t-be-possible-without-you", - "__REDDIT__/f/memes/127989/if-you-have-no-other-choice", - "__REDDIT__/f/memes/127988/yes-yes-yes", - "__REDDIT__/f/memes/127987/shagadelic-baby" - ] - } - } - ] - }, - "intent_template_id": 84 + "format_specification": null, + "start_url_context": null, + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + }, + { + "evaluator": "BackendStateEvaluator", + "group": "byteblaze", + "project": "gimmiethat.space", + "file_path": "funny_pic/urls.txt", + "substring": [ + "/f/memes/127991/it-do-be-like-that-tho", + "/f/memes/127990/thank-you-memers-this-wouldn-t-be-possible-without-you", + "/f/memes/127989/if-you-have-no-other-choice", "/f/memes/127988/yes-yes-yes", + "/f/memes/127987/shagadelic-baby" + ], + "branch": "main", + "site": "gitlab", + "expected": true + } + ], + "revision": 2 }, { - "sites": [ - "gitlab", - "wikipedia" - ], + "sites": ["gitlab", "wikipedia"], "task_id": 556, - "require_login": true, - "storage_state": "./.auth/gitlab_state.json", - "start_url": "__GITLAB__", - "geolocation": null, - "intent_template": "Create a repo named {{name}} with {{topics}} in a README file", - "instantiation_dict": { - "name": "nolan_honest_fans", - "topics": "movies directed by Christopher Nolan" - }, + "intent_template_id": 87, + "start_urls": ["__GITLAB__", "__WIKIPEDIA__"], "intent": "Create a repo named nolan_honest_fans with movies directed by Christopher Nolan in a README file", - "require_reset": false, - "eval": { - "site": "gitlab" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "__GITLAB__/byteblaze/nolan_honest_fans/-/raw/main/README.md", - "locator": "", - "required_contents": { - "must_include": [ - "Following", - "Memento", - "Insomnia", - "Batman Begins", - "The Prestige", - "The Dark Knight", - "Inception", - "The Dark Knight Rises", - "Interstellar", - "Dunkirk", - "Tenet", - "Oppenheimer" - ] - } - } - ] - }, - "intent_template_id": 87 + "intent_template": "Create a repo named {{name}} with {{topics}} in a README file", + "instantiation_dict": {"name": "nolan_honest_fans", "topics": "movies directed by Christopher Nolan"}, + "format_specification": null, + "start_url_context": null, + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + }, + { + "evaluator": "BackendStateEvaluator", + "group": "byteblaze", + "project": "nolan_honest_fans", + "file_path": "README.md", + "substring": [ + "Following", "Memento", "Insomnia", "Batman Begins", "The Prestige", + "The Dark Knight", "Inception", "The Dark Knight Rises", "Interstellar", + "Dunkirk", "Tenet", "Oppenheimer" + ], + "branch": "main", + "site": "gitlab", + "expected": true + } + ], + "revision": 2 }, { - "sites": [ - "gitlab", - "wikipedia" - ], + "sites": ["gitlab", "wikipedia"], "task_id": 557, - "require_login": true, - "storage_state": "./.auth/gitlab_state.json", - "start_url": "__GITLAB__", - "geolocation": null, + "intent_template_id": 87, + "start_urls": ["__GITLAB__", "__WIKIPEDIA__"], + "intent": "Create a repo named nolan_old_fans with movies directed by Christopher Nolan before 2010 in a README file", "intent_template": "Create a repo named {{name}} with {{topics}} in a README file", "instantiation_dict": { "name": "nolan_old_fans", "topics": "movies directed by Christopher Nolan before 2010" }, - "intent": "Create a repo named nolan_old_fans with movies directed by Christopher Nolan before 2010 in a README file", - "require_reset": false, - "eval": { - "site": "gitlab" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "__GITLAB__/byteblaze/nolan_old_fans/-/raw/main/README.md", - "locator": "", - "required_contents": { - "must_include": [ - "Following", - "Memento", - "Insomnia", - "Batman Begins", - "The Prestige", - "The Dark Knight" - ] - } - } - ] - }, - "intent_template_id": 87 + "format_specification": null, + "start_url_context": null, + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + }, + { + "evaluator": "BackendStateEvaluator", + "group": "byteblaze", + "project": "nolan_old_fans", + "file_path": "README.md", + "substring": [ + "Following", "Memento", "Insomnia", "Batman Begins", "The Prestige", + "The Dark Knight" + ], + "branch": "main", + "site": "gitlab", + "expected": true + } + ], + "revision": 2 }, { - "sites": [ - "gitlab", - "wikipedia" - ], + "sites": ["gitlab", "wikipedia"], "task_id": 558, - "require_login": true, - "storage_state": "./.auth/gitlab_state.json", - "start_url": "__GITLAB__", - "geolocation": null, + "intent_template_id": 87, + "start_urls": ["__GITLAB__", "__WIKIPEDIA__"], + "intent": "Create a repo named nolan_young_fans with movies directed by Christopher Nolan after 2010 in a README file", "intent_template": "Create a repo named {{name}} with {{topics}} in a README file", "instantiation_dict": { "name": "nolan_young_fans", "topics": "movies directed by Christopher Nolan after 2010" }, - "intent": "Create a repo named nolan_young_fans with movies directed by Christopher Nolan after 2010 in a README file", - "require_reset": false, - "eval": { - "site": "gitlab" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "__GITLAB__/byteblaze/nolan_young_fans/-/raw/main/README.md", - "locator": "", - "required_contents": { - "must_include": [ - "Inception", - "The Dark Knight Rises", - "Interstellar", - "Dunkirk", - "Tenet", - "Oppenheimer" - ] - } - } - ] - }, - "intent_template_id": 87 + "format_specification": null, + "start_url_context": null, + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + }, + { + "evaluator": "BackendStateEvaluator", + "group": "byteblaze", + "project": "nolan_young_fans", + "file_path": "README.md", + "substring": [ + "Inception", "The Dark Knight Rises", "Interstellar", "Dunkirk", "Tenet", + "Oppenheimer" + ], + "branch": "main", + "site": "gitlab", + "expected": true + } + ], + "revision": 2 }, { - "sites": [ - "gitlab", - "wikipedia" - ], + "sites": ["gitlab", "wikipedia"], "task_id": 559, - "require_login": true, - "storage_state": "./.auth/gitlab_state.json", - "start_url": "__GITLAB__", - "geolocation": null, - "intent_template": "Create a repo named {{name}} with {{topics}} in a README file", - "instantiation_dict": { - "name": "nolan_followers", - "topics": "career timeline of Christopher Nolan" - }, + "intent_template_id": 87, + "start_urls": ["__GITLAB__", "__WIKIPEDIA__"], "intent": "Create a repo named nolan_followers with career timeline of Christopher Nolan in a README file", - "require_reset": false, - "eval": { - "site": "gitlab" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "__GITLAB__/byteblaze/nolan_followers/-/raw/main/README.md", - "locator": "", - "required_contents": { - "must_include": [ - "1993\u20132003: Early career and breakthrough", - "2003\u20132013: Widespread recognition", - "2014\u20132019: Established Hollywood auteur", - "2020\u2013present" - ] - } - } - ] - }, - "intent_template_id": 87 + "intent_template": "Create a repo named {{name}} with {{topics}} in a README file", + "instantiation_dict": {"name": "nolan_followers", "topics": "career timeline of Christopher Nolan"}, + "format_specification": null, + "start_url_context": null, + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + }, + { + "evaluator": "BackendStateEvaluator", + "group": "byteblaze", + "project": "nolan_followers", + "file_path": "README.md", + "substring": [ + "1993\u20132003: Early career and breakthrough", + "2003\u20132013: Widespread recognition", + "2014\u20132019: Established Hollywood auteur", "2020\u2013present" + ], + "branch": "main", + "site": "gitlab", + "expected": true + } + ], + "revision": 2 }, { - "sites": [ - "gitlab", - "wikipedia" - ], + "sites": ["gitlab", "wikipedia"], "task_id": 560, - "require_login": true, - "storage_state": "./.auth/gitlab_state.json", - "start_url": "__GITLAB__", - "geolocation": null, + "intent_template_id": 87, + "start_urls": ["__GITLAB__", "__WIKIPEDIA__"], + "intent": "Create a repo named nolan_academy_awards with movies that won Academy Awards by Christopher Nolan in a README file", "intent_template": "Create a repo named {{name}} with {{topics}} in a README file", "instantiation_dict": { "name": "nolan_academy_awards", "topics": "movies that won Academy Awards by Christopher Nolan" }, - "intent": "Create a repo named nolan_academy_awards with movies that won Academy Awards by Christopher Nolan in a README file", - "require_reset": false, - "eval": { - "site": "gitlab" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "__GITLAB__/byteblaze/nolan_academy_awards/-/raw/main/README.md", - "locator": "", - "required_contents": { - "must_include": [ - "The Dark Knight", - "Inception", - "Interstellar", - "Dunkirk", - "Tenet" - ] - } - } - ] - }, - "intent_template_id": 87 + "format_specification": null, + "start_url_context": null, + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + }, + { + "evaluator": "BackendStateEvaluator", + "group": "byteblaze", + "project": "nolan_academy_awards", + "file_path": "README.md", + "substring": ["The Dark Knight", "Inception", "Interstellar", "Dunkirk", "Tenet"], + "branch": "main", + "site": "gitlab", + "expected": true + } + ], + "revision": 2 }, { - "sites": [ - "gitlab", - "wikipedia" - ], + "sites": ["gitlab", "wikipedia"], "task_id": 561, - "require_login": true, - "storage_state": "./.auth/gitlab_state.json", - "start_url": "__GITLAB__", - "geolocation": null, + "intent_template_id": 87, + "start_urls": ["__GITLAB__", "__WIKIPEDIA__"], + "intent": "Create a repo named bafta_awards_nolan with movies that are nominated BAFTA Awards by Christopher Nolan in a README file", "intent_template": "Create a repo named {{name}} with {{topics}} in a README file", "instantiation_dict": { "name": "bafta_awards_nolan", "topics": "movies that are nominated BAFTA Awards by Christopher Nolan" }, - "intent": "Create a repo named bafta_awards_nolan with movies that are nominated BAFTA Awards by Christopher Nolan in a README file", - "require_reset": false, - "eval": { - "site": "gitlab" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "__GITLAB__/byteblaze/bafta_awards_nolan/-/raw/main/README.md", - "locator": "", - "required_contents": { - "must_include": [ - "Batman Begins", - "The Dark Knight", - "Inception", - "The Dark Knight Rises", - "Interstellar", - "Dunkirk", - "Tenet" - ] - } - } - ] - }, - "intent_template_id": 87 + "format_specification": null, + "start_url_context": null, + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + }, + { + "evaluator": "BackendStateEvaluator", + "group": "byteblaze", + "project": "bafta_awards_nolan", + "file_path": "README.md", + "substring": [ + "Batman Begins", "The Dark Knight", "Inception", "The Dark Knight Rises", + "Interstellar", "Dunkirk", "Tenet" + ], + "branch": "main", + "site": "gitlab", + "expected": true + } + ], + "revision": 2 }, { - "sites": [ - "gitlab", - "reddit" - ], + "sites": ["gitlab", "reddit"], "task_id": 562, - "require_login": true, - "storage_state": "./.auth/gitlab.reddit_state.json", - "start_url": "__GITLAB__", - "geolocation": null, - "intent_template": "create a repository named {{name}} that includes a README file with the links to the most active {{num}} DIY ideas on DIY subreddit?", - "instantiation_dict": { - "name": "Awesome_DIY_ideas", - "num": 6 - }, - "intent": "create a repository named Awesome_DIY_ideas that includes a README file with the links to the most active 6 DIY ideas on DIY subreddit?", - "require_reset": false, - "eval": { - "site": "gitlab" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "__GITLAB__/byteblaze/Awesome_DIY_ideas/-/raw/main/README.md", - "locator": "", - "required_contents": { - "must_include": [ - "__REDDIT__/f/DIY/118903/separate-glued-plastic-parts", - "__REDDIT__/f/DIY/118923/how-would-you-fix-this-dryer-vent-mess", - "__REDDIT__/f/DIY/118935/basement-bulkhead-soffit-wall-framing", - "__REDDIT__/f/DIY/118904/ge-water-heater-pilot-light-won-t-stay-lit", - "__REDDIT__/f/DIY/118960/attempting-to-move-a-wall-outlet-in-my-basement-a-few-inches", - "__REDDIT__/f/DIY/118931/afci-outlet-question" - ] - } - } - ] - }, - "intent_template_id": 88 + "intent_template_id": 88, + "start_urls": ["__GITLAB__", "__REDDIT__"], + "intent": "Create a repository named Awesome_DIY_ideas that includes a README file with the links to the most active 6 DIY ideas on DIY subreddit.", + "intent_template": "Create a repository named {{name}} that includes a README file with the links to the most active {{num}} DIY ideas on DIY subreddit.", + "instantiation_dict": {"name": "Awesome_DIY_ideas", "num": 6}, + "format_specification": null, + "start_url_context": null, + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + }, + { + "evaluator": "BackendStateEvaluator", + "group": "byteblaze", + "project": "Awesome_DIY_ideas", + "file_path": "README.md", + "substring": [ + "/f/DIY/118903/separate-glued-plastic-parts", + "/f/DIY/118923/how-would-you-fix-this-dryer-vent-mess", + "/f/DIY/118935/basement-bulkhead-soffit-wall-framing", + "/f/DIY/118904/ge-water-heater-pilot-light-won-t-stay-lit", + "/f/DIY/118960/attempting-to-move-a-wall-outlet-in-my-basement-a-few-inches", + "/f/DIY/118931/afci-outlet-question" + ], + "branch": "main", + "site": "gitlab", + "expected": true + } + ], + "revision": 2 }, { - "sites": [ - "gitlab", - "reddit" - ], + "sites": ["gitlab", "reddit"], "task_id": 563, - "require_login": true, - "storage_state": "./.auth/gitlab.reddit_state.json", - "start_url": "__GITLAB__", - "geolocation": null, - "intent_template": "create a repository named {{name}} that includes a README file with the links to the most active {{num}} DIY ideas on DIY subreddit?", - "instantiation_dict": { - "name": "fun_thing_to_do", - "num": 5 - }, - "intent": "create a repository named fun_thing_to_do that includes a README file with the links to the most active 5 DIY ideas on DIY subreddit?", - "require_reset": false, - "eval": { - "site": "gitlab" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "__GITLAB__/byteblaze/fun_thing_to_do/-/raw/main/README.md", - "locator": "", - "required_contents": { - "must_include": [ - "__REDDIT__/f/DIY/118903/separate-glued-plastic-parts", - "__REDDIT__/f/DIY/118923/how-would-you-fix-this-dryer-vent-mess", - "__REDDIT__/f/DIY/118935/basement-bulkhead-soffit-wall-framing", - "__REDDIT__/f/DIY/118904/ge-water-heater-pilot-light-won-t-stay-lit", - "__REDDIT__/f/DIY/118960/attempting-to-move-a-wall-outlet-in-my-basement-a-few-inches" - ] - } - } - ] - }, - "intent_template_id": 88 + "intent_template_id": 88, + "start_urls": ["__GITLAB__", "__REDDIT__"], + "intent": "Create a repository named fun_thing_to_do that includes a README file with the links to the most active 5 DIY ideas on DIY subreddit.", + "intent_template": "Create a repository named {{name}} that includes a README file with the links to the most active {{num}} DIY ideas on DIY subreddit.", + "instantiation_dict": {"name": "fun_thing_to_do", "num": 5}, + "format_specification": null, + "start_url_context": null, + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + }, + { + "evaluator": "BackendStateEvaluator", + "group": "byteblaze", + "project": "fun_thing_to_do", + "file_path": "README.md", + "substring": [ + "/f/DIY/118903/separate-glued-plastic-parts", + "/f/DIY/118923/how-would-you-fix-this-dryer-vent-mess", + "/f/DIY/118935/basement-bulkhead-soffit-wall-framing", + "/f/DIY/118904/ge-water-heater-pilot-light-won-t-stay-lit", + "/f/DIY/118960/attempting-to-move-a-wall-outlet-in-my-basement-a-few-inches" + ], + "branch": "main", + "site": "gitlab", + "expected": true + } + ], + "revision": 2 }, { - "sites": [ - "gitlab", - "reddit" - ], + "sites": ["gitlab", "reddit"], "task_id": 564, - "require_login": true, - "storage_state": "./.auth/gitlab.reddit_state.json", - "start_url": "__GITLAB__", - "geolocation": null, - "intent_template": "create a repository named {{name}} that includes a README file with the links to the most active {{num}} DIY ideas on DIY subreddit?", - "instantiation_dict": { - "name": "live_a_life", - "num": 3 - }, - "intent": "create a repository named live_a_life that includes a README file with the links to the most active 3 DIY ideas on DIY subreddit?", - "require_reset": false, - "eval": { - "site": "gitlab" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "__GITLAB__/byteblaze/live_a_life/-/raw/main/README.md", - "locator": "", - "required_contents": { - "must_include": [ - "__REDDIT__/f/DIY/118903/separate-glued-plastic-parts", - "__REDDIT__/f/DIY/118923/how-would-you-fix-this-dryer-vent-mess", - "__REDDIT__/f/DIY/118935/basement-bulkhead-soffit-wall-framing" - ] - } - } - ] - }, - "intent_template_id": 88 + "intent_template_id": 88, + "start_urls": ["__GITLAB__", "__REDDIT__"], + "intent": "Create a repository named live_a_life that includes a README file with the links to the most active 3 DIY ideas on DIY subreddit.", + "intent_template": "Create a repository named {{name}} that includes a README file with the links to the most active {{num}} DIY ideas on DIY subreddit.", + "instantiation_dict": {"name": "live_a_life", "num": 3}, + "format_specification": null, + "start_url_context": null, + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + }, + { + "evaluator": "BackendStateEvaluator", + "group": "byteblaze", + "project": "live_a_life", + "file_path": "README.md", + "substring": [ + "/f/DIY/118903/separate-glued-plastic-parts", + "/f/DIY/118923/how-would-you-fix-this-dryer-vent-mess", + "/f/DIY/118935/basement-bulkhead-soffit-wall-framing" + ], + "branch": "main", + "site": "gitlab", + "expected": true + } + ], + "revision": 2 }, { - "sites": [ - "gitlab", - "reddit" - ], + "sites": ["gitlab", "reddit"], "task_id": 565, - "require_login": true, - "storage_state": "./.auth/gitlab.reddit_state.json", - "start_url": "__GITLAB__", - "geolocation": null, - "intent_template": "create a repository named {{name}} that includes a README file with the links to the most active {{num}} DIY ideas on DIY subreddit?", - "instantiation_dict": { - "name": "TODO", - "num": 10 - }, - "intent": "create a repository named TODO that includes a README file with the links to the most active 10 DIY ideas on DIY subreddit?", - "require_reset": false, - "eval": { - "site": "gitlab" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "__GITLAB__/byteblaze/TODO/-/raw/main/README.md", - "locator": "", - "required_contents": { - "must_include": [ - "__REDDIT__/f/DIY/118903/separate-glued-plastic-parts", - "__REDDIT__/f/DIY/118923/how-would-you-fix-this-dryer-vent-mess", - "__REDDIT__/f/DIY/118935/basement-bulkhead-soffit-wall-framing", - "__REDDIT__/f/DIY/118904/ge-water-heater-pilot-light-won-t-stay-lit", - "__REDDIT__/f/DIY/118960/attempting-to-move-a-wall-outlet-in-my-basement-a-few-inches", - "__REDDIT__/f/DIY/118931/afci-outlet-question", - "__REDDIT__/f/DIY/118824/teflon-tape-to-attach-washing-machine-drain-hose-to-pipe", - "__REDDIT__/f/DIY/118866/paver-base-for-shed", - "__REDDIT__/f/DIY/118820/ways-to-locate-our-buried-electrical-service", - "__REDDIT__/f/DIY/118836/how-to-eliminate-transitions-for-disability-mobility-reasons" - ] - } - } - ] - }, - "intent_template_id": 88 + "intent_template_id": 88, + "start_urls": ["__GITLAB__", "__REDDIT__"], + "intent": "Create a repository named TODO that includes a README file with the links to the most active 10 DIY ideas on DIY subreddit.", + "intent_template": "Create a repository named {{name}} that includes a README file with the links to the most active {{num}} DIY ideas on DIY subreddit.", + "instantiation_dict": {"name": "TODO", "num": 10}, + "format_specification": null, + "start_url_context": null, + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + }, + { + "evaluator": "BackendStateEvaluator", + "group": "byteblaze", + "project": "TODO", + "file_path": "README.md", + "substring": [ + "/f/DIY/118903/separate-glued-plastic-parts", + "/f/DIY/118923/how-would-you-fix-this-dryer-vent-mess", + "/f/DIY/118935/basement-bulkhead-soffit-wall-framing", + "/f/DIY/118904/ge-water-heater-pilot-light-won-t-stay-lit", + "/f/DIY/118960/attempting-to-move-a-wall-outlet-in-my-basement-a-few-inches", + "/f/DIY/118931/afci-outlet-question", + "/f/DIY/118824/teflon-tape-to-attach-washing-machine-drain-hose-to-pipe", + "/f/DIY/118866/paver-base-for-shed", + "/f/DIY/118820/ways-to-locate-our-buried-electrical-service", + "/f/DIY/118836/how-to-eliminate-transitions-for-disability-mobility-reasons" + ], + "branch": "main", + "site": "gitlab", + "expected": true + } + ], + "revision": 2 }, { - "sites": [ - "gitlab", - "reddit" - ], + "sites": ["gitlab", "reddit"], "task_id": 566, - "require_login": true, - "storage_state": "./.auth/gitlab.reddit_state.json", - "start_url": "__GITLAB__", - "geolocation": null, - "intent_template": "create a repository named {{name}} that includes a README file with the links to the most active {{num}} DIY ideas on DIY subreddit?", - "instantiation_dict": { - "name": "Do it myself", - "num": 8 - }, - "intent": "create a repository named Do it myself that includes a README file with the links to the most active 8 DIY ideas on DIY subreddit?", - "require_reset": false, - "eval": { - "site": "gitlab" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "__GITLAB__/byteblaze/Do-it-myself/-/raw/main/README.md", - "locator": "", - "required_contents": { - "must_include": [ - "__REDDIT__/f/DIY/118903/separate-glued-plastic-parts", - "__REDDIT__/f/DIY/118923/how-would-you-fix-this-dryer-vent-mess", - "__REDDIT__/f/DIY/118935/basement-bulkhead-soffit-wall-framing", - "__REDDIT__/f/DIY/118904/ge-water-heater-pilot-light-won-t-stay-lit", - "__REDDIT__/f/DIY/118960/attempting-to-move-a-wall-outlet-in-my-basement-a-few-inches", - "__REDDIT__/f/DIY/118931/afci-outlet-question", - "__REDDIT__/f/DIY/118824/teflon-tape-to-attach-washing-machine-drain-hose-to-pipe", - "__REDDIT__/f/DIY/118866/paver-base-for-shed" - ] - } - } - ] - }, - "intent_template_id": 88 + "intent_template_id": 88, + "start_urls": ["__GITLAB__", "__REDDIT__"], + "intent": "Create a repository named Do it myself that includes a README file with the links to the most active 8 DIY ideas on DIY subreddit.", + "intent_template": "Create a repository named {{name}} that includes a README file with the links to the most active {{num}} DIY ideas on DIY subreddit.", + "instantiation_dict": {"name": "Do it myself", "num": 8}, + "format_specification": null, + "start_url_context": null, + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + }, + { + "evaluator": "BackendStateEvaluator", + "group": "byteblaze", + "project": "Do-it-myself", + "file_path": "README.md", + "substring": [ + "/f/DIY/118903/separate-glued-plastic-parts", + "/f/DIY/118923/how-would-you-fix-this-dryer-vent-mess", + "/f/DIY/118935/basement-bulkhead-soffit-wall-framing", + "/f/DIY/118904/ge-water-heater-pilot-light-won-t-stay-lit", + "/f/DIY/118960/attempting-to-move-a-wall-outlet-in-my-basement-a-few-inches", + "/f/DIY/118931/afci-outlet-question", + "/f/DIY/118824/teflon-tape-to-attach-washing-machine-drain-hose-to-pipe", + "/f/DIY/118866/paver-base-for-shed" + ], + "branch": "main", + "site": "gitlab", + "expected": true + } + ], + "revision": 2 }, { - "sites": [ - "gitlab" - ], + "sites": ["gitlab"], "task_id": 567, - "require_login": true, - "storage_state": "./.auth/gitlab_state.json", - "start_url": "__GITLAB__", - "geolocation": null, - "intent_template": "Invite {{collaborator_account_list}} as collaborator to {{repo}} repo", + "intent_template_id": 293, + "start_urls": ["__GITLAB__"], + "intent": "Invite Jakub Klinkovsk\u00fd and Beno\u00eet Blanchon as collaborator(s) to gimmiethat.space repo", + "intent_template": "Invite {{collaborator_account_list}} as collaborator(s) to {{repo}} repo", "instantiation_dict": { "collaborator_account_list": "Jakub Klinkovsk\u00fd and Beno\u00eet Blanchon", "repo": "gimmiethat.space" }, - "intent": "Invite Jakub Klinkovsk\u00fd and Beno\u00eet Blanchon as collaborator to gimmiethat.space repo", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "project_has_collaborators", - "eval_params": { - "collaborators": [ - "lahwaacz", - "bblanchon" - ], - "group": "byteblaze", - "project": "gimmiethat.space" - } - } - } - ], - "site": "gitlab" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "__GITLAB__/byteblaze/gimmiethat.space/-/project_members", - "locator": "", - "required_contents": { - "must_include": [ - "@lahwaacz", - "@bblanchon" - ] - } - } - ] - }, - "intent_template_id": 293, - "changelogs": [ + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "expected_backend_state", - "category": "switch_to_api_eval", - "note": "Use API to verify; more accurate" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "collaborators": ["lahwaacz", "bblanchon"], + "group": "byteblaze", + "project": "gimmiethat.space", + "site": "gitlab", + "expected": true } - ] + ], + "revision": 2 }, { - "sites": [ - "gitlab" - ], + "sites": ["gitlab"], "task_id": 568, - "require_login": true, - "storage_state": "./.auth/gitlab_state.json", - "start_url": "__GITLAB__", - "geolocation": null, - "intent_template": "Invite {{collaborator_account_list}} as collaborator to {{repo}} repo", - "instantiation_dict": { - "collaborator_account_list": "Abishek and Vinta", - "repo": "a11yproject.com" - }, - "intent": "Invite Abishek and Vinta as collaborator to a11yproject.com repo", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "project_has_collaborators", - "eval_params": { - "collaborators": [ - "abisubramanya27", - "vinta" - ], - "group": "a11yproject", - "project": "a11yproject.com" - } - } - } - ], - "site": "gitlab" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "__GITLAB__/a11yproject/a11yproject.com/-/project_members", - "locator": "", - "required_contents": { - "must_include": [ - "@abisubramanya27", - "@vinta" - ] - } - } - ] - }, "intent_template_id": 293, - "changelogs": [ + "start_urls": ["__GITLAB__"], + "intent": "Invite Abishek and Vinta as collaborator(s) to a11yproject.com repo", + "intent_template": "Invite {{collaborator_account_list}} as collaborator(s) to {{repo}} repo", + "instantiation_dict": {"collaborator_account_list": "Abishek and Vinta", "repo": "a11yproject.com"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "expected_backend_state", - "category": "switch_to_api_eval", - "note": "Use API to verify; more accurate" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "collaborators": ["abisubramanya27", "vinta"], + "group": "a11yproject", + "project": "a11yproject.com", + "site": "gitlab", + "expected": true } - ] + ], + "revision": 2 }, { - "sites": [ - "gitlab" - ], + "sites": ["gitlab"], "task_id": 569, - "require_login": true, - "storage_state": "./.auth/gitlab_state.json", - "start_url": "__GITLAB__", - "geolocation": null, - "intent_template": "Invite {{collaborator_account_list}} as collaborator to {{repo}} repo", + "intent_template_id": 293, + "start_urls": ["__GITLAB__"], + "intent": "Invite Beno\u00eet and Abishek as collaborator(s) to my HTML5 markup extention repo", + "intent_template": "Invite {{collaborator_account_list}} as collaborator(s) to {{repo}} repo", "instantiation_dict": { "collaborator_account_list": "Beno\u00eet and Abishek", "repo": "my HTML5 markup extention" }, - "intent": "Invite Beno\u00eet and Abishek as collaborator to my HTML5 markup extention repo", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "project_has_collaborators", - "eval_params": { - "collaborators": [ - "bblanchon", - "abisubramanya27" - ], - "group": "byteblaze", - "project": "accessible-html-content-patterns" - } - } - } - ], - "site": "gitlab" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "__GITLAB__/byteblaze/accessible-html-content-patterns/-/project_members", - "locator": "", - "required_contents": { - "must_include": [ - "@bblanchon", - "@abisubramanya27" - ] - } - } - ] - }, - "intent_template_id": 293, - "changelogs": [ + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "expected_backend_state", - "category": "switch_to_api_eval", - "note": "Use API to verify; more accurate" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "collaborators": ["bblanchon", "abisubramanya27"], + "group": "byteblaze", + "project": "accessible-html-content-patterns", + "site": "gitlab", + "expected": true } - ] + ], + "revision": 2 }, { - "sites": [ - "gitlab" - ], + "sites": ["gitlab"], "task_id": 570, - "require_login": true, - "storage_state": "./.auth/gitlab_state.json", - "start_url": "__GITLAB__", - "geolocation": null, - "intent_template": "Invite {{collaborator_account_list}} as collaborator to {{repo}} repo", + "intent_template_id": 293, + "start_urls": ["__GITLAB__"], + "intent": "Invite Jakub K, Alex Dills, Alex Hutnik and Beno\u00eet Blanchon as collaborator(s) to my time tracking tool project repo", + "intent_template": "Invite {{collaborator_account_list}} as collaborator(s) to {{repo}} repo", "instantiation_dict": { "collaborator_account_list": "Jakub K, Alex Dills, Alex Hutnik and Beno\u00eet Blanchon", "repo": "my time tracking tool project" }, - "intent": "Invite Jakub K, Alex Dills, Alex Hutnik and Beno\u00eet Blanchon as collaborator to my time tracking tool project repo", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "project_has_collaborators", - "eval_params": { - "collaborators": [ - "lahwaacz", - "V13Axel", - "alexhutnik", - "bblanchon" - ], - "group": "byteblaze", - "project": "timeit" - } - } - } - ], - "site": "gitlab" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "__GITLAB__/byteblaze/timeit/-/project_members", - "locator": "", - "required_contents": { - "must_include": [ - "@lahwaacz", - "@V13Axel", - "@alexhutnik", - "@bblanchon" - ] - } - } - ] - }, - "intent_template_id": 293, - "changelogs": [ + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "expected_backend_state", - "category": "switch_to_api_eval", - "note": "Use API to verify; more accurate" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "collaborators": ["lahwaacz", "V13Axel", "alexhutnik", "bblanchon"], + "group": "byteblaze", + "project": "timeit", + "site": "gitlab", + "expected": true } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping" - ], + "sites": ["shopping"], "task_id": 571, - "require_login": true, - "storage_state": "./.auth/shopping_state.json", - "start_url": "__SHOPPING__", - "geolocation": null, - "intent_template": "I recently moved, my address is {{address}}, update my information on OneStopShopping accordingly", - "instantiation_dict": { - "address": "231 Willow Way, Suite 100, Chicago, IL, 60601" - }, - "intent": "I recently moved, my address is 231 Willow Way, Suite 100, Chicago, IL, 60601, update my information on OneStopShopping accordingly", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "validate_address", - "expected_data": { - "address": "231 Willow Way", - "address2": "Suite 100", - "city": "Chicago", - "state": "Illinois", - "zip_code": "60601" - } - } - } - ], - "site": "shopping" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "__SHOPPING__/customer/address", - "locator": "document.querySelector(\".box.box-address-billing > .box-content\").outerText", - "required_contents": { - "must_include": [ - "231 Willow Way", - "Suite 100", - "Chicago, Illinois, 60601" - ] - } - }, - { - "url": "__SHOPPING__/customer/address", - "locator": "document.querySelector(\".box.box-address-shipping > .box-content\").outerText", - "required_contents": { - "must_include": [ - "231 Willow Way", - "Suite 100", - "Chicago, Illinois, 60601" - ] - } - } - ] - }, "intent_template_id": 165, - "changelogs": [ + "start_urls": ["__SHOPPING__"], + "intent": "I recently moved, my address is 231 Willow Way, Suite 100, Chicago, IL, 60601, update my information on OneStopShopping accordingly", + "intent_template": "I recently moved, my address is {{address}}, update my information on OneStopShopping accordingly", + "instantiation_dict": {"address": "231 Willow Way, Suite 100, Chicago, IL, 60601"}, + "format_specification": null, + "start_url_context": null, + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "site": "shopping", + "expected": { + "address": "231 Willow Way", + "address2": "Suite 100", + "city": "Chicago", + "state": "Illinois", + "zip_code": "60601" + } } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping" - ], + "sites": ["shopping"], "task_id": 572, - "require_login": true, - "storage_state": "./.auth/shopping_state.json", - "start_url": "__SHOPPING__", - "geolocation": null, - "intent_template": "I recently moved, my address is {{address}}, update my information on OneStopShopping accordingly", - "instantiation_dict": { - "address": "654 Aspen Road, House #3, Boston, MA, 02110" - }, - "intent": "I recently moved, my address is 654 Aspen Road, House #3, Boston, MA, 02110, update my information on OneStopShopping accordingly", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "validate_address", - "expected_data": { - "address": "654 Aspen Road", - "address2": "House #3", - "city": "Boston", - "state": "Massachusetts", - "zip_code": "02110" - } - } - } - ], - "site": "shopping" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "__SHOPPING__/customer/address", - "locator": "document.querySelector(\".box.box-address-billing > .box-content\").outerText", - "required_contents": { - "must_include": [ - "654 Aspen Road", - "House #3", - "Boston, Massachusetts, 02110" - ] - } - }, - { - "url": "__SHOPPING__/customer/address", - "locator": "document.querySelector(\".box.box-address-shipping > .box-content\").outerText", - "required_contents": { - "must_include": [ - "654 Aspen Road", - "House #3", - "Boston, Massachusetts, 02110" - ] - } - } - ] - }, "intent_template_id": 165, - "changelogs": [ + "start_urls": ["__SHOPPING__"], + "intent": "I recently moved, my address is 654 Aspen Road, House #3, Boston, MA, 02110, update my information on OneStopShopping accordingly", + "intent_template": "I recently moved, my address is {{address}}, update my information on OneStopShopping accordingly", + "instantiation_dict": {"address": "654 Aspen Road, House #3, Boston, MA, 02110"}, + "format_specification": null, + "start_url_context": null, + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "site": "shopping", + "expected": { + "address": "654 Aspen Road", + "address2": "House #3", + "city": "Boston", + "state": "Massachusetts", + "zip_code": "02110" + } } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping" - ], + "sites": ["shopping"], "task_id": 573, - "require_login": true, - "storage_state": "./.auth/shopping_state.json", - "start_url": "__SHOPPING__", - "geolocation": null, - "intent_template": "I recently moved, my address is {{address}}, update my information on OneStopShopping accordingly", - "instantiation_dict": { - "address": "987 Sycamore Circle, Philadelphia, PA, 19102" - }, - "intent": "I recently moved, my address is 987 Sycamore Circle, Philadelphia, PA, 19102, update my information on OneStopShopping accordingly", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "validate_address", - "expected_data": { - "address": "987 Sycamore Circle", - "city": "Philadelphia", - "state": "Pennsylvania", - "zip_code": "19102" - } - } - } - ], - "site": "shopping" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "__SHOPPING__/customer/address", - "locator": "document.querySelector(\".box.box-address-shipping > .box-content\").outerText", - "required_contents": { - "must_include": [ - "987 Sycamore Circle", - "Philadelphia, Pennsylvania, 19102" - ] - } - }, - { - "url": "__SHOPPING__/customer/address", - "locator": "document.querySelector(\".box.box-address-billing > .box-content\").outerText", - "required_contents": { - "must_include": [ - "987 Sycamore Circle", - "Philadelphia, Pennsylvania, 19102" - ] - } - } - ] - }, "intent_template_id": 165, - "changelogs": [ + "start_urls": ["__SHOPPING__"], + "intent": "I recently moved, my address is 987 Sycamore Circle, Philadelphia, PA, 19102, update my information on OneStopShopping accordingly", + "intent_template": "I recently moved, my address is {{address}}, update my information on OneStopShopping accordingly", + "instantiation_dict": {"address": "987 Sycamore Circle, Philadelphia, PA, 19102"}, + "format_specification": null, + "start_url_context": null, + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "site": "shopping", + "expected": { + "address": "987 Sycamore Circle", + "city": "Philadelphia", + "state": "Pennsylvania", + "zip_code": "19102" + } } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping" - ], + "sites": ["shopping"], "task_id": 574, - "require_login": true, - "storage_state": "./.auth/shopping_state.json", - "start_url": "__SHOPPING__", - "geolocation": null, - "intent_template": "I recently moved, my address is {{address}}, update my information on OneStopShopping accordingly", - "instantiation_dict": { - "address": "111 Magnolia Path, Atlanta, GA, 30303" - }, - "intent": "I recently moved, my address is 111 Magnolia Path, Atlanta, GA, 30303, update my information on OneStopShopping accordingly", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "validate_address", - "expected_data": { - "address": "111 Magnolia Path", - "city": "Atlanta", - "state": "Georgia", - "zip_code": "30303" - } - } - } - ], - "site": "shopping" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "__SHOPPING__/customer/address", - "locator": "document.querySelector(\".box.box-address-shipping > .box-content\").outerText", - "required_contents": { - "must_include": [ - "111 Magnolia Path", - "Atlanta, Georgia, 30303" - ] - } - }, - { - "url": "__SHOPPING__/customer/address", - "locator": "document.querySelector(\".box.box-address-billing > .box-content\").outerText", - "required_contents": { - "must_include": [ - "111 Magnolia Path", - "Atlanta, Georgia, 30303" - ] - } - } - ] - }, "intent_template_id": 165, - "changelogs": [ + "start_urls": ["__SHOPPING__"], + "intent": "I recently moved, my address is 111 Magnolia Path, Atlanta, GA, 30303, update my information on OneStopShopping accordingly", + "intent_template": "I recently moved, my address is {{address}}, update my information on OneStopShopping accordingly", + "instantiation_dict": {"address": "111 Magnolia Path, Atlanta, GA, 30303"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + }, + { + "evaluator": "BackendStateEvaluator", + "site": "shopping", + "expected": { + "address": "111 Magnolia Path", + "city": "Atlanta", + "state": "Georgia", + "zip_code": "30303" + } } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping" - ], + "sites": ["shopping"], "task_id": 575, - "require_login": true, - "storage_state": "./.auth/shopping_state.json", - "start_url": "__SHOPPING__", - "geolocation": null, - "intent_template": "I recently moved, my address is {{address}}, update my information on OneStopShopping accordingly", - "instantiation_dict": { - "address": "222 Redwood Rise, Suite 300, Seattle, WA, 98101" - }, - "intent": "I recently moved, my address is 222 Redwood Rise, Suite 300, Seattle, WA, 98101, update my information on OneStopShopping accordingly", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "validate_address", - "expected_data": { - "address": "222 Redwood Rise", - "address2": "Suite 300", - "city": "Seattle", - "state": "Washington", - "zip_code": "98101" - } - } - } - ], - "site": "shopping" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "__SHOPPING__/customer/address", - "locator": "document.querySelector(\".box.box-address-shipping > .box-content\").outerText", - "required_contents": { - "must_include": [ - "222 Redwood Rise", - "Suite 300", - "Seattle, Washington, 98101" - ] - } - }, - { - "url": "__SHOPPING__/customer/address", - "locator": "document.querySelector(\".box.box-address-billing > .box-content\").outerText", - "required_contents": { - "must_include": [ - "222 Redwood Rise", - "Suite 300", - "Seattle, Washington, 98101" - ] - } - } - ] - }, "intent_template_id": 165, - "changelogs": [ + "start_urls": ["__SHOPPING__"], + "intent": "I recently moved, my address is 222 Redwood Rise, Suite 300, Seattle, WA, 98101, update my information on OneStopShopping accordingly", + "intent_template": "I recently moved, my address is {{address}}, update my information on OneStopShopping accordingly", + "instantiation_dict": {"address": "222 Redwood Rise, Suite 300, Seattle, WA, 98101"}, + "format_specification": null, + "start_url_context": null, + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "site": "shopping", + "expected": { + "address": "222 Redwood Rise", + "address2": "Suite 300", + "city": "Seattle", + "state": "Washington", + "zip_code": "98101" + } } - ] + ], + "revision": 2 }, { - "sites": [ - "gitlab" - ], + "sites": ["gitlab"], "task_id": 576, - "require_login": true, - "storage_state": "./.auth/gitlab_state.json", - "start_url": "__GITLAB__", - "geolocation": null, + "intent_template_id": 351, + "start_urls": ["__GITLAB__"], + "intent": "Add the following users to repo a11y-webring.club as developer: ['abisubramanya27', 'lahwaacz']", "intent_template": "Add the following users to repo {{repo}} as {{role}}: {{user_list}}", "instantiation_dict": { "repo": "a11y-webring.club", "role": "developer", - "user_list": [ - "abisubramanya27", - "lahwaacz" - ] - }, - "intent": "Add the following users to repo a11y-webring.club as developer: ['abisubramanya27', 'lahwaacz']", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "project_has_collaborators", - "eval_params": { - "collaborators": [ - "abisubramanya27", - "lahwaacz" - ], - "group": "byteblaze", - "project": "a11y-webring.club", - "access_level": 30 - } - } - } - ], - "site": "gitlab" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "__GITLAB__/byteblaze/a11y-webring.club/-/project_members", - "locator": "func:gitlab_get_project_memeber_role(__page__, 'abisubramanya27')", - "required_contents": { - "must_include": [ - "Developer" - ] - } - }, - { - "url": "__GITLAB__/byteblaze/a11y-webring.club/-/project_members", - "locator": "func:gitlab_get_project_memeber_role(__page__, 'lahwaacz')", - "required_contents": { - "must_include": [ - "Developer" - ] - } - } - ] + "user_list": ["abisubramanya27", "lahwaacz"] }, - "intent_template_id": 351, - "changelogs": [ + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "expected_backend_state", - "category": "switch_to_api_eval", - "note": "Use API to verify; more accurate" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "collaborators": ["abisubramanya27", "lahwaacz"], + "group": "byteblaze", + "project": "a11y-webring.club", + "access_level": 30, + "site": "gitlab", + "expected": true } - ] + ], + "revision": 2 }, { - "sites": [ - "gitlab" - ], + "sites": ["gitlab"], "task_id": 577, - "require_login": true, - "storage_state": "./.auth/gitlab_state.json", - "start_url": "__GITLAB__", - "geolocation": null, + "intent_template_id": 351, + "start_urls": ["__GITLAB__"], + "intent": "Add the following users to my GitHub timeline item management extension as maintainer: ['abisubramanya27', 'lahwaacz']", "intent_template": "Add the following users to my {{repo}} as {{role}}: {{user_list}}", "instantiation_dict": { "repo": "GitHub timeline item management extension", "role": "maintainer", - "user_list": [ - "abisubramanya27", - "lahwaacz" - ] - }, - "intent": "Add the following users to my GitHub timeline item management extension as maintainer: ['abisubramanya27', 'lahwaacz']", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "project_has_collaborators", - "eval_params": { - "collaborators": [ - "abisubramanya27", - "lahwaacz" - ], - "group": "byteblaze", - "project": "remove-board-movement-events-from-the-github-issue-timeline", - "access_level": 40 - } - } - } - ], - "site": "gitlab" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "__GITLAB__/byteblaze/remove-board-movement-events-from-the-github-issue-timeline/-/project_members", - "locator": "func:gitlab_get_project_memeber_role(__page__, 'abisubramanya27')", - "required_contents": { - "must_include": [ - "Maintainer" - ] - } - }, - { - "url": "__GITLAB__/byteblaze/remove-board-movement-events-from-the-github-issue-timeline/-/project_members", - "locator": "func:gitlab_get_project_memeber_role(__page__, 'lahwaacz')", - "required_contents": { - "must_include": [ - "Maintainer" - ] - } - } - ] + "user_list": ["abisubramanya27", "lahwaacz"] }, - "intent_template_id": 351, - "changelogs": [ + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "expected_backend_state", - "category": "switch_to_api_eval", - "note": "Use API to verify; more accurate" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "collaborators": ["abisubramanya27", "lahwaacz"], + "group": "byteblaze", + "project": "remove-board-movement-events-from-the-github-issue-timeline", + "access_level": 40, + "site": "gitlab", + "expected": true } - ] + ], + "revision": 2 }, { - "sites": [ - "gitlab" - ], + "sites": ["gitlab"], "task_id": 578, - "require_login": true, - "storage_state": "./.auth/gitlab_state.json", - "start_url": "__GITLAB__", - "geolocation": null, + "intent_template_id": 351, + "start_urls": ["__GITLAB__"], + "intent": "Add the following users to repo millennials-to-snake-people as reporter: ['yjlou', 'a11yproject']", "intent_template": "Add the following users to repo {{repo}} as {{role}}: {{user_list}}", "instantiation_dict": { "repo": "millennials-to-snake-people", "role": "reporter", - "user_list": [ - "yjlou", - "a11yproject" - ] - }, - "intent": "Add the following users to repo millennials-to-snake-people as reporter: ['yjlou', 'a11yproject']", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "project_has_collaborators", - "eval_params": { - "collaborators": [ - "yjlou", - "a11yproject" - ], - "group": "byteblaze", - "project": "millennials-to-snake-people", - "access_level": 20 - } - } - } - ], - "site": "gitlab" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "__GITLAB__/byteblaze/millennials-to-snake-people/-/project_members", - "locator": "func:gitlab_get_project_memeber_role(__page__, 'yjlou')", - "required_contents": { - "must_include": [ - "Reporter" - ] - } - }, - { - "url": "__GITLAB__/byteblaze/millennials-to-snake-people/-/project_members", - "locator": "func:gitlab_get_project_memeber_role(__page__, 'a11yproject')", - "required_contents": { - "must_include": [ - "Reporter" - ] - } - } - ] + "user_list": ["yjlou", "a11yproject"] }, - "intent_template_id": 351, - "changelogs": [ + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "expected_backend_state", - "category": "switch_to_api_eval", - "note": "Use API to verify; more accurate" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "collaborators": ["yjlou", "a11yproject"], + "group": "byteblaze", + "project": "millennials-to-snake-people", + "access_level": 20, + "site": "gitlab", + "expected": true } - ] + ], + "revision": 2 }, { - "sites": [ - "gitlab" - ], + "sites": ["gitlab"], "task_id": 579, - "require_login": true, - "storage_state": "./.auth/gitlab_state.json", - "start_url": "__GITLAB__", - "geolocation": null, - "intent_template": "Add the following users to my {{repo}} as {{role}}: {{user_list}}", - "instantiation_dict": { - "repo": "time tracking tool", - "role": "guest", - "user_list": [ - "yjlou" - ] - }, - "intent": "Add the following users to my time tracking tool as guest: ['yjlou']", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "project_has_collaborators", - "eval_params": { - "collaborators": [ - "yjlou" - ], - "group": "byteblaze", - "project": "timeit", - "access_level": 10 - } - } - } - ], - "site": "gitlab" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "__GITLAB__/byteblaze/timeit/-/project_members", - "locator": "func:gitlab_get_project_memeber_role(__page__, 'yjlou')", - "required_contents": { - "must_include": [ - "Guest" - ] - } - } - ] - }, "intent_template_id": 351, - "changelogs": [ + "start_urls": ["__GITLAB__"], + "intent": "Add the following users to my time tracking tool as guest: ['yjlou']", + "intent_template": "Add the following users to my {{repo}} as {{role}}: {{user_list}}", + "instantiation_dict": { "repo": "time tracking tool", "role": "guest", "user_list": ["yjlou"] }, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "expected_backend_state", - "category": "switch_to_api_eval", - "note": "Use API to verify; more accurate" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "collaborators": ["yjlou"], + "group": "byteblaze", + "project": "timeit", + "access_level": 10, + "site": "gitlab", + "expected": true } - ] + ], + "revision": 2 }, { - "sites": [ - "reddit" - ], + "sites": ["reddit"], "task_id": 580, - "require_login": true, - "storage_state": "./.auth/reddit_state.json", - "start_url": "__REDDIT__", - "geolocation": null, + "intent_template_id": 7, + "start_urls": ["__REDDIT__"], + "intent": "Create a new forum named sci_fi, with a description of A wild place for sci-fi enthusiasts, and include ['New', 'Classic', 'Movies', 'Post my novel', 'Random'] in the sidebar?", "intent_template": "Create a new forum named {{name}}, with a description of {{description}}, and include {{sidebar_list}} in the sidebar?", "instantiation_dict": { "name": "sci_fi", "description": "A wild place for sci-fi enthusiasts", - "sidebar_list": [ - "New", - "Classic", - "Movies", - "Post my novel", - "Random" - ] - }, - "intent": "Create a new forum named sci_fi, with a description of A wild place for sci-fi enthusiasts, and include ['New', 'Classic', 'Movies', 'Post my novel', 'Random'] in the sidebar?", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "validate_subreddit_has_description", - "expected_data": { - "subreddit": "sci_fi", - "description": [ - "New", - "Classic", - "Movies", - "Post my novel", - "Random" - ] - } - } - } - ], - "site": "reddit" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "__REDDIT__/f/sci_fi/edit", - "locator": "document.querySelector(\"#forum_description\").value", - "required_contents": { - "must_include": [ - "A wild place for sci-fi enthusiasts" - ] - } - }, - { - "url": "__REDDIT__/f/sci_fi/edit", - "locator": "document.querySelector(\"#forum_sidebar\").value", - "required_contents": { - "must_include": [ - "New", - "Classic", - "Movies", - "Post my novel", - "Random" - ] - } - } - ] + "sidebar_list": ["New", "Classic", "Movies", "Post my novel", "Random"] }, - "intent_template_id": 7, - "changelogs": [ + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "expected_backend_state", - "category": "switch_to_api_eval", - "note": "Update to use system check instead of dom check" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "site": "reddit", + "expected": { + "subreddit": "sci_fi", + "description": "A wild place for sci-fi enthusiasts", + "sidebar": ["New", "Classic", "Movies", "Post my novel", "Random"] + } } - ] + ], + "revision": 2 }, { - "sites": [ - "reddit" - ], + "sites": ["reddit"], "task_id": 581, - "require_login": true, - "storage_state": "./.auth/reddit_state.json", - "start_url": "__REDDIT__", - "geolocation": null, + "intent_template_id": 7, + "start_urls": ["__REDDIT__"], + "intent": "Create a new forum named cmu_lti, with a description of Language Technologies Institute at Carnegie Mellon University, and include ['announcement', 'paper', 'alumni'] in the sidebar?", "intent_template": "Create a new forum named {{name}}, with a description of {{description}}, and include {{sidebar_list}} in the sidebar?", "instantiation_dict": { "name": "cmu_lti", "description": "Language Technologies Institute at Carnegie Mellon University", - "sidebar_list": [ - "announcement", - "paper", - "alumni" - ] - }, - "intent": "Create a new forum named cmu_lti, with a description of Language Technologies Institute at Carnegie Mellon University, and include ['announcement', 'paper', 'alumni'] in the sidebar?", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "validate_subreddit_has_description", - "expected_data": { - "subreddit": "cmu_lti", - "description": [ - "announcement", - "paper", - "alumni" - ] - } - } - } - ], - "site": "reddit" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "__REDDIT__/f/cmu_lti/edit", - "locator": "document.querySelector(\"#forum_description\").value", - "required_contents": { - "must_include": [ - "Language Technologies Institute at Carnegie Mellon University" - ] - } - }, - { - "url": "__REDDIT__/f/cmu_lti/edit", - "locator": "document.querySelector(\"#forum_sidebar\").value", - "required_contents": { - "must_include": [ - "announcement", - "paper", - "alumni" - ] - } - } - ] + "sidebar_list": ["announcement", "paper", "alumni"] }, - "intent_template_id": 7, - "changelogs": [ + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "expected_backend_state", - "category": "switch_to_api_eval", - "note": "Update to use system check instead of dom check" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "site": "reddit", + "expected": { + "subreddit": "cmu_lti", + "description": "Language Technologies Institute at Carnegie Mellon University", + "sidebar": ["announcement", "paper", "alumni"] + } } - ] + ], + "revision": 2 }, { - "sites": [ - "reddit" - ], + "sites": ["reddit"], "task_id": 582, - "require_login": true, - "storage_state": "./.auth/reddit_state.json", - "start_url": "__REDDIT__", - "geolocation": null, + "intent_template_id": 7, + "start_urls": ["__REDDIT__"], + "intent": "Create a new forum named Cyberpunk, with a description of Welcome to the future, and include ['Games', 'Books', 'Movies', 'Future'] in the sidebar?", "intent_template": "Create a new forum named {{name}}, with a description of {{description}}, and include {{sidebar_list}} in the sidebar?", "instantiation_dict": { "name": "Cyberpunk", "description": "Welcome to the future", - "sidebar_list": [ - "Games", - "Books", - "Movies", - "Future" - ] - }, - "intent": "Create a new forum named Cyberpunk, with a description of Welcome to the future, and include ['Games', 'Books', 'Movies', 'Future'] in the sidebar?", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "validate_subreddit_has_description", - "expected_data": { - "subreddit": "sci_fi", - "description": [ - "Games", - "Books", - "Movies", - "Future" - ] - } - } - } - ], - "site": "reddit" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "__REDDIT__/f/Cyberpunk/edit", - "locator": "document.querySelector(\"#forum_description\").value", - "required_contents": { - "must_include": [ - "Welcome to the future" - ] - } - }, - { - "url": "__REDDIT__/f/Cyberpunk/edit", - "locator": "document.querySelector(\"#forum_sidebar\").value", - "required_contents": { - "must_include": [ - "Games", - "Books", - "Movies", - "Future" - ] - } - } - ] + "sidebar_list": ["Games", "Books", "Movies", "Future"] }, - "intent_template_id": 7, - "changelogs": [ + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "expected_backend_state", - "category": "switch_to_api_eval", - "note": "Update to use system check instead of dom check" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "site": "reddit", + "expected": { + "subreddit": "Cyberpunk", + "description": "Welcome to the future", + "sidebar": ["Games", "Books", "Movies", "Future"] + } } - ] + ], + "revision": 2 }, { - "sites": [ - "reddit" - ], + "sites": ["reddit"], "task_id": 583, - "require_login": true, - "storage_state": "./.auth/reddit_state.json", - "start_url": "__REDDIT__", - "geolocation": null, + "intent_template_id": 7, + "start_urls": ["__REDDIT__"], + "intent": "Create a new forum named PlantsForCatParents, with a description of Cat parents & plan lovers, and include ['Cat friendly', 'Local vendors', 'Promotion', 'Toxic plants!'] in the sidebar?", "intent_template": "Create a new forum named {{name}}, with a description of {{description}}, and include {{sidebar_list}} in the sidebar?", "instantiation_dict": { "name": "PlantsForCatParents", "description": "Cat parents & plan lovers", - "sidebar_list": [ - "Cat friendly", - "Local vendors", - "Promotion", - "Toxic plants!" - ] - }, - "intent": "Create a new forum named PlantsForCatParents, with a description of Cat parents & plan lovers, and include ['Cat friendly', 'Local vendors', 'Promotion', 'Toxic plants!'] in the sidebar?", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "validate_subreddit_has_description", - "expected_data": { - "subreddit": "PlantsForCatParents", - "description": [ - "Cat friendly", - "Local vendors", - "Promotion", - "Toxic plants!" - ] - } - } - } - ], - "site": "reddit" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "__REDDIT__/f/PlantsForCatParents/edit", - "locator": "document.querySelector(\"#forum_description\").value", - "required_contents": { - "must_include": [ - "Cat parents & plan lovers" - ] - } - }, - { - "url": "__REDDIT__/f/PlantsForCatParents/edit", - "locator": "document.querySelector(\"#forum_sidebar\").value", - "required_contents": { - "must_include": [ - "Cat friendly", - "Local vendors", - "Promotion", - "Toxic plants!" - ] - } - } - ] + "sidebar_list": ["Cat friendly", "Local vendors", "Promotion", "Toxic plants!"] }, - "intent_template_id": 7, - "changelogs": [ + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "expected_backend_state", - "category": "switch_to_api_eval", - "note": "Update to use system check instead of dom check" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "site": "reddit", + "expected": { + "subreddit": "PlantsForCatParents", + "description": "Cat parents & plan lovers", + "sidebar": ["Cat friendly", "Local vendors", "Promotion", "Toxic plants!"] + } } - ] + ], + "revision": 2 }, { - "sites": [ - "reddit" - ], + "sites": ["reddit"], "task_id": 584, - "require_login": true, - "storage_state": "./.auth/reddit_state.json", - "start_url": "__REDDIT__", - "geolocation": null, + "intent_template_id": 7, + "start_urls": ["__REDDIT__"], + "intent": "Create a new forum named Karaoke, with a description of Place for Karaoke lovers, and include ['devices', 'setup'] in the sidebar?", "intent_template": "Create a new forum named {{name}}, with a description of {{description}}, and include {{sidebar_list}} in the sidebar?", "instantiation_dict": { "name": "Karaoke", "description": "Place for Karaoke lovers", - "sidebar_list": [ - "devices", - "setup" - ] - }, - "intent": "Create a new forum named Karaoke, with a description of Place for Karaoke lovers, and include ['devices', 'setup'] in the sidebar?", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "validate_subreddit_has_description", - "expected_data": { - "subreddit": "Karaoke", - "description": [ - "devices", - "setup" - ] - } - } - } - ], - "site": "reddit" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "__REDDIT__/f/Karaoke", - "locator": "document.querySelector(\"#forum_description\").value", - "required_contents": { - "must_include": [ - "Place for Karaoke lovers" - ] - } - }, - { - "url": "__REDDIT__/f/Karaoke", - "locator": "document.querySelector(\"#forum_sidebar\").value", - "required_contents": { - "must_include": [ - "devices", - "setup" - ] - } - } - ] + "sidebar_list": ["devices", "setup"] }, - "intent_template_id": 7, - "changelogs": [ + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "expected_backend_state", - "category": "switch_to_api_eval", - "note": "Update to use system check instead of dom check" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "site": "reddit", + "expected": { + "subreddit": "Karaoke", + "description": "Place for Karaoke lovers", + "sidebar": ["devices", "setup"] + } } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping" - ], + "sites": ["shopping"], "task_id": 585, - "require_login": true, - "storage_state": "./.auth/shopping_state.json", - "start_url": "__SHOPPING__", - "geolocation": null, - "intent_template": "Rate my recent purchase of {{product}} with {{num_star}} stars, using my nickname {{nickname}}?", - "instantiation_dict": { - "product": "floor lamp", - "num_star": 5, - "nickname": "Emma Lopez" - }, - "intent": "Rate my recent purchase of floor lamp with 5 stars, using my nickname Emma Lopez?", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "validate_review", - "expected_data": { - "sku": "B00J8RZL7I", - "num_star": 5, - "nickname": "Emma Lopez" - } - } - } - ], - "site": "shopping" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "last", - "locator": "func:shopping_get_sku_latest_review_rating('B00J8RZL7I')", - "required_contents": { - "must_include": [ - "100" - ] - } - }, - { - "url": "last", - "locator": "func:shopping_get_sku_latest_review_author('B00J8RZL7I')", - "required_contents": { - "must_include": [ - "Emma Lopez" - ] - } - } - ] - }, "intent_template_id": 194, - "changelogs": [ + "start_urls": ["__SHOPPING__"], + "intent": "Rate my recent purchase of floor lamp with 5 stars, using my nickname Emma Lopez?", + "intent_template": "Rate my recent purchase of {{product}} with {{num_star}} stars, using my nickname {{nickname}}?", + "instantiation_dict": {"product": "floor lamp", "num_star": 5, "nickname": "Emma Lopez"}, + "format_specification": null, + "start_url_context": null, + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "site": "shopping", + "expected": {"sku": "B00J8RZL7I", "num_star": 5, "nickname": "Emma Lopez"} } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping" - ], + "sites": ["shopping"], "task_id": 586, - "require_login": true, - "storage_state": "./.auth/shopping_state.json", - "start_url": "__SHOPPING__", - "geolocation": null, + "intent_template_id": 194, + "start_urls": ["__SHOPPING__"], + "intent": "Rate my recent purchase of Jiffy Corn Muffin Cornbread Mix with 4 stars, using my nickname ShoppingEmma?", "intent_template": "Rate my recent purchase of {{product}} with {{num_star}} stars, using my nickname {{nickname}}?", "instantiation_dict": { "product": "Jiffy Corn Muffin Cornbread Mix", "num_star": 4, "nickname": "ShoppingEmma" }, - "intent": "Rate my recent purchase of Jiffy Corn Muffin Cornbread Mix with 4 stars, using my nickname ShoppingEmma?", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "validate_review", - "expected_data": { - "sku": "B07HZB38XH", - "num_star": 4, - "nickname": "ShoppingEmma" - } - } - } - ], - "site": "shopping" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "last", - "locator": "func:shopping_get_sku_latest_review_rating('B07HZB38XH')", - "required_contents": { - "must_include": [ - "80" - ] - } - }, - { - "url": "last", - "locator": "func:shopping_get_sku_latest_review_author('B07HZB38XH')", - "required_contents": { - "must_include": [ - "ShoppingEmma" - ] - } - } - ] - }, - "intent_template_id": 194, - "changelogs": [ + "format_specification": null, + "start_url_context": null, + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "site": "shopping", + "expected": {"sku": "B07HZB38XH", "num_star": 4, "nickname": "ShoppingEmma"} } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping" - ], + "sites": ["shopping"], "task_id": 587, - "require_login": true, - "storage_state": "./.auth/shopping_state.json", - "start_url": "__SHOPPING__", - "geolocation": null, + "intent_template_id": 194, + "start_urls": ["__SHOPPING__"], + "intent": "Rate my recent purchase of PS3 Remote Controller Skins with 3 stars, using my nickname GamingEmma?", "intent_template": "Rate my recent purchase of {{product}} with {{num_star}} stars, using my nickname {{nickname}}?", "instantiation_dict": { "product": "PS3 Remote Controller Skins", "num_star": 3, "nickname": "GamingEmma" }, - "original.instantiation_dict": { - "product": "PS3 Remote Controllers", - "num_star": 3, - "nickname": "GamingEmma" - }, - "intent": "Rate my recent purchase of PS3 Remote Controller Skins with 3 stars, using my nickname GamingEmma?", - "original.intent": "Rate my recent purchase of PS3 Remote Controllers with 3 stars, using my nickname GamingEmma?", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "validate_review", - "expected_data": { - "sku": "B0041MSF2S", - "num_star": 3, - "nickname": "GamingEmma" - } - } - } - ], - "site": "shopping" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "last", - "locator": "func:shopping_get_sku_latest_review_rating('B0041MSF2S')", - "required_contents": { - "must_include": [ - "60" - ] - } - }, - { - "url": "last", - "locator": "func:shopping_get_sku_latest_review_author('B0041MSF2S')", - "required_contents": { - "must_include": [ - "GamingEmma" - ] - } - } - ] - }, - "intent_template_id": 194, - "changelogs": [ + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "instantiation_dict", - "category": "reference_alignment", - "note": "Changed product name to indicate the desired product is a skin not the controller" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "site": "shopping", + "expected": {"sku": "B0041MSF2S", "num_star": 3, "nickname": "GamingEmma"} } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping" - ], + "sites": ["shopping"], "task_id": 588, - "require_login": true, - "storage_state": "./.auth/shopping_state.json", - "start_url": "__SHOPPING__", - "geolocation": null, + "intent_template_id": 194, + "start_urls": ["__SHOPPING__"], + "intent": "Rate my recent purchase of Foundation For Mattress With Frame Set with 1 stars, using my nickname ShoppingEmma?", "intent_template": "Rate my recent purchase of {{product}} with {{num_star}} stars, using my nickname {{nickname}}?", "instantiation_dict": { "product": "Foundation For Mattress With Frame Set", "num_star": 1, "nickname": "ShoppingEmma" }, - "intent": "Rate my recent purchase of Foundation For Mattress With Frame Set with 1 stars, using my nickname ShoppingEmma?", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "validate_review", - "expected_data": { - "sku": "B07DFJ5XKH", - "num_star": 1, - "nickname": "ShoppingEmma" - } - } - } - ], - "site": "shopping" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "last", - "locator": "func:shopping_get_sku_latest_review_rating('B07DFJ5XKH')", - "required_contents": { - "must_include": [ - "20" - ] - } - }, - { - "url": "last", - "locator": "func:shopping_get_sku_latest_review_author('B07DFJ5XKH')", - "required_contents": { - "must_include": [ - "ShoppingEmma" - ] - } - } - ] - }, - "intent_template_id": 194, - "changelogs": [ + "format_specification": null, + "start_url_context": null, + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "site": "shopping", + "expected": {"sku": "B07DFJ5XKH", "num_star": 1, "nickname": "ShoppingEmma"} } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping" - ], + "sites": ["shopping"], "task_id": 589, - "require_login": true, - "storage_state": "./.auth/shopping_state.json", - "start_url": "__SHOPPING__", - "geolocation": null, + "intent_template_id": 194, + "start_urls": ["__SHOPPING__"], + "intent": "Rate my recent purchase of Mini Wireless Bluetooth Speaker with 2 stars, using my nickname SimpleEmma?", "intent_template": "Rate my recent purchase of {{product}} with {{num_star}} stars, using my nickname {{nickname}}?", "instantiation_dict": { "product": "Mini Wireless Bluetooth Speaker", "num_star": 2, "nickname": "SimpleEmma" }, - "intent": "Rate my recent purchase of Mini Wireless Bluetooth Speaker with 2 stars, using my nickname SimpleEmma?", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "validate_review", - "expected_data": { - "sku": "B09P7BFL4H", - "num_star": 2, - "nickname": "SimpleEmma" - } - } - } - ], - "site": "shopping" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "last", - "locator": "func:shopping_get_sku_latest_review_rating('B09P7BFL4H')", - "required_contents": { - "must_include": [ - "40" - ] - } - }, - { - "url": "last", - "locator": "func:shopping_get_sku_latest_review_author('B09P7BFL4H')", - "required_contents": { - "must_include": [ - "SimpleEmma" - ] - } - } - ] - }, - "intent_template_id": 194, - "changelogs": [ + "format_specification": null, + "start_url_context": null, + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "site": "shopping", + "expected": {"sku": "B09P7BFL4H", "num_star": 2, "nickname": "SimpleEmma"} } - ] + ], + "revision": 2 }, { - "sites": [ - "gitlab" - ], + "sites": ["gitlab"], "task_id": 590, - "require_login": true, - "storage_state": "./.auth/gitlab_state.json", - "start_url": "__GITLAB__/primer/design", - "geolocation": null, + "intent_template_id": 339, + "start_urls": ["__GITLAB__/primer/design"], + "intent": "Create a milestone for the upcoming event of product launch starting on 1/16/2023 and ending on 1/30/2023", "intent_template": "Create a milestone for the upcoming {{event}} starting on {{start_date}} and ending on {{end_date}}", "instantiation_dict": { "event": "event of product launch", "start_date": "1/16/2023", "end_date": "1/30/2023" }, - "intent": "Create a milestone for the upcoming event of product launch starting on 1/16/2023 and ending on 1/30/2023", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "does_milestone_exist_with_fields", - "eval_params": { - "group": "primer", - "project": "design", - "values": { - "title": "product launch", - "start_date": "2023-01-16", - "due_date": "2023-01-30" - }, - "title_field_is_substring": true - } - } - } - ], - "site": "gitlab" - }, - "original.eval": { - "eval_types": [ - "url_match", - "program_html" - ], - "reference_answers": null, - "reference_url": "__GITLAB__/primer/design/-/milestones", - "program_html": [ - { - "url": "last", - "locator": "document.querySelector(\"#content-body\").outerText", - "required_contents": { - "must_include": [ - "product launch" - ] - } - }, - { - "url": "last", - "locator": "document.querySelector('.block.start_date').outerText", - "required_contents": { - "must_include": [ - "Jan 16, 2030" - ] - } - }, - { - "url": "last", - "locator": "document.querySelector('.block.due_date').outerText", - "required_contents": { - "must_include": [ - "Jan 30, 2030" - ] - } - } - ], - "url_note": "GOLD in PRED" - }, - "intent_template_id": 339, - "changelogs": [ + "format_specification": null, + "start_url_context": "Complete the task given the context of the current page.", + "eval": [ { - "key": "expected_backend_state", - "category": "switch_to_api_eval", - "note": "Use API to verify; more accurate" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "group": "primer", + "project": "design", + "values": { + "title": "product launch", + "start_date": "2023-01-16", + "due_date": "2023-01-30" + }, + "title_field_is_substring": true, + "site": "gitlab", + "expected": true } - ] + ], + "revision": 2 }, { - "sites": [ - "gitlab" - ], + "sites": ["gitlab"], "task_id": 591, - "require_login": true, - "storage_state": "./.auth/gitlab_state.json", - "start_url": "__GITLAB__/primer/design", - "geolocation": null, + "intent_template_id": 339, + "start_urls": ["__GITLAB__/primer/design"], + "intent": "Create a milestone for the upcoming practice of collective code review starting on 1/16/2023 and ending on in 20 days", "intent_template": "Create a milestone for the upcoming {{event}} starting on {{start_date}} and ending on {{end_date}}", "instantiation_dict": { "event": "practice of collective code review", "start_date": "1/16/2023", "end_date": "in 20 days" }, - "intent": "Create a milestone for the upcoming practice of collective code review starting on 1/16/2023 and ending on in 20 days", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "does_milestone_exist_with_fields", - "eval_params": { - "group": "primer", - "project": "design", - "values": { - "title": "code review", - "start_date": "2023-01-16", - "due_date": "2023-02-05" - }, - "title_field_is_substring": true - } - } - } - ], - "site": "gitlab" - }, - "original.eval": { - "eval_types": [ - "url_match", - "program_html" - ], - "reference_answers": null, - "reference_url": "__GITLAB__/primer/design/-/milestones", - "program_html": [ - { - "url": "last", - "locator": "document.querySelector(\"#content-body\").outerText", - "required_contents": { - "must_include": [ - "code review" - ] - } - }, - { - "url": "last", - "locator": "document.querySelector('.block.start_date').outerText", - "required_contents": { - "must_include": [ - "Jan 16, 2030" - ] - } - }, - { - "url": "last", - "locator": "document.querySelector('.block.due_date').outerText", - "required_contents": { - "must_include": [ - "Feb 5, 2030" - ] - } - } - ], - "url_note": "GOLD in PRED" - }, - "intent_template_id": 339, - "changelogs": [ + "format_specification": null, + "start_url_context": "Complete the task given the context of the current page.", + "eval": [ { - "key": "expected_backend_state", - "category": "switch_to_api_eval", - "note": "Use API to verify; more accurate" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "group": "primer", + "project": "design", + "values": {"title": "code review", "start_date": "2023-01-16", "due_date": "2023-02-05"}, + "title_field_is_substring": true, + "site": "gitlab", + "expected": true } - ] + ], + "revision": 2 }, { - "sites": [ - "gitlab" - ], + "sites": ["gitlab"], "task_id": 592, - "require_login": true, - "storage_state": "./.auth/gitlab_state.json", - "start_url": "__GITLAB__/primer/design", - "geolocation": null, + "intent_template_id": 339, + "start_urls": ["__GITLAB__/primer/design"], + "intent": "Create a milestone for the upcoming task of cleaning sensitive information starting on 2/16/2023 and ending on in 20 days", "intent_template": "Create a milestone for the upcoming {{event}} starting on {{start_date}} and ending on {{end_date}}", "instantiation_dict": { "event": "task of cleaning sensitive information", "start_date": "2/16/2023", "end_date": "in 20 days" }, - "intent": "Create a milestone for the upcoming task of cleaning sensitive information starting on 2/16/2023 and ending on in 20 days", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "does_milestone_exist_with_fields", - "eval_params": { - "group": "primer", - "project": "design", - "values": { - "title": "sensitive information", - "start_date": "2023-02-16", - "due_date": "2023-03-08" - }, - "title_field_is_substring": true - } - } - } - ], - "site": "gitlab" - }, - "original.eval": { - "eval_types": [ - "url_match", - "program_html" - ], - "reference_answers": null, - "reference_url": "__GITLAB__/primer/design/-/milestones", - "program_html": [ - { - "url": "last", - "locator": "document.querySelector(\"#content-body\").outerText", - "required_contents": { - "must_include": [ - "sensitive information" - ] - } - }, - { - "url": "last", - "locator": "document.querySelector('.block.start_date').outerText", - "required_contents": { - "must_include": [ - "Feb 16, 2030" - ] - } - }, - { - "url": "last", - "locator": "document.querySelector('.block.due_date').outerText", - "required_contents": { - "must_include": [ - "Mar 8, 2030" - ] - } - } - ], - "url_note": "GOLD in PRED" - }, - "intent_template_id": 339, - "changelogs": [ + "format_specification": null, + "start_url_context": "Complete the task given the context of the current page.", + "eval": [ { - "key": "expected_backend_state", - "category": "switch_to_api_eval", - "note": "Use API to verify; more accurate" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "group": "primer", + "project": "design", + "values": { + "title": "sensitive information", + "start_date": "2023-02-16", + "due_date": "2023-03-08" + }, + "title_field_is_substring": true, + "site": "gitlab", + "expected": true } - ] + ], + "revision": 2 }, { - "sites": [ - "gitlab" - ], + "sites": ["gitlab"], "task_id": 593, - "require_login": true, - "storage_state": "./.auth/gitlab_state.json", - "start_url": "__GITLAB__/byteblaze/dotfiles", - "geolocation": null, + "intent_template_id": 339, + "start_urls": ["__GITLAB__/byteblaze/dotfiles"], + "intent": "Create a milestone for the upcoming task of merging all branches to main starting on March 15, 2044 and ending on March 30, 2044", "intent_template": "Create a milestone for the upcoming {{event}} starting on {{start_date}} and ending on {{end_date}}", "instantiation_dict": { "event": "task of merging all branches to main", "start_date": "March 15, 2044", "end_date": "March 30, 2044" }, - "intent": "Create a milestone for the upcoming task of merging all branches to main starting on March 15, 2044 and ending on March 30, 2044", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "does_milestone_exist_with_fields", - "eval_params": { - "group": "byteblaze", - "project": "dotfiles", - "values": { - "title": "all branches to main", - "start_date": "2044-03-15", - "due_date": "2044-03-30" - }, - "title_field_is_substring": true - } - } - } - ], - "site": "gitlab" - }, - "original.eval": { - "eval_types": [ - "url_match", - "program_html" - ], - "reference_answers": null, - "reference_url": "__GITLAB__/byteblaze/dotfiles/-/milestones", - "program_html": [ - { - "url": "last", - "locator": "document.querySelector(\"#content-body\").outerText", - "required_contents": { - "must_include": [ - "all branches to main" - ] - } - }, - { - "url": "last", - "locator": "document.querySelector('.block.start_date').outerText", - "required_contents": { - "must_include": [ - "Mar 15, 2044" - ] - } - }, - { - "url": "last", - "locator": "document.querySelector('.block.due_date').outerText", - "required_contents": { - "must_include": [ - "Mar 30, 2044" - ] - } - } - ], - "url_note": "GOLD in PRED" - }, - "intent_template_id": 339, - "changelogs": [ + "format_specification": null, + "start_url_context": "Complete the task given the context of the current page.", + "eval": [ { - "key": "expected_backend_state", - "category": "switch_to_api_eval", - "note": "Use API to verify; more accurate" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "group": "byteblaze", + "project": "dotfiles", + "values": { + "title": "all branches to main", + "start_date": "2044-03-15", + "due_date": "2044-03-30" + }, + "title_field_is_substring": true, + "site": "gitlab", + "expected": true } - ] + ], + "revision": 2 }, { - "sites": [ - "gitlab" - ], + "sites": ["gitlab"], "task_id": 594, - "require_login": true, - "storage_state": "./.auth/gitlab_state.json", - "start_url": "__GITLAB__/byteblaze/dotfiles", - "geolocation": null, + "intent_template_id": 339, + "start_urls": ["__GITLAB__/byteblaze/dotfiles"], + "intent": "Create a milestone for the upcoming task of adding a new branch for zsh comprehensive support starting on 5/1/2044 and ending on in 20 days", "intent_template": "Create a milestone for the upcoming {{event}} starting on {{start_date}} and ending on {{end_date}}", "instantiation_dict": { "event": "task of adding a new branch for zsh comprehensive support", "start_date": "5/1/2044", "end_date": "in 20 days" }, - "intent": "Create a milestone for the upcoming task of adding a new branch for zsh comprehensive support starting on 5/1/2044 and ending on in 20 days", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "does_milestone_exist_with_fields", - "eval_params": { - "group": "byteblaze", - "project": "dotfiles", - "values": { - "title": "zsh comprehensive support", - "start_date": "2044-05-01", - "due_date": "2044-05-21" - }, - "title_field_is_substring": true - } - } - } - ], - "site": "gitlab" - }, - "original.eval": { - "eval_types": [ - "url_match", - "program_html" - ], - "reference_answers": null, - "reference_url": "__GITLAB__/byteblaze/dotfiles/-/milestones", - "program_html": [ - { - "url": "last", - "locator": "document.querySelector(\"#content-body\").outerText", - "required_contents": { - "must_include": [ - "zsh comprehensive support" - ] - } - }, - { - "url": "last", - "locator": "document.querySelector('.block.start_date').outerText", - "required_contents": { - "must_include": [ - "May 1, 2044" - ] - } - }, - { - "url": "last", - "locator": "document.querySelector('.block.due_date').outerText", - "required_contents": { - "must_include": [ - "May 21, 2044" - ] - } - } - ], - "url_note": "GOLD in PRED" - }, - "intent_template_id": 339, - "changelogs": [ + "format_specification": null, + "start_url_context": "Complete the task given the context of the current page.", + "eval": [ { - "key": "expected_backend_state", - "category": "switch_to_api_eval", - "note": "Use API to verify; more accurate" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "group": "byteblaze", + "project": "dotfiles", + "values": { + "title": "zsh comprehensive support", + "start_date": "2044-05-01", + "due_date": "2044-05-21" + }, + "title_field_is_substring": true, + "site": "gitlab", + "expected": true } - ] + ], + "revision": 2 }, { - "sites": [ - "reddit" - ], + "sites": ["reddit"], "task_id": 595, - "require_login": true, - "storage_state": "./.auth/reddit_state.json", - "start_url": "__REDDIT__", - "geolocation": null, - "intent_template": "Open the thread of a trending post on the forum \"{{subreddit}}\" and subscribe.", - "instantiation_dict": { - "subreddit": "space" - }, + "intent_template_id": 4, + "start_urls": ["__REDDIT__"], "intent": "Open the thread of a trending post on the forum \"space\" and subscribe.", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "validate_user_subscribed_to_subreddit", - "expected_data": { - "subreddit": "space" - } - } - } - ], - "site": "reddit" - }, - "original.eval": { - "eval_types": [ - "url_match", - "program_html" - ], - "reference_answers": null, - "reference_url": "__REDDIT__/f/space", - "program_html": [ - { - "url": "__REDDIT__/", - "locator": "document.querySelector(\"#sidebar > section\").outerText", - "required_contents": { - "must_include": [ - "space" - ] - } - } - ], - "url_note": "GOLD in PRED" - }, - "intent_template_id": 4, - "changelogs": [ + "intent_template": "Open the thread of a trending post on the forum \"{{subreddit}}\" and subscribe.", + "instantiation_dict": {"subreddit": "space"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "expected_backend_state", - "category": "switch_to_api_eval", - "note": "Update to use system check instead of dom check" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "site": "reddit", + "expected": {"subreddit": "space"} } - ] + ], + "revision": 2 }, { - "sites": [ - "reddit" - ], + "sites": ["reddit"], "task_id": 596, - "require_login": true, - "storage_state": "./.auth/reddit_state.json", - "start_url": "__REDDIT__", - "geolocation": null, - "intent_template": "Open the thread of a trending post on the forum \"{{subreddit}}\" and subscribe.", - "instantiation_dict": { - "subreddit": "books" - }, - "intent": "Open the thread of a trending post on the forum \"books\" and subscribe.", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "validate_user_subscribed_to_subreddit", - "expected_data": { - "subreddit": "books" - } - } - } - ], - "site": "reddit" - }, - "original.eval": { - "eval_types": [ - "url_match", - "program_html" - ], - "reference_answers": null, - "reference_url": "__REDDIT__/f/books", - "program_html": [ - { - "url": "__REDDIT__/", - "locator": "document.querySelector(\"#sidebar > section\").outerText", - "required_contents": { - "must_include": [ - "books" - ] - } - } - ], - "url_note": "GOLD in PRED" - }, "intent_template_id": 4, - "changelogs": [ + "start_urls": ["__REDDIT__"], + "intent": "Open the thread of a trending post on the forum \"books\" and subscribe.", + "intent_template": "Open the thread of a trending post on the forum \"{{subreddit}}\" and subscribe.", + "instantiation_dict": {"subreddit": "books"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "expected_backend_state", - "category": "switch_to_api_eval", - "note": "Update to use system check instead of dom check" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "site": "reddit", + "expected": {"subreddit": "books"} } - ] + ], + "revision": 2 }, { - "sites": [ - "reddit" - ], + "sites": ["reddit"], "task_id": 597, - "require_login": true, - "storage_state": "./.auth/reddit_state.json", - "start_url": "__REDDIT__", - "geolocation": null, - "intent_template": "Open the thread of a trending post on the forum \"{{subreddit}}\" and subscribe.", - "instantiation_dict": { - "subreddit": "consoles" - }, - "intent": "Open the thread of a trending post on the forum \"consoles\" and subscribe.", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "validate_user_subscribed_to_subreddit", - "expected_data": { - "subreddit": "consoles" - } - } - } - ], - "site": "reddit" - }, - "original.eval": { - "eval_types": [ - "url_match", - "program_html" - ], - "reference_answers": null, - "reference_url": "__REDDIT__/f/consoles", - "program_html": [ - { - "url": "__REDDIT__/", - "locator": "document.querySelector(\"#sidebar > section\").outerText", - "required_contents": { - "must_include": [ - "consoles" - ] - } - } - ], - "url_note": "GOLD in PRED" - }, "intent_template_id": 4, - "changelogs": [ + "start_urls": ["__REDDIT__"], + "intent": "Open the thread of a trending post on the forum \"consoles\" and subscribe.", + "intent_template": "Open the thread of a trending post on the forum \"{{subreddit}}\" and subscribe.", + "instantiation_dict": {"subreddit": "consoles"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "expected_backend_state", - "category": "switch_to_api_eval", - "note": "Update to use system check instead of dom check" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "site": "reddit", + "expected": {"subreddit": "consoles"} } - ] + ], + "revision": 2 }, { - "sites": [ - "reddit" - ], + "sites": ["reddit"], "task_id": 598, - "require_login": true, - "storage_state": "./.auth/reddit_state.json", - "start_url": "__REDDIT__", - "geolocation": null, - "intent_template": "Open the thread of a trending post on the forum \"{{subreddit}}\" and subscribe.", - "instantiation_dict": { - "subreddit": "pittsburgh" - }, - "intent": "Open the thread of a trending post on the forum \"pittsburgh\" and subscribe.", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "validate_user_subscribed_to_subreddit", - "expected_data": { - "subreddit": "pittsburgh" - } - } - } - ], - "site": "reddit" - }, - "original.eval": { - "eval_types": [ - "url_match", - "program_html" - ], - "reference_answers": null, - "reference_url": "__REDDIT__/f/pittsburgh", - "program_html": [ - { - "url": "__REDDIT__/", - "locator": "document.querySelector(\"#sidebar > section\").outerText", - "required_contents": { - "must_include": [ - "pittsburgh" - ] - } - } - ], - "url_note": "GOLD in PRED" - }, "intent_template_id": 4, - "changelogs": [ + "start_urls": ["__REDDIT__"], + "intent": "Open the thread of a trending post on the forum \"pittsburgh\" and subscribe.", + "intent_template": "Open the thread of a trending post on the forum \"{{subreddit}}\" and subscribe.", + "instantiation_dict": {"subreddit": "pittsburgh"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "expected_backend_state", - "category": "switch_to_api_eval", - "note": "Update to use system check instead of dom check" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "site": "reddit", + "expected": {"subreddit": "pittsburgh"} } - ] + ], + "revision": 2 }, { - "sites": [ - "reddit" - ], + "sites": ["reddit"], "task_id": 599, - "require_login": true, - "storage_state": "./.auth/reddit_state.json", - "start_url": "__REDDIT__", - "geolocation": null, - "intent_template": "Open the thread of a trending post on the forum \"{{subreddit}}\" and subscribe.", - "instantiation_dict": { - "subreddit": "machine learning" - }, - "intent": "Open the thread of a trending post on the forum \"machine learning\" and subscribe.", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "validate_user_subscribed_to_subreddit", - "expected_data": { - "subreddit": "machine learning" - } - } - } - ], - "site": "reddit" - }, - "original.eval": { - "eval_types": [ - "url_match", - "program_html" - ], - "reference_answers": null, - "reference_url": "__REDDIT__/f/MachineLearning", - "program_html": [ - { - "url": "__REDDIT__/", - "locator": "document.querySelector(\"#sidebar > section\").outerText", - "required_contents": { - "must_include": [ - "MachineLearning" - ] - } - } - ], - "url_note": "GOLD in PRED" - }, "intent_template_id": 4, - "changelogs": [ + "start_urls": ["__REDDIT__"], + "intent": "Open the thread of a trending post on the forum \"machine learning\" and subscribe.", + "intent_template": "Open the thread of a trending post on the forum \"{{subreddit}}\" and subscribe.", + "instantiation_dict": {"subreddit": "machine learning"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "expected_backend_state", - "category": "switch_to_api_eval", - "note": "Update to use system check instead of dom check" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "site": "reddit", + "expected": {"subreddit": "machine learning"} } - ] + ], + "revision": 2 }, { - "sites": [ - "reddit" - ], + "sites": ["reddit"], "task_id": 600, - "require_login": true, - "storage_state": "./.auth/reddit_state.json", - "start_url": "__REDDIT__", - "geolocation": null, - "intent_template": "Post my question, \"{{question}}\", in a subreddit where I'm likely to get an answer", - "instantiation_dict": { - "question": "what is the recommended console to buy these days" - }, - "intent": "Post my question, \"what is the recommended console to buy these days\", in a subreddit where I'm likely to get an answer", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "validate_post_exists_in_subreddit", - "expected_data": { - "subreddit": "consoles", - "title": "what is the recommended console to buy these days" - } - } - } - ], - "site": "reddit" - }, - "original.eval": { - "eval_types": [ - "url_match", - "program_html" - ], - "reference_answers": null, - "reference_url": "__REDDIT__/f/consoles", - "program_html": [ - { - "url": "func:reddit_get_post_url('__last_url__')", - "locator": "document.querySelector('.submission__inner').outerText", - "required_contents": { - "must_include": [ - "what is the recommended console to buy these days" - ] - } - } - ], - "url_note": "GOLD in PRED" - }, "intent_template_id": 3765, - "changelogs": [ + "start_urls": ["__REDDIT__"], + "intent": "Post my question with the title \"what is the recommended console to buy these days\", in a subreddit where I'm likely to get an answer", + "intent_template": "Post my question with the title \"{{question}}\", in a subreddit where I'm likely to get an answer", + "instantiation_dict": {"question": "what is the recommended console to buy these days"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "expected_backend_state", - "category": "switch_to_api_eval", - "note": "Update to use system check instead of dom check" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "site": "reddit", + "expected": { + "subreddit": "consoles", + "title": "what is the recommended console to buy these days" + } } - ] + ], + "revision": 2 }, { - "sites": [ - "reddit" - ], + "sites": ["reddit"], "task_id": 601, - "require_login": true, - "storage_state": "./.auth/reddit_state.json", - "start_url": "__REDDIT__", - "geolocation": null, - "intent_template": "Post my question, \"{{question}}\", in a subreddit where I'm likely to get an answer", - "instantiation_dict": { - "question": "is car necessary in NYC" - }, - "intent": "Post my question, \"is car necessary in NYC\", in a subreddit where I'm likely to get an answer", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "validate_post_exists_in_subreddit", - "expected_data": { - "subreddit": "nyc", - "title": "is car necessary in NYC?" - } - } - } - ], - "site": "reddit" - }, - "original.eval": { - "eval_types": [ - "url_match", - "program_html" - ], - "reference_answers": null, - "reference_url": "__REDDIT__/f/nyc", - "program_html": [ - { - "url": "func:reddit_get_post_url('__last_url__')", - "locator": "document.querySelector('.submission__inner').outerText", - "required_contents": { - "must_include": [ - "is car necessary in NYC" - ] - } - } - ], - "url_note": "GOLD in PRED" - }, "intent_template_id": 3765, - "changelogs": [ + "start_urls": ["__REDDIT__"], + "intent": "Post my question with the title \"is car necessary in NYC\", in a subreddit where I'm likely to get an answer", + "intent_template": "Post my question with the title \"{{question}}\", in a subreddit where I'm likely to get an answer", + "instantiation_dict": {"question": "is car necessary in NYC"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "expected_backend_state", - "category": "switch_to_api_eval", - "note": "Update to use system check instead of dom check" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "site": "reddit", + "expected": {"subreddit": "nyc", "title": "is car necessary in NYC?"} } - ] + ], + "revision": 2 }, { - "sites": [ - "reddit" - ], + "sites": ["reddit"], "task_id": 602, - "require_login": true, - "storage_state": "./.auth/reddit_state.json", - "start_url": "__REDDIT__", - "geolocation": null, - "intent_template": "Post my question, \"{{question}}\", in a subreddit where I'm likely to get an answer", - "instantiation_dict": { - "question": "places for new drivers to learn driving in pittsburgh" - }, - "intent": "Post my question, \"places for new drivers to learn driving in pittsburgh\", in a subreddit where I'm likely to get an answer", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "validate_post_exists_in_subreddit", - "expected_data": { - "subreddit": "pittsburgh", - "title": "places for new drivers to learn driving in pittsburgh" - } - } - } - ], - "site": "reddit" - }, - "original.eval": { - "eval_types": [ - "url_match", - "program_html" - ], - "reference_answers": null, - "reference_url": "__REDDIT__/f/pittsburgh", - "program_html": [ - { - "url": "func:reddit_get_post_url('__last_url__')", - "locator": "document.querySelector('.submission__inner').outerText", - "required_contents": { - "must_include": [ - "places for new drivers to learn driving in pittsburgh" - ] - } - } - ], - "url_note": "GOLD in PRED" - }, "intent_template_id": 3765, - "changelogs": [ + "start_urls": ["__REDDIT__"], + "intent": "Post my question with the title \"places for new drivers to learn driving in pittsburgh\", in a subreddit where I'm likely to get an answer", + "intent_template": "Post my question with the title \"{{question}}\", in a subreddit where I'm likely to get an answer", + "instantiation_dict": {"question": "places for new drivers to learn driving in pittsburgh"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "expected_backend_state", - "category": "switch_to_api_eval", - "note": "Update to use system check instead of dom check" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "site": "reddit", + "expected": { + "subreddit": "pittsburgh", + "title": "places for new drivers to learn driving in pittsburgh" + } } - ] + ], + "revision": 2 }, { - "sites": [ - "reddit" - ], + "sites": ["reddit"], "task_id": 603, - "require_login": true, - "storage_state": "./.auth/reddit_state.json", - "start_url": "__REDDIT__", - "geolocation": null, - "intent_template": "Post my question, \"{{question}}\", in a subreddit where I'm likely to get an answer", - "instantiation_dict": { - "question": "safe and budget apartment to live in nyc" - }, - "original.instantiation_dict": { - "question": "safe and budge apartment to live in nyc" - }, - "intent": "Post my question, \"safe and budget apartment to live in nyc\", in a subreddit where I'm likely to get an answer", - "original.intent": "Post my question, \"safe and budge apartment to live in nyc\", in a subreddit where I'm likely to get an answer", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "validate_post_exists_in_subreddit", - "expected_data": { - "subreddit": "nyc", - "title": "safe and budget apartment to live in nyc" - } - } - } - ], - "site": "reddit" - }, - "original.eval": { - "eval_types": [ - "url_match", - "program_html" - ], - "reference_answers": null, - "reference_url": "__REDDIT__/f/nyc", - "program_html": [ - { - "url": "func:reddit_get_post_url('__last_url__')", - "locator": "document.querySelector('.submission__inner').outerText", - "required_contents": { - "must_include": [ - "safe and budge apartment to live in nyc" - ] - } - } - ], - "url_note": "GOLD in PRED" - }, "intent_template_id": 3765, - "changelogs": [ + "start_urls": ["__REDDIT__"], + "intent": "Post my question with the title \"safe and budget apartment to live in nyc\", in a subreddit where I'm likely to get an answer", + "intent_template": "Post my question with the title \"{{question}}\", in a subreddit where I'm likely to get an answer", + "instantiation_dict": {"question": "safe and budget apartment to live in nyc"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "instantiation_dict", - "category": "spelling_or_grammar", - "note": "Typo in the question" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "key": "expected_backend_state", - "category": "switch_to_api_eval", - "note": "Update to use system check instead of dom check" - }, - { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "site": "reddit", + "expected": {"subreddit": "nyc", "title": "safe and budget apartment to live in nyc"} } - ] + ], + "revision": 2 }, { - "sites": [ - "reddit" - ], + "sites": ["reddit"], "task_id": 604, - "require_login": true, - "storage_state": "./.auth/reddit_state.json", - "start_url": "__REDDIT__", - "geolocation": null, - "intent_template": "Post my question, \"{{question}}\", in a subreddit where I'm likely to get an answer", - "instantiation_dict": { - "question": "what is the SOTA web navigation agent repo" - }, - "intent": "Post my question, \"what is the SOTA web navigation agent repo\", in a subreddit where I'm likely to get an answer", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "validate_post_exists_in_subreddit", - "expected_data": { - "subreddit": "consoles", - "title": "what is the recommended console to buy these days" - } - } - } - ], - "site": "reddit" - }, - "original.eval": { - "eval_types": [ - "url_match", - "program_html" - ], - "reference_answers": null, - "reference_url": "__REDDIT__/f/deeplearning |OR| __REDDIT__/f/MachineLearning |OR| __REDDIT__/f/singularity", - "program_html": [ - { - "url": "func:reddit_get_post_url('__last_url__')", - "locator": "document.querySelector('.submission__inner').outerText", - "required_contents": { - "must_include": [ - "what is the SOTA web navigation agent repo" - ] - } - } - ], - "url_note": "GOLD in PRED" - }, "intent_template_id": 3765, - "changelogs": [ + "start_urls": ["__REDDIT__"], + "intent": "Post my question with the title \"what is the SOTA web navigation agent repo\", in a subreddit where I'm likely to get an answer", + "intent_template": "Post my question with the title \"{{question}}\", in a subreddit where I'm likely to get an answer", + "instantiation_dict": {"question": "what is the SOTA web navigation agent repo"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "expected_backend_state", - "category": "switch_to_api_eval", - "note": "Update to use system check instead of dom check" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "site": "reddit", + "expected": { + "subreddit": ["deeplearning", "MachineLearning", "singularity"], + "title": "what is the SOTA web navigation agent repo" + } } - ] + ], + "revision": 2 }, { - "sites": [ - "reddit" - ], + "sites": ["reddit"], "task_id": 605, - "require_login": true, - "storage_state": "./.auth/reddit_state.json", - "start_url": "__REDDIT__", - "geolocation": null, + "intent_template_id": 5, + "start_urls": ["__REDDIT__"], + "intent": "Find a subreddit focused on topics related to gaming consoles, and post my question, \"what is the recommended console to buy these days\" there", "intent_template": "Find a subreddit focused on topics related to {{topic}}, and post my question, \"{{question}}\" there", "instantiation_dict": { "topic": "gaming consoles", "question": "what is the recommended console to buy these days" }, - "intent": "Find a subreddit focused on topics related to gaming consoles, and post my question, \"what is the recommended console to buy these days\" there", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "validate_post_exists_in_subreddit", - "expected_data": { - "subreddit": "consoles", - "title": "what is the recommended console to buy these days" - } - } - } - ], - "site": "reddit" - }, - "original.eval": { - "eval_types": [ - "url_match", - "program_html" - ], - "reference_answers": null, - "reference_url": "__REDDIT__/f/consoles", - "program_html": [ - { - "url": "func:reddit_get_post_url('__last_url__')", - "locator": "document.querySelector('.submission__inner').outerText", - "required_contents": { - "must_include": [ - "what is the recommended console to buy these days" - ] - } - } - ], - "url_note": "GOLD in PRED" - }, - "intent_template_id": 5, - "changelogs": [ + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "expected_backend_state", - "category": "switch_to_api_eval", - "note": "Update to use system check instead of dom check" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "site": "reddit", + "expected": { + "subreddit": "consoles", + "title": "what is the recommended console to buy these days" + } } - ] + ], + "revision": 2 }, { - "sites": [ - "reddit" - ], + "sites": ["reddit"], "task_id": 606, - "require_login": true, - "storage_state": "./.auth/reddit_state.json", - "start_url": "__REDDIT__", - "geolocation": null, - "intent_template": "Find a subreddit focused on topics related to {{topic}}, and post my question, \"{{question}}\" there", - "instantiation_dict": { - "topic": "NYC", - "question": "is car necessary" - }, - "intent": "Find a subreddit focused on topics related to NYC, and post my question, \"is car necessary\" there", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "validate_post_exists_in_subreddit", - "expected_data": { - "subreddit": "nyc", - "title": "is car necessary" - } - } - } - ], - "site": "reddit" - }, - "original.eval": { - "eval_types": [ - "url_match", - "program_html" - ], - "reference_answers": null, - "reference_url": "__REDDIT__/f/nyc", - "program_html": [ - { - "url": "func:reddit_get_post_url('__last_url__')", - "locator": "document.querySelector('.submission__inner').outerText", - "required_contents": { - "must_include": [ - "is car necessary" - ] - } - } - ], - "url_note": "GOLD in PRED" - }, "intent_template_id": 5, - "changelogs": [ + "start_urls": ["__REDDIT__"], + "intent": "Find a subreddit focused on topics related to NYC, and post my question, \"is car necessary\" there", + "intent_template": "Find a subreddit focused on topics related to {{topic}}, and post my question, \"{{question}}\" there", + "instantiation_dict": {"topic": "NYC", "question": "is car necessary"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "expected_backend_state", - "category": "switch_to_api_eval", - "note": "Update to use system check instead of dom check" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "site": "reddit", + "expected": {"subreddit": "nyc", "title": "is car necessary"} } - ] + ], + "revision": 2 }, { - "sites": [ - "reddit" - ], + "sites": ["reddit"], "task_id": 607, - "require_login": true, - "storage_state": "./.auth/reddit_state.json", - "start_url": "__REDDIT__", - "geolocation": null, + "intent_template_id": 5, + "start_urls": ["__REDDIT__"], + "intent": "Find a subreddit focused on topics related to city Pittsburgh, and post my question, \"places for new drivers to learn driving\" there", "intent_template": "Find a subreddit focused on topics related to {{topic}}, and post my question, \"{{question}}\" there", "instantiation_dict": { "topic": "city Pittsburgh", "question": "places for new drivers to learn driving" }, - "intent": "Find a subreddit focused on topics related to city Pittsburgh, and post my question, \"places for new drivers to learn driving\" there", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "validate_post_exists_in_subreddit", - "expected_data": { - "subreddit": "pittsburgh", - "title": "places for new drivers to learn driving" - } - } - } - ], - "site": "reddit" - }, - "original.eval": { - "eval_types": [ - "url_match", - "program_html" - ], - "reference_answers": null, - "reference_url": "__REDDIT__/f/pittsburgh", - "program_html": [ - { - "url": "func:reddit_get_post_url('__last_url__')", - "locator": "document.querySelector('.submission__inner').outerText", - "required_contents": { - "must_include": [ - "places for new drivers to learn driving" - ] - } - } - ], - "url_note": "GOLD in PRED" - }, - "intent_template_id": 5, - "changelogs": [ + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "expected_backend_state", - "category": "switch_to_api_eval", - "note": "Update to use system check instead of dom check" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "site": "reddit", + "expected": {"subreddit": "pittsburgh", "title": "places for new drivers to learn driving"} } - ] + ], + "revision": 2 }, { - "sites": [ - "reddit" - ], + "sites": ["reddit"], "task_id": 608, - "require_login": true, - "storage_state": "./.auth/reddit_state.json", - "start_url": "__REDDIT__", - "geolocation": null, + "intent_template_id": 5, + "start_urls": ["__REDDIT__"], + "intent": "Find a subreddit focused on topics related to city lives in DMV area, and post my question, \"safe and budget apartment to live\" there", "intent_template": "Find a subreddit focused on topics related to {{topic}}, and post my question, \"{{question}}\" there", "instantiation_dict": { "topic": "city lives in DMV area", "question": "safe and budget apartment to live" }, - "original.instantiation_dict": { - "topic": "city lives in DMV area", - "question": "safe and budge apartment to live" - }, - "intent": "Find a subreddit focused on topics related to city lives in DMV area, and post my question, \"safe and budget apartment to live\" there", - "original.intent": "Find a subreddit focused on topics related to city lives in DMV area, and post my question, \"safe and budge apartment to live\" there", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "validate_post_exists_in_subreddit", - "expected_data": { - "subreddit": "washington", - "title": "safe and budge apartment to live" - } - } - } - ], - "site": "reddit" - }, - "original.eval": { - "eval_types": [ - "url_match", - "program_html" - ], - "reference_answers": null, - "reference_url": "__REDDIT__/f/washington |OR| __REDDIT__/f/washingtondc", - "program_html": [ - { - "url": "func:reddit_get_post_url('__last_url__')", - "locator": "document.querySelector('.submission__inner').outerText", - "required_contents": { - "must_include": [ - "safe and budge apartment to live" - ] - } - } - ], - "url_note": "GOLD in PRED" - }, - "intent_template_id": 5, - "changelogs": [ - { - "key": "instantiation_dict", - "category": "spelling_or_grammar", - "note": "Typo in the question" - }, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "expected_backend_state", - "category": "switch_to_api_eval", - "note": "Update to use system check instead of dom check" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "site": "reddit", + "expected": {"subreddit": "washington", "title": "safe and budge apartment to live"} } - ] + ], + "revision": 2 }, { - "sites": [ - "reddit" - ], + "sites": ["reddit"], "task_id": 609, - "require_login": true, - "storage_state": "./.auth/reddit_state.json", - "start_url": "__REDDIT__", - "geolocation": null, + "intent_template_id": 5, + "start_urls": ["__REDDIT__"], + "intent": "Find a subreddit focused on topics related to ML, DL, NLP, and post my question, \"what is the SOTA web navigation agent repo\" there", "intent_template": "Find a subreddit focused on topics related to {{topic}}, and post my question, \"{{question}}\" there", "instantiation_dict": { "topic": "ML, DL, NLP", "question": "what is the SOTA web navigation agent repo" }, - "intent": "Find a subreddit focused on topics related to ML, DL, NLP, and post my question, \"what is the SOTA web navigation agent repo\" there", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "validate_post_exists_in_subreddit", - "expected_data": { - "subreddit": "deeplearning", - "title": "what is the SOTA web navigation agent repo" - } - } - } - ], - "site": "reddit" - }, - "original.eval": { - "eval_types": [ - "url_match", - "program_html" - ], - "reference_answers": null, - "reference_url": "__REDDIT__/f/deeplearning |OR| __REDDIT__/f/MachineLearning |OR| __REDDIT__/f/singularity", - "program_html": [ - { - "url": "func:reddit_get_post_url('__last_url__')", - "locator": "document.querySelector('.submission__inner').outerText", - "required_contents": { - "must_include": [ - "what is the SOTA web navigation agent repo" - ] - } - } - ], - "url_note": "GOLD in PRED" - }, - "intent_template_id": 5, - "changelogs": [ + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "expected_backend_state", - "category": "switch_to_api_eval", - "note": "Update to use system check instead of dom check" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "site": "reddit", + "expected": { + "subreddit": "deeplearning", + "title": "what is the SOTA web navigation agent repo" + } } - ] + ], + "revision": 2 }, { - "sites": [ - "reddit" - ], + "sites": ["reddit"], "task_id": 610, - "require_login": true, - "storage_state": "./.auth/reddit_state.json", - "start_url": "__REDDIT__", - "geolocation": null, - "intent_template": "Post a review of my recent reading, titled \"{{book}}\" in the r/books and put my comment \"{{content}}\" in the body.", - "original.intent_template": "Post a review of my recent reading \"{{book}}\" in the r/books with my comment \"{{content}}\".", - "instantiation_dict": { - "book": "To Kill a Mockingbird by Harper Lee", - "content": "good book!" - }, - "intent": "Post a review of my recent reading, titled \"To Kill a Mockingbird by Harper Lee\" in the r/books and put my comment \"good book!\" in the body.", - "original.intent": "Post a review of my recent reading \"To Kill a Mockingbird by Harper Lee\" in the r/books with my comment \"good book!\".", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "validate_post_exists_in_subreddit", - "expected_data": { - "subreddit": "books", - "content": "good book!", - "title": "To Kill a Mockingbird by Harper Lee" - } - } - } - ], - "site": "reddit" - }, - "original.eval": { - "eval_types": [ - "url_match", - "program_html" - ], - "reference_answers": null, - "reference_url": "__REDDIT__/f/books", - "program_html": [ - { - "url": "func:reddit_get_post_url('__last_url__')", - "locator": "document.querySelector('.submission__inner').outerText", - "required_contents": { - "must_include": [ - "To Kill a Mockingbird by Harper Lee", - "good book!" - ] - } - } - ], - "url_note": "GOLD in PRED" - }, "intent_template_id": 9, - "changelogs": [ - { - "key": "intent_template", - "category": "clarify_instructions", - "note": "Need to specify the book title and content" - }, + "start_urls": ["__REDDIT__"], + "intent": "Post a review of my recent reading, titled \"To Kill a Mockingbird by Harper Lee\" in the r/books and put my comment \"good book!\" in the body.", + "intent_template": "Post a review of my recent reading, titled \"{{book}}\" in the r/books and put my comment \"{{content}}\" in the body.", + "instantiation_dict": {"book": "To Kill a Mockingbird by Harper Lee", "content": "good book!"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "expected_backend_state", - "category": "switch_to_api_eval", - "note": "Update to use system check instead of dom check" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "site": "reddit", + "expected": { + "subreddit": "books", + "content": "good book!", + "title": "To Kill a Mockingbird by Harper Lee" + } } - ] + ], + "revision": 2 }, { - "sites": [ - "reddit" - ], + "sites": ["reddit"], "task_id": 611, - "require_login": true, - "storage_state": "./.auth/reddit_state.json", - "start_url": "__REDDIT__", - "geolocation": null, - "intent_template": "Post a review of my recent reading, titled \"{{book}}\" in the r/books and put my comment \"{{content}}\" in the body.", - "original.intent_template": "Post a review of my recent reading \"{{book}}\" in the r/books with my comment \"{{content}}\".", - "instantiation_dict": { - "book": "Harry Potter", - "content": "Wonderful journey" - }, - "intent": "Post a review of my recent reading, titled \"Harry Potter\" in the r/books and put my comment \"Wonderful journey\" in the body.", - "original.intent": "Post a review of my recent reading \"Harry Potter\" in the r/books with my comment \"Wonderful journey\".", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "validate_post_exists_in_subreddit", - "expected_data": { - "subreddit": "books", - "content": "Wonderful journey", - "title": "Harry Potter" - } - } - } - ], - "site": "reddit" - }, - "original.eval": { - "eval_types": [ - "url_match", - "program_html" - ], - "reference_answers": null, - "reference_url": "__REDDIT__/f/books", - "program_html": [ - { - "url": "func:reddit_get_post_url('__last_url__')", - "locator": "document.querySelector('.submission__inner').outerText", - "required_contents": { - "must_include": [ - "Harry Potter", - "Wonderful journey" - ] - } - } - ], - "url_note": "GOLD in PRED" - }, "intent_template_id": 9, - "changelogs": [ - { - "key": "intent_template", - "category": "clarify_instructions", - "note": "Need to specify the book title and content" - }, + "start_urls": ["__REDDIT__"], + "intent": "Post a review of my recent reading, titled \"Harry Potter\" in the r/books and put my comment \"Wonderful journey\" in the body.", + "intent_template": "Post a review of my recent reading, titled \"{{book}}\" in the r/books and put my comment \"{{content}}\" in the body.", + "instantiation_dict": {"book": "Harry Potter", "content": "Wonderful journey"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "expected_backend_state", - "category": "switch_to_api_eval", - "note": "Update to use system check instead of dom check" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "site": "reddit", + "expected": {"subreddit": "books", "content": "Wonderful journey", "title": "Harry Potter"} } - ] + ], + "revision": 2 }, { - "sites": [ - "reddit" - ], + "sites": ["reddit"], "task_id": 612, - "require_login": true, - "storage_state": "./.auth/reddit_state.json", - "start_url": "__REDDIT__", - "geolocation": null, - "intent_template": "Post a review of my recent reading, titled \"{{book}}\" in the r/books and put my comment \"{{content}}\" in the body.", - "original.intent_template": "Post a review of my recent reading \"{{book}}\" in the r/books with my comment \"{{content}}\".", - "instantiation_dict": { - "book": "big little lies", - "content": "can't stop it" - }, - "intent": "Post a review of my recent reading, titled \"big little lies\" in the r/books and put my comment \"can't stop it\" in the body.", - "original.intent": "Post a review of my recent reading \"big little lies\" in the r/books with my comment \"can't stop it\".", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "validate_post_exists_in_subreddit", - "expected_data": { - "subreddit": "books", - "content": "can't stop it", - "title": "big little lies" - } - } - } - ], - "site": "reddit" - }, - "original.eval": { - "eval_types": [ - "url_match", - "program_html" - ], - "reference_answers": null, - "reference_url": "__REDDIT__/f/books", - "program_html": [ - { - "url": "func:reddit_get_post_url('__last_url__')", - "locator": "document.querySelector('.submission__inner').outerText", - "required_contents": { - "must_include": [ - "big little lies", - "can't stop it" - ] - } - } - ], - "url_note": "GOLD in PRED" - }, "intent_template_id": 9, - "changelogs": [ - { - "key": "intent_template", - "category": "clarify_instructions", - "note": "Need to specify the book title and content" - }, + "start_urls": ["__REDDIT__"], + "intent": "Post a review of my recent reading, titled \"big little lies\" in the r/books and put my comment \"can't stop it\" in the body.", + "intent_template": "Post a review of my recent reading, titled \"{{book}}\" in the r/books and put my comment \"{{content}}\" in the body.", + "instantiation_dict": {"book": "big little lies", "content": "can't stop it"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "expected_backend_state", - "category": "switch_to_api_eval", - "note": "Update to use system check instead of dom check" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "site": "reddit", + "expected": {"subreddit": "books", "content": "can't stop it", "title": "big little lies"} } - ] + ], + "revision": 2 }, { - "sites": [ - "reddit" - ], + "sites": ["reddit"], "task_id": 613, - "require_login": true, - "storage_state": "./.auth/reddit_state.json", - "start_url": "__REDDIT__", - "geolocation": null, - "intent_template": "Post a review of my recent reading, titled \"{{book}}\" in the r/books and put my comment \"{{content}}\" in the body.", - "original.intent_template": "Post a review of my recent reading \"{{book}}\" in the r/books with my comment \"{{content}}\".", - "instantiation_dict": { - "book": "Love story", - "content": "I cried" - }, - "intent": "Post a review of my recent reading, titled \"Love story\" in the r/books and put my comment \"I cried\" in the body.", - "original.intent": "Post a review of my recent reading \"Love story\" in the r/books with my comment \"I cried\".", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "validate_post_exists_in_subreddit", - "expected_data": { - "subreddit": "books", - "comment": "I cried", - "title": "Love story" - } - } - } - ], - "site": "reddit" - }, - "original.eval": { - "eval_types": [ - "url_match", - "program_html" - ], - "reference_answers": null, - "reference_url": "__REDDIT__/f/books", - "program_html": [ - { - "url": "func:reddit_get_post_url('__last_url__')", - "locator": "document.querySelector('.submission__inner').outerText", - "required_contents": { - "must_include": [ - "Love story", - "I cried" - ] - } - } - ], - "url_note": "GOLD in PRED" - }, "intent_template_id": 9, - "changelogs": [ - { - "key": "intent_template", - "category": "clarify_instructions", - "note": "Need to specify the book title and content" - }, + "start_urls": ["__REDDIT__"], + "intent": "Post a review of my recent reading, titled \"Love story\" in the r/books and put my comment \"I cried\" in the body.", + "intent_template": "Post a review of my recent reading, titled \"{{book}}\" in the r/books and put my comment \"{{content}}\" in the body.", + "instantiation_dict": {"book": "Love story", "content": "I cried"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "expected_backend_state", - "category": "switch_to_api_eval", - "note": "Update to use system check instead of dom check" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "site": "reddit", + "expected": {"subreddit": "books", "comment": "I cried", "title": "Love story"} } - ] + ], + "revision": 2 }, { - "sites": [ - "reddit" - ], + "sites": ["reddit"], "task_id": 614, - "require_login": true, - "storage_state": "./.auth/reddit_state.json", - "start_url": "__REDDIT__", - "geolocation": null, - "intent_template": "Post a review of my recent reading, titled \"{{book}}\" in the r/books and put my comment \"{{content}}\" in the body.", - "original.intent_template": "Post a review of my recent reading \"{{book}}\" in the r/books with my comment \"{{content}}\".", - "instantiation_dict": { - "book": "Gone with the wind", - "content": "It's a book with history" - }, - "intent": "Post a review of my recent reading, titled \"Gone with the wind\" in the r/books and put my comment \"It's a book with history\" in the body.", - "original.intent": "Post a review of my recent reading \"Gone with the wind\" in the r/books with my comment \"It's a book with history\".", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "validate_post_exists_in_subreddit", - "expected_data": { - "subreddit": "books", - "content": "It's a book with history", - "title": "Gone with the wind" - } - } - } - ], - "site": "reddit" - }, - "original.eval": { - "eval_types": [ - "url_match", - "program_html" - ], - "reference_answers": null, - "reference_url": "__REDDIT__/f/books", - "program_html": [ - { - "url": "func:reddit_get_post_url('__last_url__')", - "locator": "document.querySelector('.submission__inner').outerText", - "required_contents": { - "must_include": [ - "Gone with the wind", - "It's a book with history" - ] - } - } - ], - "url_note": "GOLD in PRED" - }, "intent_template_id": 9, - "changelogs": [ - { - "key": "intent_template", - "category": "clarify_instructions", - "note": "Need to specify the book title and content" - }, + "start_urls": ["__REDDIT__"], + "intent": "Post a review of my recent reading, titled \"Gone with the wind\" in the r/books and put my comment \"It's a book with history\" in the body.", + "intent_template": "Post a review of my recent reading, titled \"{{book}}\" in the r/books and put my comment \"{{content}}\" in the body.", + "instantiation_dict": {"book": "Gone with the wind", "content": "It's a book with history"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "expected_backend_state", - "category": "switch_to_api_eval", - "note": "Update to use system check instead of dom check" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "site": "reddit", + "expected": { + "subreddit": "books", + "content": "It's a book with history", + "title": "Gone with the wind" + } } - ] + ], + "revision": 2 }, { - "sites": [ - "reddit" - ], + "sites": ["reddit"], "task_id": 615, - "require_login": true, - "storage_state": "./.auth/reddit_state.json", - "start_url": "__REDDIT__/f/pics", - "geolocation": null, - "intent_template": "Re-post the image of {{content}} in this page to {{subreddit}} subreddit and note \"from /f/pics\"", - "instantiation_dict": { - "content": "Bald Eagle", - "subreddit": "earthporn" - }, - "intent": "Re-post the image of Bald Eagle in this page to earthporn subreddit and note \"from /f/pics\"", - "require_reset": false, - "eval": { - "expected_ui_state": [ - { - "url": "__REDDIT__/f/earthporn" - } - ], - "site": "reddit" - }, - "original.eval": { - "eval_types": [ - "url_match", - "program_html" - ], - "reference_answers": null, - "reference_url": "__REDDIT__/f/earthporn", - "program_html": [ - { - "url": "func:reddit_get_post_url('__last_url__')", - "locator": "document.querySelector('.submission__inner').outerText", - "required_contents": { - "must_include": [ - "from /f/pics" - ] - } - }, - { - "url": "func:reddit_get_post_url('__last_url__')", - "locator": "[...document.querySelector('.submission__inner').querySelectorAll('[href],[src]')].map(elem => elem.getAttribute('href') || elem.getAttribute('src')).join(' ')", - "required_contents": { - "must_include": [ - "b02113033af32feae9ff147dbbe3764039368d67d193885bd04e65c2e6beea9c.jpg" - ] - } + "intent_template_id": 11, + "start_urls": ["__REDDIT__/f/pics"], + "intent": "Re-post the image of Bald Eagle in this page to earthporn subreddit and title \"from /f/pics\"", + "intent_template": "Re-post the image of {{content}} in this page to {{subreddit}} subreddit and title \"from /f/pics\"", + "instantiation_dict": {"content": "Bald Eagle", "subreddit": "earthporn"}, + "format_specification": null, + "start_url_context": "Complete the task given the context of the current page.", + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + }, + { + "evaluator": "BackendStateEvaluator", + "site": "reddit", + "expected": { + "subreddit": "earthporn", + "file_name": "b02113033af32feae9ff147dbbe3764039368d67d193885bd04e65c2e6beea9c.jpg", + "title": "from /f/pics" } - ], - "url_note": "GOLD in PRED" - }, - "intent_template_id": 11 + } + ], + "revision": 2 }, { - "sites": [ - "reddit" - ], + "sites": ["reddit"], "task_id": 616, - "require_login": true, - "storage_state": "./.auth/reddit_state.json", - "start_url": "__REDDIT__/f/pics", - "geolocation": null, - "intent_template": "Re-post the image of {{content}} in this page to {{subreddit}} subreddit and note \"from /f/pics\"", - "instantiation_dict": { - "content": "Thanksgiving turkey", - "subreddit": "funny" - }, - "intent": "Re-post the image of Thanksgiving turkey in this page to funny subreddit and note \"from /f/pics\"", - "require_reset": false, - "eval": { - "expected_ui_state": [ - { - "url": "__REDDIT__/f/funny" - } - ], - "site": "reddit" - }, - "original.eval": { - "eval_types": [ - "url_match", - "program_html" - ], - "reference_answers": null, - "reference_url": "__REDDIT__/f/funny", - "program_html": [ - { - "url": "func:reddit_get_post_url('__last_url__')", - "locator": "document.querySelector('.submission__inner').outerText", - "required_contents": { - "must_include": [ - "from /f/pics" - ] - } - }, - { - "url": "func:reddit_get_post_url('__last_url__')", - "locator": "[...document.querySelector('.submission__inner').querySelectorAll('[href],[src]')].map(elem => elem.getAttribute('href') || elem.getAttribute('src')).join(' ')", - "required_contents": { - "must_include": [ - "2e4fa0a328e653a97a7d07046291c298ef5b4e0d0c73a287f317ca86a8e8685f.jpg" - ] - } + "intent_template_id": 11, + "start_urls": ["__REDDIT__/f/pics"], + "intent": "Re-post the image of Thanksgiving turkey in this page to funny subreddit and title \"from /f/pics\"", + "intent_template": "Re-post the image of {{content}} in this page to {{subreddit}} subreddit and title \"from /f/pics\"", + "instantiation_dict": {"content": "Thanksgiving turkey", "subreddit": "funny"}, + "format_specification": null, + "start_url_context": "Complete the task given the context of the current page.", + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + }, + { + "evaluator": "BackendStateEvaluator", + "site": "reddit", + "expected": { + "subreddit": "funny", + "file_name": "2e4fa0a328e653a97a7d07046291c298ef5b4e0d0c73a287f317ca86a8e8685f.jpg", + "title": "from /f/pics" } - ], - "url_note": "GOLD in PRED" - }, - "intent_template_id": 11 + } + ], + "revision": 2 }, { - "sites": [ - "reddit" - ], + "sites": ["reddit"], "task_id": 617, - "require_login": true, - "storage_state": "./.auth/reddit_state.json", - "start_url": "__REDDIT__/f/pics", - "geolocation": null, - "intent_template": "Re-post the image of {{content}} in this page to {{subreddit}} subreddit and note \"from /f/pics\"", - "instantiation_dict": { - "content": "Firework", - "subreddit": "earthporn" - }, - "intent": "Re-post the image of Firework in this page to earthporn subreddit and note \"from /f/pics\"", - "require_reset": false, - "eval": { - "expected_ui_state": [ - { - "url": "__REDDIT__/f/earthporn" - } - ], - "site": "reddit" - }, - "original.eval": { - "eval_types": [ - "url_match", - "program_html" - ], - "reference_answers": null, - "reference_url": "__REDDIT__/f/earthporn", - "program_html": [ - { - "url": "func:reddit_get_post_url('__last_url__')", - "locator": "document.querySelector('.submission__inner').outerText", - "required_contents": { - "must_include": [ - "from /f/pics" - ] - } - }, - { - "url": "func:reddit_get_post_url('__last_url__')", - "locator": "[...document.querySelector('.submission__inner').querySelectorAll('[href],[src]')].map(elem => elem.getAttribute('href') || elem.getAttribute('src')).join(' ')", - "required_contents": { - "must_include": [ - "92411be6af4e9ad5ccd3ccbaa01c10457bb00e704e99c58dd430de1a958307fd.jpg" - ] - } + "intent_template_id": 11, + "start_urls": ["__REDDIT__/f/pics"], + "intent": "Re-post the image of Firework in this page to earthporn subreddit and title \"from /f/pics\"", + "intent_template": "Re-post the image of {{content}} in this page to {{subreddit}} subreddit and title \"from /f/pics\"", + "instantiation_dict": {"content": "Firework", "subreddit": "earthporn"}, + "format_specification": null, + "start_url_context": "Complete the task given the context of the current page.", + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + }, + { + "evaluator": "BackendStateEvaluator", + "site": "reddit", + "expected": { + "subreddit": "earthporn", + "file_name": "92411be6af4e9ad5ccd3ccbaa01c10457bb00e704e99c58dd430de1a958307fd.jpg", + "title": "from /f/pics" } - ], - "url_note": "GOLD in PRED" - }, - "intent_template_id": 11 + } + ], + "revision": 2 }, { - "sites": [ - "reddit" - ], + "sites": ["reddit"], "task_id": 618, - "require_login": true, - "storage_state": "./.auth/reddit_state.json", - "start_url": "__REDDIT__/f/pics", - "geolocation": null, - "intent_template": "Re-post the image of {{content}} in this page to {{subreddit}} subreddit and note \"from /f/pics\"", - "instantiation_dict": { - "content": "Wife's costume", - "subreddit": "funny" - }, - "intent": "Re-post the image of Wife's costume in this page to funny subreddit and note \"from /f/pics\"", - "require_reset": false, - "eval": { - "expected_ui_state": [ - { - "url": "__REDDIT__/f/funny" - } - ], - "site": "reddit" - }, - "original.eval": { - "eval_types": [ - "url_match", - "program_html" - ], - "reference_answers": null, - "reference_url": "__REDDIT__/f/funny", - "program_html": [ - { - "url": "func:reddit_get_post_url('__last_url__')", - "locator": "document.querySelector('.submission__inner').outerText", - "required_contents": { - "must_include": [ - "from /f/pics" - ] - } - }, - { - "url": "func:reddit_get_post_url('__last_url__')", - "locator": "[...document.querySelector('.submission__inner').querySelectorAll('[href],[src]')].map(elem => elem.getAttribute('href') || elem.getAttribute('src')).join(' ')", - "required_contents": { - "must_include": [ - "6bfbf1280d28d84a9261695f0cac5a90addaaff1174807a2b381fdc159f3ed00.jpg" - ] - } + "intent_template_id": 11, + "start_urls": ["__REDDIT__/f/pics"], + "intent": "Re-post the image of Wife's costume in this page to funny subreddit and title \"from /f/pics\"", + "intent_template": "Re-post the image of {{content}} in this page to {{subreddit}} subreddit and title \"from /f/pics\"", + "instantiation_dict": {"content": "Wife's costume", "subreddit": "funny"}, + "format_specification": null, + "start_url_context": "Complete the task given the context of the current page.", + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + }, + { + "evaluator": "BackendStateEvaluator", + "site": "reddit", + "expected": { + "subreddit": "funny", + "file_name": "6bfbf1280d28d84a9261695f0cac5a90addaaff1174807a2b381fdc159f3ed00.jpg", + "title": "from /f/pics" } - ], - "url_note": "GOLD in PRED" - }, - "intent_template_id": 11 + } + ], + "revision": 2 }, { - "sites": [ - "reddit" - ], + "sites": ["reddit"], "task_id": 619, - "require_login": true, - "storage_state": "./.auth/reddit_state.json", - "start_url": "__REDDIT__/f/pics", - "geolocation": null, - "intent_template": "Re-post the image of {{content}} in this page to {{subreddit}} subreddit and note \"from /f/pics\"", - "instantiation_dict": { - "content": "costume contest", - "subreddit": "funny" - }, - "intent": "Re-post the image of costume contest in this page to funny subreddit and note \"from /f/pics\"", - "require_reset": false, - "eval": { - "expected_ui_state": [ - { - "url": "__REDDIT__/f/funny" - } - ], - "site": "reddit" - }, - "original.eval": { - "eval_types": [ - "url_match", - "program_html" - ], - "reference_answers": null, - "reference_url": "__REDDIT__/f/funny", - "program_html": [ - { - "url": "func:reddit_get_post_url('__last_url__')", - "locator": "document.querySelector('.submission__inner').outerText", - "required_contents": { - "must_include": [ - "from /f/pics" - ] - } - }, - { - "url": "func:reddit_get_post_url('__last_url__')", - "locator": "[...document.querySelector('.submission__inner').querySelectorAll('[href],[src]')].map(elem => elem.getAttribute('href') || elem.getAttribute('src')).join(' ')", - "required_contents": { - "must_include": [ - "bd8bc5f4c846aac4df08626faa3a34a7d47c8f3bdd92bf615a54afd939f063a7.jpg" - ] - } + "intent_template_id": 11, + "start_urls": ["__REDDIT__/f/pics"], + "intent": "Re-post the image of costume contest in this page to funny subreddit and title \"from /f/pics\"", + "intent_template": "Re-post the image of {{content}} in this page to {{subreddit}} subreddit and title \"from /f/pics\"", + "instantiation_dict": {"content": "costume contest", "subreddit": "funny"}, + "format_specification": null, + "start_url_context": "Complete the task given the context of the current page.", + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + }, + { + "evaluator": "BackendStateEvaluator", + "site": "reddit", + "expected": { + "subreddit": "funny", + "file_name": "bd8bc5f4c846aac4df08626faa3a34a7d47c8f3bdd92bf615a54afd939f063a7.jpg", + "title": "from /f/pics" } - ], - "url_note": "GOLD in PRED" - }, - "intent_template_id": 11 + } + ], + "revision": 2 }, { - "sites": [ - "reddit" - ], + "sites": ["reddit"], "task_id": 620, - "require_login": true, - "storage_state": "./.auth/reddit_state.json", - "start_url": "__REDDIT__", - "geolocation": null, - "intent_template": "Ask for advice about {{issue}} in a subreddit for relations", - "instantiation_dict": { - "issue": "deal with long-distance relationships" - }, - "intent": "Ask for advice about deal with long-distance relationships in a subreddit for relations", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "validate_post_exists_in_subreddit", - "expected_data": { - "subreddit": "relationship_advice", - "title": [ - "long", - "distance", - "relationship" - ] - } - } - } - ], - "site": "reddit" - }, - "original.eval": { - "eval_types": [ - "url_match", - "program_html" - ], - "reference_answers": null, - "reference_url": "__REDDIT__/f/relationship_advice", - "program_html": [ - { - "url": "func:reddit_get_post_url('__last_url__')", - "locator": "document.querySelector('.submission__inner').outerText", - "required_contents": { - "must_include": [ - "long", - "distance", - "relation" - ] - } - } - ], - "url_note": "GOLD in PRED" - }, "intent_template_id": 12, - "changelogs": [ + "start_urls": ["__REDDIT__"], + "intent": "Ask for advice with the title \"deal with long-distance relationships\" in a subreddit for relations", + "intent_template": "Ask for advice with the title \"{{issue}}\" in a subreddit for relations", + "instantiation_dict": {"issue": "deal with long-distance relationships"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "expected_backend_state", - "category": "switch_to_api_eval", - "note": "Update to use system check instead of dom check" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "site": "reddit", + "expected": { + "subreddit": "relationship_advice", + "title": ["long", "distance", "relationship"] + } } - ] + ], + "revision": 2 }, { - "sites": [ - "reddit" - ], + "sites": ["reddit"], "task_id": 621, - "require_login": true, - "storage_state": "./.auth/reddit_state.json", - "start_url": "__REDDIT__", - "geolocation": null, - "intent_template": "Ask for advice about {{issue}} in a subreddit for relations", - "instantiation_dict": { - "issue": "cheat" - }, - "intent": "Ask for advice about cheat in a subreddit for relations", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "validate_post_exists_in_subreddit", - "expected_data": { - "subreddit": "relationship_advice", - "title": "cheat" - } - } - } - ], - "site": "reddit" - }, - "original.eval": { - "eval_types": [ - "url_match", - "program_html" - ], - "reference_answers": null, - "reference_url": "__REDDIT__/f/relationship_advice", - "program_html": [ - { - "url": "func:reddit_get_post_url('__last_url__')", - "locator": "document.querySelector('.submission__inner').outerText", - "required_contents": { - "must_include": [ - "cheat" - ] - } - } - ], - "url_note": "GOLD in PRED" - }, "intent_template_id": 12, - "changelogs": [ + "start_urls": ["__REDDIT__"], + "intent": "Ask for advice with the title \"cheat\" in a subreddit for relations", + "intent_template": "Ask for advice with the title \"{{issue}}\" in a subreddit for relations", + "instantiation_dict": {"issue": "cheat"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "expected_backend_state", - "category": "switch_to_api_eval", - "note": "Update to use system check instead of dom check" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "site": "reddit", + "expected": {"subreddit": "relationship_advice", "title": "cheat"} } - ] + ], + "revision": 2 }, { - "sites": [ - "reddit" - ], + "sites": ["reddit"], "task_id": 622, - "require_login": true, - "storage_state": "./.auth/reddit_state.json", - "start_url": "__REDDIT__", - "geolocation": null, - "intent_template": "Ask for advice about {{issue}} in a subreddit for relations", - "instantiation_dict": { - "issue": "sexual harassment" - }, - "intent": "Ask for advice about sexual harassment in a subreddit for relations", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "validate_post_exists_in_subreddit", - "expected_data": { - "subreddit": "relationship_advice", - "title": "sexual harassment" - } - } - } - ], - "site": "reddit" - }, - "original.eval": { - "eval_types": [ - "url_match", - "program_html" - ], - "reference_answers": null, - "reference_url": "__REDDIT__/f/relationship_advice", - "program_html": [ - { - "url": "func:reddit_get_post_url('__last_url__')", - "locator": "document.querySelector('.submission__inner').outerText", - "required_contents": { - "must_include": [ - "sexual", - "harassment" - ] - } - } - ], - "url_note": "GOLD in PRED" - }, "intent_template_id": 12, - "changelogs": [ + "start_urls": ["__REDDIT__"], + "intent": "Ask for advice with the title \"sexual harassment\" in a subreddit for relations", + "intent_template": "Ask for advice with the title \"{{issue}}\" in a subreddit for relations", + "instantiation_dict": {"issue": "sexual harassment"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "expected_backend_state", - "category": "switch_to_api_eval", - "note": "Update to use system check instead of dom check" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "site": "reddit", + "expected": {"subreddit": "relationship_advice", "title": "sexual harassment"} } - ] + ], + "revision": 2 }, { - "sites": [ - "reddit" - ], + "sites": ["reddit"], "task_id": 623, - "require_login": true, - "storage_state": "./.auth/reddit_state.json", - "start_url": "__REDDIT__", - "geolocation": null, - "intent_template": "Ask for advice about {{issue}} in a subreddit for relations", - "instantiation_dict": { - "issue": "gift for birthday" - }, - "intent": "Ask for advice about gift for birthday in a subreddit for relations", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "validate_post_exists_in_subreddit", - "expected_data": { - "subreddit": "relationship_advice", - "title": [ - "gift", - "birthday" - ] - } - } - } - ], - "site": "reddit" - }, - "original.eval": { - "eval_types": [ - "url_match", - "program_html" - ], - "reference_answers": null, - "reference_url": "__REDDIT__/f/relationship_advice", - "program_html": [ - { - "url": "func:reddit_get_post_url('__last_url__')", - "locator": "document.querySelector('.submission__inner').outerText", - "required_contents": { - "must_include": [ - "gift", - "birthday" - ] - } - } - ], - "url_note": "GOLD in PRED" - }, "intent_template_id": 12, - "changelogs": [ + "start_urls": ["__REDDIT__"], + "intent": "Ask for advice with the title \"gift for birthday\" in a subreddit for relations", + "intent_template": "Ask for advice with the title \"{{issue}}\" in a subreddit for relations", + "instantiation_dict": {"issue": "gift for birthday"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "expected_backend_state", - "category": "switch_to_api_eval", - "note": "Update to use system check instead of dom check" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "site": "reddit", + "expected": { "subreddit": "relationship_advice", "title": ["gift", "birthday"] } } - ] + ], + "revision": 2 }, { - "sites": [ - "reddit" - ], + "sites": ["reddit"], "task_id": 624, - "require_login": true, - "storage_state": "./.auth/reddit_state.json", - "start_url": "__REDDIT__", - "geolocation": null, - "intent_template": "Ask for advice about {{issue}} in a subreddit for relations", - "instantiation_dict": { - "issue": "break-up remedy" - }, - "intent": "Ask for advice about break-up remedy in a subreddit for relations", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "validate_post_exists_in_subreddit", - "expected_data": { - "subreddit": "relationship_advice", - "title": [ - "break", - "up", - "remedy" - ] - } - } - } - ], - "site": "reddit" - }, - "original.eval": { - "eval_types": [ - "url_match", - "program_html" - ], - "reference_answers": null, - "reference_url": "__REDDIT__/f/relationship_advice", - "program_html": [ - { - "url": "func:reddit_get_post_url('__last_url__')", - "locator": "document.querySelector('.submission__inner').outerText", - "required_contents": { - "must_include": [ - "break", - "remedy" - ] - } - } - ], - "url_note": "GOLD in PRED" - }, "intent_template_id": 12, - "changelogs": [ + "start_urls": ["__REDDIT__"], + "intent": "Ask for advice with the title \"break-up remedy\" in a subreddit for relations", + "intent_template": "Ask for advice with the title \"{{issue}}\" in a subreddit for relations", + "instantiation_dict": {"issue": "break-up remedy"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "expected_backend_state", - "category": "switch_to_api_eval", - "note": "Update to use system check instead of dom check" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "site": "reddit", + "expected": { "subreddit": "relationship_advice", "title": ["break", "up", "remedy"] } } - ] + ], + "revision": 2 }, { - "sites": [ - "reddit" - ], + "sites": ["reddit"], "task_id": 625, - "require_login": true, - "storage_state": "./.auth/reddit_state.json", - "start_url": "__REDDIT__", - "geolocation": null, - "intent_template": "Create a discussion post titled \"{{topic}}\" in a relevant subreddit and ask users for their opinions with the simple prompt, \"your opinion\"", - "original.intent_template": "Create a discussion post about \"{{topic}}\" in a relevant subreddit and ask users for their opinions with the simple prompt, \"your opinion\"", - "instantiation_dict": { - "topic": "the effectiveness of deep learning" - }, - "original.instantiation_dict": { - "topic": "the effectiveness of online learning" - }, - "intent": "Create a discussion post titled \"the effectiveness of deep learning\" in a relevant subreddit and ask users for their opinions with the simple prompt, \"your opinion\"", - "original.intent": "Create a discussion post about \"the effectiveness of online learning\" in a relevant subreddit and ask users for their opinions with the simple prompt, \"your opinion\"", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "validate_post_exists_in_subreddit", - "expected_data": { - "subreddit": "deeplearning", - "content": [ - "your opinion" - ], - "title": "the effectiveness of deep learning" - } - } - } - ], - "site": "reddit" - }, - "original.eval": { - "eval_types": [ - "url_match", - "program_html" - ], - "reference_answers": null, - "reference_url": "__REDDIT__/f/machinelearning |OR| __REDDIT__/f/deeplearning", - "program_html": [ - { - "url": "func:reddit_get_post_url('__last_url__')", - "locator": "document.querySelector('.submission__inner').outerText", - "required_contents": { - "must_include": [ - "your opinion", - "the effectiveness of online learning" - ] - } - } - ], - "url_note": "GOLD in PRED" - }, "intent_template_id": 13, - "changelogs": [ - { - "key": "instantiation_dict", - "category": "clarify_instructions", - "note": "Relevant subreddit is difficult to decide based on the original topic" - }, - { - "key": "intent_template", - "category": "clarify_instructions", - "note": "Ambiguous subreddits based on original topic" - }, + "start_urls": ["__REDDIT__"], + "intent": "Create a discussion post titled \"the effectiveness of deep learning\" in a relevant subreddit and ask users for their opinions with the simple prompt, \"your opinion\"", + "intent_template": "Create a discussion post titled \"{{topic}}\" in a relevant subreddit and ask users for their opinions with the simple prompt, \"your opinion\"", + "instantiation_dict": {"topic": "the effectiveness of deep learning"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "expected_backend_state", - "category": "switch_to_api_eval", - "note": "Update to use system check instead of dom check" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "site": "reddit", + "expected": { + "subreddit": "deeplearning", + "content": ["your opinion"], + "title": "the effectiveness of deep learning" + } } - ] + ], + "revision": 2 }, { - "sites": [ - "reddit" - ], + "sites": ["reddit"], "task_id": 626, - "require_login": true, - "storage_state": "./.auth/reddit_state.json", - "start_url": "__REDDIT__", - "geolocation": null, - "intent_template": "Create a discussion post titled \"{{topic}}\" in a relevant subreddit and ask users for their opinions with the simple prompt, \"your opinion\"", - "original.intent_template": "Create a discussion post about \"{{topic}}\" in a relevant subreddit and ask users for their opinions with the simple prompt, \"your opinion\"", - "instantiation_dict": { - "topic": "Iphone 14" - }, - "intent": "Create a discussion post titled \"Iphone 14\" in a relevant subreddit and ask users for their opinions with the simple prompt, \"your opinion\"", - "original.intent": "Create a discussion post about \"Iphone 14\" in a relevant subreddit and ask users for their opinions with the simple prompt, \"your opinion\"", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "validate_post_exists_in_subreddit", - "expected_data": { - "subreddit": "iphone", - "content": [ - "your opinion" - ], - "title": "Iphone 14" - } - } - } - ], - "site": "reddit" - }, - "original.eval": { - "eval_types": [ - "url_match", - "program_html" - ], - "reference_answers": null, - "reference_url": "__REDDIT__/f/iphone", - "program_html": [ - { - "url": "func:reddit_get_post_url('__last_url__')", - "locator": "document.querySelector('.submission__inner').outerText", - "required_contents": { - "must_include": [ - "your opinion", - "Iphone 14" - ] - } - } - ], - "url_note": "GOLD in PRED" - }, "intent_template_id": 13, - "changelogs": [ - { - "key": "intent_template", - "category": "clarify_instructions", - "note": "Need to specify the post title and content" - }, + "start_urls": ["__REDDIT__"], + "intent": "Create a discussion post titled \"Iphone 14\" in a relevant subreddit and ask users for their opinions with the simple prompt, \"your opinion\"", + "intent_template": "Create a discussion post titled \"{{topic}}\" in a relevant subreddit and ask users for their opinions with the simple prompt, \"your opinion\"", + "instantiation_dict": {"topic": "Iphone 14"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "expected_backend_state", - "category": "switch_to_api_eval", - "note": "Update to use system check instead of dom check" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "site": "reddit", + "expected": { "subreddit": "iphone", "content": ["your opinion"], "title": "Iphone 14" } } - ] + ], + "revision": 2 }, { - "sites": [ - "reddit" - ], + "sites": ["reddit"], "task_id": 627, - "require_login": true, - "storage_state": "./.auth/reddit_state.json", - "start_url": "__REDDIT__", - "geolocation": null, - "intent_template": "Create a discussion post titled \"{{topic}}\" in a relevant subreddit and ask users for their opinions with the simple prompt, \"your opinion\"", - "original.intent_template": "Create a discussion post about \"{{topic}}\" in a relevant subreddit and ask users for their opinions with the simple prompt, \"your opinion\"", - "instantiation_dict": { - "topic": "Harry Potter movie series" - }, - "intent": "Create a discussion post titled \"Harry Potter movie series\" in a relevant subreddit and ask users for their opinions with the simple prompt, \"your opinion\"", - "original.intent": "Create a discussion post about \"Harry Potter movie series\" in a relevant subreddit and ask users for their opinions with the simple prompt, \"your opinion\"", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "validate_post_exists_in_subreddit", - "expected_data": { - "subreddit": "movies", - "content": [ - "your opinion" - ], - "title": "Harry Potter movie series" - } - } - } - ], - "site": "reddit" - }, - "original.eval": { - "eval_types": [ - "url_match", - "program_html" - ], - "reference_answers": null, - "reference_url": "__REDDIT__/f/movies", - "program_html": [ - { - "url": "func:reddit_get_post_url('__last_url__')", - "locator": "document.querySelector('.submission__inner').outerText", - "required_contents": { - "must_include": [ - "your opinion", - "Harry Potter movie series" - ] - } - } - ], - "url_note": "GOLD in PRED" - }, "intent_template_id": 13, - "changelogs": [ - { - "key": "intent_template", - "category": "clarify_instructions", - "note": "Need to specify the post title and content" - }, + "start_urls": ["__REDDIT__"], + "intent": "Create a discussion post titled \"Harry Potter movie series\" in a relevant subreddit and ask users for their opinions with the simple prompt, \"your opinion\"", + "intent_template": "Create a discussion post titled \"{{topic}}\" in a relevant subreddit and ask users for their opinions with the simple prompt, \"your opinion\"", + "instantiation_dict": {"topic": "Harry Potter movie series"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "expected_backend_state", - "category": "switch_to_api_eval", - "note": "Update to use system check instead of dom check" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "site": "reddit", + "expected": { + "subreddit": "movies", + "content": ["your opinion"], + "title": "Harry Potter movie series" + } } - ] + ], + "revision": 2 }, { - "sites": [ - "reddit" - ], + "sites": ["reddit"], "task_id": 628, - "require_login": true, - "storage_state": "./.auth/reddit_state.json", - "start_url": "__REDDIT__", - "geolocation": null, + "intent_template_id": 13, + "start_urls": ["__REDDIT__"], + "intent": "Create a discussion post titled \"long distance relationship\" in a relevant subreddit and ask users for their opinions with the simple prompt, \"your opinion\"", "intent_template": "Create a discussion post titled \"{{topic}}\" in a relevant subreddit and ask users for their opinions with the simple prompt, \"your opinion\"", - "original.intent_template": "Create a discussion post about \"{{topic}}\" in a relevant subreddit and ask users for their opinions with the simple prompt, \"your opinion\"", - "instantiation_dict": { - "topic": "long distance relationship" - }, - "intent": "Create a discussion post titled \"long distance relationship\" in a relevant subreddit and ask users for their opinions with the simple prompt, \"your opinion\"", - "original.intent": "Create a discussion post about \"long distance relationship\" in a relevant subreddit and ask users for their opinions with the simple prompt, \"your opinion\"", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "validate_post_exists_in_subreddit", - "expected_data": { - "subreddit": "relationship_advice", - "content": [ - "your opinion" - ], - "title": "long distance relationship" - } - } - } - ], - "site": "reddit" - }, - "original.eval": { - "eval_types": [ - "url_match", - "program_html" - ], - "reference_answers": null, - "reference_url": "__REDDIT__/f/relationship_advice", - "program_html": [ - { - "url": "func:reddit_get_post_url('__last_url__')", - "locator": "document.querySelector('.submission__inner').outerText", - "required_contents": { - "must_include": [ - "your opinion", - "long distance relationship" - ] - } - } - ], - "url_note": "GOLD in PRED" - }, - "intent_template_id": 13, - "changelogs": [ - { - "key": "intent_template", - "category": "clarify_instructions", - "note": "Need to specify the post title and content" - }, + "instantiation_dict": {"topic": "long distance relationship"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "expected_backend_state", - "category": "switch_to_api_eval", - "note": "Update to use system check instead of dom check" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "site": "reddit", + "expected": { + "subreddit": "relationship_advice", + "content": ["your opinion"], + "title": "long distance relationship" + } } - ] + ], + "revision": 2 }, { - "sites": [ - "reddit" - ], + "sites": ["reddit"], "task_id": 629, - "require_login": true, - "storage_state": "./.auth/reddit_state.json", - "start_url": "__REDDIT__", - "geolocation": null, - "intent_template": "Create a discussion post titled \"{{topic}}\" in a relevant subreddit and ask users for their opinions with the simple prompt, \"your opinion\"", - "original.intent_template": "Create a discussion post about \"{{topic}}\" in a relevant subreddit and ask users for their opinions with the simple prompt, \"your opinion\"", - "instantiation_dict": { - "topic": "Fun thing to do in Pittsburgh" - }, - "intent": "Create a discussion post titled \"Fun thing to do in Pittsburgh\" in a relevant subreddit and ask users for their opinions with the simple prompt, \"your opinion\"", - "original.intent": "Create a discussion post about \"Fun thing to do in Pittsburgh\" in a relevant subreddit and ask users for their opinions with the simple prompt, \"your opinion\"", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "validate_post_exists_in_subreddit", - "expected_data": { - "subreddit": "pittsburgh", - "content": [ - "your opinion" - ], - "title": "Fun thing to do in Pittsburgh" - } - } - } - ], - "site": "reddit" - }, - "original.eval": { - "eval_types": [ - "url_match", - "program_html" - ], - "reference_answers": null, - "reference_url": "__REDDIT__/f/pittsburgh", - "program_html": [ - { - "url": "func:reddit_get_post_url('__last_url__')", - "locator": "document.querySelector('.submission__inner').outerText", - "required_contents": { - "must_include": [ - "your opinion", - "Fun thing to do in Pittsburgh" - ] - } - } - ], - "url_note": "GOLD in PRED" - }, "intent_template_id": 13, - "changelogs": [ - { - "key": "intent_template", - "category": "clarify_instructions", - "note": "Need to specify the post title and content" - }, + "start_urls": ["__REDDIT__"], + "intent": "Create a discussion post titled \"Fun thing to do in Pittsburgh\" in a relevant subreddit and ask users for their opinions with the simple prompt, \"your opinion\"", + "intent_template": "Create a discussion post titled \"{{topic}}\" in a relevant subreddit and ask users for their opinions with the simple prompt, \"your opinion\"", + "instantiation_dict": {"topic": "Fun thing to do in Pittsburgh"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "expected_backend_state", - "category": "switch_to_api_eval", - "note": "Update to use system check instead of dom check" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "site": "reddit", + "expected": { + "subreddit": "pittsburgh", + "content": ["your opinion"], + "title": "Fun thing to do in Pittsburgh" + } } - ] + ], + "revision": 2 }, { - "sites": [ - "reddit" - ], + "sites": ["reddit"], "task_id": 630, - "require_login": true, - "storage_state": "./.auth/reddit_state.json", - "start_url": "__REDDIT__", - "geolocation": null, - "intent_template": "Ask for product recommendations for {{category}} within a budget of {{price}} in {{subreddit}}", + "intent_template_id": 15, + "start_urls": ["__REDDIT__"], + "intent": "Ask for product recommendations for noise-cancelling headphones within a budget of $200 in r/headphones. Put all relevant information in the body of the post.", + "intent_template": "Ask for product recommendations for {{category}} within a budget of {{price}} in {{subreddit}}. Put all relevant information in the body of the post.", "instantiation_dict": { "category": "noise-cancelling headphones", "price": "$200", "subreddit": "r/headphones" }, - "intent": "Ask for product recommendations for noise-cancelling headphones within a budget of $200 in r/headphones", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "validate_post_exists_in_subreddit", - "expected_data": { - "subreddit": "headphones", - "title": [ - "200", - "noise", - "cancelling", - "headphones" - ] - } - } - } - ], - "site": "reddit" - }, - "original.eval": { - "eval_types": [ - "url_match", - "program_html" - ], - "reference_answers": null, - "reference_url": "__REDDIT__/f/headphones", - "program_html": [ - { - "url": "func:reddit_get_post_url('__last_url__')", - "locator": "document.querySelector('.submission__inner').outerText", - "required_contents": { - "must_include": [ - "200", - "noise-cancelling", - "headphone" - ] - } - } - ], - "url_note": "GOLD in PRED" - }, - "intent_template_id": 15, - "changelogs": [ + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "expected_backend_state", - "category": "switch_to_api_eval", - "note": "Update to use system check instead of dom check" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "site": "reddit", + "expected": { + "subreddit": "headphones", + "content": ["200", "noise", "cancelling", "headphones"] + } } - ] + ], + "revision": 2 }, { - "sites": [ - "reddit" - ], + "sites": ["reddit"], "task_id": 631, - "require_login": true, - "storage_state": "./.auth/reddit_state.json", - "start_url": "__REDDIT__", - "geolocation": null, - "intent_template": "Ask for product recommendations for {{category}} within a budget of {{price}} in {{subreddit}}", - "instantiation_dict": { - "category": "running shoes", - "price": "$100", - "subreddit": "r/sports" - }, - "intent": "Ask for product recommendations for running shoes within a budget of $100 in r/sports", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "validate_post_exists_in_subreddit", - "expected_data": { - "subreddit": "sports", - "title": [ - "running", - "shoes", - "100" - ] - } - } - } - ], - "site": "reddit" - }, - "original.eval": { - "eval_types": [ - "url_match", - "program_html" - ], - "reference_answers": null, - "reference_url": "__REDDIT__/f/sports", - "program_html": [ - { - "url": "func:reddit_get_post_url('__last_url__')", - "locator": "document.querySelector('.submission__inner').outerText", - "required_contents": { - "must_include": [ - "100", - "running", - "shoes" - ] - } - } - ], - "url_note": "GOLD in PRED" - }, "intent_template_id": 15, - "changelogs": [ + "start_urls": ["__REDDIT__"], + "intent": "Ask for product recommendations for running shoes within a budget of $100 in r/sports. Put all relevant information in the body of the post.", + "intent_template": "Ask for product recommendations for {{category}} within a budget of {{price}} in {{subreddit}}. Put all relevant information in the body of the post.", + "instantiation_dict": {"category": "running shoes", "price": "$100", "subreddit": "r/sports"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "expected_backend_state", - "category": "switch_to_api_eval", - "note": "Update to use system check instead of dom check" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "site": "reddit", + "expected": { "subreddit": "sports", "content": ["running", "shoes", "100"] } } - ] + ], + "revision": 2 }, { - "sites": [ - "reddit" - ], + "sites": ["reddit"], "task_id": 632, - "require_login": true, - "storage_state": "./.auth/reddit_state.json", - "start_url": "__REDDIT__", - "geolocation": null, - "intent_template": "Ask for product recommendations for {{category}} within a budget of {{price}} in {{subreddit}}", - "instantiation_dict": { - "category": "running shoes", - "price": "$500", - "subreddit": "r/sports" - }, - "intent": "Ask for product recommendations for running shoes within a budget of $500 in r/sports", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "validate_post_exists_in_subreddit", - "expected_data": { - "subreddit": "sports", - "title": [ - "running", - "shoes", - "500" - ] - } - } - } - ], - "site": "reddit" - }, - "original.eval": { - "eval_types": [ - "url_match", - "program_html" - ], - "reference_answers": null, - "reference_url": "__REDDIT__/f/sports", - "program_html": [ - { - "url": "func:reddit_get_post_url('__last_url__')", - "locator": "document.querySelector('.submission__inner').outerText", - "required_contents": { - "must_include": [ - "500", - "running", - "shoes" - ] - } - } - ], - "url_note": "GOLD in PRED" - }, "intent_template_id": 15, - "changelogs": [ + "start_urls": ["__REDDIT__"], + "intent": "Ask for product recommendations for running shoes within a budget of $500 in r/sports. Put all relevant information in the body of the post.", + "intent_template": "Ask for product recommendations for {{category}} within a budget of {{price}} in {{subreddit}}. Put all relevant information in the body of the post.", + "instantiation_dict": {"category": "running shoes", "price": "$500", "subreddit": "r/sports"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "expected_backend_state", - "category": "switch_to_api_eval", - "note": "Update to use system check instead of dom check" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "site": "reddit", + "expected": { "subreddit": "sports", "content": ["running", "shoes", "500"] } } - ] + ], + "revision": 2 }, { - "sites": [ - "reddit" - ], + "sites": ["reddit"], "task_id": 633, - "require_login": true, - "storage_state": "./.auth/reddit_state.json", - "start_url": "__REDDIT__", - "geolocation": null, - "intent_template": "Ask for product recommendations for {{category}} within a budget of {{price}} in {{subreddit}}", - "instantiation_dict": { - "category": "running pants", - "price": "$500", - "subreddit": "r/sports" - }, - "intent": "Ask for product recommendations for running pants within a budget of $500 in r/sports", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "validate_post_exists_in_subreddit", - "expected_data": { - "subreddit": "sports", - "title": [ - "running", - "pants", - "500" - ] - } - } - } - ], - "site": "reddit" - }, - "original.eval": { - "eval_types": [ - "url_match", - "program_html" - ], - "reference_answers": null, - "reference_url": "__REDDIT__/f/sports", - "program_html": [ - { - "url": "func:reddit_get_post_url('__last_url__')", - "locator": "document.querySelector('.submission__inner').outerText", - "required_contents": { - "must_include": [ - "500", - "running", - "pants" - ] - } - } - ], - "url_note": "GOLD in PRED" - }, "intent_template_id": 15, - "changelogs": [ + "start_urls": ["__REDDIT__"], + "intent": "Ask for product recommendations for running pants within a budget of $500 in r/sports. Put all relevant information in the body of the post.", + "intent_template": "Ask for product recommendations for {{category}} within a budget of {{price}} in {{subreddit}}. Put all relevant information in the body of the post.", + "instantiation_dict": {"category": "running pants", "price": "$500", "subreddit": "r/sports"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "expected_backend_state", - "category": "switch_to_api_eval", - "note": "Update to use system check instead of dom check" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "site": "reddit", + "expected": { "subreddit": "sports", "content": ["running", "pants", "500"] } } - ] + ], + "revision": 2 }, { - "sites": [ - "reddit" - ], + "sites": ["reddit"], "task_id": 634, - "require_login": true, - "storage_state": "./.auth/reddit_state.json", - "start_url": "__REDDIT__", - "geolocation": null, - "intent_template": "Ask for product recommendations for {{category}} within a budget of {{price}} in {{subreddit}}", - "instantiation_dict": { - "category": "used iphone", - "price": "$1000", - "subreddit": "r/iphone" - }, - "intent": "Ask for product recommendations for used iphone within a budget of $1000 in r/iphone", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "validate_post_exists_in_subreddit", - "expected_data": { - "subreddit": "iphone", - "title": [ - "1000", - "used iphone" - ] - } - } - } - ], - "site": "reddit" - }, - "original.eval": { - "eval_types": [ - "url_match", - "program_html" - ], - "reference_answers": null, - "reference_url": "__REDDIT__/f/iphone", - "program_html": [ - { - "url": "func:reddit_get_post_url('__last_url__')", - "locator": "document.querySelector('.submission__inner').outerText", - "required_contents": { - "must_include": [ - "1000", - "used iphone" - ] - } - } - ], - "url_note": "GOLD in PRED" - }, "intent_template_id": 15, - "changelogs": [ + "start_urls": ["__REDDIT__"], + "intent": "Ask for product recommendations for used iphone within a budget of $1000 in r/iphone. Put all relevant information in the body of the post.", + "intent_template": "Ask for product recommendations for {{category}} within a budget of {{price}} in {{subreddit}}. Put all relevant information in the body of the post.", + "instantiation_dict": {"category": "used iphone", "price": "$1000", "subreddit": "r/iphone"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "expected_backend_state", - "category": "switch_to_api_eval", - "note": "Update to use system check instead of dom check" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "site": "reddit", + "expected": { "subreddit": "iphone", "content": ["1000", "used", "iphone"] } } - ] + ], + "revision": 2 }, { - "sites": [ - "reddit" - ], + "sites": ["reddit"], "task_id": 635, - "require_login": true, - "storage_state": "./.auth/reddit_state.json", - "start_url": "__REDDIT__", - "geolocation": null, - "intent_template": "Post in the most appropriate subreddit and ask for recommendations for {{category}} products within a budget of {{price}}", - "instantiation_dict": { - "category": "noise-cancelling headphones", - "price": "$200" - }, - "intent": "Post in the most appropriate subreddit and ask for recommendations for noise-cancelling headphones products within a budget of $200", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "validate_post_exists_in_subreddit", - "expected_data": { - "subreddit": "headphones", - "title": [ - "200", - "noise", - "cancelling", - "headphones" - ] - } - } - } - ], - "site": "reddit" - }, - "original.eval": { - "eval_types": [ - "url_match", - "program_html" - ], - "reference_answers": null, - "reference_url": "__REDDIT__/f/headphones", - "program_html": [ - { - "url": "func:reddit_get_post_url('__last_url__')", - "locator": "document.querySelector('.submission__inner').outerText", - "required_contents": { - "must_include": [ - "200", - "noise-cancelling", - "headphone" - ] - } - } - ], - "url_note": "GOLD in PRED" - }, "intent_template_id": 6100, - "changelogs": [ + "start_urls": ["__REDDIT__"], + "intent": "Post in the most appropriate subreddit and ask for recommendations for noise-cancelling headphones products within a budget of $200. Put all relevant information in the body of the post.", + "intent_template": "Post in the most appropriate subreddit and ask for recommendations for {{category}} products within a budget of {{price}}. Put all relevant information in the body of the post.", + "instantiation_dict": {"category": "noise-cancelling headphones", "price": "$200"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "expected_backend_state", - "category": "switch_to_api_eval", - "note": "Update to use system check instead of dom check" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "site": "reddit", + "expected": { + "subreddit": "headphones", + "content": ["200", "noise", "cancelling", "headphones"] + } } - ] + ], + "revision": 2 }, { - "sites": [ - "reddit" - ], + "sites": ["reddit"], "task_id": 636, - "require_login": true, - "storage_state": "./.auth/reddit_state.json", - "start_url": "__REDDIT__", - "geolocation": null, - "intent_template": "Post in the most appropriate subreddit and ask for recommendations for {{category}} products within a budget of {{price}}", - "instantiation_dict": { - "category": "DIY toolkit", - "price": "$100" - }, - "intent": "Post in the most appropriate subreddit and ask for recommendations for DIY toolkit products within a budget of $100", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "validate_post_exists_in_subreddit", - "expected_data": { - "subreddit": "diy", - "title": [ - "100", - "DIY", - "toolkit" - ] - } - } - } - ], - "site": "reddit" - }, - "original.eval": { - "eval_types": [ - "url_match", - "program_html" - ], - "reference_answers": null, - "reference_url": "__REDDIT__/f/DIY", - "program_html": [ - { - "url": "func:reddit_get_post_url('__last_url__')", - "locator": "document.querySelector('.submission__inner').outerText", - "required_contents": { - "must_include": [ - "100", - "DIY", - "toolkit" - ] - } - } - ], - "url_note": "GOLD in PRED" - }, "intent_template_id": 6100, - "changelogs": [ + "start_urls": ["__REDDIT__"], + "intent": "Post in the most appropriate subreddit and ask for recommendations for DIY toolkit products within a budget of $100. Put all relevant information in the body of the post.", + "intent_template": "Post in the most appropriate subreddit and ask for recommendations for {{category}} products within a budget of {{price}}. Put all relevant information in the body of the post.", + "instantiation_dict": {"category": "DIY toolkit", "price": "$100"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "expected_backend_state", - "category": "switch_to_api_eval", - "note": "Update to use system check instead of dom check" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "site": "reddit", + "expected": { "subreddit": "diy", "content": ["100", "DIY", "toolkit"] } } - ] + ], + "revision": 2 }, { - "sites": [ - "reddit" - ], + "sites": ["reddit"], "task_id": 637, - "require_login": true, - "storage_state": "./.auth/reddit_state.json", - "start_url": "__REDDIT__", - "geolocation": null, - "intent_template": "Post in the most appropriate subreddit and ask for recommendations for {{category}} products within a budget of {{price}}", - "instantiation_dict": { - "category": "sony headphones", - "price": "$500" - }, - "intent": "Post in the most appropriate subreddit and ask for recommendations for sony headphones products within a budget of $500", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "validate_post_exists_in_subreddit", - "expected_data": { - "subreddit": "headphones", - "title": [ - "500", - "sony", - "headphones" - ] - } - } - } - ], - "site": "reddit" - }, - "original.eval": { - "eval_types": [ - "url_match", - "program_html" - ], - "reference_answers": null, - "reference_url": "__REDDIT__/f/headphones", - "program_html": [ - { - "url": "func:reddit_get_post_url('__last_url__')", - "locator": "document.querySelector('.submission__inner').outerText", - "required_contents": { - "must_include": [ - "500", - "sony headphone" - ] - } - } - ], - "url_note": "GOLD in PRED" - }, "intent_template_id": 6100, - "changelogs": [ + "start_urls": ["__REDDIT__"], + "intent": "Post in the most appropriate subreddit and ask for recommendations for sony headphones products within a budget of $500. Put all relevant information in the body of the post.", + "intent_template": "Post in the most appropriate subreddit and ask for recommendations for {{category}} products within a budget of {{price}}. Put all relevant information in the body of the post.", + "instantiation_dict": {"category": "sony headphones", "price": "$500"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "expected_backend_state", - "category": "switch_to_api_eval", - "note": "Update to use system check instead of dom check" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "site": "reddit", + "expected": { "subreddit": "headphones", "content": ["500", "sony", "headphones"] } } - ] + ], + "revision": 2 }, { - "sites": [ - "reddit" - ], + "sites": ["reddit"], "task_id": 638, - "require_login": true, - "storage_state": "./.auth/reddit_state.json", - "start_url": "__REDDIT__", - "geolocation": null, - "intent_template": "Post in the most appropriate subreddit and ask for recommendations for {{category}} products within a budget of {{price}}", - "instantiation_dict": { - "category": "must-have product in my life", - "price": "$30" - }, - "intent": "Post in the most appropriate subreddit and ask for recommendations for must-have product in my life products within a budget of $30", - "require_reset": false, - "eval": { - "expected_ui_state": [ - { - "url": "__REDDIT__/f/BuyItForLife" - } - ], - "site": "reddit" - }, - "original.eval": { - "eval_types": [ - "url_match", - "program_html" - ], - "reference_answers": null, - "reference_url": "__REDDIT__/f/BuyItForLife", - "program_html": [ - { - "url": "func:reddit_get_post_url('__last_url__')", - "locator": "document.querySelector('.submission__inner').outerText", - "required_contents": { - "must_include": [ - "30", - "must-have", - "product", - "life" - ] - } + "intent_template_id": 6100, + "start_urls": ["__REDDIT__"], + "intent": "Post in the most appropriate subreddit and ask for recommendations for must-have product to buy for life products within a budget of $30. Put all relevant information in the body of the post.", + "intent_template": "Post in the most appropriate subreddit and ask for recommendations for {{category}} products within a budget of {{price}}. Put all relevant information in the body of the post.", + "instantiation_dict": {"category": "must-have product to buy for life", "price": "$30"}, + "format_specification": null, + "start_url_context": null, + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + }, + { + "evaluator": "BackendStateEvaluator", + "site": "reddit", + "expected": { + "subreddit": "BuyItForLife", + "content": ["30", "must-have", "product", "life"] } - ], - "url_note": "GOLD in PRED" - }, - "intent_template_id": 6100 + } + ], + "revision": 2 }, { - "sites": [ - "reddit" - ], + "sites": ["reddit"], "task_id": 639, - "require_login": true, - "storage_state": "./.auth/reddit_state.json", - "start_url": "__REDDIT__", - "geolocation": null, - "intent_template": "Post in the most appropriate subreddit and ask for recommendations for {{category}} products within a budget of {{price}}", - "instantiation_dict": { - "category": "used iphone", - "price": "$1000" - }, - "intent": "Post in the most appropriate subreddit and ask for recommendations for used iphone products within a budget of $1000", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "validate_post_exists_in_subreddit", - "expected_data": { - "subreddit": "iphone", - "title": [ - "1000", - "used iphone" - ] - } - } - } - ], - "site": "reddit" - }, - "original.eval": { - "eval_types": [ - "url_match", - "program_html" - ], - "reference_answers": null, - "reference_url": "__REDDIT__/f/iphone", - "program_html": [ - { - "url": "func:reddit_get_post_url('__last_url__')", - "locator": "document.querySelector('.submission__inner').outerText", - "required_contents": { - "must_include": [ - "1000", - "used iphone" - ] - } - } - ], - "url_note": "GOLD in PRED" - }, "intent_template_id": 6100, - "changelogs": [ + "start_urls": ["__REDDIT__"], + "intent": "Post in the most appropriate subreddit and ask for recommendations for used iphone products within a budget of $1000. Put all relevant information in the body of the post.", + "intent_template": "Post in the most appropriate subreddit and ask for recommendations for {{category}} products within a budget of {{price}}. Put all relevant information in the body of the post.", + "instantiation_dict": {"category": "used iphone", "price": "$1000"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "expected_backend_state", - "category": "switch_to_api_eval", - "note": "Update to use system check instead of dom check" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "site": "reddit", + "expected": { "subreddit": "iphone", "content": ["1000", "used", "iphone"] } } - ] + ], + "revision": 2 }, { - "sites": [ - "reddit" - ], + "sites": ["reddit"], "task_id": 640, - "require_login": true, - "storage_state": "./.auth/reddit_state.json", - "start_url": "__REDDIT__", - "geolocation": null, - "intent_template": "Post a notice on a virtual meetup for {{interest}} enthusiasts on {{date}} in the {{subreddit}} subreddit", - "instantiation_dict": { - "interest": "book reading", - "date": "March 15th", - "subreddit": "r/books" - }, - "intent": "Post a notice on a virtual meetup for book reading enthusiasts on March 15th in the r/books subreddit", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "validate_post_exists_in_subreddit", - "expected_data": { - "subreddit": "books", - "title": [ - "book reading", - "March 15th", - "virtual meetup" - ] - } - } - } - ], - "site": "reddit" - }, - "original.eval": { - "eval_types": [ - "url_match", - "program_html" - ], - "reference_answers": null, - "reference_url": "__REDDIT__/f/books", - "program_html": [ - { - "url": "func:reddit_get_post_url('__last_url__')", - "locator": "document.querySelector('.submission__inner').outerText", - "required_contents": { - "must_include": [ - "book reading", - "March 15th", - "virtual meetup" - ] - } - } - ], - "url_note": "GOLD in PRED" - }, "intent_template_id": 16, - "changelogs": [ + "start_urls": ["__REDDIT__"], + "intent": "Post a notice on a virtual meetup for book reading enthusiasts on March 15th in the r/books subreddit. Put all relevant information in the body of the post.", + "intent_template": "Post a notice on a virtual meetup for {{interest}} enthusiasts on {{date}} in the {{subreddit}} subreddit. Put all relevant information in the body of the post.", + "instantiation_dict": {"interest": "book reading", "date": "March 15th", "subreddit": "r/books"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "expected_backend_state", - "category": "switch_to_api_eval", - "note": "Update to use system check instead of dom check" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "site": "reddit", + "expected": { + "subreddit": "books", + "content": ["book reading", "March", "15", "virtual meetup"] + } } - ] + ], + "revision": 2 }, { - "sites": [ - "reddit" - ], + "sites": ["reddit"], "task_id": 641, - "require_login": true, - "storage_state": "./.auth/reddit_state.json", - "start_url": "__REDDIT__", - "geolocation": null, - "intent_template": "Post a notice on a virtual meetup for {{interest}} enthusiasts on {{date}} in the {{subreddit}} subreddit", - "instantiation_dict": { - "interest": "Harry Potter", - "date": "July 8th", - "subreddit": "books" - }, - "intent": "Post a notice on a virtual meetup for Harry Potter enthusiasts on July 8th in the books subreddit", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "validate_post_exists_in_subreddit", - "expected_data": { - "subreddit": "books", - "title": [ - "Harry Potter", - "July 8th", - "virtual meetup" - ] - } - } - } - ], - "site": "reddit" - }, - "original.eval": { - "eval_types": [ - "url_match", - "program_html" - ], - "reference_answers": null, - "reference_url": "__REDDIT__/f/books", - "program_html": [ - { - "url": "func:reddit_get_post_url('__last_url__')", - "locator": "document.querySelector('.submission__inner').outerText", - "required_contents": { - "must_include": [ - "Harry Potter", - "July 8th", - "virtual meetup" - ] - } - } - ], - "url_note": "GOLD in PRED" - }, "intent_template_id": 16, - "changelogs": [ + "start_urls": ["__REDDIT__"], + "intent": "Post a notice on a virtual meetup for Harry Potter enthusiasts on July 8th in the books subreddit. Put all relevant information in the body of the post.", + "intent_template": "Post a notice on a virtual meetup for {{interest}} enthusiasts on {{date}} in the {{subreddit}} subreddit. Put all relevant information in the body of the post.", + "instantiation_dict": {"interest": "Harry Potter", "date": "July 8th", "subreddit": "books"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "expected_backend_state", - "category": "switch_to_api_eval", - "note": "Update to use system check instead of dom check" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "site": "reddit", + "expected": { + "subreddit": "books", + "content": ["Harry Potter", "July", "8", "virtual meetup"] + } } - ] + ], + "revision": 2 }, { - "sites": [ - "reddit" - ], + "sites": ["reddit"], "task_id": 642, - "require_login": true, - "storage_state": "./.auth/reddit_state.json", - "start_url": "__REDDIT__", - "geolocation": null, - "intent_template": "Post a notice on a virtual meetup for {{interest}} enthusiasts on {{date}} in the {{subreddit}} subreddit", - "instantiation_dict": { - "interest": "Big little lies", - "date": "Sep 10th", - "subreddit": "books" - }, - "intent": "Post a notice on a virtual meetup for Big little lies enthusiasts on Sep 10th in the books subreddit", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "validate_post_exists_in_subreddit", - "expected_data": { - "subreddit": "books", - "title": [ - "Big little lies", - "Sep 10th", - "virtual meetup" - ] - } - } - } - ], - "site": "reddit" - }, - "original.eval": { - "eval_types": [ - "url_match", - "program_html" - ], - "reference_answers": null, - "reference_url": "__REDDIT__/f/books", - "program_html": [ - { - "url": "func:reddit_get_post_url('__last_url__')", - "locator": "document.querySelector('.submission__inner').outerText", - "required_contents": { - "must_include": [ - "Big little lies", - "Sep 10th", - "virtual meetup" - ] - } - } - ], - "url_note": "GOLD in PRED" - }, "intent_template_id": 16, - "changelogs": [ + "start_urls": ["__REDDIT__"], + "intent": "Post a notice on a virtual meetup for Big little lies enthusiasts on Sep 10th in the books subreddit. Put all relevant information in the body of the post.", + "intent_template": "Post a notice on a virtual meetup for {{interest}} enthusiasts on {{date}} in the {{subreddit}} subreddit. Put all relevant information in the body of the post.", + "instantiation_dict": {"interest": "Big little lies", "date": "Sep 10th", "subreddit": "books"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "expected_backend_state", - "category": "switch_to_api_eval", - "note": "Update to use system check instead of dom check" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "site": "reddit", + "expected": { + "subreddit": "books", + "content": ["Big little lies", "Sep", "10", "virtual meetup"] + } } - ] + ], + "revision": 2 }, { - "sites": [ - "reddit" - ], + "sites": ["reddit"], "task_id": 643, - "require_login": true, - "storage_state": "./.auth/reddit_state.json", - "start_url": "__REDDIT__", - "geolocation": null, - "intent_template": "Post a notice on a virtual meetup for {{interest}} enthusiasts on {{date}} in the {{subreddit}} subreddit", - "instantiation_dict": { - "interest": "racing cars", - "date": "Oct 21st", - "subreddit": "nyc" - }, - "intent": "Post a notice on a virtual meetup for racing cars enthusiasts on Oct 21st in the nyc subreddit", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "validate_post_exists_in_subreddit", - "expected_data": { - "subreddit": "books", - "title": [ - "racing cars", - "Oct 21st", - "virtual meetup" - ] - } - } - } - ], - "site": "reddit" - }, - "original.eval": { - "eval_types": [ - "url_match", - "program_html" - ], - "reference_answers": null, - "reference_url": "__REDDIT__/f/nyc", - "program_html": [ - { - "url": "func:reddit_get_post_url('__last_url__')", - "locator": "document.querySelector('.submission__inner').outerText", - "required_contents": { - "must_include": [ - "racing cars", - "Oct 21st", - "virtual meetup" - ] - } - } - ], - "url_note": "GOLD in PRED" - }, "intent_template_id": 16, - "changelogs": [ + "start_urls": ["__REDDIT__"], + "intent": "Post a notice on a virtual meetup for racing cars enthusiasts on Oct 21st in the nyc subreddit. Put all relevant information in the body of the post.", + "intent_template": "Post a notice on a virtual meetup for {{interest}} enthusiasts on {{date}} in the {{subreddit}} subreddit. Put all relevant information in the body of the post.", + "instantiation_dict": {"interest": "racing cars", "date": "Oct 21st", "subreddit": "nyc"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "expected_backend_state", - "category": "switch_to_api_eval", - "note": "Update to use system check instead of dom check" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "site": "reddit", + "expected": { + "subreddit": "nyc", + "content": ["racing cars", "Oct", "21", "virtual meetup"] + } } - ] + ], + "revision": 2 }, { - "sites": [ - "reddit" - ], + "sites": ["reddit"], "task_id": 644, - "require_login": true, - "storage_state": "./.auth/reddit_state.json", - "start_url": "__REDDIT__", - "geolocation": null, - "intent_template": "Post a notice on a virtual meetup for {{interest}} enthusiasts on {{date}} in the {{subreddit}} subreddit", - "instantiation_dict": { - "interest": "Tears of Kingdom", - "date": "Dec 15th", - "subreddit": "games" - }, - "intent": "Post a notice on a virtual meetup for Tears of Kingdom enthusiasts on Dec 15th in the games subreddit", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "validate_post_exists_in_subreddit", - "expected_data": { - "subreddit": "books", - "title": [ - "Tears of Kingdom", - "Dec 15th", - "virtual meetup" - ] - } - } - } - ], - "site": "reddit" - }, - "original.eval": { - "eval_types": [ - "url_match", - "program_html" - ], - "reference_answers": null, - "reference_url": "__REDDIT__/f/games", - "program_html": [ - { - "url": "func:reddit_get_post_url('__last_url__')", - "locator": "document.querySelector('.submission__inner').outerText", - "required_contents": { - "must_include": [ - "Tears of Kingdom", - "Dec 15th", - "virtual meetup" - ] - } - } - ], - "url_note": "GOLD in PRED" - }, "intent_template_id": 16, - "changelogs": [ + "start_urls": ["__REDDIT__"], + "intent": "Post a notice on a virtual meetup for Tears of Kingdom enthusiasts on Dec 15th in the games subreddit. Put all relevant information in the body of the post.", + "intent_template": "Post a notice on a virtual meetup for {{interest}} enthusiasts on {{date}} in the {{subreddit}} subreddit. Put all relevant information in the body of the post.", + "instantiation_dict": {"interest": "Tears of Kingdom", "date": "Dec 15th", "subreddit": "games"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "expected_backend_state", - "category": "switch_to_api_eval", - "note": "Update to use system check instead of dom check" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "site": "reddit", + "expected": { + "subreddit": "gaming", + "content": ["Tears of Kingdom", "Dec", "15", "virtual meetup"] + } } - ] + ], + "revision": 2 }, { - "sites": [ - "reddit" - ], + "sites": ["reddit"], "task_id": 645, - "require_login": true, - "storage_state": "./.auth/reddit_state.json", - "start_url": "__REDDIT__", - "geolocation": null, - "intent_template": "Post in {{subreddit}} subreddit about what could {{model_type}} help the corresponding field.", - "original.intent_template": "Post in {{subreddit}} subreddit about what could machine learning help the correpong field.", - "instantiation_dict": { - "model_type": "machine learning", - "subreddit": "books" - }, - "original.instantiation_dict": { - "subreddit": "books" - }, - "intent": "Post in books subreddit about what could machine learning help the corresponding field.", - "original.intent": "Post in books subreddit about what could machine learning help the correpong field.", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "validate_post_exists_in_subreddit", - "expected_data": { - "subreddit": "books", - "title": [ - "machine learning", - "help" - ] - } - } - } - ], - "site": "reddit" - }, - "original.eval": { - "eval_types": [ - "url_match", - "program_html" - ], - "reference_answers": null, - "reference_url": "__REDDIT__/f/books", - "program_html": [ - { - "url": "func:reddit_get_post_url('__last_url__')", - "locator": "document.querySelector('.submission__inner').outerText", - "required_contents": { - "must_include": [ - "machine learning", - "help" - ] - } - } - ], - "url_note": "GOLD in PRED" - }, "intent_template_id": 19, - "changelogs": [ + "start_urls": ["__REDDIT__"], + "intent": "Post in books subreddit about what could machine learning help the corresponding field.", + "intent_template": "Post in {{subreddit}} subreddit about what could {{model_type}} help the corresponding field.", + "instantiation_dict": {"model_type": "machine learning", "subreddit": "books"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "intent_template", - "category": "spelling_or_grammar", - "note": "Fix task wording" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "key": "intent_template", - "category": "spelling_or_grammar", - "note": "Grammatical error in task intent template" - }, + "evaluator": "BackendStateEvaluator", + "site": "reddit", + "expected": { "subreddit": "books", "title": ["machine learning", "help"] } + } + ], + "revision": 2 + }, + { + "sites": ["reddit"], + "task_id": 646, + "intent_template_id": 19, + "start_urls": ["__REDDIT__"], + "intent": "Post in DIY subreddit about what could midjourney help the corresponding field.", + "intent_template": "Post in {{subreddit}} subreddit about what could {{model_type}} help the corresponding field.", + "instantiation_dict": {"model_type": "midjourney", "subreddit": "DIY"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "expected_backend_state", - "category": "switch_to_api_eval", - "note": "Update to use system check instead of dom check" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "site": "reddit", + "expected": { "subreddit": "diy", "title": ["midjourney", "help"] } } - ] + ], + "revision": 2 }, { - "sites": [ - "reddit" - ], - "task_id": 646, - "require_login": true, - "storage_state": "./.auth/reddit_state.json", - "start_url": "__REDDIT__", - "geolocation": null, - "intent_template": "Post in {{subreddit}} subreddit about what could {{model_type}} help the corresponding field.", - "original.intent_template": "Post in {{subreddit}} subreddit about what could midjourney help the correpong field.", - "instantiation_dict": { - "model_type": "midjourney", - "subreddit": "DIY" - }, - "original.instantiation_dict": { - "subreddit": "DIY" - }, - "intent": "Post in DIY subreddit about what could midjourney help the corresponding field.", - "original.intent": "Post in DIY subreddit about what could midjourney help the correpong field.", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "validate_post_exists_in_subreddit", - "expected_data": { - "subreddit": "diy", - "title": [ - "midjourney", - "help" - ] - } - } - } - ], - "site": "reddit" - }, - "original.eval": { - "eval_types": [ - "url_match", - "program_html" - ], - "reference_answers": null, - "reference_url": "__REDDIT__/f/diy", - "program_html": [ - { - "url": "func:reddit_get_post_url('__last_url__')", - "locator": "document.querySelector('.submission__inner').outerText", - "required_contents": { - "must_include": [ - "midjourney", - "help" - ] - } - } - ], - "url_note": "GOLD in PRED" - }, - "intent_template_id": 19, - "changelogs": [ - { - "key": "intent_template", - "category": "spelling_or_grammar", - "note": "Fix task wording" - }, - { - "key": "intent_template", - "category": "spelling_or_grammar", - "note": "Grammatical error in task intent template" - }, - { - "key": "expected_backend_state", - "category": "switch_to_api_eval", - "note": "Update to use system check instead of dom check" - }, - { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" - } - ] - }, - { - "sites": [ - "reddit" - ], + "sites": ["reddit"], "task_id": 647, - "require_login": true, - "storage_state": "./.auth/reddit_state.json", - "start_url": "__REDDIT__", - "geolocation": null, - "intent_template": "Post in {{subreddit}} subreddit about what could {{model_type}} help the corresponding field.", - "original.intent_template": "Post in {{subreddit}} forum about what could open-source LLMs help the correpong field.", - "instantiation_dict": { - "model_type": "open-source LLMs", - "subreddit": "technology" - }, - "original.instantiation_dict": { - "subreddit": "technology" - }, - "intent": "Post in technology subreddit about what could open-source LLMs help the corresponding field.", - "original.intent": "Post in technology forum about what could open-source LLMs help the correpong field.", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "validate_post_exists_in_subreddit", - "expected_data": { - "subreddit": "technology", - "title": [ - "open-source LLMs", - "help" - ] - } - } - } - ], - "site": "reddit" - }, - "original.eval": { - "eval_types": [ - "url_match", - "program_html" - ], - "reference_answers": null, - "reference_url": "__REDDIT__/f/technology", - "program_html": [ - { - "url": "func:reddit_get_post_url('__last_url__')", - "locator": "document.querySelector('.submission__inner').outerText", - "required_contents": { - "must_include": [ - "open-source LLMs", - "help" - ] - } - } - ], - "url_note": "GOLD in PRED" - }, "intent_template_id": 19, - "changelogs": [ - { - "key": "intent_template", - "category": "spelling_or_grammar", - "note": "Fix task wording" - }, - { - "key": "intent_template", - "category": "spelling_or_grammar", - "note": "Grammatical error in task intent template" - }, + "start_urls": ["__REDDIT__"], + "intent": "Post in technology subreddit about what could open-source LLMs help the corresponding field.", + "intent_template": "Post in {{subreddit}} subreddit about what could {{model_type}} help the corresponding field.", + "instantiation_dict": {"model_type": "open-source LLMs", "subreddit": "technology"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "expected_backend_state", - "category": "switch_to_api_eval", - "note": "Update to use system check instead of dom check" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "site": "reddit", + "expected": { "subreddit": "technology", "title": ["open-source LLMs", "help"] } } - ] + ], + "revision": 2 }, { - "sites": [ - "reddit" - ], + "sites": ["reddit"], "task_id": 648, - "require_login": true, - "storage_state": "./.auth/reddit_state.json", - "start_url": "__REDDIT__", - "geolocation": null, - "intent_template": "Post in {{subreddit}} subreddit about what could {{model_type}} help the corresponding field.", - "original.intent_template": "Post in {{subreddit}} forum about what could large language models help the correpong field.", - "instantiation_dict": { - "model_type": "large language models", - "subreddit": "dataisbeautiful" - }, - "original.instantiation_dict": { - "subreddit": "dataisbeautiful" - }, - "intent": "Post in dataisbeautiful subreddit about what could large language models help the corresponding field.", - "original.intent": "Post in dataisbeautiful forum about what could large language models help the correpong field.", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "validate_post_exists_in_subreddit", - "expected_data": { - "subreddit": "dataisbeautiful", - "title": [ - "large language models", - "help" - ] - } - } - } - ], - "site": "reddit" - }, - "original.eval": { - "eval_types": [ - "url_match", - "program_html" - ], - "reference_answers": null, - "reference_url": "__REDDIT__/f/dataisbeautiful", - "program_html": [ - { - "url": "func:reddit_get_post_url('__last_url__')", - "locator": "document.querySelector('.submission__inner').outerText", - "required_contents": { - "must_include": [ - "large language models", - "help" - ] - } - } - ], - "url_note": "GOLD in PRED" - }, "intent_template_id": 19, - "changelogs": [ - { - "key": "intent_template", - "category": "spelling_or_grammar", - "note": "Fix task wording" - }, - { - "key": "intent_template", - "category": "spelling_or_grammar", - "note": "Grammatical error in task intent template" - }, + "start_urls": ["__REDDIT__"], + "intent": "Post in dataisbeautiful subreddit about what could large language models help the corresponding field.", + "intent_template": "Post in {{subreddit}} subreddit about what could {{model_type}} help the corresponding field.", + "instantiation_dict": {"model_type": "large language models", "subreddit": "dataisbeautiful"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "expected_backend_state", - "category": "switch_to_api_eval", - "note": "Update to use system check instead of dom check" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "site": "reddit", + "expected": { "subreddit": "dataisbeautiful", "title": ["large language models", "help"] } } - ] + ], + "revision": 2 }, { - "sites": [ - "reddit" - ], + "sites": ["reddit"], "task_id": 649, - "require_login": true, - "storage_state": "./.auth/reddit_state.json", - "start_url": "__REDDIT__", - "geolocation": null, - "intent_template": "Post in {{subreddit}} subreddit about what could {{model_type}} help the corresponding field.", - "original.intent_template": "Post in {{subreddit}} subreddit about what could diffusion model help the correpong field.", - "instantiation_dict": { - "model_type": "diffusion model", - "subreddit": "dataisbeautiful" - }, - "original.instantiation_dict": { - "subreddit": "history" - }, - "intent": "Post in dataisbeautiful subreddit about what could diffusion model help the corresponding field.", - "original.intent": "Post in history subreddit about what could diffusion model help the correpong field.", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "validate_post_exists_in_subreddit", - "expected_data": { - "subreddit": "history", - "title": [ - "diffusion models", - "help" - ] - } - } - } - ], - "site": "reddit" - }, - "original.eval": { - "eval_types": [ - "url_match", - "program_html" - ], - "reference_answers": null, - "reference_url": "__REDDIT__/f/history", - "program_html": [ - { - "url": "func:reddit_get_post_url('__last_url__')", - "locator": "document.querySelector('.submission__inner').outerText", - "required_contents": { - "must_include": [ - "diffusion model", - "help" - ] - } - } - ], - "url_note": "GOLD in PRED" - }, "intent_template_id": 19, - "changelogs": [ - { - "key": "intent_template", - "category": "spelling_or_grammar", - "note": "Fix task wording" - }, - { - "key": "intent_template", - "category": "spelling_or_grammar", - "note": "Grammatical error in task intent template" - }, + "start_urls": ["__REDDIT__"], + "intent": "Post in dataisbeautiful subreddit about what could diffusion model help the corresponding field.", + "intent_template": "Post in {{subreddit}} subreddit about what could {{model_type}} help the corresponding field.", + "instantiation_dict": {"model_type": "diffusion model", "subreddit": "dataisbeautiful"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "expected_backend_state", - "category": "switch_to_api_eval", - "note": "Update to use system check instead of dom check" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "site": "reddit", + "expected": { "subreddit": "dataisbeautiful", "title": ["diffusion models", "help"] } } - ] + ], + "revision": 2 }, { - "sites": [ - "reddit" - ], + "sites": ["reddit"], "task_id": 650, - "require_login": true, - "storage_state": "./.auth/reddit_state.json", - "start_url": "__REDDIT__/f/books/59421/friendly-reminder-bookshop-org-exists", - "geolocation": null, + "intent_template_id": 23, + "start_urls": ["__REDDIT__/f/books/59421/friendly-reminder-bookshop-org-exists"], + "intent": "Reply to the post with my comment \"I am a big fan of the bookorg\"", "intent_template": "Reply to {{position_description}} with my comment \"{{content_description}}\"", "instantiation_dict": { "position_description": "the post", "content_description": "I am a big fan of the bookorg" }, - "intent": "Reply to the post with my comment \"I am a big fan of the bookorg\"", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "validate_comment_exists_in_post", - "expected_data": { - "post_id": "59421", - "comment_content": "I am a big fan of the bookorg" - } - } - } - ], - "site": "reddit" - }, - "original.eval": { - "eval_types": [ - "url_match", - "program_html" - ], - "reference_answers": null, - "reference_url": "__REDDIT__/f/books/59421/friendly-reminder-bookshop-org-exists", - "program_html": [ - { - "url": "last", - "locator": "document.querySelector('.comment__body').outerText", - "required_contents": { - "exact_match": "I am a big fan of the bookorg" - } - } - ], - "url_note": "GOLD in PRED" - }, - "intent_template_id": 23, - "changelogs": [ + "format_specification": null, + "start_url_context": "Complete the task given the context of the current page.", + "eval": [ { - "key": "expected_backend_state", - "category": "switch_to_api_eval", - "note": "Update to use system check instead of dom check" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "site": "reddit", + "expected": {"post_id": "59421", "comment_content": "I am a big fan of the bookorg"} } - ] + ], + "revision": 2 }, { - "sites": [ - "reddit" - ], + "sites": ["reddit"], "task_id": 651, - "require_login": true, - "storage_state": "./.auth/reddit_state.json", - "start_url": "__REDDIT__/f/pittsburgh/45899/driving-in-pittsburgh-summed-up-by-one-traffic-sign", - "geolocation": null, + "intent_template_id": 23, + "start_urls": [ + "__REDDIT__/f/pittsburgh/45899/driving-in-pittsburgh-summed-up-by-one-traffic-sign" + ], + "intent": "Reply to the post with my comment \"Yeah, pittsburgh traffic, you know...\"", "intent_template": "Reply to {{position_description}} with my comment \"{{content_description}}\"", "instantiation_dict": { "position_description": "the post", "content_description": "Yeah, pittsburgh traffic, you know..." }, - "intent": "Reply to the post with my comment \"Yeah, pittsburgh traffic, you know...\"", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "validate_comment_exists_in_post", - "expected_data": { - "post_id": "45899", - "comment_content": "Yeah, pittsburgh traffic, you know..." - } - } - } - ], - "site": "reddit" - }, - "original.eval": { - "eval_types": [ - "url_match", - "program_html" - ], - "reference_answers": null, - "reference_url": "__REDDIT__/f/pittsburgh/45899/driving-in-pittsburgh-summed-up-by-one-traffic-sign", - "program_html": [ - { - "url": "last", - "locator": "document.querySelector('.comment__body').outerText", - "required_contents": { - "exact_match": "Yeah, pittsburgh traffic, you know..." - } - } - ], - "url_note": "GOLD in PRED" - }, - "intent_template_id": 23, - "changelogs": [ + "format_specification": null, + "start_url_context": "Complete the task given the context of the current page.", + "eval": [ { - "key": "expected_backend_state", - "category": "switch_to_api_eval", - "note": "Update to use system check instead of dom check" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "site": "reddit", + "expected": {"post_id": "45899", "comment_content": "Yeah, pittsburgh traffic, you know..."} } - ] + ], + "revision": 2 }, { - "sites": [ - "reddit" - ], + "sites": ["reddit"], "task_id": 652, - "require_login": true, - "storage_state": "./.auth/reddit_state.json", - "start_url": "__REDDIT__/f/technology/134852/ai-experts-disown-musk-backed-campaign-citing-their-research", - "geolocation": null, - "intent_template": "Reply to {{position_description}} with my comment \"{{content_description}}\"", - "instantiation_dict": { - "position_description": "the post", - "content_description": "???" - }, - "intent": "Reply to the post with my comment \"???\"", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "validate_comment_exists_in_post", - "expected_data": { - "post_id": "134852", - "comment_content": "???" - } - } - } - ], - "site": "reddit" - }, - "original.eval": { - "eval_types": [ - "url_match", - "program_html" - ], - "reference_answers": null, - "reference_url": "__REDDIT__/f/technology/134852/ai-experts-disown-musk-backed-campaign-citing-their-research", - "program_html": [ - { - "url": "last", - "locator": "document.querySelector('.comment__body').outerText", - "required_contents": { - "exact_match": "???" - } - } - ], - "url_note": "GOLD in PRED" - }, "intent_template_id": 23, - "changelogs": [ + "start_urls": [ + "__REDDIT__/f/technology/134852/ai-experts-disown-musk-backed-campaign-citing-their-research" + ], + "intent": "Reply to the post with my comment \"???\"", + "intent_template": "Reply to {{position_description}} with my comment \"{{content_description}}\"", + "instantiation_dict": {"position_description": "the post", "content_description": "???"}, + "format_specification": null, + "start_url_context": "Complete the task given the context of the current page.", + "eval": [ { - "key": "expected_backend_state", - "category": "switch_to_api_eval", - "note": "Update to use system check instead of dom check" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "site": "reddit", + "expected": {"post_id": "134852", "comment_content": "???"} } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping" - ], + "sites": ["shopping"], "task_id": 653, - "require_login": true, - "storage_state": "./.auth/shopping_state.json", - "start_url": "__SHOPPING__", - "geolocation": null, - "intent_template": "Fill the \"contact us\" form in the site for a refund on the {{product}} I bought, stating that \"it broke after just three days of use\". Also, ensure to include the order number #{{order_id}} and the product SKU. Don't submit yet, I will check.", - "original.intent_template": "Fill the \"contact us\" form in the site for a refund on the {{product}} I bought, stating that it broke after just three days of use. Also, ensure to include the order number #{{order_id}} and the product SKU. Don't submit yet, I will check.", - "instantiation_dict": { - "product": "phone screen protector", - "order_id": "000000180" - }, - "intent": "Fill the \"contact us\" form in the site for a refund on the phone screen protector I bought, stating that \"it broke after just three days of use\". Also, ensure to include the order number #000000180 and the product SKU. Don't submit yet, I will check.", - "original.intent": "Fill the \"contact us\" form in the site for a refund on the phone screen protector I bought, stating that it broke after just three days of use. Also, ensure to include the order number #000000180 and the product SKU. Don't submit yet, I will check.", - "require_reset": false, - "eval": { - "expected_ui_state": [ - { - "url": "__SHOPPING__/contact", - "eval_func": { - "name": "verify_ui_value_exists", - "eval_params": { - "locator": "title=\"What\u2019s on your mind?\"" - } - } - } - ], - "site": "shopping" - }, - "original.eval": { - "eval_types": [ - "url_match", - "program_html" - ], - "reference_answers": null, - "reference_url": "__SHOPPING__/contact", - "program_html": [ - { - "url": "last", - "locator": "document.querySelector('[title=\"What\u2019s on your mind?\"').value", - "required_contents": { - "must_include": [ - "refund", - "it broke after three days of use", - "000000180", - "B087QJN9W1" - ] - } - } - ], - "url_note": "GOLD in PRED" - }, "intent_template_id": 153, - "changelogs": [ + "start_urls": ["__SHOPPING__"], + "intent": "Fill the \"contact us\" form in the site for a refund on the phone screen protector I bought, stating that \"it broke after just three days of use\". Also, ensure to include the order number #000000180 and the product SKU. Don't submit yet, I will check.", + "intent_template": "Fill the \"contact us\" form in the site for a refund on the {{product}} I bought, stating that \"it broke after just three days of use\". Also, ensure to include the order number #{{order_id}} and the product SKU. Don't submit yet, I will check.", + "instantiation_dict": {"product": "phone screen protector", "order_id": "000000180"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "intent_template", - "category": "task_ambiguity", - "note": "Clarify the intent to provide exact string to enter" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} }, { - "key": "expected_ui_state", - "category": "context_free_evaluation", - "note": "Used locator check" + "evaluator": "NetworkEventEvaluator", + "site": "shopping", + "url_match_mode": "exact", + "last_event_only": true, + "expected": { + "url": "__SHOPPING__/contact", + "response_status": 200, + "event_type": "navigation" + } } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping" - ], + "sites": ["shopping"], "task_id": 654, - "require_login": true, - "storage_state": "./.auth/shopping_state.json", - "start_url": "__SHOPPING__", - "geolocation": null, - "intent_template": "Fill the \"contact us\" form in the site for a refund on the {{product}} I bought, stating that \"it broke after just three days of use\". Also, ensure to include the order number #{{order_id}} and the product SKU. Don't submit yet, I will check.", - "original.intent_template": "Fill the \"contact us\" form in the site for a refund on the {{product}} I bought, stating that it broke after just three days of use. Also, ensure to include the order number #{{order_id}} and the product SKU. Don't submit yet, I will check.", - "instantiation_dict": { - "product": "bluetooth speaker", - "order_id": "161" - }, - "intent": "Fill the \"contact us\" form in the site for a refund on the bluetooth speaker I bought, stating that \"it broke after just three days of use\". Also, ensure to include the order number #161 and the product SKU. Don't submit yet, I will check.", - "original.intent": "Fill the \"contact us\" form in the site for a refund on the bluetooth speaker I bought, stating that it broke after just three days of use. Also, ensure to include the order number #161 and the product SKU. Don't submit yet, I will check.", - "require_reset": false, - "eval": { - "expected_ui_state": [ - { - "url": "__SHOPPING__/contact", - "eval_func": { - "name": "verify_ui_value_exists", - "eval_params": { - "locator": "title=\"What\u2019s on your mind?\"" - } - } - } - ], - "site": "shopping" - }, - "original.eval": { - "eval_types": [ - "url_match", - "program_html" - ], - "reference_answers": null, - "reference_url": "__SHOPPING__/contact", - "program_html": [ - { - "url": "last", - "locator": "document.querySelector('[title=\"What\u2019s on your mind?\"').value", - "required_contents": { - "must_include": [ - "refund", - "it broke after three days of use", - "161", - "B09P7BFL4H" - ] - } - } - ], - "url_note": "GOLD in PRED" - }, "intent_template_id": 153, - "changelogs": [ + "start_urls": ["__SHOPPING__"], + "intent": "Fill the \"contact us\" form in the site for a refund on the bluetooth speaker I bought, stating that \"it broke after just three days of use\". Also, ensure to include the order number #161 and the product SKU. Don't submit yet, I will check.", + "intent_template": "Fill the \"contact us\" form in the site for a refund on the {{product}} I bought, stating that \"it broke after just three days of use\". Also, ensure to include the order number #{{order_id}} and the product SKU. Don't submit yet, I will check.", + "instantiation_dict": {"product": "bluetooth speaker", "order_id": "161"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "intent_template", - "category": "task_ambiguity", - "note": "Clarify the intent to provide exact string to enter" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} }, { - "key": "expected_ui_state", - "category": "context_free_evaluation", - "note": "Used locator check" + "evaluator": "NetworkEventEvaluator", + "site": "shopping", + "url_match_mode": "exact", + "last_event_only": true, + "expected": { + "url": "__SHOPPING__/contact", + "response_status": 200, + "event_type": "navigation" + } } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping" - ], + "sites": ["shopping"], "task_id": 655, - "require_login": true, - "storage_state": "./.auth/shopping_state.json", - "start_url": "__SHOPPING__", - "geolocation": null, - "intent_template": "Fill the \"contact us\" form in the site for a refund on the {{product}} I bought, stating that \"it broke after just three days of use\". Also, ensure to include the order number #{{order_id}} and the product SKU. Don't submit yet, I will check.", - "original.intent_template": "Fill the \"contact us\" form in the site for a refund on the {{product}} I bought, stating that it broke after just three days of use. Also, ensure to include the order number #{{order_id}} and the product SKU. Don't submit yet, I will check.", - "instantiation_dict": { - "product": "iphone case", - "order_id": "180" - }, - "intent": "Fill the \"contact us\" form in the site for a refund on the iphone case I bought, stating that \"it broke after just three days of use\". Also, ensure to include the order number #180 and the product SKU. Don't submit yet, I will check.", - "original.intent": "Fill the \"contact us\" form in the site for a refund on the iphone case I bought, stating that it broke after just three days of use. Also, ensure to include the order number #180 and the product SKU. Don't submit yet, I will check.", - "require_reset": false, - "eval": { - "expected_ui_state": [ - { - "url": "__SHOPPING__/contact", - "eval_func": { - "name": "verify_ui_value_exists", - "eval_params": { - "locator": "title=\"What\u2019s on your mind?\"" - } - } - } - ], - "site": "shopping" - }, - "original.eval": { - "eval_types": [ - "url_match", - "program_html" - ], - "reference_answers": null, - "reference_url": "__SHOPPING__/contact", - "program_html": [ - { - "url": "last", - "locator": "document.querySelector('[title=\"What\u2019s on your mind?\"').value", - "required_contents": { - "must_include": [ - "refund", - "it broke after three days of use", - "180", - "B087QJN9W1" - ] - } - } - ], - "url_note": "GOLD in PRED" - }, "intent_template_id": 153, - "changelogs": [ + "start_urls": ["__SHOPPING__"], + "intent": "Fill the \"contact us\" form in the site for a refund on the iphone case I bought, stating that \"it broke after just three days of use\". Also, ensure to include the order number #180 and the product SKU. Don't submit yet, I will check.", + "intent_template": "Fill the \"contact us\" form in the site for a refund on the {{product}} I bought, stating that \"it broke after just three days of use\". Also, ensure to include the order number #{{order_id}} and the product SKU. Don't submit yet, I will check.", + "instantiation_dict": {"product": "iphone case", "order_id": "180"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "intent_template", - "category": "task_ambiguity", - "note": "Clarify the intent to provide exact string to enter" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} }, { - "key": "expected_ui_state", - "category": "context_free_evaluation", - "note": "Used locator check" + "evaluator": "NetworkEventEvaluator", + "site": "shopping", + "url_match_mode": "exact", + "last_event_only": true, + "expected": { + "url": "__SHOPPING__/contact", + "response_status": 200, + "event_type": "navigation" + } } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping" - ], + "sites": ["shopping"], "task_id": 656, - "require_login": true, - "storage_state": "./.auth/shopping_state.json", - "start_url": "__SHOPPING__", - "geolocation": null, - "intent_template": "Fill the \"contact us\" form in the site for a refund on the {{product}} I bought, stating that \"it broke after just three days of use\". Also, ensure to include the order number #{{order_id}} and the product SKU. Don't submit yet, I will check.", - "original.intent_template": "Fill the \"contact us\" form in the site for a refund on the {{product}} I bought, stating that it broke after just three days of use. Also, ensure to include the order number #{{order_id}} and the product SKU. Don't submit yet, I will check.", - "instantiation_dict": { - "product": "remote controller", - "order_id": "180" - }, - "intent": "Fill the \"contact us\" form in the site for a refund on the remote controller I bought, stating that \"it broke after just three days of use\". Also, ensure to include the order number #180 and the product SKU. Don't submit yet, I will check.", - "original.intent": "Fill the \"contact us\" form in the site for a refund on the remote controller I bought, stating that it broke after just three days of use. Also, ensure to include the order number #180 and the product SKU. Don't submit yet, I will check.", - "require_reset": false, - "eval": { - "expected_ui_state": [ - { - "url": "__SHOPPING__/contact", - "eval_func": { - "name": "verify_ui_value_exists", - "eval_params": { - "locator": "title=\"What\u2019s on your mind?\"" - } - } - } - ], - "site": "shopping" - }, - "original.eval": { - "eval_types": [ - "url_match", - "program_html" - ], - "reference_answers": null, - "reference_url": "__SHOPPING__/contact", - "program_html": [ - { - "url": "last", - "locator": "document.querySelector('[title=\"What\u2019s on your mind?\"').value", - "required_contents": { - "must_include": [ - "refund", - "it broke after three days of use", - "180", - "B0041MSF2S" - ] - } - } - ], - "url_note": "GOLD in PRED" - }, "intent_template_id": 153, - "changelogs": [ + "start_urls": ["__SHOPPING__"], + "intent": "Fill the \"contact us\" form in the site for a refund on the remote controller I bought, stating that \"it broke after just three days of use\". Also, ensure to include the order number #180 and the product SKU. Don't submit yet, I will check.", + "intent_template": "Fill the \"contact us\" form in the site for a refund on the {{product}} I bought, stating that \"it broke after just three days of use\". Also, ensure to include the order number #{{order_id}} and the product SKU. Don't submit yet, I will check.", + "instantiation_dict": {"product": "remote controller", "order_id": "180"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "intent_template", - "category": "task_ambiguity", - "note": "Clarify the intent to provide exact string to enter" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} }, { - "key": "expected_ui_state", - "category": "context_free_evaluation", - "note": "Used locator check" + "evaluator": "NetworkEventEvaluator", + "site": "shopping", + "url_match_mode": "exact", + "last_event_only": true, + "expected": { + "url": "__SHOPPING__/contact", + "response_status": 200, + "event_type": "navigation" + } } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping" - ], + "sites": ["shopping"], "task_id": 657, - "require_login": true, - "storage_state": "./.auth/shopping_state.json", - "start_url": "__SHOPPING__", - "geolocation": null, - "intent_template": "Fill the \"contact us\" form in the site for a refund on the {{product}} I bought, stating that \"it broke after just three days of use\". Also, ensure to include the order number #{{order_id}} and the product SKU. Don't submit yet, I will check.", - "original.intent_template": "Fill the \"contact us\" form in the site for a refund on the {{product}} I bought, stating that it broke after just three days of use. Also, ensure to include the order number #{{order_id}} and the product SKU. Don't submit yet, I will check.", - "instantiation_dict": { - "product": "speaker", - "order_id": "148" - }, - "intent": "Fill the \"contact us\" form in the site for a refund on the speaker I bought, stating that \"it broke after just three days of use\". Also, ensure to include the order number #148 and the product SKU. Don't submit yet, I will check.", - "original.intent": "Fill the \"contact us\" form in the site for a refund on the speaker I bought, stating that it broke after just three days of use. Also, ensure to include the order number #148 and the product SKU. Don't submit yet, I will check.", - "require_reset": false, - "eval": { - "expected_ui_state": [ - { - "url": "__SHOPPING__/contact", - "eval_func": { - "name": "verify_ui_value_exists", - "eval_params": { - "locator": "title=\"What\u2019s on your mind?\"" - } - } - } - ], - "site": "shopping" - }, - "original.eval": { - "eval_types": [ - "url_match", - "program_html" - ], - "reference_answers": null, - "reference_url": "__SHOPPING__/contact", - "program_html": [ - { - "url": "last", - "locator": "document.querySelector('[title=\"What\u2019s on your mind?\"').value", - "required_contents": { - "must_include": [ - "refund", - "broke after three days of use", - "148", - "B003FVW3VA" - ] - } - } - ], - "url_note": "GOLD in PRED" - }, "intent_template_id": 153, - "changelogs": [ + "start_urls": ["__SHOPPING__"], + "intent": "Fill the \"contact us\" form in the site for a refund on the speaker I bought, stating that \"it broke after just three days of use\". Also, ensure to include the order number #148 and the product SKU. Don't submit yet, I will check.", + "intent_template": "Fill the \"contact us\" form in the site for a refund on the {{product}} I bought, stating that \"it broke after just three days of use\". Also, ensure to include the order number #{{order_id}} and the product SKU. Don't submit yet, I will check.", + "instantiation_dict": {"product": "speaker", "order_id": "148"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "intent_template", - "category": "task_ambiguity", - "note": "Clarify the intent to provide exact string to enter" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} }, { - "key": "expected_ui_state", - "category": "context_free_evaluation", - "note": "Used locator check" + "evaluator": "NetworkEventEvaluator", + "site": "shopping", + "url_match_mode": "exact", + "last_event_only": true, + "expected": { + "url": "__SHOPPING__/contact", + "response_status": 200, + "event_type": "navigation" + } } - ] + ], + "revision": 2 }, { - "sites": [ - "gitlab" - ], + "sites": ["gitlab"], "task_id": 658, - "require_login": true, - "storage_state": "./.auth/gitlab_state.json", - "start_url": "__GITLAB__", - "geolocation": null, + "intent_template_id": 327, + "start_urls": ["__GITLAB__"], + "intent": "Create an issue in a11yproject repo with title \"401 bad gateway\". Assign the issue to Roshanjossey. Set due date to be the end of 2030", "intent_template": "Create an issue in {{repo}} repo with title \"{{issue}}\". Assign the issue to {{account}}. Set due date to be {{due}}", "instantiation_dict": { "repo": "a11yproject", @@ -38773,86 +18703,35 @@ "account": "Roshanjossey", "due": "the end of 2030" }, - "intent": "Create an issue in a11yproject repo with title \"401 bad gateway\". Assign the issue to Roshanjossey. Set due date to be the end of 2030", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "project_has_issue_with_fields", - "eval_params": { - "group": "a11yproject", - "project": "a11yproject.com", - "fields": { - "title": "401 bad gateway", - "assignee.username": "Roshanjossey", - "due_date": "2030-12-31" - } - } - } - } - ], - "site": "gitlab" - }, - "original.eval": { - "eval_types": [ - "url_match", - "program_html" - ], - "reference_answers": null, - "reference_url": "__GITLAB__/a11yproject/a11yproject.com/-/issues", - "program_html": [ - { - "url": "last", - "locator": "document.querySelector('[data-qa-selector=\"title_content\"]').outerText", - "required_contents": { - "exact_match": "401 bad gateway" - } - }, - { - "url": "last", - "locator": "document.querySelector('[data-testid=\"sidebar-due-date\"').outerText", - "required_contents": { - "must_include": [ - "Dec 31, 2030" - ] - } - }, - { - "url": "last", - "locator": "document.querySelector('.block.assignee').outerText", - "required_contents": { - "must_include": [ - "Roshan Jossey" - ] - } - } - ], - "url_note": "GOLD in PRED" - }, - "intent_template_id": 327, - "changelogs": [ + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "expected_backend_state", - "category": "switch_to_api_eval", - "note": "Use API to verify; more accurate" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "group": "a11yproject", + "project": "a11yproject.com", + "fields": { + "title": "401 bad gateway", + "assignee.username": "Roshanjossey", + "due_date": "2030-12-31" + }, + "site": "gitlab", + "expected": true } - ] + ], + "revision": 2 }, { - "sites": [ - "gitlab" - ], + "sites": ["gitlab"], "task_id": 659, - "require_login": true, - "storage_state": "./.auth/gitlab_state.json", - "start_url": "__GITLAB__", - "geolocation": null, + "intent_template_id": 327, + "start_urls": ["__GITLAB__"], + "intent": "Create an issue in empathy-prompts repo with title \"Integrating LLMs for better prompts\". Assign the issue to Roshanjossey. Set due date to be the beginning of Q2 2033", "intent_template": "Create an issue in {{repo}} repo with title \"{{issue}}\". Assign the issue to {{account}}. Set due date to be {{due}}", "instantiation_dict": { "repo": "empathy-prompts", @@ -38860,86 +18739,35 @@ "account": "Roshanjossey", "due": "the beginning of Q2 2033" }, - "intent": "Create an issue in empathy-prompts repo with title \"Integrating LLMs for better prompts\". Assign the issue to Roshanjossey. Set due date to be the beginning of Q2 2033", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "project_has_issue_with_fields", - "eval_params": { - "group": "byteblaze", - "project": "empathy-prompts", - "fields": { - "title": "Integrating LLMs for better prompts", - "assignee.username": "Roshanjossey", - "due_date": "2033-04-01" - } - } - } - } - ], - "site": "gitlab" - }, - "original.eval": { - "eval_types": [ - "url_match", - "program_html" - ], - "reference_answers": null, - "reference_url": "__GITLAB__/byteblaze/empathy-prompts/-/issues", - "program_html": [ - { - "url": "last", - "locator": "document.querySelector('[data-qa-selector=\"title_content\"]').outerText", - "required_contents": { - "exact_match": "Integrating LLMs for better prompts" - } + "format_specification": null, + "start_url_context": null, + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + }, + { + "evaluator": "BackendStateEvaluator", + "group": "byteblaze", + "project": "empathy-prompts", + "fields": { + "title": "Integrating LLMs for better prompts", + "assignee.username": "Roshanjossey", + "due_date": "2033-04-01" }, - { - "url": "last", - "locator": "document.querySelector('[data-testid=\"sidebar-due-date\"').outerText", - "required_contents": { - "must_include": [ - "Apr 1, 2033" - ] - } - }, - { - "url": "last", - "locator": "document.querySelector('.block.assignee').outerText", - "required_contents": { - "must_include": [ - "Roshan Jossey" - ] - } - } - ], - "url_note": "GOLD in PRED" - }, - "intent_template_id": 327, - "changelogs": [ - { - "key": "expected_backend_state", - "category": "switch_to_api_eval", - "note": "Use API to verify; more accurate" - }, - { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "site": "gitlab", + "expected": true } - ] + ], + "revision": 2 }, { - "sites": [ - "gitlab" - ], + "sites": ["gitlab"], "task_id": 660, - "require_login": true, - "storage_state": "./.auth/gitlab_state.json", - "start_url": "__GITLAB__", - "geolocation": null, + "intent_template_id": 327, + "start_urls": ["__GITLAB__"], + "intent": "Create an issue in dotfiles repo with title \"add support for oh-my-zsh\". Assign the issue to Abishek. Set due date to be July 18 2033", "intent_template": "Create an issue in {{repo}} repo with title \"{{issue}}\". Assign the issue to {{account}}. Set due date to be {{due}}", "instantiation_dict": { "repo": "dotfiles", @@ -38947,2110 +18775,1076 @@ "account": "Abishek", "due": "July 18 2033" }, - "intent": "Create an issue in dotfiles repo with title \"add support for oh-my-zsh\". Assign the issue to Abishek. Set due date to be July 18 2033", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "project_has_issue_with_fields", - "eval_params": { - "group": "byteblaze", - "project": "dotfiles", - "fields": { - "title": "add support for oh-my-zsh", - "assignee.username": "abisubramanya27", - "due_date": "2033-07-18" - } - } - } - } - ], - "site": "gitlab" - }, - "original.eval": { - "eval_types": [ - "url_match", - "program_html" - ], - "reference_answers": null, - "reference_url": "__GITLAB__/byteblaze/dotfiles/-/tree/main", - "program_html": [ - { - "url": "last", - "locator": "document.querySelector('[data-qa-selector=\"title_content\"]').outerText", - "required_contents": { - "exact_match": "add support for oh-my-zsh" - } - }, - { - "url": "last", - "locator": "document.querySelector('[data-testid=\"sidebar-due-date\"').outerText", - "required_contents": { - "must_include": [ - "Jul 18, 2033" - ] - } - }, - { - "url": "last", - "locator": "document.querySelector('.block.assignee').outerText", - "required_contents": { - "must_include": [ - "Abishek S" - ] - } - } - ], - "url_note": "GOLD in PRED" - }, - "intent_template_id": 327, - "changelogs": [ + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "expected_backend_state", - "category": "switch_to_api_eval", - "note": "Use API to verify; more accurate" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "group": "byteblaze", + "project": "dotfiles", + "fields": { + "title": "add support for oh-my-zsh", + "assignee.username": "abisubramanya27", + "due_date": "2033-07-18" + }, + "site": "gitlab", + "expected": true } - ] + ], + "revision": 2 }, { - "sites": [ - "gitlab" - ], + "sites": ["gitlab"], "task_id": 661, - "require_login": true, - "storage_state": "./.auth/gitlab_state.json", - "start_url": "__GITLAB__", - "geolocation": null, - "intent_template": "Open an issue to {{issue}} in {{repo}}.", + "intent_template_id": 328, + "start_urls": ["__GITLAB__"], + "intent": "Open an issue with title \"Connection refused\" to discuss the connection refused exception in the respository ChatGPT.", + "intent_template": "Open an issue with title \"{{title}}\" to {{issue}} in the respository {{repo}}.", "instantiation_dict": { - "repo": "ChatGPT", - "issue": "report the issue of connection refused" - }, - "intent": "Open an issue to report the issue of connection refused in ChatGPT.", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "project_has_issue_with_fields", - "eval_params": { - "group": "convexegg", - "project": "chatgpt", - "fields": {} - } - } - } - ], - "site": "gitlab" - }, - "original.eval": { - "eval_types": [ - "url_match", - "program_html" - ], - "reference_answers": null, - "reference_url": "__GITLAB__/convexegg/chatgpt/-/issues", - "program_html": [ - { - "url": "last", - "locator": "document.querySelector('.detail-page-description').outerText", - "required_contents": { - "must_include": [ - "connection refused" - ] - } - } - ], - "url_note": "GOLD in PRED" + "title": "Connection refused", + "issue": "discuss the connection refused exception", + "repo": "ChatGPT" }, - "intent_template_id": 328, - "changelogs": [ + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "expected_backend_state", - "category": "switch_to_api_eval", - "note": "Use API to verify; more accurate" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "group": "convexegg", + "project": "chatgpt", + "fields": {"title": "Connection refused"}, + "site": "gitlab", + "expected": true } - ] + ], + "revision": 2 }, { - "sites": [ - "gitlab" - ], + "sites": ["gitlab"], "task_id": 662, - "require_login": true, - "storage_state": "./.auth/gitlab_state.json", - "start_url": "__GITLAB__", - "geolocation": null, - "intent_template": "Open an issue to {{issue}} in {{repo}}.", + "intent_template_id": 328, + "start_urls": ["__GITLAB__"], + "intent": "Open an issue with title \"OSError: [Errno 98] Address already in use\" to discuss the OS error in the respository aem-hacker.", + "intent_template": "Open an issue with title \"{{title}}\" to {{issue}} in the respository {{repo}}.", "instantiation_dict": { - "repo": "aem-hacker", - "issue": "report experiencing \"OSError: [Errno 98] Address already in use\" during executions" - }, - "intent": "Open an issue to report experiencing \"OSError: [Errno 98] Address already in use\" during executions in aem-hacker.", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "project_has_issue_with_fields", - "eval_params": { - "group": "0ang3el", - "project": "aem-hacker", - "fields": {} - } - } - } - ], - "site": "gitlab" - }, - "original.eval": { - "eval_types": [ - "url_match", - "program_html" - ], - "reference_answers": null, - "reference_url": "__GITLAB__/0ang3el/aem-hacker/-/issues", - "program_html": [ - { - "url": "last", - "locator": "document.querySelector('.detail-page-description').outerText", - "required_contents": { - "must_include": [ - "OSError: [Errno 98] Address already in use" - ] - } - } - ], - "url_note": "GOLD in PRED" + "title": "OSError: [Errno 98] Address already in use", + "issue": "discuss the OS error", + "repo": "aem-hacker" }, - "intent_template_id": 328, - "changelogs": [ + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "expected_backend_state", - "category": "switch_to_api_eval", - "note": "Use API to verify; more accurate" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "group": "0ang3el", + "project": "aem-hacker", + "fields": {"title": "OSError: [Errno 98] Address already in use"}, + "site": "gitlab", + "expected": true } - ] + ], + "revision": 2 }, { - "sites": [ - "gitlab" - ], + "sites": ["gitlab"], "task_id": 663, - "require_login": true, - "storage_state": "./.auth/gitlab_state.json", - "start_url": "__GITLAB__", - "geolocation": null, - "intent_template": "Open an issue to {{issue}} in {{repo}}.", + "intent_template_id": 328, + "start_urls": ["__GITLAB__"], + "intent": "Open an issue with title \"Llama model support timeline\" to ask the team's plan on supporting Llama and other llama family models in the respository metaseq.", + "intent_template": "Open an issue with title \"{{title}}\" to {{issue}} in the respository {{repo}}.", "instantiation_dict": { - "repo": "metaseq", - "issue": "ask their plan on supporting Llama and other llama family models" - }, - "intent": "Open an issue to ask their plan on supporting Llama and other llama family models in metaseq.", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "project_has_issue_with_fields", - "eval_params": { - "group": "root", - "project": "metaseq", - "fields": {} - } - } - } - ], - "site": "gitlab" - }, - "original.eval": { - "eval_types": [ - "url_match", - "program_html" - ], - "reference_answers": null, - "reference_url": "__GITLAB__/root/metaseq/-/issues", - "program_html": [ - { - "url": "last", - "locator": "document.querySelector('.detail-page-description').outerText", - "required_contents": { - "must_include": [ - "llama" - ] - } - } - ], - "url_note": "GOLD in PRED" + "title": "Llama model support timeline", + "issue": "ask the team's plan on supporting Llama and other llama family models", + "repo": "metaseq" }, - "intent_template_id": 328, - "changelogs": [ + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "expected_backend_state", - "category": "switch_to_api_eval", - "note": "Use API to verify; more accurate" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "group": "root", + "project": "metaseq", + "fields": {"title": "Llama model support timeline"}, + "site": "gitlab", + "expected": true } - ] + ], + "revision": 2 }, { - "sites": [ - "gitlab" - ], + "sites": ["gitlab"], "task_id": 664, - "require_login": true, - "storage_state": "./.auth/gitlab_state.json", - "start_url": "__GITLAB__", - "geolocation": null, - "intent_template": "Open an issue to {{issue}} in {{repo}}.", + "intent_template_id": 328, + "start_urls": ["__GITLAB__"], + "intent": "Open an issue with title \"Question on future usage of Python 3.11\" to ask the team's plans on adding Python 3.11 related resources in the respository awesome-python.", + "intent_template": "Open an issue with title \"{{title}}\" to {{issue}} in the respository {{repo}}.", "instantiation_dict": { - "repo": "awesome-python", - "issue": "ask their plans on adding Python 3.11 related resources" - }, - "intent": "Open an issue to ask their plans on adding Python 3.11 related resources in awesome-python.", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "project_has_issue_with_fields", - "eval_params": { - "group": "vinta", - "project": "awesome-python", - "fields": {} - } - } - } - ], - "site": "gitlab" - }, - "original.eval": { - "eval_types": [ - "url_match", - "program_html" - ], - "reference_answers": null, - "reference_url": "__GITLAB__/vinta/awesome-python/-/issues", - "program_html": [ - { - "url": "last", - "locator": "document.querySelector('.detail-page-description').outerText", - "required_contents": { - "must_include": [ - "Python 3.11" - ] - } - } - ], - "url_note": "GOLD in PRED" + "title": "Question on future usage of Python 3.11", + "issue": "ask the team's plans on adding Python 3.11 related resources", + "repo": "awesome-python" }, - "intent_template_id": 328, - "changelogs": [ + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "expected_backend_state", - "category": "switch_to_api_eval", - "note": "Use API to verify; more accurate" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "group": "vinta", + "project": "awesome-python", + "fields": {"title": "Question on future usage of Python 3.11"}, + "site": "gitlab", + "expected": true } - ] + ], + "revision": 2 }, { - "sites": [ - "gitlab" - ], + "sites": ["gitlab"], "task_id": 665, - "require_login": true, - "storage_state": "./.auth/gitlab_state.json", - "start_url": "__GITLAB__", - "geolocation": null, - "intent_template": "Open an issue to {{issue}} in {{repo}}.", + "intent_template_id": 328, + "start_urls": ["__GITLAB__"], + "intent": "Open an issue with title \"Feature Request: MT support\" to request adding support for MT theme editor in the respository a11y-syntax-highlighting.", + "intent_template": "Open an issue with title \"{{title}}\" to {{issue}} in the respository {{repo}}.", "instantiation_dict": { - "repo": "a11y-syntax-highlighting", - "issue": "request adding support for MT theme editor" - }, - "intent": "Open an issue to request adding support for MT theme editor in a11y-syntax-highlighting.", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "project_has_issue_with_fields", - "eval_params": { - "group": "byteblaze", - "project": "a11y-syntax-highlighting", - "fields": {} - } - } - } - ], - "site": "gitlab" + "title": "Feature Request: MT support", + "issue": "request adding support for MT theme editor", + "repo": "a11y-syntax-highlighting" }, - "original.eval": { - "eval_types": [ - "url_match", - "program_html" - ], - "reference_answers": null, - "reference_url": "__GITLAB__/byteblaze/a11y-syntax-highlighting/-/issues", - "program_html": [ - { - "url": "last", - "locator": "document.querySelector('.detail-page-description').outerText", - "required_contents": { - "must_include": [ - "MT theme editor" - ] - } - } - ], - "url_note": "GOLD in PRED" - }, - "intent_template_id": 328, - "changelogs": [ + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "expected_backend_state", - "category": "switch_to_api_eval", - "note": "Use API to verify; more accurate" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "group": "byteblaze", + "project": "a11y-syntax-highlighting", + "fields": {"title": "Feature Request: MT support"}, + "site": "gitlab", + "expected": true } - ] + ], + "revision": 2 }, { - "sites": [ - "gitlab" - ], + "sites": ["gitlab"], "task_id": 666, - "require_login": true, - "storage_state": "./.auth/gitlab_state.json", - "start_url": "__GITLAB__/primer/design", - "geolocation": null, + "intent_template_id": 335, + "start_urls": ["__GITLAB__/primer/design"], + "intent": "Submit a request to merge dialog-component branch into dialog branch, assign Primer as the reviewer", "intent_template": "Submit a request to merge {{source_branch}} branch into {{target_branch}} branch, assign {{reviewer}} as the reviewer", "instantiation_dict": { "source_branch": "dialog-component", "target_branch": "dialog", - "reviewer": "Carol" + "reviewer": "Primer" }, - "intent": "Submit a request to merge dialog-component branch into dialog branch, assign Carol as the reviewer", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "validate_status", - "expected_data": { - "expected_status": "RESOURCE_NOT_FOUND_ERROR" - } - } - } - ], - "site": "gitlab" - }, - "original.eval": { - "eval_types": [ - "url_match", - "program_html" - ], - "reference_answers": null, - "reference_url": "__GITLAB__/primer/design/-/merge_requests", - "program_html": [ - { - "url": "last", - "locator": "document.querySelectorAll(\".detail-page-description > a.gl-font-monospace\")[1].outerText", - "required_contents": { - "exact_match": "dialog" - } - }, - { - "url": "last", - "locator": "document.querySelectorAll(\".detail-page-description > a.gl-font-monospace\")[0].outerText", - "required_contents": { - "exact_match": "dialog-component" - } - }, - { - "url": "last", - "locator": "document.querySelector('.block.reviewer').outerText", - "required_contents": { - "must_include": [ - "Caroline Stewart" - ] - } - } - ], - "url_note": "GOLD in PRED" - }, - "intent_template_id": 335, - "changelogs": [ + "format_specification": null, + "start_url_context": "Complete the task given the context of the current page.", + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + }, { - "key": "expected_retrieve_value", - "category": "unachievable_tasks", - "note": "No user named Carol" + "evaluator": "BackendStateEvaluator", + "group": "primer", + "project": "design", + "source_branch": "dialog-component", + "target_branch": "dialog", + "reviewer": "Primer", + "site": "gitlab", + "expected": true } - ] + ], + "revision": 2 }, { - "sites": [ - "gitlab" - ], + "sites": ["gitlab"], "task_id": 667, - "require_login": true, - "storage_state": "./.auth/gitlab_state.json", - "start_url": "__GITLAB__/primer/design", - "geolocation": null, + "intent_template_id": 335, + "start_urls": ["__GITLAB__/primer/design"], + "intent": "Submit a merge request for dialog-component branch to be merged into bump-doctocat branch, assign primer as the reviewer", "intent_template": "Submit a merge request for {{source_branch}} branch to be merged into {{target_branch}} branch, assign {{reviewer}} as the reviewer", "instantiation_dict": { "source_branch": "dialog-component", "target_branch": "bump-doctocat", "reviewer": "primer" }, - "intent": "Submit a merge request for dialog-component branch to be merged into bump-doctocat branch, assign primer as the reviewer", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "check_merge_request_has_values", - "eval_params": { - "group": "primer", - "project": "design", - "source_branch": "dialog-component", - "target_branch": "bump-doctocat", - "reviewer": "primer" - } - } - } - ], - "site": "gitlab" - }, - "original.eval": { - "eval_types": [ - "url_match", - "program_html" - ], - "reference_answers": null, - "reference_url": "__GITLAB__/primer/design/-/merge_requests", - "program_html": [ - { - "url": "last", - "locator": "document.querySelectorAll(\".detail-page-description > a.gl-font-monospace\")[1].outerText", - "required_contents": { - "exact_match": "bump-doctocat" - } - }, - { - "url": "last", - "locator": "document.querySelectorAll(\".detail-page-description > a.gl-font-monospace\")[0].outerText", - "required_contents": { - "exact_match": "dialog-component" - } - }, - { - "url": "last", - "locator": "document.querySelector('.block.reviewer').outerText", - "required_contents": { - "must_include": [ - "Primer" - ] - } - } - ], - "url_note": "GOLD in PRED" - }, - "intent_template_id": 335, - "changelogs": [ + "format_specification": null, + "start_url_context": "Complete the task given the context of the current page.", + "eval": [ { - "key": "expected_backend_state", - "category": "switch_to_api_eval", - "note": "Use API to verify; more accurate" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "group": "primer", + "project": "design", + "source_branch": "dialog-component", + "target_branch": "bump-doctocat", + "reviewer": "primer", + "site": "gitlab", + "expected": true } - ] + ], + "revision": 2 }, { - "sites": [ - "gitlab" - ], + "sites": ["gitlab"], "task_id": 668, - "require_login": true, - "storage_state": "./.auth/gitlab_state.json", - "start_url": "__GITLAB__", - "geolocation": null, + "intent_template_id": 335, + "start_urls": ["__GITLAB__"], + "intent": "Submit a merge request for a11yproject.com/redesign branch to be merged into main branch, assign Roshan Jossy as the reviewer", "intent_template": "Submit a merge request for {{source_branch}} branch to be merged into {{target_branch}} branch, assign {{reviewer}} as the reviewer", "instantiation_dict": { "source_branch": "a11yproject.com/redesign", - "target_branch": "master", - "reviewer": "Justin Armstrong" - }, - "intent": "Submit a merge request for a11yproject.com/redesign branch to be merged into master branch, assign Justin Armstrong as the reviewer", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "validate_status", - "expected_data": { - "expected_status": "RESOURCE_NOT_FOUND_ERROR" - } - } - } - ], - "site": "gitlab" - }, - "original.eval": { - "eval_types": [ - "url_match", - "program_html" - ], - "reference_answers": null, - "reference_url": "__GITLAB__/a11yproject/a11yproject.com/-/merge_requests", - "program_html": [ - { - "url": "last", - "locator": "document.querySelectorAll(\".detail-page-description > a.gl-font-monospace\")[0].outerText", - "required_contents": { - "exact_match": "redesign" - } - }, - { - "url": "last", - "locator": "document.querySelectorAll(\".detail-page-description > a.gl-font-monospace\")[1].outerText", - "required_contents": { - "exact_match": "main" - } - }, - { - "url": "last", - "locator": "document.querySelector('.block.reviewer').outerText", - "required_contents": { - "must_include": [ - "Justin Armstrong" - ] - } - } - ], - "url_note": "GOLD in PRED" + "target_branch": "main", + "reviewer": "Roshan Jossy" }, - "intent_template_id": 335, - "changelogs": [ + "format_specification": null, + "start_url_context": null, + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + }, { - "key": "expected_retrieve_value", - "category": "unachievable_tasks", - "note": "No branch named master" + "evaluator": "BackendStateEvaluator", + "group": "a11yproject", + "project": "a11yproject.com", + "source_branch": "redesign", + "target_branch": "main", + "reviewer": "Roshan Jossy", + "site": "gitlab", + "expected": true } - ] + ], + "revision": 2 }, { - "sites": [ - "gitlab" - ], + "sites": ["gitlab"], "task_id": 669, - "require_login": true, - "storage_state": "./.auth/gitlab_state.json", - "start_url": "__GITLAB__/byteblaze/solarized-prism-theme", - "geolocation": null, - "intent_template": "Open a new issue to discuss the implementation of {{feature}}", - "instantiation_dict": { - "feature": "dark mode" - }, - "intent": "Open a new issue to discuss the implementation of dark mode", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "project_has_issue_with_fields", - "eval_params": { - "group": "byteblaze", - "project": "solarized-prism-theme", - "fields": { - "title": "dark mode" - }, - "title_field_is_substring": true - } - } - } - ], - "site": "gitlab" - }, - "original.eval": { - "eval_types": [ - "url_match", - "program_html" - ], - "reference_answers": null, - "reference_url": "__GITLAB__/byteblaze/solarized-prism-theme/-/issues", - "program_html": [ - { - "url": "last", - "locator": "document.querySelector('.detail-page-description').outerText", - "required_contents": { - "must_include": [ - "implementation", - "dark mode" - ] - } - } - ], - "url_note": "GOLD in PRED" - }, "intent_template_id": 337, - "changelogs": [ + "start_urls": ["__GITLAB__/byteblaze/solarized-prism-theme"], + "intent": "Open a new issue to discuss the implementation of dark mode", + "intent_template": "Open a new issue to discuss the implementation of {{feature}}", + "instantiation_dict": {"feature": "dark mode"}, + "format_specification": null, + "start_url_context": "Complete the task given the context of the current page.", + "eval": [ { - "key": "expected_backend_state", - "category": "switch_to_api_eval", - "note": "Use API to verify; more accurate" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "group": "byteblaze", + "project": "solarized-prism-theme", + "fields": {"title": "dark mode"}, + "title_field_is_substring": true, + "site": "gitlab", + "expected": true } - ] + ], + "revision": 2 }, { - "sites": [ - "gitlab" - ], + "sites": ["gitlab"], "task_id": 670, - "require_login": true, - "storage_state": "./.auth/gitlab_state.json", - "start_url": "__GITLAB__/byteblaze/dotfiles", - "geolocation": null, - "intent_template": "Open a new issue to discuss the implementation of {{feature}}", - "instantiation_dict": { - "feature": "default plugins for .zsh" - }, - "intent": "Open a new issue to discuss the implementation of default plugins for .zsh", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "project_has_issue_with_fields", - "eval_params": { - "group": "byteblaze", - "project": "dotfiles", - "fields": { - "title": "default plugins for .zsh" - }, - "title_field_is_substring": true - } - } - } - ], - "site": "gitlab" - }, - "original.eval": { - "eval_types": [ - "url_match", - "program_html" - ], - "reference_answers": null, - "reference_url": "__GITLAB__/byteblaze/dotfiles/-/issues", - "program_html": [ - { - "url": "last", - "locator": "document.querySelector('.detail-page-description').outerText", - "required_contents": { - "must_include": [ - "implementation", - "default plugins", - "zsh" - ] - } - } - ], - "url_note": "GOLD in PRED" - }, "intent_template_id": 337, - "changelogs": [ + "start_urls": ["__GITLAB__/byteblaze/dotfiles"], + "intent": "Open a new issue to discuss the implementation of default plugins for .zsh", + "intent_template": "Open a new issue to discuss the implementation of {{feature}}", + "instantiation_dict": {"feature": "default plugins for .zsh"}, + "format_specification": null, + "start_url_context": "Complete the task given the context of the current page.", + "eval": [ { - "key": "expected_backend_state", - "category": "switch_to_api_eval", - "note": "Use API to verify; more accurate" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "group": "byteblaze", + "project": "dotfiles", + "fields": {"title": "default plugins for .zsh"}, + "title_field_is_substring": true, + "site": "gitlab", + "expected": true } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping", - "reddit" - ], + "sites": ["shopping", "reddit"], "task_id": 671, - "require_login": true, - "storage_state": "./.auth/reddit_state.json", - "start_url": "__SHOPPING__", - "geolocation": null, - "intent_template": "Gather the titles of {{product}} reviews with {{rating}} rating from OneStopShop, and post them in the games subreddit under the title \"real user feedback on {{product}}\"", - "instantiation_dict": { - "product": "Sony Computer Entertainment VR", - "rating": "2 stars and less" - }, + "intent_template_id": 101, + "start_urls": ["__SHOPPING__", "__REDDIT__"], "intent": "Gather the titles of Sony Computer Entertainment VR reviews with 2 stars and less rating from OneStopShop, and post them in the games subreddit under the title \"real user feedback on Sony Computer Entertainment VR\"", - "require_reset": false, - "eval": { - "expected_ui_state": [ - { - "url": "__REDDIT__/f/gaming" - } - ], - "site": "reddit" - }, - "original.eval": { - "eval_types": [ - "url_match", - "program_html" - ], - "reference_answers": null, - "reference_url": "__REDDIT__/f/gaming", - "program_html": [ - { - "url": "func:reddit_get_post_url('__last_url__')", - "locator": "document.querySelector('.submission__title').outerText", - "required_contents": { - "exact_match": "real user feedback on Sony Computer Entertainment VR" - } - }, - { - "url": "func:reddit_get_post_url('__last_url__')", - "locator": "document.querySelector('.submission__body').outerText", - "required_contents": { - "must_include": [ - "didn't last a year without issues", - "Disappointing. Didn't last long before it stopped powering on and needed to be sent in for repair.", - "Received used items!!" - ] - } + "intent_template": "Gather the titles of {{product}} reviews with {{rating}} rating from OneStopShop, and post them in the games subreddit under the title \"real user feedback on {{product}}\"", + "instantiation_dict": {"product": "Sony Computer Entertainment VR", "rating": "2 stars and less"}, + "format_specification": null, + "start_url_context": null, + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} + }, + { + "evaluator": "NetworkEventEvaluator", + "site": "reddit", + "url_match_mode": "exact", + "last_event_only": true, + "expected": { + "url": "__REDDIT__/f/gaming", + "response_status": 200, + "event_type": "navigation" } - ], - "url_note": "GOLD in PRED" - }, - "intent_template_id": 101 + } + ], + "revision": 2 }, { - "sites": [ - "shopping", - "reddit" - ], + "sites": ["shopping", "reddit"], "task_id": 672, - "require_login": true, - "storage_state": "./.auth/reddit_state.json", - "start_url": "__SHOPPING__", - "geolocation": null, + "intent_template_id": 101, + "start_urls": ["__SHOPPING__", "__REDDIT__"], + "intent": "Gather the titles of Nintendo Switch Fortnite Wildcat Console EU reviews with 3 stars and less rating from OneStopShop, and post them in the games subreddit under the title \"real user feedback on Nintendo Switch Fortnite Wildcat Console EU\"", "intent_template": "Gather the titles of {{product}} reviews with {{rating}} rating from OneStopShop, and post them in the games subreddit under the title \"real user feedback on {{product}}\"", "instantiation_dict": { "product": "Nintendo Switch Fortnite Wildcat Console EU", "rating": "3 stars and less" }, - "intent": "Gather the titles of Nintendo Switch Fortnite Wildcat Console EU reviews with 3 stars and less rating from OneStopShop, and post them in the games subreddit under the title \"real user feedback on Nintendo Switch Fortnite Wildcat Console EU\"", - "require_reset": false, - "eval": { - "expected_ui_state": [ - { - "url": "__REDDIT__/f/gaming" - } - ], - "site": "reddit" - }, - "original.eval": { - "eval_types": [ - "url_match", - "program_html" - ], - "reference_answers": null, - "reference_url": "__REDDIT__/f/gaming", - "program_html": [ - { - "url": "func:reddit_get_post_url('__last_url__')", - "locator": "document.querySelector('.submission__title').outerText", - "required_contents": { - "exact_match": "real user feedback on Nintendo Switch Fortnite Wildcat Console EU" - } - }, - { - "url": "func:reddit_get_post_url('__last_url__')", - "locator": "document.querySelector('.submission__body').outerText", - "required_contents": { - "must_include": [ - "EU charger and wild cat card doesn\u2019t even work!", - "REFUND REJECTED", - "Charging port not compatible", - "not compatible in the US", - "Wildcard Bonus Credits Not Redeemable!", - "Code not available!!" - ] - } + "format_specification": null, + "start_url_context": null, + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} + }, + { + "evaluator": "NetworkEventEvaluator", + "site": "reddit", + "url_match_mode": "exact", + "last_event_only": true, + "expected": { + "url": "__REDDIT__/f/gaming", + "response_status": 200, + "event_type": "navigation" } - ], - "url_note": "GOLD in PRED" - }, - "intent_template_id": 101 + } + ], + "revision": 2 }, { - "sites": [ - "shopping", - "reddit" - ], + "sites": ["shopping", "reddit"], "task_id": 673, - "require_login": true, - "storage_state": "./.auth/reddit_state.json", - "start_url": "__SHOPPING__", - "geolocation": null, - "intent_template": "Gather the titles of {{product}} reviews with {{rating}} rating from OneStopShop, and post them in the games subreddit under the title \"real user feedback on {{product}}\"", - "instantiation_dict": { - "product": "Racing Wheel Overdrive for Xbox X", - "rating": "1 star" - }, + "intent_template_id": 101, + "start_urls": ["__SHOPPING__", "__REDDIT__"], "intent": "Gather the titles of Racing Wheel Overdrive for Xbox X reviews with 1 star rating from OneStopShop, and post them in the games subreddit under the title \"real user feedback on Racing Wheel Overdrive for Xbox X\"", - "require_reset": false, - "eval": { - "expected_ui_state": [ - { - "url": "__REDDIT__/f/gaming" - } - ], - "site": "reddit" - }, - "original.eval": { - "eval_types": [ - "url_match", - "program_html" - ], - "reference_answers": null, - "reference_url": "__REDDIT__/f/gaming", - "program_html": [ - { - "url": "func:reddit_get_post_url('__last_url__')", - "locator": "document.querySelector('.submission__title').outerText", - "required_contents": { - "exact_match": "real user feedback on Racing Wheel Overdrive for Xbox X" - } - }, - { - "url": "func:reddit_get_post_url('__last_url__')", - "locator": "document.querySelector('.submission__body').outerText", - "required_contents": { - "must_include": [ - "Unable to set neutral steering", - "Doesn\u2019t work with PC", - "Crazy problems in automatic mode", - "pedals stopped working", - "Only works with certain games" - ] - } + "intent_template": "Gather the titles of {{product}} reviews with {{rating}} rating from OneStopShop, and post them in the games subreddit under the title \"real user feedback on {{product}}\"", + "instantiation_dict": {"product": "Racing Wheel Overdrive for Xbox X", "rating": "1 star"}, + "format_specification": null, + "start_url_context": null, + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} + }, + { + "evaluator": "NetworkEventEvaluator", + "site": "reddit", + "url_match_mode": "exact", + "last_event_only": true, + "expected": { + "url": "__REDDIT__/f/gaming", + "response_status": 200, + "event_type": "navigation" } - ], - "url_note": "GOLD in PRED" - }, - "intent_template_id": 101 + } + ], + "revision": 2 }, { - "sites": [ - "shopping", - "reddit" - ], + "sites": ["shopping", "reddit"], "task_id": 674, - "require_login": true, - "storage_state": "./.auth/reddit_state.json", - "start_url": "__SHOPPING__", - "geolocation": null, + "intent_template_id": 101, + "start_urls": ["__SHOPPING__", "__REDDIT__"], + "intent": "Gather the titles of Doc and Pies Arcade Factory Cocktail Arcade Machine reviews with 3 stars and less rating from OneStopShop, and post them in the games subreddit under the title \"real user feedback on Doc and Pies Arcade Factory Cocktail Arcade Machine\"", "intent_template": "Gather the titles of {{product}} reviews with {{rating}} rating from OneStopShop, and post them in the games subreddit under the title \"real user feedback on {{product}}\"", "instantiation_dict": { "product": "Doc and Pies Arcade Factory Cocktail Arcade Machine", "rating": "3 stars and less" }, - "intent": "Gather the titles of Doc and Pies Arcade Factory Cocktail Arcade Machine reviews with 3 stars and less rating from OneStopShop, and post them in the games subreddit under the title \"real user feedback on Doc and Pies Arcade Factory Cocktail Arcade Machine\"", - "require_reset": false, - "eval": { - "expected_ui_state": [ - { - "url": "__REDDIT__/f/gaming" - } - ], - "site": "reddit" - }, - "original.eval": { - "eval_types": [ - "url_match", - "program_html" - ], - "reference_answers": null, - "reference_url": "__REDDIT__/f/gaming", - "program_html": [ - { - "url": "func:reddit_get_post_url('__last_url__')", - "locator": "document.querySelector('.submission__title').outerText", - "required_contents": { - "exact_match": "real user feedback on Doc and Pies Arcade Factory Cocktail Arcade Machine" - } - }, - { - "url": "func:reddit_get_post_url('__last_url__')", - "locator": "document.querySelector('.submission__body').outerText", - "required_contents": { - "must_include": [ - "Poorly Made Exterior. Consider a different Company.", - "piece of junk ,..can't believe I spent money on this !!!!", - "Based arrived broken but game itself works" - ] - } + "format_specification": null, + "start_url_context": null, + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} + }, + { + "evaluator": "NetworkEventEvaluator", + "site": "reddit", + "url_match_mode": "exact", + "last_event_only": true, + "expected": { + "url": "__REDDIT__/f/gaming", + "response_status": 200, + "event_type": "navigation" } - ], - "url_note": "GOLD in PRED" - }, - "intent_template_id": 101 + } + ], + "revision": 2 }, { - "sites": [ - "shopping", - "reddit" - ], + "sites": ["shopping", "reddit"], "task_id": 675, - "require_login": true, - "storage_state": "./.auth/reddit_state.json", - "start_url": "__SHOPPING__", - "geolocation": null, - "intent_template": "Gather the titles of {{product}} reviews with {{rating}} rating from OneStopShop, and post them in the games subreddit under the title \"real user feedback on {{product}}\"", - "instantiation_dict": { - "product": "HORI 3D Surround Gaming Neckset", - "rating": "2 stars and less" - }, + "intent_template_id": 101, + "start_urls": ["__SHOPPING__", "__REDDIT__"], "intent": "Gather the titles of HORI 3D Surround Gaming Neckset reviews with 2 stars and less rating from OneStopShop, and post them in the games subreddit under the title \"real user feedback on HORI 3D Surround Gaming Neckset\"", - "require_reset": false, - "eval": { - "expected_ui_state": [ - { - "url": "__REDDIT__/f/gaming" - } - ], - "site": "reddit" - }, - "original.eval": { - "eval_types": [ - "url_match", - "program_html" - ], - "reference_answers": null, - "reference_url": "__REDDIT__/f/gaming", - "program_html": [ - { - "url": "func:reddit_get_post_url('__last_url__')", - "locator": "document.querySelector('.submission__title').outerText", - "required_contents": { - "exact_match": "real user feedback on HORI 3D Surround Gaming Neckset" - } - }, - { - "url": "func:reddit_get_post_url('__last_url__')", - "locator": "document.querySelector('.submission__body').outerText", - "required_contents": { - "must_include": [ - "Not worth it for PC users", - "I really wanted to like this.", - "I wish this was better..." - ] - } + "intent_template": "Gather the titles of {{product}} reviews with {{rating}} rating from OneStopShop, and post them in the games subreddit under the title \"real user feedback on {{product}}\"", + "instantiation_dict": {"product": "HORI 3D Surround Gaming Neckset", "rating": "2 stars and less"}, + "format_specification": null, + "start_url_context": null, + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} + }, + { + "evaluator": "NetworkEventEvaluator", + "site": "reddit", + "url_match_mode": "exact", + "last_event_only": true, + "expected": { + "url": "__REDDIT__/f/gaming", + "response_status": 200, + "event_type": "navigation" } - ], - "url_note": "GOLD in PRED" - }, - "intent_template_id": 101 + } + ], + "revision": 2 }, { - "sites": [ - "shopping_admin" - ], + "sites": ["shopping_admin"], "task_id": 676, - "require_login": true, - "storage_state": "./.auth/shopping_admin_state.json", - "start_url": "__SHOPPING_ADMIN__", - "geolocation": null, - "intent_template": "Navigate to and filter for orders that are {{status}}", - "original.intent_template": "Lookup orders that are {{status}}", - "instantiation_dict": { - "status": "suspected of being fraudulent" - }, - "intent": "Navigate to and filter for orders that are suspected of being fraudulent", - "original.intent": "Lookup orders that are suspected of being fraudulent", - "require_reset": false, - "eval": { - "expected_ui_state": [ - { - "url": "__SHOPPING_ADMIN__/sales/order/", - "eval_func": { - "name": "verify_ui_selector_value", - "eval_params": { - "locator": "div.admin__data-grid-filters-current._show > div.admin__current-filters-list-wrap > ul.admin__current-filters-list > li > span:nth-of-type(2)", - "expected_value": "Suspected Fraud" - } - } - } - ], - "site": "shopping_admin" - }, - "original.eval": { - "eval_types": [ - "url_match", - "program_html" - ], - "reference_answers": null, - "reference_url": "__SHOPPING_ADMIN__/sales/order/", - "program_html": [ - { - "url": "last", - "locator": "document.querySelector(\"div.admin__data-grid-filters-current\").outerText", - "required_contents": { - "must_include": [ - "Suspected Fraud" - ] - } - } - ], - "url_note": "GOLD in PRED" - }, "intent_template_id": 253, - "changelogs": [ - { - "key": "intent_template", - "category": "task_ambiguity", - "note": "Clearly specify to navigate to the orders page instead of returning a list of orders" - }, + "start_urls": ["__SHOPPING_ADMIN__"], + "intent": "Navigate to the list of orders that are suspected of being fraudulent", + "intent_template": "Navigate to the list of orders that are {{status}}", + "instantiation_dict": {"status": "suspected of being fraudulent"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "expected_ui_state", - "category": "context_free_evaluation", - "note": "Used locator check" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping_admin" - ], + "sites": ["shopping_admin"], "task_id": 677, - "require_login": true, - "storage_state": "./.auth/shopping_admin_state.json", - "start_url": "__SHOPPING_ADMIN__", - "geolocation": null, - "intent_template": "Navigate to and filter for orders that are {{status}}", - "original.intent_template": "Lookup orders that are {{status}}", - "instantiation_dict": { - "status": "processing" - }, - "intent": "Navigate to and filter for orders that are processing", - "original.intent": "Lookup orders that are processing", - "require_reset": false, - "eval": { - "expected_ui_state": [ - { - "url": "__SHOPPING_ADMIN__/sales/order/", - "eval_func": { - "name": "verify_ui_selector_value", - "eval_params": { - "locator": "div.admin__data-grid-filters-current._show > div.admin__current-filters-list-wrap > ul.admin__current-filters-list > li > span:nth-of-type(2)", - "expected_value": "Processing" - } - } - } - ], - "site": "shopping_admin" - }, - "original.eval": { - "eval_types": [ - "url_match", - "program_html" - ], - "reference_answers": null, - "reference_url": "__SHOPPING_ADMIN__/sales/order/", - "program_html": [ - { - "url": "last", - "locator": "document.querySelector(\"div.admin__data-grid-filters-current\").outerText", - "required_contents": { - "must_include": [ - "Processing" - ] - } - } - ], - "url_note": "GOLD in PRED" - }, "intent_template_id": 253, - "changelogs": [ + "start_urls": ["__SHOPPING_ADMIN__"], + "intent": "Navigate to the list of orders that are processing", + "intent_template": "Navigate to the list of orders that are {{status}}", + "instantiation_dict": {"status": "processing"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "intent_template", - "category": "task_ambiguity", - "note": "Clearly specify to navigate to the orders page instead of returning a list of orders" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} }, { - "key": "expected_ui_state", - "category": "context_free_evaluation", - "note": "Used locator check" + "evaluator": "NetworkEventEvaluator", + "url_match_mode": "prefix", + "last_event_only": true, + "expected": { + "url": "__SHOPPING_ADMIN__/mui/index/render/?namespace=sales_order_grid", + "response_status": 200, + "headers": { + "referer": "__SHOPPING_ADMIN__/sales/order/", + "X-Requested-With": "XMLHttpRequest" + }, + "query_string": { + "namespace": "sales_order_grid", + "filters[placeholder]": "true", + "filters[status]": "processing", + "search": "", + "keywordUpdated": "false" + } + }, + "site": "shopping_admin", + "ignored_query_params_patterns": ["^paging", "^sorting", "isAjax"] } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping_admin" - ], + "sites": ["shopping_admin"], "task_id": 678, - "require_login": true, - "storage_state": "./.auth/shopping_admin_state.json", - "start_url": "__SHOPPING_ADMIN__", - "geolocation": null, - "intent_template": "Navigate to and filter for orders that are {{status}}", - "original.intent_template": "Lookup orders that are {{status}}", - "instantiation_dict": { - "status": "canceled" - }, - "intent": "Navigate to and filter for orders that are canceled", - "original.intent": "Lookup orders that are canceled", - "require_reset": false, - "eval": { - "expected_ui_state": [ - { - "url": "__SHOPPING_ADMIN__/sales/order/", - "eval_func": { - "name": "verify_ui_selector_value", - "eval_params": { - "locator": "div.admin__data-grid-filters-current._show > div.admin__current-filters-list-wrap > ul.admin__current-filters-list > li > span:nth-of-type(2)", - "expected_value": "Canceled" - } - } - } - ], - "site": "shopping_admin" - }, - "original.eval": { - "eval_types": [ - "url_match", - "program_html" - ], - "reference_answers": null, - "reference_url": "__SHOPPING_ADMIN__/sales/order/", - "program_html": [ - { - "url": "last", - "locator": "document.querySelector(\"div.admin__data-grid-filters-current\").outerText", - "required_contents": { - "must_include": [ - "Canceled" - ] - } - } - ], - "url_note": "GOLD in PRED" - }, "intent_template_id": 253, - "changelogs": [ + "start_urls": ["__SHOPPING_ADMIN__"], + "intent": "Navigate to the list of orders that are canceled", + "intent_template": "Navigate to the list of orders that are {{status}}", + "instantiation_dict": {"status": "canceled"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "intent_template", - "category": "task_ambiguity", - "note": "Clearly specify to navigate to the orders page instead of returning a list of orders" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} }, { - "key": "expected_ui_state", - "category": "context_free_evaluation", - "note": "Used locator check" + "evaluator": "NetworkEventEvaluator", + "url_match_mode": "prefix", + "last_event_only": true, + "expected": { + "url": "__SHOPPING_ADMIN__/mui/index/render/?namespace=sales_order_grid", + "response_status": 200, + "headers": { + "referer": "__SHOPPING_ADMIN__/sales/order/", + "X-Requested-With": "XMLHttpRequest" + }, + "query_string": { + "namespace": "sales_order_grid", + "filters[placeholder]": "true", + "filters[status]": "canceled", + "search": "", + "keywordUpdated": "false" + } + }, + "site": "shopping_admin", + "ignored_query_params_patterns": ["^paging", "^sorting", "isAjax"] } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping_admin" - ], + "sites": ["shopping_admin"], "task_id": 679, - "require_login": true, - "storage_state": "./.auth/shopping_admin_state.json", - "start_url": "__SHOPPING_ADMIN__", - "geolocation": null, - "intent_template": "Navigate to and filter for orders that are {{status}}", - "original.intent_template": "Lookup orders that are {{status}}", - "instantiation_dict": { - "status": "completed" - }, - "intent": "Navigate to and filter for orders that are completed", - "original.intent": "Lookup orders that are completed", - "require_reset": false, - "eval": { - "expected_ui_state": [ - { - "url": "__SHOPPING_ADMIN__/sales/order/", - "eval_func": { - "name": "verify_ui_selector_value", - "eval_params": { - "locator": "div.admin__data-grid-filters-current._show > div.admin__current-filters-list-wrap > ul.admin__current-filters-list > li > span:nth-of-type(2)", - "expected_value": "Completed" - } - } - } - ], - "site": "shopping_admin" - }, - "original.eval": { - "eval_types": [ - "url_match", - "program_html" - ], - "reference_answers": null, - "reference_url": "__SHOPPING_ADMIN__/sales/order/", - "program_html": [ - { - "url": "last", - "locator": "document.querySelector(\"div.admin__data-grid-filters-current\").outerText", - "required_contents": { - "must_include": [ - "Completed" - ] - } - } - ], - "url_note": "GOLD in PRED" - }, "intent_template_id": 253, - "changelogs": [ + "start_urls": ["__SHOPPING_ADMIN__"], + "intent": "Navigate to the list of orders that are completed", + "intent_template": "Navigate to the list of orders that are {{status}}", + "instantiation_dict": {"status": "completed"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "intent_template", - "category": "task_ambiguity", - "note": "Clearly specify to navigate to the orders page instead of returning a list of orders" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} }, { - "key": "expected_ui_state", - "category": "context_free_evaluation", - "note": "Used locator check" + "evaluator": "NetworkEventEvaluator", + "url_match_mode": "prefix", + "last_event_only": true, + "expected": { + "url": "__SHOPPING_ADMIN__/mui/index/render/?namespace=sales_order_grid", + "response_status": 200, + "headers": { + "referer": "__SHOPPING_ADMIN__/sales/order/", + "X-Requested-With": "XMLHttpRequest" + }, + "query_string": { + "namespace": "sales_order_grid", + "filters[placeholder]": "true", + "filters[status]": "complete", + "search": "", + "keywordUpdated": "false" + } + }, + "site": "shopping_admin", + "ignored_query_params_patterns": ["^paging", "^sorting", "isAjax"] } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping_admin" - ], + "sites": ["shopping_admin"], "task_id": 680, - "require_login": true, - "storage_state": "./.auth/shopping_admin_state.json", - "start_url": "__SHOPPING_ADMIN__", - "geolocation": null, - "intent_template": "Navigate to and filter for orders that are {{status}}", - "original.intent_template": "Lookup orders that are {{status}}", - "instantiation_dict": { - "status": "on hold" - }, - "intent": "Navigate to and filter for orders that are on hold", - "original.intent": "Lookup orders that are on hold", - "require_reset": false, - "eval": { - "expected_ui_state": [ - { - "url": "__SHOPPING_ADMIN__/sales/order/", - "eval_func": { - "name": "verify_ui_selector_value", - "eval_params": { - "locator": "div.admin__data-grid-filters-current._show > div.admin__current-filters-list-wrap > ul.admin__current-filters-list > li > span:nth-of-type(2)", - "expected_value": "On Hold" - } - } - } - ], - "site": "shopping_admin" - }, - "original.eval": { - "eval_types": [ - "url_match", - "program_html" - ], - "reference_answers": null, - "reference_url": "__SHOPPING_ADMIN__/sales/order/", - "program_html": [ - { - "url": "last", - "locator": "document.querySelector(\"div.admin__data-grid-filters-current\").outerText", - "required_contents": { - "must_include": [ - "On Hold" - ] - } - } - ], - "url_note": "GOLD in PRED" - }, "intent_template_id": 253, - "changelogs": [ + "start_urls": ["__SHOPPING_ADMIN__"], + "intent": "Navigate to the list of orders that are on hold", + "intent_template": "Navigate to the list of orders that are {{status}}", + "instantiation_dict": {"status": "on hold"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "intent_template", - "category": "task_ambiguity", - "note": "Clearly specify to navigate to the orders page instead of returning a list of orders" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} }, { - "key": "expected_ui_state", - "category": "context_free_evaluation", - "note": "Used locator check" + "evaluator": "NetworkEventEvaluator", + "url_match_mode": "prefix", + "last_event_only": true, + "expected": { + "url": "__SHOPPING_ADMIN__/mui/index/render/?namespace=sales_order_grid", + "response_status": 200, + "headers": { + "referer": "__SHOPPING_ADMIN__/sales/order/", + "X-Requested-With": "XMLHttpRequest" + }, + "query_string": { + "namespace": "sales_order_grid", + "filters[placeholder]": "true", + "filters[status]": "holded", + "search": "", + "keywordUpdated": "false" + } + }, + "site": "shopping_admin", + "ignored_query_params_patterns": ["^paging", "^sorting", "isAjax"] } - ] + ], + "revision": 2 }, { - "sites": [ - "reddit", - "gitlab" - ], + "sites": ["reddit", "gitlab"], "task_id": 681, - "require_login": true, - "storage_state": "./.auth/gitlab.reddit_state.json", - "start_url": "__REDDIT__", - "geolocation": null, - "intent_template": "Find a GitLab repository related to {{topic}} and make a Reddit post linking to it in a relevant subreddit", - "instantiation_dict": { - "topic": "gan implementation" - }, + "intent_template_id": 116, + "start_urls": ["__REDDIT__", "__GITLAB__"], "intent": "Find a GitLab repository related to gan implementation and make a Reddit post linking to it in a relevant subreddit", - "require_reset": false, - "eval": { - "expected_ui_state": [ - { - "url": "__REDDIT__/f/machinelearning |OR| __REDDIT__/f/deeplearning |OR| __REDDIT__/f/technology" - } - ], - "site": "reddit" - }, - "original.eval": { - "eval_types": [ - "url_match", - "program_html" - ], - "reference_answers": null, - "reference_url": "__REDDIT__/f/machinelearning |OR| __REDDIT__/f/deeplearning |OR| __REDDIT__/f/technology", - "program_html": [ - { - "url": "last", - "locator": "document.querySelector('.submission__inner').outerText", - "required_contents": { - "must_include": [ - "gan implementation", - "__GITLAB__/eriklindernoren/PyTorch-GAN" - ] - } + "intent_template": "Find a GitLab repository related to {{topic}} and make a Reddit post linking to it in a relevant subreddit", + "instantiation_dict": {"topic": "gan implementation"}, + "format_specification": null, + "start_url_context": null, + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} + }, + { + "evaluator": "NetworkEventEvaluator", + "site": "reddit", + "url_match_mode": "exact", + "last_event_only": true, + "expected": { + "url": [ + "__REDDIT__/f/machinelearning", "__REDDIT__/f/deeplearning", + "__REDDIT__/f/technology" + ], + "response_status": 200, + "event_type": "navigation" } - ], - "url_note": "GOLD in PRED" - }, - "intent_template_id": 116 + } + ], + "revision": 2 }, { - "sites": [ - "reddit", - "gitlab" - ], + "sites": ["reddit", "gitlab"], "task_id": 682, - "require_login": true, - "storage_state": "./.auth/gitlab.reddit_state.json", - "start_url": "__REDDIT__", - "geolocation": null, - "intent_template": "Find a GitLab repository related to {{topic}} and make a Reddit post linking to it in a relevant subreddit", - "instantiation_dict": { - "topic": "chatGPT" - }, + "intent_template_id": 116, + "start_urls": ["__REDDIT__", "__GITLAB__"], "intent": "Find a GitLab repository related to chatGPT and make a Reddit post linking to it in a relevant subreddit", - "require_reset": false, - "eval": { - "expected_ui_state": [ - { - "url": "__REDDIT__/f/machinelearning |OR| __REDDIT__/f/singularity |OR| __REDDIT__/f/deeplearning |OR| __REDDIT__/f/technology" - } - ], - "site": "reddit" - }, - "original.eval": { - "eval_types": [ - "url_match", - "program_html" - ], - "reference_answers": null, - "reference_url": "__REDDIT__/f/machinelearning |OR| __REDDIT__/f/singularity |OR| __REDDIT__/f/deeplearning |OR| __REDDIT__/f/technology", - "program_html": [ - { - "url": "last", - "locator": "document.querySelector('.submission__inner').outerText", - "required_contents": { - "must_include": [ - "chatgpt", - "__GITLAB__/convexegg/chatgpt" - ] - } + "intent_template": "Find a GitLab repository related to {{topic}} and make a Reddit post linking to it in a relevant subreddit", + "instantiation_dict": {"topic": "chatGPT"}, + "format_specification": null, + "start_url_context": null, + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} + }, + { + "evaluator": "NetworkEventEvaluator", + "site": "reddit", + "url_match_mode": "exact", + "last_event_only": true, + "expected": { + "url": [ + "__REDDIT__/f/machinelearning", "__REDDIT__/f/singularity", + "__REDDIT__/f/deeplearning", "__REDDIT__/f/technology" + ], + "response_status": 200, + "event_type": "navigation" } - ], - "url_note": "GOLD in PRED" - }, - "intent_template_id": 116 + } + ], + "revision": 2 }, { - "sites": [ - "reddit", - "gitlab" - ], + "sites": ["reddit", "gitlab"], "task_id": 683, - "require_login": true, - "storage_state": "./.auth/gitlab.reddit_state.json", - "start_url": "__REDDIT__", - "geolocation": null, - "intent_template": "Find a GitLab repository related to {{topic}} and make a Reddit post linking to it in a relevant subreddit", - "instantiation_dict": { - "topic": "metaseq" - }, + "intent_template_id": 116, + "start_urls": ["__REDDIT__", "__GITLAB__"], "intent": "Find a GitLab repository related to metaseq and make a Reddit post linking to it in a relevant subreddit", - "require_reset": false, - "eval": { - "expected_ui_state": [ - { - "url": "__REDDIT__/f/machinelearning |OR| __REDDIT__/f/deeplearning" - } - ], - "site": "reddit" - }, - "original.eval": { - "eval_types": [ - "url_match", - "program_html" - ], - "reference_answers": null, - "reference_url": "__REDDIT__/f/machinelearning |OR| __REDDIT__/f/deeplearning", - "program_html": [ - { - "url": "last", - "locator": "document.querySelector('.submission__inner').outerText", - "required_contents": { - "must_include": [ - "metaseq", - "__GITLAB__/root/metaseq" - ] - } + "intent_template": "Find a GitLab repository related to {{topic}} and make a Reddit post linking to it in a relevant subreddit", + "instantiation_dict": {"topic": "metaseq"}, + "format_specification": null, + "start_url_context": null, + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} + }, + { + "evaluator": "NetworkEventEvaluator", + "site": "reddit", + "url_match_mode": "exact", + "last_event_only": true, + "expected": { + "url": ["__REDDIT__/f/machinelearning", "__REDDIT__/f/deeplearning"], + "response_status": 200, + "event_type": "navigation" } - ], - "url_note": "GOLD in PRED" - }, - "intent_template_id": 116 + } + ], + "revision": 2 }, { - "sites": [ - "reddit", - "gitlab" - ], + "sites": ["reddit", "gitlab"], "task_id": 684, - "require_login": true, - "storage_state": "./.auth/gitlab.reddit_state.json", - "start_url": "__REDDIT__", - "geolocation": null, - "intent_template": "Promote {{repo}} to subreddit {{subreddit}} with the description from the repo itself.", - "instantiation_dict": { - "repo": "byteblaze/cloud-to-butt", - "subreddit": "LifeProTips" - }, + "intent_template_id": 117, + "start_urls": ["__REDDIT__", "__GITLAB__"], "intent": "Promote byteblaze/cloud-to-butt to subreddit LifeProTips with the description from the repo itself.", - "require_reset": false, - "eval": { - "expected_ui_state": [ - { - "url": "__REDDIT__/f/LifeProTips" - } - ], - "site": "reddit" - }, - "original.eval": { - "eval_types": [ - "url_match", - "program_html" - ], - "reference_answers": null, - "reference_url": "__REDDIT__/f/LifeProTips", - "program_html": [ - { - "url": "last", - "locator": "document.querySelector('.submission__inner').outerText + [...document.querySelector('.submission__inner').querySelectorAll('[href],[src]')].map(elem => elem.getAttribute('href') || elem.getAttribute('src')).join(' ')", - "required_contents": { - "must_include": [ - "Chrome extension that replaces occurrences of 'the cloud' with 'my butt'", - "__GITLAB__/byteblaze/cloud-to-butt" - ] - } + "intent_template": "Promote {{repo}} to subreddit {{subreddit}} with the description from the repo itself.", + "instantiation_dict": {"repo": "byteblaze/cloud-to-butt", "subreddit": "LifeProTips"}, + "format_specification": null, + "start_url_context": null, + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} + }, + { + "evaluator": "NetworkEventEvaluator", + "site": "reddit", + "url_match_mode": "exact", + "last_event_only": true, + "expected": { + "url": "__REDDIT__/f/LifeProTips", + "response_status": 200, + "event_type": "navigation" } - ], - "url_note": "GOLD in PRED" - }, - "intent_template_id": 117 + } + ], + "revision": 2 }, { - "sites": [ - "reddit", - "gitlab" - ], + "sites": ["reddit", "gitlab"], "task_id": 685, - "require_login": true, - "storage_state": "./.auth/gitlab.reddit_state.json", - "start_url": "__REDDIT__", - "geolocation": null, - "intent_template": "Promote {{repo}} to subreddit {{subreddit}} with the description from the repo itself.", - "instantiation_dict": { - "repo": "byteblaze/dotfiles", - "subreddit": "aww" - }, + "intent_template_id": 117, + "start_urls": ["__REDDIT__", "__GITLAB__"], "intent": "Promote byteblaze/dotfiles to subreddit aww with the description from the repo itself.", - "require_reset": false, - "eval": { - "expected_ui_state": [ - { - "url": "__REDDIT__/f/aww" - } - ], - "site": "reddit" - }, - "original.eval": { - "eval_types": [ - "url_match", - "program_html" - ], - "reference_answers": null, - "reference_url": "__REDDIT__/f/aww", - "program_html": [ - { - "url": "last", - "locator": "document.querySelector('.submission__inner').outerText + [...document.querySelector('.submission__inner').querySelectorAll('[href],[src]')].map(elem => elem.getAttribute('href') || elem.getAttribute('src')).join(' ')", - "required_contents": { - "must_include": [ - "Computer setup", - "__GITLAB__/byteblaze/dotfiles" - ] - } - } - ], - "url_note": "GOLD in PRED" - }, - "intent_template_id": 117 + "intent_template": "Promote {{repo}} to subreddit {{subreddit}} with the description from the repo itself.", + "instantiation_dict": {"repo": "byteblaze/dotfiles", "subreddit": "aww"}, + "format_specification": null, + "start_url_context": null, + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} + }, + { + "evaluator": "NetworkEventEvaluator", + "site": "reddit", + "url_match_mode": "exact", + "last_event_only": true, + "expected": {"url": "__REDDIT__/f/aww", "response_status": 200, "event_type": "navigation"} + } + ], + "revision": 2 }, { - "sites": [ - "reddit", - "gitlab" - ], + "sites": ["reddit", "gitlab"], "task_id": 686, - "require_login": true, - "storage_state": "./.auth/gitlab.reddit_state.json", - "start_url": "__REDDIT__", - "geolocation": null, - "intent_template": "Promote {{repo}} to subreddit {{subreddit}} with the description from the repo itself.", - "instantiation_dict": { - "repo": "auth0/angular-storage", - "subreddit": "technology" - }, + "intent_template_id": 117, + "start_urls": ["__REDDIT__", "__GITLAB__"], "intent": "Promote auth0/angular-storage to subreddit technology with the description from the repo itself.", - "require_reset": false, - "eval": { - "expected_ui_state": [ - { - "url": "__REDDIT__/f/technology" - } - ], - "site": "reddit" - }, - "original.eval": { - "eval_types": [ - "url_match", - "program_html" - ], - "reference_answers": null, - "reference_url": "__REDDIT__/f/technology", - "program_html": [ - { - "url": "last", - "locator": "document.querySelector('.submission__inner').outerText + [...document.querySelector('.submission__inner').querySelectorAll('[href],[src]')].map(elem => elem.getAttribute('href') || elem.getAttribute('src')).join(' ')", - "required_contents": { - "must_include": [ - "A storage library for AngularJS done right", - "__GITLAB__/auth0/angular-storage" - ] - } + "intent_template": "Promote {{repo}} to subreddit {{subreddit}} with the description from the repo itself.", + "instantiation_dict": {"repo": "auth0/angular-storage", "subreddit": "technology"}, + "format_specification": null, + "start_url_context": null, + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} + }, + { + "evaluator": "NetworkEventEvaluator", + "site": "reddit", + "url_match_mode": "exact", + "last_event_only": true, + "expected": { + "url": "__REDDIT__/f/technology", + "response_status": 200, + "event_type": "navigation" } - ], - "url_note": "GOLD in PRED" - }, - "intent_template_id": 117 + } + ], + "revision": 2 }, { - "sites": [ - "reddit", - "gitlab" - ], + "sites": ["reddit", "gitlab"], "task_id": 687, - "require_login": true, - "storage_state": "./.auth/gitlab.reddit_state.json", - "start_url": "__REDDIT__", - "geolocation": null, - "intent_template": "Promote {{repo}} to subreddit {{subreddit}} with the description from the repo itself.", - "instantiation_dict": { - "repo": "koush/AndroidAsync", - "subreddit": "funny" - }, + "intent_template_id": 117, + "start_urls": ["__REDDIT__", "__GITLAB__"], "intent": "Promote koush/AndroidAsync to subreddit funny with the description from the repo itself.", - "require_reset": false, - "eval": { - "expected_ui_state": [ - { - "url": "__REDDIT__/f/funny" - } - ], - "site": "reddit" - }, - "original.eval": { - "eval_types": [ - "url_match", - "program_html" - ], - "reference_answers": null, - "reference_url": "__REDDIT__/f/funny", - "program_html": [ - { - "url": "last", - "locator": "document.querySelector('.submission__inner').outerText + [...document.querySelector('.submission__inner').querySelectorAll('[href],[src]')].map(elem => elem.getAttribute('href') || elem.getAttribute('src')).join(' ')", - "required_contents": { - "must_include": [ - "Asynchronous socket, http(s) (client+server) and websocket library for android. Based on nio, not threads.", - "__GITLAB__/koush/AndroidAsync" - ] - } + "intent_template": "Promote {{repo}} to subreddit {{subreddit}} with the description from the repo itself.", + "instantiation_dict": {"repo": "koush/AndroidAsync", "subreddit": "funny"}, + "format_specification": null, + "start_url_context": null, + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} + }, + { + "evaluator": "NetworkEventEvaluator", + "site": "reddit", + "url_match_mode": "exact", + "last_event_only": true, + "expected": { + "url": "__REDDIT__/f/funny", + "response_status": 200, + "event_type": "navigation" } - ], - "url_note": "GOLD in PRED" - }, - "intent_template_id": 117 + } + ], + "revision": 2 }, { - "sites": [ - "reddit", - "gitlab" - ], + "sites": ["reddit", "gitlab"], "task_id": 688, - "require_login": true, - "storage_state": "./.auth/gitlab.reddit_state.json", - "start_url": "__REDDIT__", - "geolocation": null, - "intent_template": "Promote {{repo}} to subreddit {{subreddit}} with the description from the repo itself.", - "instantiation_dict": { - "repo": "lahwaacz/arch-wiki-docs", - "subreddit": "science" - }, + "intent_template_id": 117, + "start_urls": ["__REDDIT__", "__GITLAB__"], "intent": "Promote lahwaacz/arch-wiki-docs to subreddit science with the description from the repo itself.", - "require_reset": false, - "eval": { - "expected_ui_state": [ - { - "url": "__REDDIT__/f/science" - } - ], - "site": "reddit" - }, - "original.eval": { - "eval_types": [ - "url_match", - "program_html" - ], - "reference_answers": null, - "reference_url": "__REDDIT__/f/science", - "program_html": [ - { - "url": "last", - "locator": "document.querySelector('.submission__inner').outerText + [...document.querySelector('.submission__inner').querySelectorAll('[href],[src]')].map(elem => elem.getAttribute('href') || elem.getAttribute('src')).join(' ')", - "required_contents": { - "must_include": [ - "A script to download pages from Arch Wiki for offline browsing", - "__GITLAB__/lahwaacz/arch-wiki-docs" - ] - } + "intent_template": "Promote {{repo}} to subreddit {{subreddit}} with the description from the repo itself.", + "instantiation_dict": {"repo": "lahwaacz/arch-wiki-docs", "subreddit": "science"}, + "format_specification": null, + "start_url_context": null, + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} + }, + { + "evaluator": "NetworkEventEvaluator", + "site": "reddit", + "url_match_mode": "exact", + "last_event_only": true, + "expected": { + "url": "__REDDIT__/f/science", + "response_status": 200, + "event_type": "navigation" } - ], - "url_note": "GOLD in PRED" - }, - "intent_template_id": 117 + } + ], + "revision": 2 }, { - "sites": [ - "shopping" - ], + "sites": ["shopping"], "task_id": 689, - "require_login": true, - "storage_state": "./.auth/shopping_state.json", - "start_url": "__SHOPPING__", - "geolocation": null, - "intent_template": "Draft an email to the shop owner via their contact us function for a coupon, enter in the exact reason of \"{{reason}}\"", - "original.intent_template": "Draft an email to the shop owner via their contact us function for a coupon as {{reason}}", - "instantiation_dict": { - "reason": "I am a loyal customer" - }, - "intent": "Draft an email to the shop owner via their contact us function for a coupon, enter in the exact reason of \"I am a loyal customer\"", - "original.intent": "Draft an email to the shop owner via their contact us function for a coupon as I am a loyal customer", - "require_reset": false, - "eval": { - "expected_ui_state": [ - { - "url": "__SHOPPING__/contact", - "eval_func": { - "name": "verify_ui_value_exists", - "eval_params": { - "locator": "title=\"What\u2019s on your mind?\"" - } - } - } - ], - "site": "shopping" - }, - "original.eval": { - "eval_types": [ - "url_match", - "program_html" - ], - "reference_answers": null, - "reference_url": "__SHOPPING__/contact", - "program_html": [ - { - "url": "last", - "locator": "document.querySelector('[title=\"What\u2019s on your mind?\"').value", - "required_contents": { - "must_include": [ - "coupon", - "I am a loyal customer" - ] - } - } - ], - "url_note": "GOLD in PRED" - }, "intent_template_id": 163, - "changelogs": [ + "start_urls": ["__SHOPPING__"], + "intent": "Draft an email to the shop owner via their contact us function for a coupon, enter in the exact reason of \"I am a loyal customer\"", + "intent_template": "Draft an email to the shop owner via their contact us function for a coupon, enter in the exact reason of \"{{reason}}\"", + "instantiation_dict": {"reason": "I am a loyal customer"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "intent_template", - "category": "permissive_string_match", - "note": "Clarify exact text to enter" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} }, { - "key": "expected_ui_state", - "category": "context_free_evaluation", - "note": "Used locator check" + "evaluator": "NetworkEventEvaluator", + "site": "shopping", + "url_match_mode": "exact", + "last_event_only": true, + "expected": { + "url": "__SHOPPING__/contact", + "response_status": 200, + "event_type": "navigation" + } } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping" - ], + "sites": ["shopping"], "task_id": 690, - "require_login": true, - "storage_state": "./.auth/shopping_state.json", - "start_url": "__SHOPPING__", - "geolocation": null, - "intent_template": "Draft an email to the shop owner via their contact us function for a coupon, enter in the exact reason of \"{{reason}}\"", - "original.intent_template": "Draft an email to the shop owner via their contact us function for a coupon as {{reason}}", - "instantiation_dict": { - "reason": "they promised me a coupon last time" - }, - "intent": "Draft an email to the shop owner via their contact us function for a coupon, enter in the exact reason of \"they promised me a coupon last time\"", - "original.intent": "Draft an email to the shop owner via their contact us function for a coupon as they promised me a coupon last time", - "require_reset": false, - "eval": { - "expected_ui_state": [ - { - "url": "__SHOPPING__/contact", - "eval_func": { - "name": "verify_ui_value_exists", - "eval_params": { - "locator": "title=\"What\u2019s on your mind?\"" - } - } - } - ], - "site": "shopping" - }, - "original.eval": { - "eval_types": [ - "url_match", - "program_html" - ], - "reference_answers": null, - "reference_url": "__SHOPPING__/contact", - "program_html": [ - { - "url": "last", - "locator": "document.querySelector('[title=\"What\u2019s on your mind?\"').value", - "required_contents": { - "must_include": [ - "coupon", - "promised" - ] - } - } - ], - "url_note": "GOLD in PRED" - }, "intent_template_id": 163, - "changelogs": [ + "start_urls": ["__SHOPPING__"], + "intent": "Draft an email to the shop owner via their contact us function for a coupon, enter in the exact reason of \"they promised me a coupon last time\"", + "intent_template": "Draft an email to the shop owner via their contact us function for a coupon, enter in the exact reason of \"{{reason}}\"", + "instantiation_dict": {"reason": "they promised me a coupon last time"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "intent_template", - "category": "permissive_string_match", - "note": "Clarify exact text to enter" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} }, { - "key": "expected_ui_state", - "category": "context_free_evaluation", - "note": "Used locator check" + "evaluator": "NetworkEventEvaluator", + "site": "shopping", + "url_match_mode": "exact", + "last_event_only": true, + "expected": { + "url": "__SHOPPING__/contact", + "response_status": 200, + "event_type": "navigation" + } } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping" - ], + "sites": ["shopping"], "task_id": 691, - "require_login": true, - "storage_state": "./.auth/shopping_state.json", - "start_url": "__SHOPPING__", - "geolocation": null, - "intent_template": "Draft an email to the shop owner via their contact us function for a coupon, enter in the exact reason of \"{{reason}}\"", - "original.intent_template": "Draft an email to the shop owner via their contact us function for a coupon as {{reason}}", - "instantiation_dict": { - "reason": "I plan to make a bulk purchase" - }, - "intent": "Draft an email to the shop owner via their contact us function for a coupon, enter in the exact reason of \"I plan to make a bulk purchase\"", - "original.intent": "Draft an email to the shop owner via their contact us function for a coupon as I plan to make a bulk purchase", - "require_reset": false, - "eval": { - "expected_ui_state": [ - { - "url": "__SHOPPING__/contact", - "eval_func": { - "name": "verify_ui_value_exists", - "eval_params": { - "locator": "title=\"What\u2019s on your mind?\"" - } - } - } - ], - "site": "shopping" - }, - "original.eval": { - "eval_types": [ - "url_match", - "program_html" - ], - "reference_answers": null, - "reference_url": "__SHOPPING__/contact", - "program_html": [ - { - "url": "last", - "locator": "document.querySelector('[title=\"What\u2019s on your mind?\"').value", - "required_contents": { - "must_include": [ - "coupon", - "bulk purchase" - ] - } - } - ], - "url_note": "GOLD in PRED" - }, "intent_template_id": 163, - "changelogs": [ + "start_urls": ["__SHOPPING__"], + "intent": "Draft an email to the shop owner via their contact us function for a coupon, enter in the exact reason of \"I plan to make a bulk purchase\"", + "intent_template": "Draft an email to the shop owner via their contact us function for a coupon, enter in the exact reason of \"{{reason}}\"", + "instantiation_dict": {"reason": "I plan to make a bulk purchase"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "intent_template", - "category": "permissive_string_match", - "note": "Clarify exact text to enter" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} }, { - "key": "expected_ui_state", - "category": "context_free_evaluation", - "note": "Used locator check" + "evaluator": "NetworkEventEvaluator", + "site": "shopping", + "url_match_mode": "exact", + "last_event_only": true, + "expected": { + "url": "__SHOPPING__/contact", + "response_status": 200, + "event_type": "navigation" + } } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping" - ], + "sites": ["shopping"], "task_id": 692, - "require_login": true, - "storage_state": "./.auth/shopping_state.json", - "start_url": "__SHOPPING__", - "geolocation": null, - "intent_template": "Draft an email to the shop owner via their contact us function for a coupon, enter in the exact reason of \"{{reason}}\"", - "original.intent_template": "Draft an email to the shop owner via their contact us function for a coupon as {{reason}}", - "instantiation_dict": { - "reason": "I am a student" - }, - "intent": "Draft an email to the shop owner via their contact us function for a coupon, enter in the exact reason of \"I am a student\"", - "original.intent": "Draft an email to the shop owner via their contact us function for a coupon as I am a student", - "require_reset": false, - "eval": { - "expected_ui_state": [ - { - "url": "__SHOPPING__/contact", - "eval_func": { - "name": "verify_ui_value_exists", - "eval_params": { - "locator": "title=\"What\u2019s on your mind?\"" - } - } - } - ], - "site": "shopping" - }, - "original.eval": { - "eval_types": [ - "url_match", - "program_html" - ], - "reference_answers": null, - "reference_url": "__SHOPPING__/contact", - "program_html": [ - { - "url": "last", - "locator": "document.querySelector('[title=\"What\u2019s on your mind?\"').value", - "required_contents": { - "must_include": [ - "coupon", - "student" - ] - } - } - ], - "url_note": "GOLD in PRED" - }, "intent_template_id": 163, - "changelogs": [ + "start_urls": ["__SHOPPING__"], + "intent": "Draft an email to the shop owner via their contact us function for a coupon, enter in the exact reason of \"I am a student\"", + "intent_template": "Draft an email to the shop owner via their contact us function for a coupon, enter in the exact reason of \"{{reason}}\"", + "instantiation_dict": {"reason": "I am a student"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "intent_template", - "category": "permissive_string_match", - "note": "Clarify exact text to enter" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} }, { - "key": "expected_ui_state", - "category": "context_free_evaluation", - "note": "Used locator check" + "evaluator": "NetworkEventEvaluator", + "site": "shopping", + "url_match_mode": "exact", + "last_event_only": true, + "expected": { + "url": "__SHOPPING__/contact", + "response_status": 200, + "event_type": "navigation" + } } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping" - ], + "sites": ["shopping"], "task_id": 693, - "require_login": true, - "storage_state": "./.auth/shopping_state.json", - "start_url": "__SHOPPING__", - "geolocation": null, - "intent_template": "Draft an email to the shop owner via their contact us function for a coupon, enter in the exact reason of \"{{reason}}\"", - "original.intent_template": "Draft an email to the shop owner via their contact us function for a coupon as {{reason}}", - "instantiation_dict": { - "reason": "my refund is supposed to be replaced by a coupon" - }, - "original.instantiation_dict": { - "reason": "my refund is suppoed to be replaced by a coupon" - }, - "intent": "Draft an email to the shop owner via their contact us function for a coupon, enter in the exact reason of \"my refund is supposed to be replaced by a coupon\"", - "original.intent": "Draft an email to the shop owner via their contact us function for a coupon as my refund is suppoed to be replaced by a coupon", - "require_reset": false, - "eval": { - "expected_ui_state": [ - { - "url": "__SHOPPING__/contact", - "eval_func": { - "name": "verify_ui_value_exists", - "eval_params": { - "locator": "title=\"What\u2019s on your mind?\"" - } - } - } - ], - "site": "shopping" - }, - "original.eval": { - "eval_types": [ - "url_match", - "program_html" - ], - "reference_answers": null, - "reference_url": "__SHOPPING__/contact", - "program_html": [ - { - "url": "last", - "locator": "document.querySelector('[title=\"What\u2019s on your mind?\"').value", - "required_contents": { - "must_include": [ - "coupon", - "refund" - ] - } - } - ], - "url_note": "GOLD in PRED" - }, "intent_template_id": 163, - "changelogs": [ - { - "key": "intent_template", - "category": "permissive_string_match", - "note": "Clarify exact text to enter" - }, + "start_urls": ["__SHOPPING__"], + "intent": "Draft an email to the shop owner via their contact us function for a coupon, enter in the exact reason of \"my refund is supposed to be replaced by a coupon\"", + "intent_template": "Draft an email to the shop owner via their contact us function for a coupon, enter in the exact reason of \"{{reason}}\"", + "instantiation_dict": {"reason": "my refund is supposed to be replaced by a coupon"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "instantiation_dict", - "category": "spelling_or_grammar", - "note": "Fix spelling" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} }, { - "key": "expected_ui_state", - "category": "context_free_evaluation", - "note": "Used locator check" + "evaluator": "NetworkEventEvaluator", + "site": "shopping", + "url_match_mode": "exact", + "last_event_only": true, + "expected": { + "url": "__SHOPPING__/contact", + "response_status": 200, + "event_type": "navigation" + } } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping_admin" - ], + "sites": ["shopping_admin"], "task_id": 694, - "require_login": true, - "storage_state": "./.auth/shopping_admin_state.json", - "start_url": "__SHOPPING_ADMIN__", - "geolocation": null, + "intent_template_id": 256, + "start_urls": ["__SHOPPING_ADMIN__"], + "intent": "Add a simple product named Energy-Bulk Women Shirt with 50 in stock, available in size S and color blue, priced at $60", "intent_template": "Add a simple product named {{product}} with {{stock}} in stock, available in size {{size}} and color {{color}}, priced at ${{price}}", "instantiation_dict": { "product": "Energy-Bulk Women Shirt", @@ -41059,112 +19853,35 @@ "color": "blue", "price": "60" }, - "intent": "Add a simple product named Energy-Bulk Women Shirt with 50 in stock, available in size S and color blue, priced at $60", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "verify_product_details", - "eval_params": { - "name": "Energy-Bulk Women Shirt" - }, - "expected_data": { - "name": "Energy-Bulk Women Shirt", - "price": 60.0, - "stock_qty": 50, - "color": "50", - "size": "167" - } - } - } - ], - "site": "shopping_admin" - }, - "original.eval": { - "eval_types": [ - "url_match", - "program_html" - ], - "reference_answers": null, - "reference_url": "__SHOPPING_ADMIN__/catalog/product", - "program_html": [ - { - "url": "last", - "locator": "document.querySelector('[name=\"product[price]\"').value", - "required_contents": { - "exact_match": "60.00" - } - }, - { - "url": "last", - "locator": "document.querySelector('[name=\"product[name]\"').value", - "required_contents": { - "must_include": [ - "Energy-Bulk Women Shirt" - ] - } - }, - { - "url": "last", - "locator": "document.querySelector('[name=\"product[quantity_and_stock_status][qty]\"').value", - "required_contents": { - "exact_match": "50" - } - }, - { - "url": "last", - "locator": "document.querySelector('[data-role=\"selected-option\"').outerText", - "required_contents": { - "must_include": [ - "top" - ] - } - }, - { - "url": "last", - "locator": "document.querySelector('[name=\"product[size]\"').value", - "required_contents": { - "exact_match": "167" - } - }, - { - "url": "last", - "locator": "document.querySelector('[name=\"product[color]\"').value", - "required_contents": { - "exact_match": "50" - } - }, - { - "url": "last", - "locator": "document.querySelector('[data-index=\"category_ids\"').outerText", - "required_contents": { - "must_include": [ - "tops" - ] - } - } - ], - "url_note": "GOLD in PRED" - }, - "intent_template_id": 256, - "changelogs": [ + "format_specification": null, + "start_url_context": null, + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "name": "Energy-Bulk Women Shirt", + "site": "shopping_admin", + "expected": { + "name": "Energy-Bulk Women Shirt", + "price": 60.0, + "stock_qty": 50, + "color": "50", + "size": "167" + } } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping_admin" - ], + "sites": ["shopping_admin"], "task_id": 695, - "require_login": true, - "storage_state": "./.auth/shopping_admin_state.json", - "start_url": "__SHOPPING_ADMIN__", - "geolocation": null, + "intent_template_id": 256, + "start_urls": ["__SHOPPING_ADMIN__"], + "intent": "Add a simple product named Energy-Bulk Man Yoga Pant with 50 in stock, available in size 38 and color yellow, priced at $69.99", "intent_template": "Add a simple product named {{product}} with {{stock}} in stock, available in size {{size}} and color {{color}}, priced at ${{price}}", "instantiation_dict": { "product": "Energy-Bulk Man Yoga Pant", @@ -41173,112 +19890,35 @@ "color": "yellow", "price": "69.99" }, - "intent": "Add a simple product named Energy-Bulk Man Yoga Pant with 50 in stock, available in size 38 and color yellow, priced at $69.99", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "verify_product_details", - "eval_params": { - "name": "Energy-Bulk Man Yoga Pant" - }, - "expected_data": { - "name": "Energy-Bulk Man Yoga Pant", - "price": 69.99, - "stock_qty": 50, - "color": "60", - "size": "179" - } - } - } - ], - "site": "shopping_admin" - }, - "original.eval": { - "eval_types": [ - "url_match", - "program_html" - ], - "reference_answers": null, - "reference_url": "__SHOPPING_ADMIN__/catalog/product", - "program_html": [ - { - "url": "last", - "locator": "document.querySelector('[name=\"product[price]\"').value", - "required_contents": { - "exact_match": "69.99" - } - }, - { - "url": "last", - "locator": "document.querySelector('[name=\"product[name]\"').value", - "required_contents": { - "must_include": [ - "Energy-Bulk Man Yoga Pant" - ] - } - }, - { - "url": "last", - "locator": "document.querySelector('[name=\"product[quantity_and_stock_status][qty]\"').value", - "required_contents": { - "exact_match": "50" - } - }, - { - "url": "last", - "locator": "document.querySelector('[data-role=\"selected-option\"').outerText", - "required_contents": { - "must_include": [ - "bottom" - ] - } - }, - { - "url": "last", - "locator": "document.querySelector('[name=\"product[size]\"').value", - "required_contents": { - "exact_match": "179" - } - }, - { - "url": "last", - "locator": "document.querySelector('[name=\"product[color]\"').value", - "required_contents": { - "exact_match": "60" - } - }, - { - "url": "last", - "locator": "document.querySelector('[data-index=\"category_ids\"').outerText", - "required_contents": { - "must_include": [ - "bottoms" - ] - } - } - ], - "url_note": "GOLD in PRED" - }, - "intent_template_id": 256, - "changelogs": [ + "format_specification": null, + "start_url_context": null, + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "name": "Energy-Bulk Man Yoga Pant", + "site": "shopping_admin", + "expected": { + "name": "Energy-Bulk Man Yoga Pant", + "price": 69.99, + "stock_qty": 50, + "color": "60", + "size": "179" + } } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping_admin" - ], + "sites": ["shopping_admin"], "task_id": 696, - "require_login": true, - "storage_state": "./.auth/shopping_admin_state.json", - "start_url": "__SHOPPING_ADMIN__", - "geolocation": null, + "intent_template_id": 256, + "start_urls": ["__SHOPPING_ADMIN__"], + "intent": "Add a simple product named FancyBoy Man Causal Jeans with 42 in stock, available in size 34 and color Blue, priced at $169.99", "intent_template": "Add a simple product named {{product}} with {{stock}} in stock, available in size {{size}} and color {{color}}, priced at ${{price}}", "instantiation_dict": { "product": "FancyBoy Man Causal Jeans", @@ -41287,112 +19927,35 @@ "color": "Blue", "price": "169.99" }, - "intent": "Add a simple product named FancyBoy Man Causal Jeans with 42 in stock, available in size 34 and color Blue, priced at $169.99", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "verify_product_details", - "eval_params": { - "name": "FancyBoy Man Causal Jeans" - }, - "expected_data": { - "name": "FancyBoy Man Causal Jeans", - "price": 169.99, - "stock_qty": 42, - "color": "50", - "size": "177" - } - } - } - ], - "site": "shopping_admin" - }, - "original.eval": { - "eval_types": [ - "url_match", - "program_html" - ], - "reference_answers": null, - "reference_url": "__SHOPPING_ADMIN__/catalog/product", - "program_html": [ - { - "url": "last", - "locator": "document.querySelector('[name=\"product[name]\"').value", - "required_contents": { - "must_include": [ - "FancyBoy Man Causal Jeans" - ] - } - }, - { - "url": "last", - "locator": "document.querySelector('[name=\"product[quantity_and_stock_status][qty]\"').value", - "required_contents": { - "exact_match": "42" - } - }, - { - "url": "last", - "locator": "document.querySelector('[name=\"product[price]\"').value", - "required_contents": { - "exact_match": "169.99" - } - }, - { - "url": "last", - "locator": "document.querySelector('[data-role=\"selected-option\"').outerText", - "required_contents": { - "must_include": [ - "bottom" - ] - } - }, - { - "url": "last", - "locator": "document.querySelector('[name=\"product[size]\"').value", - "required_contents": { - "exact_match": "177" - } - }, - { - "url": "last", - "locator": "document.querySelector('[name=\"product[color]\"').value", - "required_contents": { - "exact_match": "50" - } - }, - { - "url": "last", - "locator": "document.querySelector('[data-index=\"category_ids\"').outerText", - "required_contents": { - "must_include": [ - "bottoms" - ] - } - } - ], - "url_note": "GOLD in PRED" - }, - "intent_template_id": 256, - "changelogs": [ + "format_specification": null, + "start_url_context": null, + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "name": "FancyBoy Man Causal Jeans", + "site": "shopping_admin", + "expected": { + "name": "FancyBoy Man Causal Jeans", + "price": 169.99, + "stock_qty": 42, + "color": "50", + "size": "177" + } } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping_admin" - ], + "sites": ["shopping_admin"], "task_id": 697, - "require_login": true, - "storage_state": "./.auth/shopping_admin_state.json", - "start_url": "__SHOPPING_ADMIN__", - "geolocation": null, + "intent_template_id": 256, + "start_urls": ["__SHOPPING_ADMIN__"], + "intent": "Add a simple product named Swaatch Smart Watch with 42 in stock, available in size uni-size and color Blue, priced at $769.99", "intent_template": "Add a simple product named {{product}} with {{stock}} in stock, available in size {{size}} and color {{color}}, priced at ${{price}}", "instantiation_dict": { "product": "Swaatch Smart Watch", @@ -41401,105 +19964,35 @@ "color": "Blue", "price": "769.99" }, - "intent": "Add a simple product named Swaatch Smart Watch with 42 in stock, available in size uni-size and color Blue, priced at $769.99", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "verify_product_details", - "eval_params": { - "name": "Swaatch Smart Watch" - }, - "expected_data": { - "name": "Swaatch Smart Watch", - "price": 769.99, - "stock_qty": "42", - "color": "50", - "size": "uni-size" - } - } - } - ], - "site": "shopping_admin" - }, - "original.eval": { - "eval_types": [ - "url_match", - "program_html" - ], - "reference_answers": null, - "reference_url": "__SHOPPING_ADMIN__/catalog/product", - "program_html": [ - { - "url": "last", - "locator": "document.querySelector('[name=\"product[name]\"').value", - "required_contents": { - "must_include": [ - "Swaatch Smart Watch" - ] - } - }, - { - "url": "last", - "locator": "document.querySelector('[name=\"product[quantity_and_stock_status][qty]\"').value", - "required_contents": { - "exact_match": "42" - } - }, - { - "url": "last", - "locator": "document.querySelector('[name=\"product[price]\"').value", - "required_contents": { - "exact_match": "769.99" - } - }, - { - "url": "last", - "locator": "document.querySelector('[data-role=\"selected-option\"').outerText", - "required_contents": { - "must_include": [ - "gear" - ] - } - }, - { - "url": "last", - "locator": "document.querySelector('[name=\"product[color]\"').value", - "required_contents": { - "exact_match": "50" - } - }, - { - "url": "last", - "locator": "document.querySelector('[data-index=\"category_ids\"').outerText", - "required_contents": { - "must_include": [ - "watches" - ] - } - } - ], - "url_note": "GOLD in PRED" - }, - "intent_template_id": 256, - "changelogs": [ + "format_specification": null, + "start_url_context": null, + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "name": "Swaatch Smart Watch", + "site": "shopping_admin", + "expected": { + "name": "Swaatch Smart Watch", + "price": 769.99, + "stock_qty": "42", + "color": "50", + "size": "uni-size" + } } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping_admin" - ], + "sites": ["shopping_admin"], "task_id": 698, - "require_login": true, - "storage_state": "./.auth/shopping_admin_state.json", - "start_url": "__SHOPPING_ADMIN__", - "geolocation": null, + "intent_template_id": 256, + "start_urls": ["__SHOPPING_ADMIN__"], + "intent": "Add a simple product named Lelelumon Yoga Mat with 42 in stock, available in size uni-size and color black, priced at $769.99", "intent_template": "Add a simple product named {{product}} with {{stock}} in stock, available in size {{size}} and color {{color}}, priced at ${{price}}", "instantiation_dict": { "product": "Lelelumon Yoga Mat", @@ -41508,8030 +20001,3307 @@ "color": "black", "price": "769.99" }, - "intent": "Add a simple product named Lelelumon Yoga Mat with 42 in stock, available in size uni-size and color black, priced at $769.99", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "verify_product_details", - "eval_params": { - "name": "Lelelumon Yoga Mat" - }, - "expected_data": { - "name": "Lelelumon Yoga Mat", - "price": 769.99, - "stock_qty": 42, - "color": "49-black", - "size": "uni-size" - } - } - } - ], - "site": "shopping_admin" - }, - "original.eval": { - "eval_types": [ - "url_match", - "program_html" - ], - "reference_answers": null, - "reference_url": "__SHOPPING_ADMIN__/catalog/product", - "program_html": [ - { - "url": "last", - "locator": "document.querySelector('[name=\"product[name]\"').value", - "required_contents": { - "must_include": [ - "Lelelumon Yoga Mat" - ] - } - }, - { - "url": "last", - "locator": "document.querySelector('[name=\"product[quantity_and_stock_status][qty]\"').value", - "required_contents": { - "exact_match": "42" - } - }, - { - "url": "last", - "locator": "document.querySelector('[name=\"product[price]\"').value", - "required_contents": { - "exact_match": "769.99" - } - }, - { - "url": "last", - "locator": "document.querySelector('[data-role=\"selected-option\"').outerText", - "required_contents": { - "must_include": [ - "gear" - ] - } - }, - { - "url": "last", - "locator": "document.querySelector('[name=\"product[color]\"').value", - "required_contents": { - "exact_match": "49" - } - }, - { - "url": "last", - "locator": "document.querySelector('[data-index=\"category_ids\"').outerText", - "required_contents": { - "must_include": [ - "fitness equipment" - ] - } - } - ], - "url_note": "GOLD in PRED" - }, - "intent_template_id": 256, - "changelogs": [ + "format_specification": null, + "start_url_context": null, + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "name": "Lelelumon Yoga Mat", + "site": "shopping_admin", + "expected": { + "name": "Lelelumon Yoga Mat", + "price": 769.99, + "stock_qty": 42, + "color": "49-black", + "size": "uni-size" + } } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping_admin" - ], + "sites": ["shopping_admin"], "task_id": 699, - "require_login": true, - "storage_state": "./.auth/shopping_admin_state.json", - "start_url": "__SHOPPING_ADMIN__", - "geolocation": null, - "intent_template": "Draft a new marketing price rule for {{topic}} that offers {{rule}} for all customers", - "instantiation_dict": { - "topic": "spring sale", - "rule": "a 20 percent discount site-wide" - }, - "intent": "Draft a new marketing price rule for spring sale that offers a 20 percent discount site-wide for all customers", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "verify_sale_details", - "eval_params": { - "topic": "spring sale" - }, - "expected_data": { - "name": "spring sale", - "website_ids": [ - 1 - ], - "customer_group_ids": [ - 1 - ], - "discount_amount_type": "by_percent", - "discount_amount": 20 - } - } - } - ], - "site": "shopping_admin" - }, - "original.eval": { - "eval_types": [ - "url_match", - "program_html" - ], - "reference_answers": null, - "reference_url": "__SHOPPING_ADMIN__/sales_rule/promo_quote", - "program_html": [ - { - "url": "last", - "locator": "document.querySelector(\"[name='name'\").value", - "required_contents": { - "must_include": [ - "spring sale" - ] - } - }, - { - "url": "last", - "locator": "document.querySelector('[name=\"website_ids\"').selectedIndex", - "required_contents": { - "exact_match": "0" - } - }, - { - "url": "last", - "locator": "document.querySelector('[name=\"customer_group_ids\"').selectedIndex", - "required_contents": { - "exact_match": "1" - } - }, - { - "url": "last", - "locator": "document.querySelector('[name=\"simple_action\"').value", - "prep_actions": [ - "document.querySelector('[data-index=\"actions\"]').querySelector('.admin__collapsible-title').click()" - ], - "required_contents": { - "exact_match": "by_percent" - } - }, - { - "url": "last", - "locator": "document.querySelector('[name=\"discount_amount\"').value", - "prep_actions": [ - "document.querySelector('[data-index=\"actions\"]').querySelector('.admin__collapsible-title').click()" - ], - "required_contents": { - "exact_match": "20" - } - } - ], - "url_note": "GOLD in PRED" - }, "intent_template_id": 258, - "changelogs": [ + "start_urls": ["__SHOPPING_ADMIN__"], + "intent": "Draft a new marketing price rule for spring sale that offers a 20 percent discount site-wide for all customers", + "intent_template": "Draft a new marketing price rule for {{topic}} that offers {{rule}} for all customers", + "instantiation_dict": {"topic": "spring sale", "rule": "a 20 percent discount site-wide"}, + "format_specification": null, + "start_url_context": null, + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "topic": "spring sale", + "site": "shopping_admin", + "expected": { + "name": "spring sale", + "website_ids": [1], + "customer_group_ids": [1], + "discount_amount_type": "by_percent", + "discount_amount": 20 + } } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping_admin" - ], + "sites": ["shopping_admin"], "task_id": 700, - "require_login": true, - "storage_state": "./.auth/shopping_admin_state.json", - "start_url": "__SHOPPING_ADMIN__", - "geolocation": null, - "intent_template": "Draft a new marketing price rule for {{topic}} that offers {{rule}} for all customers", - "instantiation_dict": { - "topic": "fall discount", - "rule": "$10 discount on checkout" - }, + "intent_template_id": 258, + "start_urls": ["__SHOPPING_ADMIN__"], "intent": "Draft a new marketing price rule for fall discount that offers $10 discount on checkout for all customers", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "verify_sale_details", - "eval_params": { - "topic": "fall discount" - }, - "expected_data": { - "name": "fall discount", - "website_ids": [ - 1 - ], - "customer_group_ids": [ - 1 - ], - "discount_amount_type": "cart_fixed", - "discount_amount": 10 - } - } - } - ], - "site": "shopping_admin" - }, - "original.eval": { - "eval_types": [ - "url_match", - "program_html" - ], - "reference_answers": null, - "reference_url": "__SHOPPING_ADMIN__/sales_rule/promo_quote", - "program_html": [ - { - "url": "last", - "locator": "document.querySelector(\"[name='name'\").value", - "required_contents": { - "must_include": [ - "fall discount" - ] - } - }, - { - "url": "last", - "locator": "document.querySelector('[name=\"website_ids\"').selectedIndex", - "required_contents": { - "exact_match": "0" - } - }, - { - "url": "last", - "locator": "document.querySelector('[name=\"customer_group_ids\"').selectedIndex", - "required_contents": { - "exact_match": "1" - } - }, - { - "url": "last", - "locator": "document.querySelector('[name=\"simple_action\"').value", - "prep_actions": [ - "document.querySelector('[data-index=\"actions\"]').querySelector('.admin__collapsible-title').click()" - ], - "required_contents": { - "exact_match": "cart_fixed" - } - }, - { - "url": "last", - "locator": "document.querySelector('[name=\"discount_amount\"').value", - "prep_actions": [ - "document.querySelector('[data-index=\"actions\"]').querySelector('.admin__collapsible-title').click()" - ], - "required_contents": { - "exact_match": "10" - } + "intent_template": "Draft a new marketing price rule for {{topic}} that offers {{rule}} for all customers", + "instantiation_dict": {"topic": "fall discount", "rule": "$10 discount on checkout"}, + "format_specification": null, + "start_url_context": null, + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + }, + { + "evaluator": "BackendStateEvaluator", + "topic": "fall discount", + "site": "shopping_admin", + "expected": { + "name": "fall discount", + "website_ids": [1], + "customer_group_ids": [1], + "discount_amount_type": "cart_fixed", + "discount_amount": 10 } - ], - "url_note": "GOLD in PRED" - }, + } + ], + "revision": 2 + }, + { + "sites": ["shopping_admin"], + "task_id": 701, "intent_template_id": 258, - "changelogs": [ + "start_urls": ["__SHOPPING_ADMIN__"], + "intent": "Draft a new marketing price rule for Mother's day sale that offers $15 discount on checkout for all customers", + "intent_template": "Draft a new marketing price rule for {{topic}} that offers {{rule}} for all customers", + "instantiation_dict": {"topic": "Mother's day sale", "rule": "$15 discount on checkout"}, + "format_specification": null, + "start_url_context": null, + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "topic": "Mother's day sale", + "site": "shopping_admin", + "expected": { + "name": "Mother's day sale", + "website_ids": [1], + "customer_group_ids": [1], + "discount_amount_type": "cart_fixed", + "discount_amount": 15 + } } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping_admin" - ], - "task_id": 701, - "require_login": true, - "storage_state": "./.auth/shopping_admin_state.json", - "start_url": "__SHOPPING_ADMIN__", - "geolocation": null, + "sites": ["shopping_admin"], + "task_id": 702, + "intent_template_id": 258, + "start_urls": ["__SHOPPING_ADMIN__"], + "intent": "Draft a new marketing price rule for Pride Month that offers 45% off on all products for all customers", "intent_template": "Draft a new marketing price rule for {{topic}} that offers {{rule}} for all customers", - "instantiation_dict": { - "topic": "Mother's day sale", - "rule": "$15 discount on checkout" - }, - "intent": "Draft a new marketing price rule for Mother's day sale that offers $15 discount on checkout for all customers", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "verify_sale_details", - "eval_params": { - "topic": "Mother's day sale" - }, - "expected_data": { - "name": "Mother's day sale", - "website_ids": [ - 1 - ], - "customer_group_ids": [ - 1 - ], - "discount_amount_type": "cart_fixed", - "discount_amount": 15 - } - } - } - ], - "site": "shopping_admin" - }, - "original.eval": { - "eval_types": [ - "url_match", - "program_html" - ], - "reference_answers": null, - "reference_url": "__SHOPPING_ADMIN__/sales_rule/promo_quote", - "program_html": [ - { - "url": "last", - "locator": "document.querySelector(\"[name='name'\").value", - "required_contents": { - "must_include": [ - "Mother's day sale" - ] - } - }, - { - "url": "last", - "locator": "document.querySelector('[name=\"website_ids\"').selectedIndex", - "required_contents": { - "exact_match": "0" - } - }, - { - "url": "last", - "locator": "document.querySelector('[name=\"customer_group_ids\"').selectedIndex", - "required_contents": { - "exact_match": "1" - } - }, - { - "url": "last", - "locator": "document.querySelector('[name=\"simple_action\"').value", - "prep_actions": [ - "document.querySelector('[data-index=\"actions\"]').querySelector('.admin__collapsible-title').click()" - ], - "required_contents": { - "exact_match": "cart_fixed" - } - }, - { - "url": "last", - "locator": "document.querySelector('[name=\"discount_amount\"').value", - "prep_actions": [ - "document.querySelector('[data-index=\"actions\"]').querySelector('.admin__collapsible-title').click()" - ], - "required_contents": { - "exact_match": "15" - } - } - ], - "url_note": "GOLD in PRED" - }, - "intent_template_id": 258, - "changelogs": [ + "instantiation_dict": {"topic": "Pride Month", "rule": "45% off on all products"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" - } - ] - }, - { - "sites": [ - "shopping_admin" - ], - "task_id": 702, - "require_login": true, - "storage_state": "./.auth/shopping_admin_state.json", - "start_url": "__SHOPPING_ADMIN__", - "geolocation": null, - "intent_template": "Draft a new marketing price rule for {{topic}} that offers {{rule}} for all customers", - "instantiation_dict": { - "topic": "Pride Month", - "rule": "45% off on all products" - }, - "intent": "Draft a new marketing price rule for Pride Month that offers 45% off on all products for all customers", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "verify_sale_details", - "eval_params": { - "topic": "Pride Month" - }, - "expected_data": { - "name": "Pride Month", - "website_ids": [ - 1 - ], - "customer_group_ids": [ - 1 - ], - "discount_amount_type": "by_percent", - "discount_amount": 45 - } - } - } - ], - "site": "shopping_admin" - }, - "original.eval": { - "eval_types": [ - "url_match", - "program_html" - ], - "reference_answers": null, - "reference_url": "__SHOPPING_ADMIN__/sales_rule/promo_quote", - "program_html": [ - { - "url": "last", - "locator": "document.querySelector(\"[name='name'\").value", - "required_contents": { - "must_include": [ - "Pride Month" - ] - } - }, - { - "url": "last", - "locator": "document.querySelector('[name=\"website_ids\"').selectedIndex", - "required_contents": { - "exact_match": "0" - } - }, - { - "url": "last", - "locator": "document.querySelector('[name=\"customer_group_ids\"').selectedIndex", - "required_contents": { - "exact_match": "1" - } - }, - { - "url": "last", - "locator": "document.querySelector('[name=\"simple_action\"').value", - "prep_actions": [ - "document.querySelector('[data-index=\"actions\"]').querySelector('.admin__collapsible-title').click()" - ], - "required_contents": { - "exact_match": "by_percent" - } - }, - { - "url": "last", - "locator": "document.querySelector('[name=\"discount_amount\"').value", - "prep_actions": [ - "document.querySelector('[data-index=\"actions\"]').querySelector('.admin__collapsible-title').click()" - ], - "required_contents": { - "exact_match": "45" - } - } - ], - "url_note": "GOLD in PRED" - }, - "intent_template_id": 258, - "changelogs": [ + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "topic": "Pride Month", + "site": "shopping_admin", + "expected": { + "name": "Pride Month", + "website_ids": [1], + "customer_group_ids": [1], + "discount_amount_type": "by_percent", + "discount_amount": 45 + } } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping_admin" - ], + "sites": ["shopping_admin"], "task_id": 703, - "require_login": true, - "storage_state": "./.auth/shopping_admin_state.json", - "start_url": "__SHOPPING_ADMIN__", - "geolocation": null, - "intent_template": "Draft a new marketing price rule for {{topic}} that offers {{rule}} for all customers", - "instantiation_dict": { - "topic": "Thanks giving sale", - "rule": "$40 discount on checkout" - }, - "intent": "Draft a new marketing price rule for Thanks giving sale that offers $40 discount on checkout for all customers", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "verify_sale_details", - "eval_params": { - "topic": "Thanks giving sale" - }, - "expected_data": { - "name": "Thanks giving sale", - "website_ids": [ - 1 - ], - "customer_group_ids": [ - 1 - ], - "discount_amount_type": "cart_fixed", - "discount_amount": 40 - } - } - } - ], - "site": "shopping_admin" - }, - "original.eval": { - "eval_types": [ - "url_match", - "program_html" - ], - "reference_answers": null, - "reference_url": "__SHOPPING_ADMIN__/sales_rule/promo_quote", - "program_html": [ - { - "url": "last", - "locator": "document.querySelector(\"[name='name'\").value", - "required_contents": { - "must_include": [ - "Thanks giving sale" - ] - } - }, - { - "url": "last", - "locator": "document.querySelector('[name=\"website_ids\"').selectedIndex", - "required_contents": { - "exact_match": "0" - } - }, - { - "url": "last", - "locator": "document.querySelector('[name=\"customer_group_ids\"').selectedIndex", - "required_contents": { - "exact_match": "1" - } - }, - { - "url": "last", - "locator": "document.querySelector('[name=\"simple_action\"').value", - "prep_actions": [ - "document.querySelector('[data-index=\"actions\"]').querySelector('.admin__collapsible-title').click()" - ], - "required_contents": { - "exact_match": "cart_fixed" - } - }, - { - "url": "last", - "locator": "document.querySelector('[name=\"discount_amount\"').value", - "prep_actions": [ - "document.querySelector('[data-index=\"actions\"]').querySelector('.admin__collapsible-title').click()" - ], - "required_contents": { - "exact_match": "40" - } - } - ], - "url_note": "GOLD in PRED" - }, "intent_template_id": 258, - "changelogs": [ + "start_urls": ["__SHOPPING_ADMIN__"], + "intent": "Draft a new marketing price rule for Thanks giving sale that offers $40 discount on checkout for all customers", + "intent_template": "Draft a new marketing price rule for {{topic}} that offers {{rule}} for all customers", + "instantiation_dict": {"topic": "Thanks giving sale", "rule": "$40 discount on checkout"}, + "format_specification": null, + "start_url_context": null, + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "topic": "Thanks giving sale", + "site": "shopping_admin", + "expected": { + "name": "Thanks giving sale", + "website_ids": [1], + "customer_group_ids": [1], + "discount_amount_type": "cart_fixed", + "discount_amount": 40 + } } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping_admin" - ], + "sites": ["shopping_admin"], "task_id": 704, - "require_login": true, - "storage_state": "./.auth/shopping_admin_state.json", - "start_url": "__SHOPPING_ADMIN__", - "geolocation": null, - "intent_template": "Today is 3/15/2023, generate a {{report}} {{time_span}}", - "instantiation_dict": { - "report": "sales order report", - "time_span": "for last month" - }, + "intent_template_id": 268, + "start_urls": ["__SHOPPING_ADMIN__"], "intent": "Today is 3/15/2023, generate a sales order report for last month", - "require_reset": false, - "eval": { - "expected_ui_state": [ - { - "url": "__SHOPPING_ADMIN__/admin/reports/report_sales/sales/", - "eval_func": { - "name": "verify_report_generation_form", - "eval_params": { - "from_date": "2/1/23", - "to_date": "2/28/23", - "order_statuses": [], - "period_type": "day", - "report_type": "created_at_order", - "show_empty_rows": "0", - "show_order_statuses": "0" - } - } - } - ], - "site": "shopping_admin" - }, - "original.eval": { - "eval_types": [ - "url_match", - "program_html" - ], - "reference_answers": null, - "reference_url": "__SHOPPING_ADMIN__/reports/report_sales/sales", - "program_html": [ - { - "url": "last", - "locator": "document.querySelector('[id=\"sales_report_from\"').value", - "required_contents": { - "exact_match": "2/1/23" + "intent_template": "Today is 3/15/2023, generate a {{report}} {{time_span}}", + "instantiation_dict": {"report": "sales order report", "time_span": "for last month"}, + "format_specification": null, + "start_url_context": null, + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} + }, + { + "evaluator": "NetworkEventEvaluator", + "site": "shopping_admin", + "url_match_mode": "exact", + "last_event_only": true, + "ignored_query_params_patterns": ["period_type", "^show"], + "decode_base64_query": true, + "query_string_schema": { + "type": "object", + "properties": { + "report_type": { "type": "string" }, + "from" : { "type": "string", "format": "date" }, + "to" : { "type": "string", "format": "date" } } }, - { - "url": "last", - "locator": "document.querySelector('[id=\"sales_report_to\"').value", - "required_contents": { - "exact_match": "2/28/23" - } + "expected": { + "url": "__SHOPPING_ADMIN__/reports/report_sales/sales/filter", + "event_type": "navigation", + "query_string": {"report_type": "created_at_order", "from": "02/1/2023", "to": "02/28/2023"} } - ], - "url_note": "GOLD in PRED" - }, - "intent_template_id": 268 + } + ], + "revision": 2 }, { - "sites": [ - "shopping_admin" - ], + "sites": ["shopping_admin"], "task_id": 705, - "require_login": true, - "storage_state": "./.auth/shopping_admin_state.json", - "start_url": "__SHOPPING_ADMIN__", - "geolocation": null, - "intent_template": "Today is 3/15/2023, generate a {{report}} {{time_span}}", - "instantiation_dict": { - "report": "sales order report", - "time_span": "over the last 45 days" - }, + "intent_template_id": 268, + "start_urls": ["__SHOPPING_ADMIN__"], "intent": "Today is 3/15/2023, generate a sales order report over the last 45 days", - "require_reset": false, - "eval": { - "expected_ui_state": [ - { - "url": "__SHOPPING_ADMIN__/admin/reports/report_sales/sales/", - "eval_func": { - "name": "verify_report_generation_form", - "eval_params": { - "from_date": "1/29/23", - "to_date": "3/15/23", - "order_statuses": [], - "period_type": "day", - "report_type": "created_at_order", - "show_empty_rows": "0", - "show_order_statuses": "0" - } - } - } - ], - "site": "shopping_admin" - }, - "original.eval": { - "eval_types": [ - "url_match", - "program_html" - ], - "reference_answers": null, - "reference_url": "__SHOPPING_ADMIN__/reports/report_sales/sales", - "program_html": [ - { - "url": "last", - "locator": "document.querySelector('[id=\"sales_report_from\"').value", - "required_contents": { - "exact_match": "1/29/23" + "intent_template": "Today is 3/15/2023, generate a {{report}} {{time_span}}", + "instantiation_dict": {"report": "sales order report", "time_span": "over the last 45 days"}, + "format_specification": null, + "start_url_context": null, + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} + }, + { + "evaluator": "NetworkEventEvaluator", + "site": "shopping_admin", + "url_match_mode": "exact", + "last_event_only": true, + "ignored_query_params_patterns": ["period_type", "^show"], + "decode_base64_query": true, + "query_string_schema": { + "type": "object", + "properties": { + "report_type": { "type": "string" }, + "from" : { "type": "string", "format": "date" }, + "to" : { "type": "string", "format": "date" } } }, - { - "url": "last", - "locator": "document.querySelector('[id=\"sales_report_to\"').value", - "required_contents": { - "exact_match": "3/15/23" - } + "expected": { + "url": "__SHOPPING_ADMIN__/reports/report_sales/sales/filter", + "event_type": "navigation", + "query_string": {"report_type": "created_at_order", "from": "01/29/2023", "to": "03/15/2023"} } - ], - "url_note": "GOLD in PRED" - }, - "intent_template_id": 268 + } + ], + "revision": 2 }, { - "sites": [ - "shopping_admin" - ], + "sites": ["shopping_admin"], "task_id": 706, - "require_login": true, - "storage_state": "./.auth/shopping_admin_state.json", - "start_url": "__SHOPPING_ADMIN__", - "geolocation": null, - "intent_template": "Today is 3/15/2023, generate a {{report}} {{time_span}}", - "instantiation_dict": { - "report": "refund report", - "time_span": "for Q1" - }, + "intent_template_id": 268, + "start_urls": ["__SHOPPING_ADMIN__"], "intent": "Today is 3/15/2023, generate a refund report for Q1", - "require_reset": false, - "eval": { - "expected_ui_state": [ - { - "url": "__SHOPPING_ADMIN__/admin/reports/report_sales/refunded/", - "eval_func": { - "name": "verify_report_generation_form", - "eval_params": { - "from_date": "1/1/23", - "to_date": "3/31/23", - "order_statuses": [], - "period_type": "day", - "report_type": "created_at_order", - "show_empty_rows": "0", - "show_order_statuses": "0" - } - } - } - ], - "site": "shopping_admin" - }, - "original.eval": { - "eval_types": [ - "url_match", - "program_html" - ], - "reference_answers": null, - "reference_url": "__SHOPPING_ADMIN__/reports/report_sales/refunded", - "program_html": [ - { - "url": "last", - "locator": "document.querySelector('[id=\"sales_report_from\"').value", - "required_contents": { - "exact_match": "1/1/23" + "intent_template": "Today is 3/15/2023, generate a {{report}} {{time_span}}", + "instantiation_dict": {"report": "refund report", "time_span": "for Q1"}, + "format_specification": null, + "start_url_context": null, + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} + }, + { + "evaluator": "NetworkEventEvaluator", + "site": "shopping_admin", + "url_match_mode": "exact", + "last_event_only": true, + "ignored_query_params_patterns": ["period_type", "^show"], + "decode_base64_query": true, + "query_string_schema": { + "type": "object", + "properties": { + "report_type": { "type": "string" }, + "from" : { "type": "string", "format": "date" }, + "to" : { "type": "string", "format": "date" } } }, - { - "url": "last", - "locator": "document.querySelector('[id=\"sales_report_to\"').value", - "required_contents": { - "exact_match": "3/31/23" - } + "expected": { + "url": "__SHOPPING_ADMIN__/reports/report_sales/refunded/filter", + "event_type": "navigation", + "query_string": {"report_type": "created_at_order", "from": "01/1/2023", "to": "03/31/2023"} } - ], - "url_note": "GOLD in PRED" - }, - "intent_template_id": 268 + } + ], + "revision": 2 }, { - "sites": [ - "shopping_admin" - ], + "sites": ["shopping_admin"], "task_id": 707, - "require_login": true, - "storage_state": "./.auth/shopping_admin_state.json", - "start_url": "__SHOPPING_ADMIN__", - "geolocation": null, - "intent_template": "Today is 3/15/2023, generate a {{report}} {{time_span}}", - "instantiation_dict": { - "report": "sales order report", - "time_span": "for last year" - }, + "intent_template_id": 268, + "start_urls": ["__SHOPPING_ADMIN__"], "intent": "Today is 3/15/2023, generate a sales order report for last year", - "require_reset": false, - "eval": { - "expected_ui_state": [ - { - "url": "__SHOPPING_ADMIN__/admin/reports/report_sales/sales/", - "eval_func": { - "name": "verify_report_generation_form", - "eval_params": { - "from_date": "1/1/2022", - "to_date": "12/31/2022", - "order_statuses": [], - "period_type": "day", - "report_type": "created_at_order", - "show_empty_rows": "0", - "show_order_statuses": "0" - } - } - } - ], - "site": "shopping_admin" - }, - "original.eval": { - "eval_types": [ - "url_match", - "program_html" - ], - "reference_answers": null, - "reference_url": "__SHOPPING_ADMIN__/reports/report_sales/sales", - "program_html": [ - { - "url": "last", - "locator": "document.querySelector('[id=\"sales_report_from\"').value", - "required_contents": { - "exact_match": "1/1/2022" + "intent_template": "Today is 3/15/2023, generate a {{report}} {{time_span}}", + "instantiation_dict": {"report": "sales order report", "time_span": "for last year"}, + "format_specification": null, + "start_url_context": null, + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} + }, + { + "evaluator": "NetworkEventEvaluator", + "site": "shopping_admin", + "url_match_mode": "exact", + "last_event_only": true, + "ignored_query_params_patterns": ["period_type", "^show"], + "decode_base64_query": true, + "query_string_schema": { + "type": "object", + "properties": { + "report_type": { "type": "string" }, + "from" : { "type": "string", "format": "date" }, + "to" : { "type": "string", "format": "date" } } }, - { - "url": "last", - "locator": "document.querySelector('[id=\"sales_report_to\"').value", - "required_contents": { - "exact_match": "12/31/2022" - } + "expected": { + "url": "__SHOPPING_ADMIN__/reports/report_sales/sales/filter", + "event_type": "navigation", + "query_string": {"report_type": "created_at_order", "from": "01/1/2022", "to": "12/31/2022"} } - ], - "url_note": "GOLD in PRED" - }, - "intent_template_id": 268 + } + ], + "revision": 2 }, { - "sites": [ - "shopping_admin" - ], + "sites": ["shopping_admin"], "task_id": 708, - "require_login": true, - "storage_state": "./.auth/shopping_admin_state.json", - "start_url": "__SHOPPING_ADMIN__", - "geolocation": null, - "intent_template": "Today is 3/15/2023, generate a {{report}} {{time_span}}", - "instantiation_dict": { - "report": "tax report", - "time_span": "for this year" - }, + "intent_template_id": 268, + "start_urls": ["__SHOPPING_ADMIN__"], "intent": "Today is 3/15/2023, generate a tax report for this year", - "require_reset": false, - "eval": { - "expected_ui_state": [ - { - "url": "__SHOPPING_ADMIN__/admin/reports/report_sales/tax/", - "eval_func": { - "name": "verify_report_generation_form", - "eval_params": { - "from_date": "1/1/2023", - "to_date": "12/31/2023", - "order_statuses": [], - "period_type": "day", - "report_type": "created_at_order", - "show_empty_rows": "0", - "show_order_statuses": "0" - } - } - } - ], - "site": "shopping_admin" - }, - "original.eval": { - "eval_types": [ - "url_match", - "program_html" - ], - "reference_answers": null, - "reference_url": "__SHOPPING_ADMIN__/reports/report_sales/tax/", - "program_html": [ - { - "url": "last", - "locator": "document.querySelector('[id=\"sales_report_from\"').value", - "required_contents": { - "exact_match": "1/1/2023" + "intent_template": "Today is 3/15/2023, generate a {{report}} {{time_span}}", + "instantiation_dict": {"report": "tax report", "time_span": "for this year"}, + "format_specification": null, + "start_url_context": null, + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} + }, + { + "evaluator": "NetworkEventEvaluator", + "site": "shopping_admin", + "url_match_mode": "exact", + "last_event_only": true, + "ignored_query_params_patterns": ["period_type", "^show"], + "decode_base64_query": true, + "query_string_schema": { + "type": "object", + "properties": { + "report_type": { "type": "string" }, + "from" : { "type": "string", "format": "date" }, + "to" : { "type": "string", "format": "date" } } }, - { - "url": "last", - "locator": "document.querySelector('[id=\"sales_report_to\"').value", - "required_contents": { - "exact_match": "12/31/2023" - } + "expected": { + "url": "__SHOPPING_ADMIN__/reports/report_sales/tax/filter", + "event_type": "navigation", + "query_string": {"report_type": "created_at_order", "from": "01/1/2023", "to": "12/31/2023"} } - ], - "url_note": "GOLD in PRED" - }, - "intent_template_id": 268 + } + ], + "revision": 2 }, { - "sites": [ - "shopping_admin" - ], + "sites": ["shopping_admin"], "task_id": 709, - "require_login": true, - "storage_state": "./.auth/shopping_admin_state.json", - "start_url": "__SHOPPING_ADMIN__", - "geolocation": null, + "intent_template_id": 271, + "start_urls": ["__SHOPPING_ADMIN__"], + "intent": "Create an orders report from beginning of May 2021 to end of March 2022", "intent_template": "Create an {{type}} report from {{start_date}} to {{end_date}}", "instantiation_dict": { "type": "orders", "start_date": "beginning of May 2021", "end_date": "end of March 2022" }, - "intent": "Create an orders report from beginning of May 2021 to end of March 2022", - "require_reset": false, - "eval": { - "expected_ui_state": [ - { - "url": "__SHOPPING_ADMIN__/admin/reports/report_sales/sales/", - "eval_func": { - "name": "verify_report_generation_form", - "eval_params": { - "from_date": "5/1/2021", - "to_date": "3/31/2022", - "order_statuses": [], - "period_type": "day", - "report_type": "created_at_order", - "show_empty_rows": "0", - "show_order_statuses": "0" - } - } - } - ], - "site": "shopping_admin" - }, - "original.eval": { - "eval_types": [ - "url_match", - "program_html" - ], - "reference_answers": null, - "reference_url": "__SHOPPING_ADMIN__/reports/report_sales/sales", - "program_html": [ - { - "url": "last", - "locator": "document.querySelector('[id=\"sales_report_from\"').value", - "required_contents": { - "exact_match": "5/1/2021" + "format_specification": null, + "start_url_context": null, + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} + }, + { + "evaluator": "NetworkEventEvaluator", + "site": "shopping_admin", + "url_match_mode": "exact", + "last_event_only": true, + "ignored_query_params_patterns": ["period_type", "^show"], + "decode_base64_query": true, + "query_string_schema": { + "type": "object", + "properties": { + "report_type": { "type": "string" }, + "from" : { "type": "string", "format": "date" }, + "to" : { "type": "string", "format": "date" } } }, - { - "url": "last", - "locator": "document.querySelector('[id=\"sales_report_to\"').value", - "required_contents": { - "exact_match": "3/31/2022" - } + "expected": { + "url": "__SHOPPING_ADMIN__/reports/report_sales/sales/filter", + "event_type": "navigation", + "query_string": {"report_type": "created_at_order", "from": "05/1/2021", "to": "03/31/2022"} } - ], - "url_note": "GOLD in PRED" - }, - "intent_template_id": 271 + } + ], + "revision": 2 }, { - "sites": [ - "shopping_admin" - ], + "sites": ["shopping_admin"], "task_id": 710, - "require_login": true, - "storage_state": "./.auth/shopping_admin_state.json", - "start_url": "__SHOPPING_ADMIN__", - "geolocation": null, - "intent_template": "Create a {{type}} report from {{start_date}} to {{end_date}}", - "instantiation_dict": { - "type": "shipping", - "start_date": "08/05/2022", - "end_date": "03/01/2023" - }, + "intent_template_id": 271, + "start_urls": ["__SHOPPING_ADMIN__"], "intent": "Create a shipping report from 08/05/2022 to 03/01/2023", - "require_reset": false, - "eval": { - "expected_ui_state": [ - { - "url": "__SHOPPING_ADMIN__/admin/reports/report_sales/shipping/", - "eval_func": { - "name": "verify_report_generation_form", - "eval_params": { - "from_date": "8/5/22", - "to_date": "3/1/23", - "order_statuses": [], - "period_type": "day", - "report_type": "created_at_order", - "show_empty_rows": "0", - "show_order_statuses": "0" - } - } - } - ], - "site": "shopping_admin" - }, - "original.eval": { - "eval_types": [ - "url_match", - "program_html" - ], - "reference_answers": null, - "reference_url": "__SHOPPING_ADMIN__/reports/report_sales/shipping", - "program_html": [ - { - "url": "last", - "locator": "document.querySelector('[id=\"sales_report_from\"').value", - "required_contents": { - "exact_match": "8/5/22" + "intent_template": "Create a {{type}} report from {{start_date}} to {{end_date}}", + "instantiation_dict": {"type": "shipping", "start_date": "08/05/2022", "end_date": "03/01/2023"}, + "format_specification": null, + "start_url_context": null, + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} + }, + { + "evaluator": "NetworkEventEvaluator", + "site": "shopping_admin", + "url_match_mode": "exact", + "last_event_only": true, + "ignored_query_params_patterns": ["period_type", "^show"], + "decode_base64_query": true, + "query_string_schema": { + "type": "object", + "properties": { + "report_type": { "type": "string" }, + "from" : { "type": "string", "format": "date" }, + "to" : { "type": "string", "format": "date" } } }, - { - "url": "last", - "locator": "document.querySelector('[id=\"sales_report_to\"').value", - "required_contents": { - "exact_match": "3/1/23" - } + "expected": { + "url": "__SHOPPING_ADMIN__/reports/report_sales/shipping/filter", + "event_type": "navigation", + "query_string": {"report_type": "created_at_order", "from": "08/5/2022", "to": "03/1/2023"} } - ], - "url_note": "GOLD in PRED" - }, - "intent_template_id": 271 + } + ], + "revision": 2 }, { - "sites": [ - "shopping_admin" - ], + "sites": ["shopping_admin"], "task_id": 711, - "require_login": true, - "storage_state": "./.auth/shopping_admin_state.json", - "start_url": "__SHOPPING_ADMIN__", - "geolocation": null, - "intent_template": "Create a {{type}} report from {{start_date}} to {{end_date}}", - "instantiation_dict": { - "type": "product view", - "start_date": "07/05/2021", - "end_date": "05/31/2023" - }, + "intent_template_id": 271, + "start_urls": ["__SHOPPING_ADMIN__"], "intent": "Create a product view report from 07/05/2021 to 05/31/2023", - "require_reset": false, - "eval": { - "expected_ui_state": [ - { - "url": "__SHOPPING_ADMIN__/admin/reports/report_product/viewed/", - "eval_func": { - "name": "verify_report_generation_form", - "eval_params": { - "from_date": "7/5/21", - "to_date": "5/31/23", - "period_type": "day", - "show_empty_rows": "0" - } - } - } - ], - "site": "shopping_admin" - }, - "original.eval": { - "eval_types": [ - "url_match", - "program_html" - ], - "reference_answers": null, - "reference_url": "__SHOPPING_ADMIN__/reports/report_product/viewed/", - "program_html": [ - { - "url": "last", - "locator": "document.querySelector('[id=\"sales_report_from\"').value", - "required_contents": { - "exact_match": "7/5/21" + "intent_template": "Create a {{type}} report from {{start_date}} to {{end_date}}", + "instantiation_dict": {"type": "product view", "start_date": "07/05/2021", "end_date": "05/31/2023"}, + "format_specification": null, + "start_url_context": null, + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} + }, + { + "evaluator": "NetworkEventEvaluator", + "site": "shopping_admin", + "url_match_mode": "exact", + "last_event_only": true, + "ignored_query_params_patterns": ["period_type", "^show"], + "decode_base64_query": true, + "query_string_schema": { + "type": "object", + "properties": { + "report_type": { "type": "string" }, + "from" : { "type": "string", "format": "date" }, + "to" : { "type": "string", "format": "date" } } }, - { - "url": "last", - "locator": "document.querySelector('[id=\"sales_report_to\"').value", - "required_contents": { - "exact_match": "5/31/23" - } + "expected": { + "url": "__SHOPPING_ADMIN__/reports/report_product/viewed/filter", + "event_type": "navigation", + "query_string": {"report_type": "created_at_order", "from": "07/5/2021", "to": "05/31/2023"} } - ], - "url_note": "GOLD in PRED" - }, - "intent_template_id": 271 + } + ], + "revision": 2 }, { - "sites": [ - "shopping_admin" - ], + "sites": ["shopping_admin"], "task_id": 712, - "require_login": true, - "storage_state": "./.auth/shopping_admin_state.json", - "start_url": "__SHOPPING_ADMIN__", - "geolocation": null, - "intent_template": "Create a {{type}} report from {{start_date}} to {{end_date}}", - "instantiation_dict": { - "type": "coupons", - "start_date": "05/01/2021", - "end_date": "05/15/2023" - }, + "intent_template_id": 271, + "start_urls": ["__SHOPPING_ADMIN__"], "intent": "Create a coupons report from 05/01/2021 to 05/15/2023", - "require_reset": false, - "eval": { - "expected_ui_state": [ - { - "url": "__SHOPPING_ADMIN__/admin/reports/report_sales/coupons/", - "eval_func": { - "name": "verify_report_generation_form", - "eval_params": { - "from_date": "5/1/21", - "to_date": "5/15/23", - "order_statuses": [], - "period_type": "day", - "report_type": "created_at_order", - "show_empty_rows": "0", - "show_order_statuses": "0" - } - } - } - ], - "site": "shopping_admin" - }, - "original.eval": { - "eval_types": [ - "url_match", - "program_html" - ], - "reference_answers": null, - "reference_url": "__SHOPPING_ADMIN__/reports/report_sales/coupons/", - "program_html": [ - { - "url": "last", - "locator": "document.querySelector('[id=\"sales_report_from\"').value", - "required_contents": { - "exact_match": "5/1/21" + "intent_template": "Create a {{type}} report from {{start_date}} to {{end_date}}", + "instantiation_dict": {"type": "coupons", "start_date": "05/01/2021", "end_date": "05/15/2023"}, + "format_specification": null, + "start_url_context": null, + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} + }, + { + "evaluator": "NetworkEventEvaluator", + "site": "shopping_admin", + "url_match_mode": "exact", + "last_event_only": true, + "ignored_query_params_patterns": ["period_type", "^show"], + "decode_base64_query": true, + "query_string_schema": { + "type": "object", + "properties": { + "report_type": { "type": "string" }, + "from" : { "type": "string", "format": "date" }, + "to" : { "type": "string", "format": "date" } } }, - { - "url": "last", - "locator": "document.querySelector('[id=\"sales_report_to\"').value", - "required_contents": { - "exact_match": "5/15/23" - } + "expected": { + "url": "__SHOPPING_ADMIN__/reports/report_sales/coupons/filter", + "event_type": "navigation", + "query_string": {"report_type": "created_at_order", "from": "05/1/2021", "to": "05/15/2023"} } - ], - "url_note": "GOLD in PRED" - }, - "intent_template_id": 271 + } + ], + "revision": 2 }, { - "sites": [ - "shopping_admin" - ], + "sites": ["shopping_admin"], "task_id": 713, - "require_login": true, - "storage_state": "./.auth/shopping_admin_state.json", - "start_url": "__SHOPPING_ADMIN__", - "geolocation": null, - "intent_template": "Create a {{type}} report from {{start_date}} to {{end_date}}", - "instantiation_dict": { - "type": "best sellers", - "start_date": "05/01/2022", - "end_date": "05/31/2023" - }, + "intent_template_id": 271, + "start_urls": ["__SHOPPING_ADMIN__"], "intent": "Create a best sellers report from 05/01/2022 to 05/31/2023", - "require_reset": false, - "eval": { - "expected_ui_state": [ - { - "url": "__SHOPPING_ADMIN__/admin/reports/report_sales/bestsellers/", - "eval_func": { - "name": "verify_report_generation_form", - "eval_params": { - "from_date": "5/1/22", - "to_date": "5/31/23", - "order_statuses": [], - "period_type": "day", - "report_type": "created_at_order", - "show_empty_rows": "0", - "show_order_statuses": "0" - } - } - } - ], - "site": "shopping_admin" - }, - "original.eval": { - "eval_types": [ - "url_match", - "program_html" - ], - "reference_answers": null, - "reference_url": "__SHOPPING_ADMIN__/reports/report_sales/bestsellers/", - "program_html": [ - { - "url": "last", - "locator": "document.querySelector('[id=\"sales_report_from\"').value", - "required_contents": { - "exact_match": "5/1/22" + "intent_template": "Create a {{type}} report from {{start_date}} to {{end_date}}", + "instantiation_dict": {"type": "best sellers", "start_date": "05/01/2022", "end_date": "05/31/2023"}, + "format_specification": null, + "start_url_context": null, + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} + }, + { + "evaluator": "NetworkEventEvaluator", + "site": "shopping_admin", + "url_match_mode": "exact", + "last_event_only": true, + "ignored_query_params_patterns": ["period_type", "^show"], + "decode_base64_query": true, + "query_string_schema": { + "type": "object", + "properties": { + "report_type": { "type": "string" }, + "from" : { "type": "string", "format": "date" }, + "to" : { "type": "string", "format": "date" } } }, - { - "url": "last", - "locator": "document.querySelector('[id=\"sales_report_to\"').value", - "required_contents": { - "exact_match": "5/31/23" - } + "expected": { + "url": "__SHOPPING_ADMIN__/reports/report_sales/bestsellers/filter", + "event_type": "navigation", + "query_string": {"report_type": "created_at_order", "from": "05/1/2022", "to": "05/31/2023"} } - ], - "url_note": "GOLD in PRED" - }, - "intent_template_id": 271 + } + ], + "revision": 2 }, { - "sites": [ - "reddit" - ], + "sites": ["reddit"], "task_id": 714, - "require_login": true, - "storage_state": "./.auth/reddit_state.json", - "start_url": "__REDDIT__", - "geolocation": null, - "intent_template": "Thumbs down the top {{k}} post ever in {{subreddit}}.", - "instantiation_dict": { - "subreddit": "gadgets", - "k": "1" - }, + "intent_template_id": 24, + "start_urls": ["__REDDIT__"], "intent": "Thumbs down the top 1 post ever in gadgets.", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "validate_user_downvoted_top_posts", - "expected_data": { - "subreddit": "gadgets", - "k": 1 - } - } - } - ], - "site": "reddit" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "__REDDIT__/f/gadgets/19459/a-custom-gaming-pc-built-inside-a-vintage-1940s-motorola", - "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')", - "required_contents": { - "must_include": [ - "vote vote--user-downvoted" - ] - } - } - ] - }, - "intent_template_id": 24, - "changelogs": [ + "intent_template": "Thumbs down the top {{k}} post ever in {{subreddit}}.", + "instantiation_dict": {"subreddit": "gadgets", "k": "1"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "expected_backend_state", - "category": "switch_to_api_eval", - "note": "Update to use system check instead of dom check" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "site": "reddit", + "expected": { "post_ids": [19459] } } - ] + ], + "revision": 2 }, { - "sites": [ - "reddit" - ], + "sites": ["reddit"], "task_id": 715, - "require_login": true, - "storage_state": "./.auth/reddit_state.json", - "start_url": "__REDDIT__", - "geolocation": null, - "intent_template": "Thumbs down the top {{k}} post ever in {{subreddit}}.", - "instantiation_dict": { - "subreddit": "history", - "k": "2" - }, - "intent": "Thumbs down the top 2 post ever in history.", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "validate_user_downvoted_top_posts", - "expected_data": { - "subreddit": "history", - "k": 2 - } - } - } - ], - "site": "reddit" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "__REDDIT__/f/history/84338/the-scientist-who-discovered-sperm-was-so-grossed-out-he", - "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')", - "required_contents": { - "must_include": [ - "vote vote--user-downvoted" - ] - } - }, - { - "url": "__REDDIT__/f/history/105990/4-500-year-old-sumerian-temple-dedicated-to-mighty-thunder", - "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')", - "required_contents": { - "must_include": [ - "vote vote--user-downvoted" - ] - } - } - ] - }, "intent_template_id": 24, - "changelogs": [ + "start_urls": ["__REDDIT__"], + "intent": "Thumbs down the top 2 post ever in history.", + "intent_template": "Thumbs down the top {{k}} post ever in {{subreddit}}.", + "instantiation_dict": {"subreddit": "history", "k": "2"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "expected_backend_state", - "category": "switch_to_api_eval", - "note": "Update to use system check instead of dom check" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "site": "reddit", + "expected": { "post_ids": [84338, 105990] } } - ] + ], + "revision": 2 }, { - "sites": [ - "reddit" - ], + "sites": ["reddit"], "task_id": 716, - "require_login": true, - "storage_state": "./.auth/reddit_state.json", - "start_url": "__REDDIT__", - "geolocation": null, - "intent_template": "Thumbs down the top {{k}} post ever in {{subreddit}}.", - "instantiation_dict": { - "k": "3", - "subreddit": "books" - }, - "intent": "Thumbs down the top 3 post ever in books.", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "validate_user_downvoted_top_posts", - "expected_data": { - "subreddit": "books", - "k": 3 - } - } - } - ], - "site": "reddit" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "__REDDIT__/f/books/81371/the-letters-of-t-s-eliot-to-emily-hale-that-were-kept-sealed", - "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')", - "required_contents": { - "must_include": [ - "vote vote--user-downvoted" - ] - } - }, - { - "url": "__REDDIT__/f/books/59421/friendly-reminder-bookshop-org-exists", - "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')", - "required_contents": { - "must_include": [ - "vote vote--user-downvoted" - ] - } - }, - { - "url": "__REDDIT__/f/books/59447/appalachian-prison-book-project-seeks-notebook-donations-the", - "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')", - "required_contents": { - "must_include": [ - "vote vote--user-downvoted" - ] - } - } - ] - }, "intent_template_id": 24, - "changelogs": [ + "start_urls": ["__REDDIT__"], + "intent": "Thumbs down the top 3 post ever in books.", + "intent_template": "Thumbs down the top {{k}} post ever in {{subreddit}}.", + "instantiation_dict": {"k": "3", "subreddit": "books"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "expected_backend_state", - "category": "switch_to_api_eval", - "note": "Update to use system check instead of dom check" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "site": "reddit", + "expected": { "post_ids": [81371, 59421, 59447] } } - ] + ], + "revision": 2 }, { - "sites": [ - "reddit" - ], + "sites": ["reddit"], "task_id": 717, - "require_login": true, - "storage_state": "./.auth/reddit_state.json", - "start_url": "__REDDIT__", - "geolocation": null, - "intent_template": "Thumbs down the top {{k}} post ever in {{subreddit}}.", - "instantiation_dict": { - "k": "4", - "subreddit": "movies" - }, - "intent": "Thumbs down the top 4 post ever in movies.", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "validate_user_downvoted_top_posts", - "expected_data": { - "subreddit": "movies", - "k": 4 - } - } - } - ], - "site": "reddit" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "__REDDIT__/f/movies/86174/who-will-win-the-oscar-for-actress-in-a-supporting-role", - "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')", - "required_contents": { - "must_include": [ - "vote vote--user-downvoted" - ] - } - }, - { - "url": "__REDDIT__/f/movies/86029/who-will-win-the-oscar-for-film-editing", - "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')", - "required_contents": { - "must_include": [ - "vote vote--user-downvoted" - ] - } - }, - { - "url": "__REDDIT__/f/movies/86055/cindy-williams-dies-laverne-amp-shirley-star-who-appeared-in", - "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')", - "required_contents": { - "must_include": [ - "vote vote--user-downvoted" - ] - } - }, - { - "url": "__REDDIT__/f/movies/42682/michelle-yeoh-to-receive-palm-springs-film-festival-s", - "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')", - "required_contents": { - "must_include": [ - "vote vote--user-downvoted" - ] - } - } - ] - }, "intent_template_id": 24, - "changelogs": [ + "start_urls": ["__REDDIT__"], + "intent": "Thumbs down the top 4 post ever in movies.", + "intent_template": "Thumbs down the top {{k}} post ever in {{subreddit}}.", + "instantiation_dict": {"k": "4", "subreddit": "movies"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "expected_backend_state", - "category": "switch_to_api_eval", - "note": "Update to use system check instead of dom check" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "site": "reddit", + "expected": { "post_ids": [86174, 86029, 86055, 42682] } } - ] + ], + "revision": 2 }, { - "sites": [ - "reddit" - ], + "sites": ["reddit"], "task_id": 718, - "require_login": true, - "storage_state": "./.auth/reddit_state.json", - "start_url": "__REDDIT__", - "geolocation": null, - "intent_template": "Thumbs down the top {{k}} post ever in {{subreddit}}.", - "instantiation_dict": { - "k": "5", - "subreddit": "technology" - }, - "intent": "Thumbs down the top 5 post ever in technology.", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "validate_user_downvoted_top_posts", - "expected_data": { - "subreddit": "technology", - "k": 5 - } - } - } - ], - "site": "reddit" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "__REDDIT__/f/technology/48670/brain-cancer-vaccine-succeeds-at-prolonging-survival-in", - "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')", - "required_contents": { - "must_include": [ - "vote vote--user-downvoted" - ] - } - }, - { - "url": "__REDDIT__/f/technology/134696/india-cuts-internet-for-27-million-people-amid-search-for", - "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')", - "required_contents": { - "must_include": [ - "vote vote--user-downvoted" - ] - } - }, - { - "url": "__REDDIT__/f/technology/48785/us-judge-orders-amazon-to-cease-and-desist-anti-union", - "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')", - "required_contents": { - "must_include": [ - "vote vote--user-downvoted" - ] - } - }, - { - "url": "__REDDIT__/f/technology/70354/activision-s-boston-studio-workers-announce-unionization", - "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')", - "required_contents": { - "must_include": [ - "vote vote--user-downvoted" - ] - } - }, - { - "url": "__REDDIT__/f/technology/70233/social-media-influencers-are-charged-with-feeding-followers", - "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')", - "required_contents": { - "must_include": [ - "vote vote--user-downvoted" - ] - } - } - ] - }, "intent_template_id": 24, - "changelogs": [ + "start_urls": ["__REDDIT__"], + "intent": "Thumbs down the top 5 post ever in technology.", + "intent_template": "Thumbs down the top {{k}} post ever in {{subreddit}}.", + "instantiation_dict": {"k": "5", "subreddit": "technology"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "expected_backend_state", - "category": "switch_to_api_eval", - "note": "Update to use system check instead of dom check" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "site": "reddit", + "expected": { "post_ids": [48670, 134696, 48785, 70354, 70233] } } - ] + ], + "revision": 2 }, { - "sites": [ - "reddit" - ], + "sites": ["reddit"], "task_id": 719, - "require_login": true, - "storage_state": "./.auth/reddit_state.json", - "start_url": "__REDDIT__", - "geolocation": null, - "intent_template": "Like all submissions created by {{user}} in subreddit {{subreddit}}", - "instantiation_dict": { - "subreddit": "wallstreetbets", - "user": "ThetaGang_wsb" - }, - "intent": "Like all submissions created by ThetaGang_wsb in subreddit wallstreetbets", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "validate_user_upvoted_all_user_posts", - "expected_data": { - "subreddit": "wallstreetbets", - "target_user": "ThetaGang_wsb" - } - } - } - ], - "site": "reddit" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "__REDDIT__/f/wallstreetbets/29478/how-will-airbnb-close-following-their-earnings-report-on", - "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')", - "required_contents": { - "must_include": [ - "vote vote--user-upvoted" - ] - } - }, - { - "url": "__REDDIT__/f/wallstreetbets/29458/how-much-will-the-federal-reserve-raise-interest-rates-in", - "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')", - "required_contents": { - "must_include": [ - "vote vote--user-upvoted" - ] - } - } - ] - }, "intent_template_id": 25, - "changelogs": [ + "start_urls": ["__REDDIT__"], + "intent": "Like all submissions created by ThetaGang_wsb in subreddit wallstreetbets", + "intent_template": "Like all submissions created by {{user}} in subreddit {{subreddit}}", + "instantiation_dict": {"subreddit": "wallstreetbets", "user": "ThetaGang_wsb"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "expected_backend_state", - "category": "switch_to_api_eval", - "note": "Update to use system check instead of dom check" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "site": "reddit", + "expected": { "post_ids": [29478, 29458] } } - ] + ], + "revision": 2 }, { - "sites": [ - "reddit" - ], + "sites": ["reddit"], "task_id": 720, - "require_login": true, - "storage_state": "./.auth/reddit_state.json", - "start_url": "__REDDIT__", - "geolocation": null, - "intent_template": "Like all submissions created by {{user}} in subreddit {{subreddit}}", - "instantiation_dict": { - "subreddit": "earthporn", - "user": "CameronKelsey" - }, - "intent": "Like all submissions created by CameronKelsey in subreddit earthporn", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "validate_user_upvoted_all_user_posts", - "expected_data": { - "subreddit": "earthporn", - "target_user": "CameronKelsey" - } - } - } - ], - "site": "reddit" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "__REDDIT__/f/EarthPorn/98332/my-favorite-place-on-the-planet-henry-s-fork-of-the-snake", - "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')", - "required_contents": { - "must_include": [ - "vote vote--user-upvoted" - ] - } - }, - { - "url": "__REDDIT__/f/EarthPorn/98297/2-years-later-this-is-still-one-of-the-most-incredible", - "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')", - "required_contents": { - "must_include": [ - "vote vote--user-upvoted" - ] - } - }, - { - "url": "__REDDIT__/f/EarthPorn/98256/i-can-t-wait-for-all-this-green-to-start-coming-back-little", - "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')", - "required_contents": { - "must_include": [ - "vote vote--user-upvoted" - ] - } - } - ] - }, "intent_template_id": 25, - "changelogs": [ + "start_urls": ["__REDDIT__"], + "intent": "Like all submissions created by CameronKelsey in subreddit earthporn", + "intent_template": "Like all submissions created by {{user}} in subreddit {{subreddit}}", + "instantiation_dict": {"subreddit": "earthporn", "user": "CameronKelsey"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "expected_backend_state", - "category": "switch_to_api_eval", - "note": "Update to use system check instead of dom check" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "site": "reddit", + "expected": { "post_ids": [98332, 98297, 98256] } } - ] + ], + "revision": 2 }, { - "sites": [ - "reddit" - ], + "sites": ["reddit"], "task_id": 721, - "require_login": true, - "storage_state": "./.auth/reddit_state.json", - "start_url": "__REDDIT__", - "geolocation": null, - "intent_template": "Like all submissions created by {{user}} in subreddit {{subreddit}}", - "instantiation_dict": { - "user": "UniversityofBath", - "subreddit": "IAmA" - }, - "intent": "Like all submissions created by UniversityofBath in subreddit IAmA", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "validate_user_upvoted_all_user_posts", - "expected_data": { - "subreddit": "IAmA", - "target_user": "UniversityofBath" - } - } - } - ], - "site": "reddit" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "__REDDIT__/f/IAmA/119742/hi-i-m-vienne-a-doctoral-student-at-the-university-of-bath-i", - "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')", - "required_contents": { - "must_include": [ - "vote vote--user-upvoted" - ] - } - }, - { - "url": "__REDDIT__/f/IAmA/119719/hello-reddit-i-m-nazia-mehrban-a-lecturer-in-biotechnology", - "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')", - "required_contents": { - "must_include": [ - "vote vote--user-upvoted" - ] - } - }, - { - "url": "__REDDIT__/f/IAmA/119714/i-m-ellie-jarvis-she-her-a-2nd-year-phd-student-in-the", - "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')", - "required_contents": { - "must_include": [ - "vote vote--user-upvoted" - ] - } - }, - { - "url": "__REDDIT__/f/IAmA/55155/hi-i-m-dr-lucy-maddox-from-bath-university-uk-i-m-a-clinical", - "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')", - "required_contents": { - "must_include": [ - "vote vote--user-upvoted" - ] - } - }, - { - "url": "__REDDIT__/f/IAmA/55142/we-re-sadeka-nujhat-hannah-leese-and-sandhya-moise-from-the", - "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')", - "required_contents": { - "must_include": [ - "vote vote--user-upvoted" - ] - } - }, - { - "url": "__REDDIT__/f/IAmA/34032/we-re-sandhya-moise-david-phillips-and-chan-lee-from-the", - "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')", - "required_contents": { - "must_include": [ - "vote vote--user-upvoted" - ] - } - }, - { - "url": "__REDDIT__/f/IAmA/13175/hi-i-m-kit-yates-i-m-a-mathematical-biologist-at-the", - "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')", - "required_contents": { - "must_include": [ - "vote vote--user-upvoted" - ] - } - }, - { - "url": "__REDDIT__/f/IAmA/13170/hello-i-m-dr-sara-fontani-from-the-university-of", - "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')", - "required_contents": { - "must_include": [ - "vote vote--user-upvoted" - ] - } - } - ] - }, "intent_template_id": 25, - "changelogs": [ + "start_urls": ["__REDDIT__"], + "intent": "Like all submissions created by UniversityofBath in subreddit IAmA", + "intent_template": "Like all submissions created by {{user}} in subreddit {{subreddit}}", + "instantiation_dict": {"user": "UniversityofBath", "subreddit": "IAmA"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "expected_backend_state", - "category": "switch_to_api_eval", - "note": "Update to use system check instead of dom check" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "site": "reddit", + "expected": { "post_ids": [119742, 119719, 119714, 55155, 55142, 34032, 13175, 13170] } } - ] + ], + "revision": 2 }, { - "sites": [ - "reddit" - ], + "sites": ["reddit"], "task_id": 722, - "require_login": true, - "storage_state": "./.auth/reddit_state.json", - "start_url": "__REDDIT__", - "geolocation": null, - "intent_template": "Like all submissions created by {{user}} in subreddit {{subreddit}}", - "instantiation_dict": { - "user": "Don_Gato1", - "subreddit": "nyc" - }, - "original.instantiation_dict": { - "user": "Don_Gato1", - "subreddit": "new york" - }, - "intent": "Like all submissions created by Don_Gato1 in subreddit nyc", - "original.intent": "Like all submissions created by Don_Gato1 in subreddit new york", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "validate_user_upvoted_all_user_posts", - "expected_data": { - "subreddit": "nyc", - "target_user": "Don_Gato1" - } - } - } - ], - "site": "reddit" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "__REDDIT__/f/nyc/44650/fox-news-hosts-cast-new-york-as-crime-ridden-and-chaotic", - "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')", - "required_contents": { - "must_include": [ - "vote vote--user-upvoted" - ] - } - } - ] - }, "intent_template_id": 25, - "changelogs": [ - { - "key": "instantiation_dict", - "category": "clarify_instructions", - "note": "Typo in subreddit name" - }, + "start_urls": ["__REDDIT__"], + "intent": "Like all submissions created by Don_Gato1 in subreddit nyc", + "intent_template": "Like all submissions created by {{user}} in subreddit {{subreddit}}", + "instantiation_dict": {"user": "Don_Gato1", "subreddit": "nyc"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "expected_backend_state", - "category": "switch_to_api_eval", - "note": "Update to use system check instead of dom check" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "site": "reddit", + "expected": { "post_ids": [44650] } } - ] + ], + "revision": 2 }, { - "sites": [ - "reddit" - ], + "sites": ["reddit"], "task_id": 723, - "require_login": true, - "storage_state": "./.auth/reddit_state.json", - "start_url": "__REDDIT__", - "geolocation": null, - "intent_template": "Like all submissions created by {{user}} in subreddit {{subreddit}}", - "instantiation_dict": { - "user": "FTorrez81", - "subreddit": "iphone13" - }, + "intent_template_id": 25, + "start_urls": ["__REDDIT__"], "intent": "Like all submissions created by FTorrez81 in subreddit iphone13", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "validate_status", - "expected_data": { - "expected_status": "ACTION_NOT_ALLOWED_ERROR" - } - } + "intent_template": "Like all submissions created by {{user}} in subreddit {{subreddit}}", + "instantiation_dict": {"user": "FTorrez81", "subreddit": "iphone13"}, + "format_specification": null, + "start_url_context": null, + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": { + "performed_operation": "retrieve", + "status": "NOT_FOUND_ERROR", + "retrieved_data": null } - ], - "site": "reddit" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "fuzzy_match": "N/A" - }, - "reference_url": "", - "program_html": [], - "reference_answer_raw_annotation": "N/A", - "string_note": "FTorrez81 does not have any submissions in iphone13" - }, + } + ], + "revision": 2 + }, + { + "sites": ["reddit"], + "task_id": 724, "intent_template_id": 25, - "string_note": "FTorrez81 has no submissions in subreddit iphone13", - "changelogs": [ + "start_urls": ["__REDDIT__"], + "intent": "Like all submissions created by Hrekires in subreddit news", + "intent_template": "Like all submissions created by {{user}} in subreddit {{subreddit}}", + "instantiation_dict": {"user": "Hrekires", "subreddit": "news"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "expected_backend_state", - "category": "switch_to_api_eval", - "note": "Update to use system check instead of dom check" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "key": "expected_retrieve_value", - "category": "unachievable_tasks", - "note": "Status check used instead of fuzzy match" + "evaluator": "BackendStateEvaluator", + "site": "reddit", + "expected": { + "post_ids": [129816, 129808, 129794, 129783, 129594, 129508, 43839, 43781, 43572, 43558] + } } - ] + ], + "revision": 2 }, { - "sites": [ - "reddit" + "sites": ["reddit"], + "task_id": 725, + "intent_template_id": 1510, + "start_urls": ["__REDDIT__"], + "intent": "DisLike all submissions created by RickyDontLoseThat in subreddit massachusetts", + "intent_template": "DisLike all submissions created by {{user}} in subreddit {{subreddit}}", + "instantiation_dict": {"subreddit": "massachusetts", "user": "RickyDontLoseThat"}, + "format_specification": null, + "start_url_context": null, + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + }, + { + "evaluator": "BackendStateEvaluator", + "site": "reddit", + "expected": { "post_ids": [84954] } + } ], - "task_id": 724, - "require_login": true, - "storage_state": "./.auth/reddit_state.json", - "start_url": "__REDDIT__", - "geolocation": null, - "intent_template": "Like all submissions created by {{user}} in subreddit {{subreddit}}", - "instantiation_dict": { - "user": "Hrekires", - "subreddit": "news" - }, - "intent": "Like all submissions created by Hrekires in subreddit news", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "validate_user_upvoted_all_user_posts", - "expected_data": { - "subreddit": "news", - "target_user": "Hrekires" - } - } + "revision": 2 + }, + { + "sites": ["reddit"], + "task_id": 726, + "intent_template_id": 1510, + "start_urls": ["__REDDIT__"], + "intent": "DisLike all submissions created by jacyanthis in subreddit earthporn", + "intent_template": "DisLike all submissions created by {{user}} in subreddit {{subreddit}}", + "instantiation_dict": {"subreddit": "earthporn", "user": "jacyanthis"}, + "format_specification": null, + "start_url_context": null, + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": { + "performed_operation": "retrieve", + "status": "NOT_FOUND_ERROR", + "retrieved_data": null } - ], - "site": "reddit" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "__REDDIT__/f/news/129816/gov-whitmer-signs-bills-to-repeal-right-to-work-restore", - "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')", - "required_contents": { - "must_include": [ - "vote vote--user-upvoted" - ] - } - }, - { - "url": "__REDDIT__/f/news/129808/disney-world-deal-with-union-will-raise-minimum-wage-to-18", - "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')", - "required_contents": { - "must_include": [ - "vote vote--user-upvoted" - ] - } - }, - { - "url": "__REDDIT__/f/news/129794/judge-halts-wyoming-abortion-ban-days-after-it-took-effect", - "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')", - "required_contents": { - "must_include": [ - "vote vote--user-upvoted" - ] - } - }, - { - "url": "__REDDIT__/f/news/129783/don-t-say-gay-lawmaker-pleads-guilty-to-covid-relief-fraud", - "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')", - "required_contents": { - "must_include": [ - "vote vote--user-upvoted" - ] - } - }, - { - "url": "__REDDIT__/f/news/129594/arizona-gov-katie-hobbs-refuses-to-proceed-with-execution", - "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')", - "required_contents": { - "must_include": [ - "vote vote--user-upvoted" - ] - } - }, - { - "url": "__REDDIT__/f/news/129508/tennessee-governor-oks-bill-to-cut-nashville-council-in-half", - "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')", - "required_contents": { - "must_include": [ - "vote vote--user-upvoted" - ] - } - }, - { - "url": "__REDDIT__/f/news/43839/philadelphia-da-larry-krasner-impeached-by-pa-house", - "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')", - "required_contents": { - "must_include": [ - "vote vote--user-upvoted" - ] - } - }, - { - "url": "__REDDIT__/f/news/43781/crypto-giant-ftx-to-file-for-bankruptcy-ceo-sam-bankman", - "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')", - "required_contents": { - "must_include": [ - "vote vote--user-upvoted" - ] - } - }, - { - "url": "__REDDIT__/f/news/43572/sec-doj-investigating-crypto-platform-ftx", - "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')", - "required_contents": { - "must_include": [ - "vote vote--user-upvoted" - ] - } - }, - { - "url": "__REDDIT__/f/news/43558/kansas-gov-laura-kelly-wins-re-election-defeating-gop", - "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')", - "required_contents": { - "must_include": [ - "vote vote--user-upvoted" - ] - } - } - ] - }, - "intent_template_id": 25, - "changelogs": [ - { - "key": "expected_backend_state", - "category": "switch_to_api_eval", - "note": "Update to use system check instead of dom check" - }, - { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" - } - ] - }, - { - "sites": [ - "reddit" - ], - "task_id": 725, - "require_login": true, - "storage_state": "./.auth/reddit_state.json", - "start_url": "__REDDIT__", - "geolocation": null, - "intent_template": "DisLike all submissions created by {{user}} in subreddit {{subreddit}}", - "instantiation_dict": { - "subreddit": "massachusetts", - "user": "RickyDontLoseThat" - }, - "intent": "DisLike all submissions created by RickyDontLoseThat in subreddit massachusetts", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "validate_user_upvoted_all_user_posts", - "expected_data": { - "subreddit": "massachusetts", - "target_user": "RickyDontLoseThat" - } - } - } - ], - "site": "reddit" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "__REDDIT__/f/massachusetts/84954/the-last-of-lincoln", - "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')", - "required_contents": { - "must_include": [ - "vote vote--user-downvoted" - ] - } - } - ] - }, - "intent_template_id": 1510, - "changelogs": [ - { - "key": "expected_backend_state", - "category": "switch_to_api_eval", - "note": "Update to use system check instead of dom check" - }, - { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" } - ] - }, - { - "sites": [ - "reddit" ], - "task_id": 726, - "require_login": true, - "storage_state": "./.auth/reddit_state.json", - "start_url": "__REDDIT__", - "geolocation": null, - "intent_template": "DisLike all submissions created by {{user}} in subreddit {{subreddit}}", - "instantiation_dict": { - "subreddit": "earthporn", - "user": "jacyanthis" - }, - "intent": "DisLike all submissions created by jacyanthis in subreddit earthporn", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "validate_user_downvoted_all_user_posts", - "expected_data": { - "subreddit": "earthporn", - "target_user": "jacyanthis" - } - } - } - ], - "site": "reddit" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "fuzzy_match": "N/A" - }, - "reference_url": "", - "program_html": [], - "string_note": "jacyanthis does not have any submissions in earthporn", - "reference_answer_raw_annotation": "N/A" - }, - "intent_template_id": 1510, - "changelogs": [ - { - "key": "expected_backend_state", - "category": "switch_to_api_eval", - "note": "Update to use system check instead of dom check" - }, - { - "key": "expected_retrieve_value", - "category": "unachievable_tasks", - "note": "Status check used instead of fuzzy match" - } - ] + "revision": 2 }, { - "sites": [ - "reddit" - ], + "sites": ["reddit"], "task_id": 727, - "require_login": true, - "storage_state": "./.auth/reddit_state.json", - "start_url": "__REDDIT__", - "geolocation": null, - "intent_template": "DisLike all submissions created by {{user}} in subreddit {{subreddit}}", - "instantiation_dict": { - "user": "PatientBuilder499", - "subreddit": "videos" - }, - "intent": "DisLike all submissions created by PatientBuilder499 in subreddit videos", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "validate_user_downvoted_all_user_posts", - "expected_data": { - "subreddit": "videos", - "target_user": "PatientBuilder499" - } - } - } - ], - "site": "reddit" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "__REDDIT__/f/videos/115139/hundreds-of-civilian-turkish-volunteers-waiting-to-be-sent", - "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')", - "required_contents": { - "must_include": [ - "vote vote--user-downvoted" - ] - } - } - ] - }, "intent_template_id": 1510, - "changelogs": [ + "start_urls": ["__REDDIT__"], + "intent": "DisLike all submissions created by PatientBuilder499 in subreddit videos", + "intent_template": "DisLike all submissions created by {{user}} in subreddit {{subreddit}}", + "instantiation_dict": {"user": "PatientBuilder499", "subreddit": "videos"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "expected_backend_state", - "category": "switch_to_api_eval", - "note": "Update to use system check instead of dom check" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "site": "reddit", + "expected": { "post_ids": [115139] } } - ] + ], + "revision": 2 }, { - "sites": [ - "reddit" - ], + "sites": ["reddit"], "task_id": 728, - "require_login": true, - "storage_state": "./.auth/reddit_state.json", - "start_url": "__REDDIT__", - "geolocation": null, - "intent_template": "DisLike all submissions created by {{user}} in subreddit {{subreddit}}", - "instantiation_dict": { - "user": "sirbarani", - "subreddit": "sports" - }, - "intent": "DisLike all submissions created by sirbarani in subreddit sports", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "validate_user_downvoted_all_user_posts", - "expected_data": { - "subreddit": "sports", - "target_user": "sirbarani" - } - } - } - ], - "site": "reddit" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "__REDDIT__/f/sports/48303/iran-football-legend-daei-will-not-attend-world-cup-amid", - "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')", - "required_contents": { - "must_include": [ - "vote vote--user-downvoted" - ] - } - } - ] - }, "intent_template_id": 1510, - "changelogs": [ + "start_urls": ["__REDDIT__"], + "intent": "DisLike all submissions created by sirbarani in subreddit sports", + "intent_template": "DisLike all submissions created by {{user}} in subreddit {{subreddit}}", + "instantiation_dict": {"user": "sirbarani", "subreddit": "sports"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "expected_backend_state", - "category": "switch_to_api_eval", - "note": "Update to use system check instead of dom check" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "site": "reddit", + "expected": { "post_ids": [48303] } } - ] + ], + "revision": 2 }, { - "sites": [ - "reddit" - ], + "sites": ["reddit"], "task_id": 729, - "require_login": true, - "storage_state": "./.auth/reddit_state.json", - "start_url": "__REDDIT__", - "geolocation": null, - "intent_template": "DisLike all submissions created by {{user}} in subreddit {{subreddit}}", - "instantiation_dict": { - "user": "AdamCannon", - "subreddit": "UpliftingNews" - }, - "intent": "DisLike all submissions created by AdamCannon in subreddit UpliftingNews", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "validate_user_downvoted_all_user_posts", - "expected_data": { - "subreddit": "UpliftingNews", - "target_user": "AdamCannon" - } - } - } - ], - "site": "reddit" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "__REDDIT__/f/UpliftingNews/16087/same-sex-marriage-is-now-legal-in-all-of-mexico-s-states", - "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')", - "required_contents": { - "must_include": [ - "vote vote--user-downvoted" - ] - } - } - ] - }, "intent_template_id": 1510, - "changelogs": [ + "start_urls": ["__REDDIT__"], + "intent": "DisLike all submissions created by AdamCannon in subreddit UpliftingNews", + "intent_template": "DisLike all submissions created by {{user}} in subreddit {{subreddit}}", + "instantiation_dict": {"user": "AdamCannon", "subreddit": "UpliftingNews"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "expected_backend_state", - "category": "switch_to_api_eval", - "note": "Update to use system check instead of dom check" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "site": "reddit", + "expected": { "post_ids": [16087] } } - ] + ], + "revision": 2 }, { - "sites": [ - "reddit" - ], + "sites": ["reddit"], "task_id": 730, - "require_login": true, - "storage_state": "./.auth/reddit_state.json", - "start_url": "__REDDIT__", - "geolocation": null, - "intent_template": "DisLike all submissions created by {{user}} in subreddit {{subreddit}}", - "instantiation_dict": { - "user": "Hrekires", - "subreddit": "news" - }, - "intent": "DisLike all submissions created by Hrekires in subreddit news", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "validate_user_downvoted_all_user_posts", - "expected_data": { - "subreddit": "news", - "target_user": "Hrekires" - } - } - } - ], - "site": "reddit" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "__REDDIT__/f/news/129816/gov-whitmer-signs-bills-to-repeal-right-to-work-restore", - "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')", - "required_contents": { - "must_include": [ - "vote vote--user-downvoted" - ] - } - }, - { - "url": "__REDDIT__/f/news/129808/disney-world-deal-with-union-will-raise-minimum-wage-to-18", - "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')", - "required_contents": { - "must_include": [ - "vote vote--user-downvoted" - ] - } - }, - { - "url": "__REDDIT__/f/news/129794/judge-halts-wyoming-abortion-ban-days-after-it-took-effect", - "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')", - "required_contents": { - "must_include": [ - "vote vote--user-downvoted" - ] - } - }, - { - "url": "__REDDIT__/f/news/129783/don-t-say-gay-lawmaker-pleads-guilty-to-covid-relief-fraud", - "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')", - "required_contents": { - "must_include": [ - "vote vote--user-downvoted" - ] - } - }, - { - "url": "__REDDIT__/f/news/129594/arizona-gov-katie-hobbs-refuses-to-proceed-with-execution", - "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')", - "required_contents": { - "must_include": [ - "vote vote--user-downvoted" - ] - } - }, - { - "url": "__REDDIT__/f/news/129508/tennessee-governor-oks-bill-to-cut-nashville-council-in-half", - "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')", - "required_contents": { - "must_include": [ - "vote vote--user-downvoted" - ] - } - }, - { - "url": "__REDDIT__/f/news/43839/philadelphia-da-larry-krasner-impeached-by-pa-house", - "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')", - "required_contents": { - "must_include": [ - "vote vote--user-downvoted" - ] - } - }, - { - "url": "__REDDIT__/f/news/43781/crypto-giant-ftx-to-file-for-bankruptcy-ceo-sam-bankman", - "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')", - "required_contents": { - "must_include": [ - "vote vote--user-downvoted" - ] - } - }, - { - "url": "__REDDIT__/f/news/43572/sec-doj-investigating-crypto-platform-ftx", - "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')", - "required_contents": { - "must_include": [ - "vote vote--user-downvoted" - ] - } - }, - { - "url": "__REDDIT__/f/news/43558/kansas-gov-laura-kelly-wins-re-election-defeating-gop", - "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')", - "required_contents": { - "must_include": [ - "vote vote--user-downvoted" - ] - } - } - ] - }, "intent_template_id": 1510, - "changelogs": [ + "start_urls": ["__REDDIT__"], + "intent": "DisLike all submissions created by Hrekires in subreddit news", + "intent_template": "DisLike all submissions created by {{user}} in subreddit {{subreddit}}", + "instantiation_dict": {"user": "Hrekires", "subreddit": "news"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "expected_backend_state", - "category": "switch_to_api_eval", - "note": "Update to use system check instead of dom check" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "site": "reddit", + "expected": { + "post_ids": [129816, 129808, 129794, 129783, 129594, 129508, 43839, 43781, 43572, 43558] + } } - ] + ], + "revision": 2 }, { - "sites": [ - "reddit" - ], + "sites": ["reddit"], "task_id": 731, - "require_login": true, - "storage_state": "./.auth/reddit_state.json", - "start_url": "__REDDIT__", - "geolocation": null, - "intent_template": "Edit my post on {{post}} by adding a line to the body that says \"{{content}}\"", - "instantiation_dict": { - "post": "Nvidia RTX 4090", - "content": "EDIT: This news aged well" - }, - "intent": "Edit my post on Nvidia RTX 4090 by adding a line to the body that says \"EDIT: This news aged well\"", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "validate_post_exists_in_subreddit", - "expected_data": { - "subreddit": "MachineLearning", - "post_id": 1, - "title": "EDIT: This news aged well" - } - } - } - ], - "site": "reddit" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "__REDDIT__/f/MachineLearning/1/nvidia-rtx-4090", - "locator": "document.querySelector('.submission__body').outerText", - "required_contents": { - "must_include": [ - "EDIT: This news aged well", - "Crazy device for ML!" - ] - } - } - ] - }, "intent_template_id": 27, - "changelogs": [ + "start_urls": ["__REDDIT__"], + "intent": "Edit my post on Nvidia RTX 4090 by adding a line to the body that says \"EDIT: This news aged well\"", + "intent_template": "Edit my post on {{post}} by adding a line to the body that says \"{{content}}\"", + "instantiation_dict": {"post": "Nvidia RTX 4090", "content": "EDIT: This news aged well"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "expected_backend_state", - "category": "switch_to_api_eval", - "note": "Update to use system check instead of dom check" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "site": "reddit", + "expected": { + "subreddit": "MachineLearning", + "post_id": 1, + "title": "EDIT: This news aged well" + } } - ] + ], + "revision": 2 }, { - "sites": [ - "reddit" - ], + "sites": ["reddit"], "task_id": 732, - "require_login": true, - "storage_state": "./.auth/reddit_state.json", - "start_url": "__REDDIT__", - "geolocation": null, - "intent_template": "Edit my post on {{post}} by adding a line to the body that says \"{{content}}\"", - "instantiation_dict": { - "post": "The Night Agent", - "content": "Done watching, pretty cool!" - }, - "intent": "Edit my post on The Night Agent by adding a line to the body that says \"Done watching, pretty cool!\"", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "validate_post_exists_in_subreddit", - "expected_data": { - "subreddit": "television", - "post_id": 134868, - "title": "Done watching, pretty cool!" - } - } - } - ], - "site": "reddit" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "__REDDIT__/f/television/134868/the-night-agent-renewed-for-season-2-at-netflix", - "locator": "document.querySelector('.submission__body').outerText", - "required_contents": { - "exact_match": "Done watching, pretty cool!" - } - } - ] - }, "intent_template_id": 27, - "changelogs": [ + "start_urls": ["__REDDIT__"], + "intent": "Edit my post on The Night Agent by adding a line to the body that says \"Done watching, pretty cool!\"", + "intent_template": "Edit my post on {{post}} by adding a line to the body that says \"{{content}}\"", + "instantiation_dict": {"post": "The Night Agent", "content": "Done watching, pretty cool!"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "expected_backend_state", - "category": "switch_to_api_eval", - "note": "Update to use system check instead of dom check" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "site": "reddit", + "expected": { + "subreddit": "television", + "post_id": 134868, + "title": "Done watching, pretty cool!" + } } - ] + ], + "revision": 2 }, { - "sites": [ - "reddit" - ], + "sites": ["reddit"], "task_id": 733, - "require_login": true, - "storage_state": "./.auth/reddit_state.json", - "start_url": "__REDDIT__", - "geolocation": null, + "intent_template_id": 27, + "start_urls": ["__REDDIT__"], + "intent": "Edit my post on Star Trek Starfleet Academy series by adding a line to the body that says \"Every watch makes me feel like a kid again\"", "intent_template": "Edit my post on {{post}} by adding a line to the body that says \"{{content}}\"", "instantiation_dict": { "post": "Star Trek Starfleet Academy series", "content": "Every watch makes me feel like a kid again" }, - "intent": "Edit my post on Star Trek Starfleet Academy series by adding a line to the body that says \"Every watch makes me feel like a kid again\"", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "validate_post_exists_in_subreddit", - "expected_data": { - "subreddit": "television", - "post_id": 135201, - "title": "Every watch makes me feel like a kid again" - } - } - } - ], - "site": "reddit" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "__REDDIT__/f/television/135201/star-trek-starfleet-academy-series-from-alex-kurtzman-and", - "locator": "document.querySelector('.submission__body').outerText", - "required_contents": { - "exact_match": "Every watch makes me feel like a kid again" - } - } - ] - }, - "intent_template_id": 27, - "changelogs": [ + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "expected_backend_state", - "category": "switch_to_api_eval", - "note": "Update to use system check instead of dom check" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "site": "reddit", + "expected": { + "subreddit": "television", + "post_id": 135201, + "title": "Every watch makes me feel like a kid again" + } } - ] + ], + "revision": 2 }, { - "sites": [ - "reddit" - ], + "sites": ["reddit"], "task_id": 734, - "require_login": true, - "storage_state": "./.auth/reddit_state.json", - "start_url": "__REDDIT__", - "geolocation": null, + "intent_template_id": 27, + "start_urls": ["__REDDIT__"], + "intent": "Edit my post on Ted Lasso season 3 premiere by adding a line to the body that says \"Done watching. I love the renew!\"", "intent_template": "Edit my post on {{post}} by adding a line to the body that says \"{{content}}\"", "instantiation_dict": { - "post": "Ted Lasso", + "post": "Ted Lasso season 3 premiere", "content": "Done watching. I love the renew!" }, - "intent": "Edit my post on Ted Lasso by adding a line to the body that says \"Done watching. I love the renew!\"", - "require_reset": false, - "eval": { - "site": "reddit" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "__REDDIT__/f/television/135156/ted-lasso-season-3-premiere-scores-870k-u-s-households-up-59", - "locator": "document.querySelector('.submission__body').outerText", - "required_contents": { - "exact_match": "Done watching. I love the renew!" - } + "format_specification": null, + "start_url_context": null, + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + }, + { + "evaluator": "BackendStateEvaluator", + "site": "reddit", + "expected": { + "subreddit": "television", + "content": "Done watching. I love the renew!", + "title": "\u2018Ted Lasso\u2019 Season 3 Premiere Scores 870K U.S. Households, Up 59% From Season 2 Premiere" } - ] - }, - "intent_template_id": 27 + } + ], + "revision": 2 }, { - "sites": [ - "reddit" - ], + "sites": ["reddit"], "task_id": 735, - "require_login": true, - "storage_state": "./.auth/reddit_state.json", - "start_url": "__REDDIT__", - "geolocation": null, + "intent_template_id": 27, + "start_urls": ["__REDDIT__"], + "intent": "Edit my post on Lord of the Rings season 2 cast announcement by adding a line to the body that says \"The cast is amazing!\"", "intent_template": "Edit my post on {{post}} by adding a line to the body that says \"{{content}}\"", "instantiation_dict": { - "post": "Lord of the Rings", + "post": "Lord of the Rings season 2 cast announcement", "content": "The cast is amazing!" }, - "intent": "Edit my post on Lord of the Rings by adding a line to the body that says \"The cast is amazing!\"", - "require_reset": false, - "eval": { - "site": "reddit" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "__REDDIT__/f/television/135152/lord-of-the-rings-the-rings-of-power-season-2-cast-adds", - "locator": "document.querySelector('.submission__body').outerText", - "required_contents": { - "exact_match": "The cast is amazing!" - } + "format_specification": null, + "start_url_context": null, + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + }, + { + "evaluator": "BackendStateEvaluator", + "site": "reddit", + "expected": { + "subreddit": "television", + "content": "The cast is amazing!", + "title": "\u2018Lord of the Rings: The Rings of Power\u2019 Season 2 Cast Adds Ciar\u00e0n Hinds, Rory Kinnear and Tanya Moodie" } - ] - }, - "intent_template_id": 27 + } + ], + "revision": 2 }, { - "sites": [ - "gitlab" - ], + "sites": ["gitlab"], "task_id": 736, - "require_login": true, - "storage_state": "./.auth/gitlab_state.json", - "start_url": "__GITLAB__", - "geolocation": null, - "intent_template": "Make the LICENSE of {{repo}} to MIT license.", + "intent_template_id": 355, + "start_urls": ["__GITLAB__"], + "intent": "Change the LICENSE for repo byteblaze/gimmiethat.space and byteblaze/dotfiles to MIT licenses", + "intent_template": "Change the LICENSE for repo {{repo}} to {{license}}", "instantiation_dict": { - "repo": "gimmiethat.space and dotfiles" - }, - "intent": "Make the LICENSE of gimmiethat.space and dotfiles to MIT license.", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "file_has_substring", - "eval_params": { - "group": "byteblaze", - "project": "gimmiethat.space", - "file_path": "LICENSE", - "substring": "MIT License", - "branch": "main" - } - } - } - ], - "site": "gitlab" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "__GITLAB__/byteblaze/gimmiethat.space/-/blob/main/LICENSE", - "locator": "", - "required_contents": { - "must_include": [ - "MIT license", - "The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software." - ] - } - }, - { - "url": "__GITLAB__/byteblaze/dotfiles/-/blob/main/LICENSE", - "locator": "", - "required_contents": { - "must_include": [ - "MIT license", - "The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software." - ] - } - } - ] + "repo": "byteblaze/gimmiethat.space and byteblaze/dotfiles", + "license": "MIT licenses" }, - "intent_template_id": 355, - "changelogs": [ + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "expected_backend_state", - "category": "switch_to_api_eval", - "note": "Use API to verify; more accurate" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" - } - ] + "evaluator": "BackendStateEvaluator", + "group": "byteblaze", + "project": "gimmiethat.space", + "file_path": "LICENSE", + "substring": "MIT License", + "branch": "main", + "site": "gitlab", + "expected": true + }, + { + "evaluator": "BackendStateEvaluator", + "group": "byteblaze", + "project": "dotfiles", + "file_path": "LICENSE", + "substring": "MIT License", + "branch": "main", + "site": "gitlab", + "expected": true + } + ], + "revision": 2 }, { - "sites": [ - "wikipedia", - "map" - ], + "sites": ["wikipedia", "map"], "task_id": 737, - "require_login": true, - "storage_state": null, - "start_url": "__MAP__", - "geolocation": null, + "intent_template_id": 94, + "start_urls": ["__MAP__", "__WIKIPEDIA__"], + "intent": "Show me the way from Carnegie Mellon University to the home stadium of Philadelphia 76ers ", "intent_template": "Show me the way from {{location}} to the home stadium of {{sport_team}} {{time}}", "instantiation_dict": { "location": "Carnegie Mellon University", "sport_team": "Philadelphia 76ers", "time": "" }, - "intent": "Show me the way from Carnegie Mellon University to the home stadium of Philadelphia 76ers ", - "require_reset": false, - "eval": { - "site": "map" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "last", - "locator": "document.querySelector('[name=\"route_from\"').value", - "required_contents": { - "must_include": [ - "Carnegie Mellon University", - "Pittsburgh" - ] - } - }, - { - "url": "last", - "locator": "document.querySelector('[name=\"route_to\"').value", - "required_contents": { - "must_include": [ - "Wells Fargo Center", - "South Philadelphia Sports Complex" - ] - } - }, - { - "url": "last", - "locator": "document.querySelector(\"div#content select.routing_engines\").selectedIndex", - "required_contents": { - "exact_match": "1" - } - } - ] - }, - "intent_template_id": 94 + "format_specification": null, + "start_url_context": null, + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} + }, + { + "evaluator": "NetworkEventEvaluator", + "site": "map", + "url_match_mode": "exact", + "last_event_only": true, + "expected": {"url": "__MAP__", "response_status": 200, "event_type": "navigation"} + } + ], + "revision": 2 }, { - "sites": [ - "wikipedia", - "map" - ], + "sites": ["wikipedia", "map"], "task_id": 738, - "require_login": true, - "storage_state": null, - "start_url": "__MAP__", - "geolocation": null, + "intent_template_id": 94, + "start_urls": ["__MAP__", "__WIKIPEDIA__"], + "intent": "Show me the way from Carnegie Mellon University to the home stadium of Philadelphia 76ers in the 70th", "intent_template": "Show me the way from {{location}} to the home stadium of {{sport_team}} {{time}}", "instantiation_dict": { "location": "Carnegie Mellon University", "sport_team": "Philadelphia 76ers", "time": "in the 70th" }, - "intent": "Show me the way from Carnegie Mellon University to the home stadium of Philadelphia 76ers in the 70th", - "require_reset": false, - "eval": { - "site": "map" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "last", - "locator": "document.querySelector('[name=\"route_from\"').value", - "required_contents": { - "must_include": [ - "Carnegie Mellon University", - "Pittsburgh" - ] - } - }, - { - "url": "last", - "locator": "document.querySelector('[name=\"route_to\"').value", - "required_contents": { - "must_include": [ - "3601 South Broad Street", - "South Philadelphia" - ] - } - }, - { - "url": "last", - "locator": "document.querySelector(\"div#content select.routing_engines\").selectedIndex", - "required_contents": { - "exact_match": "1" - } - } - ] - }, - "intent_template_id": 94 + "format_specification": null, + "start_url_context": null, + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} + }, + { + "evaluator": "NetworkEventEvaluator", + "site": "map", + "url_match_mode": "exact", + "last_event_only": true, + "expected": {"url": "__MAP__", "response_status": 200, "event_type": "navigation"} + } + ], + "revision": 2 }, { - "sites": [ - "wikipedia", - "map" - ], + "sites": ["wikipedia", "map"], "task_id": 739, - "require_login": true, - "storage_state": null, - "start_url": "__MAP__", - "geolocation": null, + "intent_template_id": 94, + "start_urls": ["__MAP__", "__WIKIPEDIA__"], + "intent": "Show me the way from Carnegie Mellon University to the home stadium of Yankees in the 80th", "intent_template": "Show me the way from {{location}} to the home stadium of {{sport_team}} {{time}}", "instantiation_dict": { "location": "Carnegie Mellon University", "sport_team": "Yankees", "time": "in the 80th" }, - "intent": "Show me the way from Carnegie Mellon University to the home stadium of Yankees in the 80th", - "require_reset": false, - "eval": { - "site": "map" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "last", - "locator": "document.querySelector('[name=\"route_from\"').value", - "required_contents": { - "must_include": [ - "Carnegie Mellon University", - "Pittsburgh" - ] - } - }, - { - "url": "last", - "locator": "document.querySelector('[name=\"route_to\"').value", - "required_contents": { - "must_include": [ - "Yankee Stadium", - "East 161st Street" - ] - } - }, - { - "url": "last", - "locator": "document.querySelector(\"div#content select.routing_engines\").selectedIndex", - "required_contents": { - "exact_match": "1" - } - } - ] - }, - "intent_template_id": 94 + "format_specification": null, + "start_url_context": null, + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} + }, + { + "evaluator": "NetworkEventEvaluator", + "site": "map", + "url_match_mode": "exact", + "last_event_only": true, + "expected": {"url": "__MAP__", "response_status": 200, "event_type": "navigation"} + } + ], + "revision": 2 }, { - "sites": [ - "wikipedia", - "map" - ], + "sites": ["wikipedia", "map"], "task_id": 740, - "require_login": true, - "storage_state": null, - "start_url": "__MAP__", - "geolocation": null, + "intent_template_id": 94, + "start_urls": ["__MAP__", "__WIKIPEDIA__"], + "intent": "Show me the way from Carnegie Mellon University to the home stadium of NYC NBA team ", "intent_template": "Show me the way from {{location}} to the home stadium of {{sport_team}} {{time}}", "instantiation_dict": { "location": "Carnegie Mellon University", "sport_team": "NYC NBA team", "time": "" }, - "intent": "Show me the way from Carnegie Mellon University to the home stadium of NYC NBA team ", - "require_reset": false, - "eval": { - "site": "map" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "last", - "locator": "document.querySelector('[name=\"route_from\"').value", - "required_contents": { - "must_include": [ - "Carnegie Mellon University", - "Pittsburgh" - ] - } - }, - { - "url": "last", - "locator": "document.querySelector('[name=\"route_to\"').value", - "required_contents": { - "must_include": [ - "Madison Square Garden", - "Pennsylvania Plaza", - "Manhattan", - "New York" - ] - } - }, - { - "url": "last", - "locator": "document.querySelector(\"div#content select.routing_engines\").selectedIndex", - "required_contents": { - "exact_match": "1" - } - } - ] - }, - "intent_template_id": 94 + "format_specification": null, + "start_url_context": null, + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} + }, + { + "evaluator": "NetworkEventEvaluator", + "site": "map", + "url_match_mode": "exact", + "last_event_only": true, + "expected": {"url": "__MAP__", "response_status": 200, "event_type": "navigation"} + } + ], + "revision": 2 }, { - "sites": [ - "wikipedia", - "map" - ], + "sites": ["wikipedia", "map"], "task_id": 741, - "require_login": true, - "storage_state": null, - "start_url": "__MAP__", - "geolocation": null, + "intent_template_id": 94, + "start_urls": ["__MAP__", "__WIKIPEDIA__"], + "intent": "Show me the way from Carnegie Mellon University to the home stadium of Boston home NBA team ", "intent_template": "Show me the way from {{location}} to the home stadium of {{sport_team}} {{time}}", "instantiation_dict": { "location": "Carnegie Mellon University", "sport_team": "Boston home NBA team", "time": "" }, - "intent": "Show me the way from Carnegie Mellon University to the home stadium of Boston home NBA team ", - "require_reset": false, - "eval": { - "site": "map" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "last", - "locator": "document.querySelector('[name=\"route_from\"').value", - "required_contents": { - "must_include": [ - "Carnegie Mellon University", - "Pittsburgh" - ] - } - }, - { - "url": "last", - "locator": "document.querySelector('[name=\"route_to\"').value", - "required_contents": { - "must_include": [ - "TD Garden", - "Boston", - "Massachusetts" - ] - } - }, - { - "url": "last", - "locator": "document.querySelector(\"div#content select.routing_engines\").selectedIndex", - "required_contents": { - "exact_match": "1" - } - } - ] - }, - "intent_template_id": 94 + "format_specification": null, + "start_url_context": null, + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} + }, + { + "evaluator": "NetworkEventEvaluator", + "site": "map", + "url_match_mode": "exact", + "last_event_only": true, + "expected": {"url": "__MAP__", "response_status": 200, "event_type": "navigation"} + } + ], + "revision": 2 }, { - "sites": [ - "gitlab" - ], + "sites": ["gitlab"], "task_id": 742, - "require_login": true, - "storage_state": "./.auth/gitlab_state.json", - "start_url": "__GITLAB__", - "geolocation": null, - "intent_template": "Create a new {{scope}} project \"planner\" and add {{account_list}} as members", - "instantiation_dict": { - "scope": "private", - "account_list": "Abishek, Vinta" - }, - "intent": "Create a new private project \"planner\" and add Abishek, Vinta as members", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "project_has_visibility_and_members", - "eval_params": { - "group": "byteblaze", - "project": "planner", - "visibility": "private", - "members": [ - "abisubramanya27", - "vinta" - ] - } - } - } - ], - "site": "gitlab" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "__GITLAB__/byteblaze/planner", - "locator": "document.querySelector('.visibility-icon').getAttribute('title')", - "required_contents": { - "must_include": [ - "private" - ] - } - }, - { - "url": "__GITLAB__/byteblaze/planner/-/project_members", - "locator": "", - "required_contents": { - "must_include": [ - "@abisubramanya27", - "@vinta" - ] - } - } - ] - }, "intent_template_id": 332, - "changelogs": [ + "start_urls": ["__GITLAB__"], + "intent": "Create a new private project \"planner\" and add Abishek, Vinta as members", + "intent_template": "Create a new {{scope}} project \"planner\" and add {{account_list}} as members", + "instantiation_dict": {"scope": "private", "account_list": "Abishek, Vinta"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "expected_backend_state", - "category": "switch_to_api_eval", - "note": "Use API to verify; more accurate" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "group": "byteblaze", + "project": "planner", + "visibility": "private", + "members": ["abisubramanya27", "vinta"], + "site": "gitlab", + "expected": true } - ] + ], + "revision": 2 }, { - "sites": [ - "gitlab" - ], + "sites": ["gitlab"], "task_id": 743, - "require_login": true, - "storage_state": "./.auth/gitlab_state.json", - "start_url": "__GITLAB__", - "geolocation": null, - "intent_template": "Create a new {{scope}} project \"web_arena\" and add {{account_list}} as members", - "instantiation_dict": { - "scope": "public", - "account_list": "Abishek, Vinta" - }, - "intent": "Create a new public project \"web_arena\" and add Abishek, Vinta as members", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "project_has_visibility_and_members", - "eval_params": { - "group": "byteblaze", - "project": "web_arena", - "visibility": "public", - "members": [ - "abisubramanya27", - "vinta" - ] - } - } - } - ], - "site": "gitlab" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "__GITLAB__/byteblaze/web_arena", - "locator": "document.querySelector('.visibility-icon').getAttribute('title')", - "required_contents": { - "must_include": [ - "public" - ] - } - }, - { - "url": "__GITLAB__/byteblaze/web_arena/-/project_members", - "locator": "", - "required_contents": { - "must_include": [ - "@abisubramanya27", - "@vinta" - ] - } - } - ] - }, "intent_template_id": 332, - "changelogs": [ + "start_urls": ["__GITLAB__"], + "intent": "Create a new public project \"web_arena\" and add Abishek, Vinta as members", + "intent_template": "Create a new {{scope}} project \"web_arena\" and add {{account_list}} as members", + "instantiation_dict": {"scope": "public", "account_list": "Abishek, Vinta"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "expected_backend_state", - "category": "switch_to_api_eval", - "note": "Use API to verify; more accurate" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "group": "byteblaze", + "project": "web_arena", + "visibility": "public", + "members": ["abisubramanya27", "vinta"], + "site": "gitlab", + "expected": true } - ] + ], + "revision": 2 }, { - "sites": [ - "gitlab" - ], + "sites": ["gitlab"], "task_id": 744, - "require_login": true, - "storage_state": "./.auth/gitlab_state.json", - "start_url": "__GITLAB__", - "geolocation": null, - "intent_template": "Create a new {{scope}} project \"AutoAGI\" and add {{account_list}} as members", - "instantiation_dict": { - "scope": "public", - "account_list": "primer" - }, - "intent": "Create a new public project \"AutoAGI\" and add primer as members", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "project_has_visibility_and_members", - "eval_params": { - "group": "byteblaze", - "project": "AutoAGI", - "visibility": "public", - "members": [ - "primer" - ] - } - } - } - ], - "site": "gitlab" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "__GITLAB__/byteblaze/AutoAGI", - "locator": "document.querySelector('.visibility-icon').getAttribute('title')", - "required_contents": { - "must_include": [ - "public" - ] - } - }, - { - "url": "__GITLAB__/byteblaze/AutoAGI/-/project_members", - "locator": "", - "required_contents": { - "must_include": [ - "@primer" - ] - } - } - ] - }, "intent_template_id": 332, - "changelogs": [ + "start_urls": ["__GITLAB__"], + "intent": "Create a new public project \"AutoAGI\" and add primer as members", + "intent_template": "Create a new {{scope}} project \"AutoAGI\" and add {{account_list}} as members", + "instantiation_dict": {"scope": "public", "account_list": "primer"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "expected_backend_state", - "category": "switch_to_api_eval", - "note": "Use API to verify; more accurate" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "group": "byteblaze", + "project": "AutoAGI", + "visibility": "public", + "members": ["primer"], + "site": "gitlab", + "expected": true } - ] + ], + "revision": 2 }, { - "sites": [ - "gitlab" - ], + "sites": ["gitlab"], "task_id": 745, - "require_login": true, - "storage_state": "./.auth/gitlab_state.json", - "start_url": "__GITLAB__", - "geolocation": null, - "intent_template": "Create a new {{scope}} project \"awesome-llms\" and add {{account_list}} as members", - "instantiation_dict": { - "scope": "public", - "account_list": "primer, convexegg, abishek" - }, - "intent": "Create a new public project \"awesome-llms\" and add primer, convexegg, abishek as members", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "project_has_visibility_and_members", - "eval_params": { - "group": "byteblaze", - "project": "awesome-llms", - "visibility": "public", - "members": [ - "primer", - "convexegg", - "abisubramanya27" - ] - } - } - } - ], - "site": "gitlab" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "__GITLAB__/byteblaze/awesome-llms", - "locator": "document.querySelector('.visibility-icon').getAttribute('title')", - "required_contents": { - "must_include": [ - "public" - ] - } - }, - { - "url": "__GITLAB__/byteblaze/awesome-llms/-/project_members", - "locator": "", - "required_contents": { - "must_include": [ - "@primer", - "@convexegg", - "@abisubramanya27" - ] - } - } - ] - }, "intent_template_id": 332, - "changelogs": [ + "start_urls": ["__GITLAB__"], + "intent": "Create a new public project \"awesome-llms\" and add primer, convexegg, abishek as members", + "intent_template": "Create a new {{scope}} project \"awesome-llms\" and add {{account_list}} as members", + "instantiation_dict": {"scope": "public", "account_list": "primer, convexegg, abishek"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "expected_backend_state", - "category": "switch_to_api_eval", - "note": "Use API to verify; more accurate" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "group": "byteblaze", + "project": "awesome-llms", + "visibility": "public", + "members": ["primer", "convexegg", "abisubramanya27"], + "site": "gitlab", + "expected": true } - ] + ], + "revision": 2 }, { - "sites": [ - "gitlab" - ], + "sites": ["gitlab"], "task_id": 746, - "require_login": true, - "storage_state": "./.auth/gitlab_state.json", - "start_url": "__GITLAB__", - "geolocation": null, - "intent_template": "Create a new {{scope}} project \"llm_bulk_inference\" and add {{account_list}} as members", - "instantiation_dict": { - "scope": "private", - "account_list": "primer, convexegg, abishek" - }, - "intent": "Create a new private project \"llm_bulk_inference\" and add primer, convexegg, abishek as members", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "project_has_visibility_and_members", - "eval_params": { - "group": "byteblaze", - "project": "llm_bulk_inference", - "visibility": "private", - "members": [ - "primer", - "convexegg", - "abisubramanya27" - ] - } - } - } - ], - "site": "gitlab" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "__GITLAB__/byteblaze/llm_bulk_inference", - "locator": "document.querySelector('.visibility-icon').getAttribute('title')", - "required_contents": { - "must_include": [ - "Private" - ] - } - }, - { - "url": "__GITLAB__/byteblaze/llm_bulk_inference/-/project_members", - "locator": "", - "required_contents": { - "must_include": [ - "@primer", - "@convexegg", - "@abisubramanya27" - ] - } - } - ] - }, "intent_template_id": 332, - "changelogs": [ + "start_urls": ["__GITLAB__"], + "intent": "Create a new private project \"llm_bulk_inference\" and add primer, convexegg, abishek as members", + "intent_template": "Create a new {{scope}} project \"llm_bulk_inference\" and add {{account_list}} as members", + "instantiation_dict": {"scope": "private", "account_list": "primer, convexegg, abishek"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "expected_backend_state", - "category": "switch_to_api_eval", - "note": "Use API to verify; more accurate" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "group": "byteblaze", + "project": "llm_bulk_inference", + "visibility": "private", + "members": ["primer", "convexegg", "abisubramanya27"], + "site": "gitlab", + "expected": true } - ] + ], + "revision": 2 }, { - "sites": [ - "gitlab" - ], + "sites": ["gitlab"], "task_id": 747, - "require_login": true, - "storage_state": "./.auth/gitlab_state.json", - "start_url": "__GITLAB__", - "geolocation": null, + "intent_template_id": 2100, + "start_urls": ["__GITLAB__"], + "intent": "Start a private project awesome_web_agents with blank template and add Abishek, Vinta as members", "intent_template": "Start a private project {{project_name}} with {{template}} template and add {{account_list}} as members", "instantiation_dict": { "project_name": "awesome_web_agents", "template": "blank", "account_list": "Abishek, Vinta" }, - "intent": "Start a private project awesome_web_agents with blank template and add Abishek, Vinta as members", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "project_with_visibility_has_collaborators_and_initial_commit_message", - "eval_params": { - "group": "byteblaze", - "project": "awesome_web_agents", - "visibility": "private", - "initial_commit_message_substrings": [ - "Initial commit" - ], - "require_all_substrings": true, - "collaborators": [ - "abisubramanya27", - "vinta" - ] - } - } - } - ], - "site": "gitlab" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "__GITLAB__/byteblaze/awesome_web_agents", - "locator": "document.querySelector('.visibility-icon').getAttribute('title')", - "required_contents": { - "must_include": [ - "Private" - ] - } - }, - { - "url": "__GITLAB__/byteblaze/awesome_web_agents/-/commits", - "locator": "", - "required_contents": { - "must_include": [ - "Initial commit" - ] - } - }, - { - "url": "__GITLAB__/byteblaze/awesome_web_agents/-/project_members", - "locator": "", - "required_contents": { - "must_include": [ - "@abisubramanya27", - "@vinta" - ] - } - } - ] - }, - "intent_template_id": 2100, - "changelogs": [ + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "expected_backend_state", - "category": "switch_to_api_eval", - "note": "API-based validation of collaborators and message is more accurate" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "group": "byteblaze", + "project": "awesome_web_agents", + "visibility": "private", + "initial_commit_message_substrings": ["Initial commit"], + "require_all_substrings": true, + "collaborators": ["abisubramanya27", "vinta"], + "site": "gitlab", + "expected": true } - ] + ], + "revision": 2 }, { - "sites": [ - "gitlab" - ], + "sites": ["gitlab"], "task_id": 748, - "require_login": true, - "storage_state": "./.auth/gitlab_state.json", - "start_url": "__GITLAB__", - "geolocation": null, + "intent_template_id": 2100, + "start_urls": ["__GITLAB__"], + "intent": "Start a private project web_agent_android_xl with Android template and add primer, convexegg, abishek as members", "intent_template": "Start a private project {{project_name}} with {{template}} template and add {{account_list}} as members", "instantiation_dict": { "project_name": "web_agent_android_xl", "template": "Android", "account_list": "primer, convexegg, abishek" }, - "intent": "Start a private project web_agent_android_xl with Android template and add primer, convexegg, abishek as members", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "project_with_visibility_has_collaborators_and_initial_commit_message", - "eval_params": { - "group": "byteblaze", - "project": "web_agent_android_xl", - "visibility": "private", - "initial_commit_message_substrings": [ - "Initialized from 'Android' project template" - ], - "require_all_substrings": true, - "collaborators": [ - "primer", - "convexegg", - "abisubramanya27" - ] - } - } - } - ], - "site": "gitlab" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "__GITLAB__/byteblaze/web_agent_android_xl", - "locator": "document.querySelector('.visibility-icon').getAttribute('title')", - "required_contents": { - "must_include": [ - "Private" - ] - } - }, - { - "url": "__GITLAB__/byteblaze/web_agent_android_xl/-/commits", - "locator": "", - "required_contents": { - "must_include": [ - "Initialized from 'Android' project template" - ] - } - }, - { - "url": "__GITLAB__/byteblaze/web_agent_android_xl/-/project_members", - "locator": "", - "required_contents": { - "must_include": [ - "@primer", - "@convexegg", - "@abisubramanya27" - ] - } - } - ] - }, - "intent_template_id": 2100, - "changelogs": [ + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "expected_backend_state", - "category": "switch_to_api_eval", - "note": "API-based validation is more accurate" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "group": "byteblaze", + "project": "web_agent_android_xl", + "visibility": "private", + "initial_commit_message_substrings": ["Initialized from 'Android' project template"], + "require_all_substrings": true, + "collaborators": ["primer", "convexegg", "abisubramanya27"], + "site": "gitlab", + "expected": true } - ] + ], + "revision": 2 }, { - "sites": [ - "gitlab" - ], + "sites": ["gitlab"], "task_id": 749, - "require_login": true, - "storage_state": "./.auth/gitlab_state.json", - "start_url": "__GITLAB__", - "geolocation": null, + "intent_template_id": 2100, + "start_urls": ["__GITLAB__"], + "intent": "Start a private project project_site with NodeJS template and add primer, convexegg, vinta as members", "intent_template": "Start a private project {{project_name}} with {{template}} template and add {{account_list}} as members", "instantiation_dict": { "project_name": "project_site", "template": "NodeJS", "account_list": "primer, convexegg, vinta" }, - "intent": "Start a private project project_site with NodeJS template and add primer, convexegg, vinta as members", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "project_with_visibility_has_collaborators_and_initial_commit_message", - "eval_params": { - "group": "byteblaze", - "project": "project_site", - "visibility": "private", - "initial_commit_message_substrings": [ - "Initialized from 'NodeJS Express' project template" - ], - "require_all_substrings": true, - "collaborators": [ - "primer", - "convexegg", - "vinta" - ] - } - } - } - ], - "site": "gitlab" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "__GITLAB__/byteblaze/project_site", - "locator": "document.querySelector('.visibility-icon').getAttribute('title')", - "required_contents": { - "must_include": [ - "Private" - ] - } - }, - { - "url": "__GITLAB__/byteblaze/project_site/-/commits", - "locator": "", - "required_contents": { - "must_include": [ - "Initialized from 'NodeJS Express' project template" - ] - } - }, - { - "url": "__GITLAB__/byteblaze/project_site/-/project_members", - "locator": "", - "required_contents": { - "must_include": [ - "@primer", - "@convexegg", - "@vinta" - ] - } - } - ] - }, - "intent_template_id": 2100, - "changelogs": [ + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "expected_backend_state", - "category": "switch_to_api_eval", - "note": "API-based validation is more accurate" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "group": "byteblaze", + "project": "project_site", + "visibility": "private", + "initial_commit_message_substrings": ["Initialized from 'NodeJS Express' project template"], + "require_all_substrings": true, + "collaborators": ["primer", "convexegg", "vinta"], + "site": "gitlab", + "expected": true } - ] + ], + "revision": 2 }, { - "sites": [ - "gitlab" - ], + "sites": ["gitlab"], "task_id": 750, - "require_login": true, - "storage_state": "./.auth/gitlab_state.json", - "start_url": "__GITLAB__", - "geolocation": null, - "intent_template": "Start a private project {{project_name}} with {{template}} template and add {{account_list}} as members", - "instantiation_dict": { - "project_name": "agi_index", - "template": "HTML", - "account_list": "Vinta Chen" - }, - "intent": "Start a private project agi_index with HTML template and add Vinta Chen as members", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "project_has_fields_and_collaborators", - "eval_params": { - "group": "byteblaze", - "project": "agi_index", - "fields": { - "name": "agi_index", - "description": "plain HTML", - "visibility": "private" - }, - "collaborators": [ - "vinta" - ], - "description_is_substring": true - } - } - } - ], - "site": "gitlab" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "__GITLAB__/byteblaze/agi_index", - "locator": "document.querySelector('.visibility-icon').getAttribute('title')", - "required_contents": { - "must_include": [ - "Private" - ] - } - }, - { - "url": "__GITLAB__/byteblaze/agi_index", - "locator": "document.querySelector('.home-panel-description-markdown').outerText", - "required_contents": { - "must_include": [ - "Example plain HTML site using GitLab Pages: https://pages.gitlab.io/plain-html |OR| A plain HTML site that uses Netlify for CI/CD instead of GitLab, but still with all the other great GitLab features." - ] - } - }, - { - "url": "__GITLAB__/byteblaze/agi_index/-/project_members", - "locator": "", - "required_contents": { - "must_include": [ - "Vinta Chen" - ] - } - } - ] - }, "intent_template_id": 2100, - "changelogs": [ + "start_urls": ["__GITLAB__"], + "intent": "Start a private project agi_index with HTML template and add Vinta Chen as members", + "intent_template": "Start a private project {{project_name}} with {{template}} template and add {{account_list}} as members", + "instantiation_dict": {"project_name": "agi_index", "template": "HTML", "account_list": "Vinta Chen"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "expected_backend_state", - "category": "switch_to_api_eval", - "note": "API-based validation is more accurate" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "group": "byteblaze", + "project": "agi_index", + "fields": {"name": "agi_index", "description": "plain HTML", "visibility": "private"}, + "collaborators": ["vinta"], + "description_is_substring": true, + "site": "gitlab", + "expected": true } - ] + ], + "revision": 2 }, { - "sites": [ - "gitlab" - ], + "sites": ["gitlab"], "task_id": 751, - "require_login": true, - "storage_state": "./.auth/gitlab_state.json", - "start_url": "__GITLAB__", - "geolocation": null, + "intent_template_id": 2100, + "start_urls": ["__GITLAB__"], + "intent": "Start a private project AGISite with JEKYLL template and add Rohan and Vinta as members", "intent_template": "Start a private project {{project_name}} with {{template}} template and add {{account_list}} as members", "instantiation_dict": { "project_name": "AGISite", "template": "JEKYLL", "account_list": "Rohan and Vinta" }, - "intent": "Start a private project AGISite with JEKYLL template and add Rohan and Vinta as members", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "project_has_fields_and_collaborators", - "eval_params": { - "group": "byteblaze", - "project": "AGISite", - "fields": { - "name": "AGISite", - "description": "Jekyll site", - "visibility": "private" - }, - "collaborators": [ - "Seirdy", - "vinta" - ], - "description_is_substring": true - } - } - } - ], - "site": "gitlab" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "__GITLAB__/byteblaze/AGISite", - "locator": "document.querySelector('.visibility-icon').getAttribute('title')", - "required_contents": { - "must_include": [ - "Private" - ] - } - }, - { - "url": "__GITLAB__/byteblaze/AGISite", - "locator": "document.querySelector('.home-panel-description-markdown').outerText", - "required_contents": { - "must_include": [ - "Example Jekyll site using GitLab Pages: https://pages.gitlab.io/jekyll |OR| A Jekyll site that uses Netlify for CI/CD instead of GitLab, but still with all the other great GitLab features." - ] - } - }, - { - "url": "__GITLAB__/byteblaze/AGISite/-/project_members", - "locator": "", - "required_contents": { - "must_include": [ - "@Seirdy", - "@vinta" - ] - } - } - ] - }, - "intent_template_id": 2100, - "changelogs": [ + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "expected_backend_state", - "category": "switch_to_api_eval", - "note": "API-based validation is more accurate" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "group": "byteblaze", + "project": "AGISite", + "fields": {"name": "AGISite", "description": "Jekyll site", "visibility": "private"}, + "collaborators": ["Seirdy", "vinta"], + "description_is_substring": true, + "site": "gitlab", + "expected": true } - ] + ], + "revision": 2 }, { - "sites": [ - "gitlab" - ], + "sites": ["gitlab"], "task_id": 752, - "require_login": true, - "storage_state": "./.auth/gitlab_state.json", - "start_url": "__GITLAB__", - "geolocation": null, - "intent_template": "Create a private {{template}} repository called \"{{project_name}}\" using the right template to speed up development.", - "instantiation_dict": { - "project_name": "web_agent", - "template": "blank" - }, - "intent": "Create a private blank repository called \"web_agent\" using the right template to speed up development.", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "project_with_visibility_has_initial_commit_message", - "eval_params": { - "group": "byteblaze", - "project": "web_agent", - "visibility": "private", - "initial_commit_message_substrings": [ - "Initial commit" - ], - "require_all_substrings": false - } - } - } - ], - "site": "gitlab" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "__GITLAB__/byteblaze/web_agent", - "locator": "document.querySelector('.visibility-icon').getAttribute('title')", - "required_contents": { - "must_include": [ - "Private" - ] - } - }, - { - "url": "__GITLAB__/byteblaze/web_agent/-/commits", - "locator": "", - "required_contents": { - "must_include": [ - "Initial commit" - ] - } - } - ] - }, "intent_template_id": 332, - "changelogs": [ - { - "key": "intent_template_id", - "category": "reference_alignment", - "note": "Group similar tasks together" - }, + "start_urls": ["__GITLAB__"], + "intent": "Create a private blank repository called \"web_agent\" using the right template to speed up development.", + "intent_template": "Create a private {{template}} repository called \"{{project_name}}\" using the right template to speed up development.", + "instantiation_dict": {"project_name": "web_agent", "template": "blank"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "expected_backend_state", - "category": "switch_to_api_eval", - "note": "API-based validation via project_with_visibility_has_initial_commit_message is more accurate" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "group": "byteblaze", + "project": "web_agent", + "visibility": "private", + "initial_commit_message_substrings": ["Initial commit"], + "require_all_substrings": false, + "site": "gitlab", + "expected": true } - ] + ], + "revision": 2 }, { - "sites": [ - "gitlab" - ], + "sites": ["gitlab"], "task_id": 753, - "require_login": true, - "storage_state": "./.auth/gitlab_state.json", - "start_url": "__GITLAB__", - "geolocation": null, - "intent_template": "Create a private {{template}} repository called \"{{project_name}}\" using the right template to speed up development.", - "instantiation_dict": { - "project_name": "web_agent_android_xs", - "template": "Android" - }, - "intent": "Create a private Android repository called \"web_agent_android_xs\" using the right template to speed up development.", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "project_with_visibility_has_initial_commit_message", - "eval_params": { - "group": "byteblaze", - "project": "web_agent_android_xs", - "visibility": "private", - "initial_commit_message_substrings": [ - "Initialized from 'Android' project template" - ], - "require_all_substrings": false - } - } - } - ], - "site": "gitlab" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "__GITLAB__/byteblaze/web_agent_android_xs", - "locator": "document.querySelector('.visibility-icon').getAttribute('title')", - "required_contents": { - "must_include": [ - "Private" - ] - } - }, - { - "url": "__GITLAB__/byteblaze/web_agent_android_xs/-/commits", - "locator": "", - "required_contents": { - "must_include": [ - "Initialized from 'Android' project template" - ] - } - } - ] - }, "intent_template_id": 332, - "changelogs": [ - { - "key": "intent_template_id", - "category": "reference_alignment", - "note": "Group similar tasks together" - }, + "start_urls": ["__GITLAB__"], + "intent": "Create a private Android repository called \"web_agent_android_xs\" using the right template to speed up development.", + "intent_template": "Create a private {{template}} repository called \"{{project_name}}\" using the right template to speed up development.", + "instantiation_dict": {"project_name": "web_agent_android_xs", "template": "Android"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "expected_backend_state", - "category": "switch_to_api_eval", - "note": "API-based validation is more accurate" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "group": "byteblaze", + "project": "web_agent_android_xs", + "visibility": "private", + "initial_commit_message_substrings": ["Initialized from 'Android' project template"], + "require_all_substrings": false, + "site": "gitlab", + "expected": true } - ] + ], + "revision": 2 }, { - "sites": [ - "gitlab" - ], + "sites": ["gitlab"], "task_id": 754, - "require_login": true, - "storage_state": "./.auth/gitlab_state.json", - "start_url": "__GITLAB__", - "geolocation": null, - "intent_template": "Create a private {{template}} repository called \"{{project_name}}\" using the right template to speed up development.", - "instantiation_dict": { - "project_name": "web_agent_nodejs", - "template": "NodeJS" - }, - "intent": "Create a private NodeJS repository called \"web_agent_nodejs\" using the right template to speed up development.", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "project_with_visibility_has_initial_commit_message", - "eval_params": { - "group": "byteblaze", - "project": "web_agent_nodejs", - "visibility": "private", - "initial_commit_message_substrings": [ - "Initialized from 'NodeJS Express' project template" - ], - "require_all_substrings": false - } - } - } - ], - "site": "gitlab" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "__GITLAB__/byteblaze/web_agent_nodejs", - "locator": "document.querySelector('.visibility-icon').getAttribute('title')", - "required_contents": { - "must_include": [ - "Private" - ] - } - }, - { - "url": "__GITLAB__/byteblaze/web_agent_nodejs/-/commits", - "locator": "", - "required_contents": { - "must_include": [ - "Initialized from 'NodeJS Express' project template" - ] - } - } - ] - }, "intent_template_id": 332, - "changelogs": [ - { - "key": "intent_template_id", - "category": "reference_alignment", - "note": "Group similar tasks together" - }, + "start_urls": ["__GITLAB__"], + "intent": "Create a private NodeJS repository called \"web_agent_nodejs\" using the right template to speed up development.", + "intent_template": "Create a private {{template}} repository called \"{{project_name}}\" using the right template to speed up development.", + "instantiation_dict": {"project_name": "web_agent_nodejs", "template": "NodeJS"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "expected_backend_state", - "category": "switch_to_api_eval", - "note": "API-based validation is more accurate" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "group": "byteblaze", + "project": "web_agent_nodejs", + "visibility": "private", + "initial_commit_message_substrings": ["Initialized from 'NodeJS Express' project template"], + "require_all_substrings": false, + "site": "gitlab", + "expected": true } - ] + ], + "revision": 2 }, { - "sites": [ - "gitlab" - ], + "sites": ["gitlab"], "task_id": 755, - "require_login": true, - "storage_state": "./.auth/gitlab_state.json", - "start_url": "__GITLAB__", - "geolocation": null, - "intent_template": "Create a private {{template}} repository called \"{{project_name}}\" using the right template to speed up development.", - "instantiation_dict": { - "project_name": "web_agent_index", - "template": "HTML" - }, - "intent": "Create a private HTML repository called \"web_agent_index\" using the right template to speed up development.", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "project_has_fields", - "eval_params": { - "group": "byteblaze", - "project": "web_agent_index", - "fields": { - "name": "web_agent_index", - "description": "plain HTML", - "visibility": "private" - }, - "description_is_substring": true - } - } - } - ], - "site": "gitlab" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "__GITLAB__/byteblaze/web_agent_index", - "locator": "document.querySelector('.visibility-icon').getAttribute('title')", - "required_contents": { - "must_include": [ - "Private" - ] - } - }, - { - "url": "__GITLAB__/byteblaze/web_agent_index", - "locator": "document.querySelector('.home-panel-description-markdown').outerText", - "required_contents": { - "must_include": [ - "Example plain HTML site using GitLab Pages: https://pages.gitlab.io/plain-html |OR| A plain HTML site that uses Netlify for CI/CD instead of GitLab, but still with all the other great GitLab features." - ] - } - } - ] - }, "intent_template_id": 332, - "changelogs": [ - { - "key": "intent_template_id", - "category": "reference_alignment", - "note": "Group similar tasks together" - }, + "start_urls": ["__GITLAB__"], + "intent": "Create a private HTML repository called \"web_agent_index\" using the right template to speed up development.", + "intent_template": "Create a private {{template}} repository called \"{{project_name}}\" using the right template to speed up development.", + "instantiation_dict": {"project_name": "web_agent_index", "template": "HTML"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "expected_backend_state", - "category": "switch_to_api_eval", - "note": "API-based validation is more accurate" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "group": "byteblaze", + "project": "web_agent_index", + "fields": { + "name": "web_agent_index", + "description": "plain HTML", + "visibility": "private" + }, + "description_is_substring": true, + "site": "gitlab", + "expected": true } - ] + ], + "revision": 2 }, { - "sites": [ - "gitlab" - ], + "sites": ["gitlab"], "task_id": 756, - "require_login": true, - "storage_state": "./.auth/gitlab_state.json", - "start_url": "__GITLAB__", - "geolocation": null, - "intent_template": "Create a private {{template}} repository called \"{{project_name}}\" using the right template to speed up development.", - "instantiation_dict": { - "project_name": "11711_gitlab", - "template": "JEKYLL" - }, - "intent": "Create a private JEKYLL repository called \"11711_gitlab\" using the right template to speed up development.", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "project_has_fields", - "eval_params": { - "group": "byteblaze", - "project": "11711_gitlab", - "fields": { - "name": "11711_gitlab", - "description": "Jekyll site", - "visibility": "private" - }, - "description_is_substring": true - } - } - } - ], - "site": "gitlab" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "__GITLAB__/byteblaze/11711_gitlab", - "locator": "document.querySelector('.visibility-icon').getAttribute('title')", - "required_contents": { - "must_include": [ - "Private" - ] - } - }, - { - "url": "__GITLAB__/byteblaze/11711_gitlab", - "locator": "document.querySelector('.home-panel-description-markdown').outerText", - "required_contents": { - "must_include": [ - "Example Jekyll site using GitLab Pages: https://pages.gitlab.io/jekyll |OR| A Jekyll site that uses Netlify for CI/CD instead of GitLab, but still with all the other great GitLab features." - ] - } - } - ] - }, "intent_template_id": 332, - "changelogs": [ - { - "key": "intent_template_id", - "category": "reference_alignment", - "note": "Group similar tasks together" - }, + "start_urls": ["__GITLAB__"], + "intent": "Create a private JEKYLL repository called \"11711_gitlab\" using the right template to speed up development.", + "intent_template": "Create a private {{template}} repository called \"{{project_name}}\" using the right template to speed up development.", + "instantiation_dict": {"project_name": "11711_gitlab", "template": "JEKYLL"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "expected_backend_state", - "category": "switch_to_api_eval", - "note": "API-based validation is more accurate" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "group": "byteblaze", + "project": "11711_gitlab", + "fields": {"name": "11711_gitlab", "description": "Jekyll site", "visibility": "private"}, + "description_is_substring": true, + "site": "gitlab", + "expected": true } - ] + ], + "revision": 2 }, { - "sites": [ - "map" - ], + "sites": ["map"], "task_id": 757, - "require_login": true, - "storage_state": null, - "start_url": "__MAP__", - "geolocation": null, + "intent_template_id": 42, + "start_urls": ["__MAP__"], + "intent": "Show me the path and travel time from home of the 1980 Super Bowl champions to home of the 1991 Super Bowl champions.", "intent_template": "Show me the path and travel time from {{city1}} to {{city2}}.", "instantiation_dict": { "city1": "home of the 1980 Super Bowl champions", "city2": "home of the 1991 Super Bowl champions" }, - "intent": "Show me the path and travel time from home of the 1980 Super Bowl champions to home of the 1991 Super Bowl champions.", - "require_reset": false, - "eval": { - "site": "map" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "last", - "locator": "document.querySelector(\"div#content select.routing_engines\").selectedIndex", - "required_contents": { - "exact_match": "1" - } - }, - { - "url": "last", - "locator": "document.querySelector('[name=\"route_from\"').value", - "required_contents": { - "must_include": [ - "Pittsburgh" - ] - } - }, - { - "url": "last", - "locator": "document.querySelector('[name=\"route_to\"').value", - "required_contents": { - "must_include": [ - "New York" - ] - } - } - ] - }, - "intent_template_id": 42 + "format_specification": null, + "start_url_context": null, + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} + }, + { + "evaluator": "NetworkEventEvaluator", + "site": "map", + "url_match_mode": "exact", + "last_event_only": true, + "expected": {"url": "__MAP__", "response_status": 200, "event_type": "navigation"} + } + ], + "revision": 2 }, { - "sites": [ - "map" - ], + "sites": ["map"], "task_id": 758, - "require_login": true, - "storage_state": null, - "start_url": "__MAP__", - "geolocation": null, - "intent_template": "Show me the path and travel time from {{city1}} to {{city2}}.", - "instantiation_dict": { - "city1": "the big apple", - "city2": "biggest city in Maine" - }, + "intent_template_id": 42, + "start_urls": ["__MAP__"], "intent": "Show me the path and travel time from the big apple to biggest city in Maine.", - "require_reset": false, - "eval": { - "site": "map" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "last", - "locator": "document.querySelector(\"div#content select.routing_engines\").selectedIndex", - "required_contents": { - "exact_match": "1" - } - }, - { - "url": "last", - "locator": "document.querySelector('[name=\"route_from\"').value", - "required_contents": { - "must_include": [ - "New York" - ] - } - }, - { - "url": "last", - "locator": "document.querySelector('[name=\"route_to\"').value", - "required_contents": { - "must_include": [ - "Portland", - "Maine" - ] - } - } - ] - }, - "intent_template_id": 42 + "intent_template": "Show me the path and travel time from {{city1}} to {{city2}}.", + "instantiation_dict": {"city1": "the big apple", "city2": "biggest city in Maine"}, + "format_specification": null, + "start_url_context": null, + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} + }, + { + "evaluator": "NetworkEventEvaluator", + "site": "map", + "url_match_mode": "exact", + "last_event_only": true, + "expected": {"url": "__MAP__", "response_status": 200, "event_type": "navigation"} + } + ], + "revision": 2 }, { - "sites": [ - "map", - "shopping_admin" - ], + "sites": ["map", "shopping_admin"], "task_id": 759, - "require_login": true, - "storage_state": null, - "start_url": "__MAP__", - "geolocation": null, + "intent_template_id": 42, + "start_urls": ["__MAP__", "__SHOPPING_ADMIN__"], + "intent": "Show me the route and driving time from the city where my E-commerce customer Sophia Young lives to New York City", "intent_template": "Show me the route and driving time from {{city1}} to {{city2}}", "instantiation_dict": { "city1": "the city where my E-commerce customer Sophia Young lives", "city2": "New York City" }, - "intent": "Show me the route and driving time from the city where my E-commerce customer Sophia Young lives to New York City", - "require_reset": false, - "eval": { - "site": "shopping_admin" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "last", - "locator": "document.querySelector(\"div#content select.routing_engines\").selectedIndex", - "required_contents": { - "exact_match": "1" - } - }, - { - "url": "last", - "locator": "document.querySelector('[name=\"route_from\"').value", - "required_contents": { - "must_include": [ - "Boston" - ] - } - }, - { - "url": "last", - "locator": "document.querySelector('[name=\"route_to\"').value", - "required_contents": { - "must_include": [ - "New York" - ] - } - } - ] - }, - "intent_template_id": 42 + "format_specification": null, + "start_url_context": null, + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} + }, + { + "evaluator": "NetworkEventEvaluator", + "site": "map", + "url_match_mode": "exact", + "last_event_only": true, + "expected": {"url": "__MAP__", "response_status": 200, "event_type": "navigation"} + } + ], + "revision": 2 }, { - "sites": [ - "map", - "shopping_admin" - ], + "sites": ["map", "shopping_admin"], "task_id": 760, - "require_login": true, - "storage_state": null, - "start_url": "__MAP__", - "geolocation": null, + "intent_template_id": 42, + "start_urls": ["__MAP__", "__SHOPPING_ADMIN__"], + "intent": "Show me the route and driving time from Allentown, PA to the city where my E-commerce customer Amanda Kim lives", "intent_template": "Show me the route and driving time from {{city1}} to {{city2}}", "instantiation_dict": { "city1": "Allentown, PA", "city2": "the city where my E-commerce customer Amanda Kim lives" }, - "intent": "Show me the route and driving time from Allentown, PA to the city where my E-commerce customer Amanda Kim lives", - "require_reset": false, - "eval": { - "site": "shopping_admin" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "last", - "locator": "document.querySelector(\"div#content select.routing_engines\").selectedIndex", - "required_contents": { - "exact_match": "1" - } - }, - { - "url": "last", - "locator": "document.querySelector('[name=\"route_from\"').value", - "required_contents": { - "must_include": [ - "Allentown" - ] - } - }, - { - "url": "last", - "locator": "document.querySelector('[name=\"route_to\"').value", - "required_contents": { - "must_include": [ - "Hoboken", - "New Jersey" - ] - } - } - ] - }, - "intent_template_id": 42 + "format_specification": null, + "start_url_context": null, + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} + }, + { + "evaluator": "NetworkEventEvaluator", + "site": "map", + "url_match_mode": "exact", + "last_event_only": true, + "expected": {"url": "__MAP__", "response_status": 200, "event_type": "navigation"} + } + ], + "revision": 2 }, { - "sites": [ - "map" - ], + "sites": ["map"], "task_id": 761, - "require_login": true, - "storage_state": null, - "start_url": "__MAP__", - "geolocation": null, - "intent_template": "Get directions from {{location/address_1}} to {{location/address_2}} using {{transportation}} options.", + "intent_template_id": 54, + "start_urls": ["__MAP__"], + "intent": "Get directions from Carnegie Science Museum to Hunt library CMU using walk options.", + "intent_template": "Get directions from {{location_address_1}} to {{location_address_2}} using {{transportation}} options.", "instantiation_dict": { - "location/address_1": "Carnegie Science Museum", - "location/address_2": "Hunt library CMU", + "location_address_1": "Carnegie Science Museum", + "location_address_2": "Hunt library CMU", "transportation": "walk" }, - "intent": "Get directions from Carnegie Science Museum to Hunt library CMU using walk options.", - "require_reset": false, - "eval": { - "site": "map" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "last", - "locator": "document.querySelector(\"div#content select.routing_engines\").selectedIndex", - "required_contents": { - "exact_match": "2" - } - }, - { - "url": "last", - "locator": "document.querySelector('[name=\"route_from\"').value", - "required_contents": { - "must_include": [ - "Carnegie Science Center", - "Allegheny County", - "Pittsburgh" - ] - } - }, - { - "url": "last", - "locator": "document.querySelector('[name=\"route_to\"').value", - "required_contents": { - "must_include": [ - "Hunt Library", - "Pittsburgh" - ] - } - } - ] - }, - "intent_template_id": 54 + "format_specification": null, + "start_url_context": null, + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} + }, + { + "evaluator": "NetworkEventEvaluator", + "site": "map", + "url_match_mode": "exact", + "last_event_only": true, + "expected": {"url": "__MAP__", "response_status": 200, "event_type": "navigation"} + } + ], + "revision": 2 }, { - "sites": [ - "map" - ], + "sites": ["map"], "task_id": 762, - "require_login": true, - "storage_state": null, - "start_url": "__MAP__", - "geolocation": null, - "intent_template": "Get directions from {{location/address_1}} to {{location/address_2}} using {{transportation}} options.", + "intent_template_id": 54, + "start_urls": ["__MAP__"], + "intent": "Get directions from Carnegie Music Hall in NYC to Carnegie Mellon University using driving options.", + "intent_template": "Get directions from {{location_address_1}} to {{location_address_2}} using {{transportation}} options.", "instantiation_dict": { - "location/address_1": "Carnegie Music Hall in NYC", - "location/address_2": "Carnegie Mellon University", + "location_address_1": "Carnegie Music Hall in NYC", + "location_address_2": "Carnegie Mellon University", "transportation": "driving" }, - "intent": "Get directions from Carnegie Music Hall in NYC to Carnegie Mellon University using driving options.", - "require_reset": false, - "eval": { - "site": "map" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "last", - "locator": "document.querySelector(\"div#content select.routing_engines\").selectedIndex", - "required_contents": { - "exact_match": "1" - } - }, - { - "url": "last", - "locator": "document.querySelector('[name=\"route_from\"').value", - "required_contents": { - "must_include": [ - "Carnegie Hall", - "West 57th Street", - "Manhattan", - "New York" - ] - } - }, - { - "url": "last", - "locator": "document.querySelector('[name=\"route_to\"').value", - "required_contents": { - "must_include": [ - "Carnegie Mellon University", - "Pittsburgh" - ] - } - } - ] - }, - "intent_template_id": 54 + "format_specification": null, + "start_url_context": null, + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} + }, + { + "evaluator": "NetworkEventEvaluator", + "site": "map", + "url_match_mode": "exact", + "last_event_only": true, + "expected": {"url": "__MAP__", "response_status": 200, "event_type": "navigation"} + } + ], + "revision": 2 }, { - "sites": [ - "map" - ], + "sites": ["map"], "task_id": 763, - "require_login": true, - "storage_state": null, - "start_url": "__MAP__", - "geolocation": null, - "intent_template": "Find the walkway to the closest {{store}} from {{location}}.", - "instantiation_dict": { - "store": "Trader Joe's", - "location": "401 Shady Ave, Pittsburgh" - }, + "intent_template_id": 75, + "start_urls": ["__MAP__"], "intent": "Find the walkway to the closest Trader Joe's from 401 Shady Ave, Pittsburgh.", - "require_reset": false, - "eval": { - "site": "map" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "last", - "locator": "document.querySelector(\"div#content select.routing_engines\").selectedIndex", - "required_contents": { - "exact_match": "2" - } - }, - { - "url": "last", - "locator": "document.querySelector('[name=\"route_from\"').value", - "required_contents": { - "must_include": [ - "401, Shady Avenue, Shadyside" - ] - } - }, - { - "url": "last", - "locator": "document.querySelector('[name=\"route_to\"').value", - "required_contents": { - "must_include": [ - "Trader Joe's, 6343, Penn Avenue, East Liberty" - ] - } - } - ] - }, - "intent_template_id": 75 + "intent_template": "Find the walkway to the closest {{store}} from {{location}}.", + "instantiation_dict": {"store": "Trader Joe's", "location": "401 Shady Ave, Pittsburgh"}, + "format_specification": null, + "start_url_context": null, + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} + }, + { + "evaluator": "NetworkEventEvaluator", + "site": "map", + "url_match_mode": "exact", + "last_event_only": true, + "expected": {"url": "__MAP__", "response_status": 200, "event_type": "navigation"} + } + ], + "revision": 2 }, { - "sites": [ - "map" - ], + "sites": ["map"], "task_id": 764, - "require_login": true, - "storage_state": null, - "start_url": "__MAP__", - "geolocation": null, - "intent_template": "Find the walkway to the closest {{store}} from {{location}}.", - "instantiation_dict": { - "store": "Target", - "location": "401 Shady Ave, Pittsburgh" - }, + "intent_template_id": 75, + "start_urls": ["__MAP__"], "intent": "Find the walkway to the closest Target from 401 Shady Ave, Pittsburgh.", - "require_reset": false, - "eval": { - "site": "map" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "last", - "locator": "document.querySelector(\"div#content select.routing_engines\").selectedIndex", - "required_contents": { - "exact_match": "2" - } - }, - { - "url": "last", - "locator": "document.querySelector('[name=\"route_from\"').value", - "required_contents": { - "must_include": [ - "401, Shady Avenue, Shadyside" - ] - } - }, - { - "url": "last", - "locator": "document.querySelector('[name=\"route_to\"').value", - "required_contents": { - "must_include": [ - "Target, 6231, Penn Avenue, East Liberty" - ] - } - } - ] - }, - "intent_template_id": 75 + "intent_template": "Find the walkway to the closest {{store}} from {{location}}.", + "instantiation_dict": {"store": "Target", "location": "401 Shady Ave, Pittsburgh"}, + "format_specification": null, + "start_url_context": null, + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} + }, + { + "evaluator": "NetworkEventEvaluator", + "site": "map", + "url_match_mode": "exact", + "last_event_only": true, + "expected": {"url": "__MAP__", "response_status": 200, "event_type": "navigation"} + } + ], + "revision": 2 }, { - "sites": [ - "map" - ], + "sites": ["map"], "task_id": 765, - "require_login": true, - "storage_state": null, - "start_url": "__MAP__", - "geolocation": null, - "intent_template": "Find the walkway to the closest {{store}} from {{location}}.", - "instantiation_dict": { - "store": "Japanese food market", - "location": "401 Shady Ave, Pittsburgh" - }, + "intent_template_id": 75, + "start_urls": ["__MAP__"], "intent": "Find the walkway to the closest Japanese food market from 401 Shady Ave, Pittsburgh.", - "require_reset": false, - "eval": { - "site": "map" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "last", - "locator": "document.querySelector(\"div#content select.routing_engines\").selectedIndex", - "required_contents": { - "exact_match": "2" - } - }, - { - "url": "last", - "locator": "document.querySelector('[name=\"route_from\"').value", - "required_contents": { - "must_include": [ - "401, Shady Avenue, Shadyside" - ] - } - }, - { - "url": "last", - "locator": "document.querySelector('[name=\"route_to\"').value", - "required_contents": { - "must_include": [ - "Tokyo Japanese Food Store, 5855, Ellsworth Avenue, Shadyside" - ] - } - } - ] - }, - "intent_template_id": 75 + "intent_template": "Find the walkway to the closest {{store}} from {{location}}.", + "instantiation_dict": {"store": "Japanese food market", "location": "401 Shady Ave, Pittsburgh"}, + "format_specification": null, + "start_url_context": null, + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} + }, + { + "evaluator": "NetworkEventEvaluator", + "site": "map", + "url_match_mode": "exact", + "last_event_only": true, + "expected": {"url": "__MAP__", "response_status": 200, "event_type": "navigation"} + } + ], + "revision": 2 }, { - "sites": [ - "map" - ], + "sites": ["map"], "task_id": 766, - "require_login": true, - "storage_state": null, - "start_url": "__MAP__", - "geolocation": null, - "intent_template": "Find the walkway to the closest {{store}} from {{location}}.", - "instantiation_dict": { - "store": "grocessory owned by Amazon", - "location": "401 Shady Ave, Pittsburgh" - }, + "intent_template_id": 75, + "start_urls": ["__MAP__"], "intent": "Find the walkway to the closest grocessory owned by Amazon from 401 Shady Ave, Pittsburgh.", - "require_reset": false, - "eval": { - "site": "map" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "last", - "locator": "document.querySelector(\"div#content select.routing_engines\").selectedIndex", - "required_contents": { - "exact_match": "2" - } - }, - { - "url": "last", - "locator": "document.querySelector('[name=\"route_from\"').value", - "required_contents": { - "must_include": [ - "401, Shady Avenue, Shadyside" - ] - } - }, - { - "url": "last", - "locator": "document.querySelector('[name=\"route_to\"').value", - "required_contents": { - "must_include": [ - "Whole Foods Market, 5700, Penn Avenue, East Liberty" - ] - } - } - ] - }, - "intent_template_id": 75 + "intent_template": "Find the walkway to the closest {{store}} from {{location}}.", + "instantiation_dict": {"store": "grocessory owned by Amazon", "location": "401 Shady Ave, Pittsburgh"}, + "format_specification": null, + "start_url_context": null, + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} + }, + { + "evaluator": "NetworkEventEvaluator", + "site": "map", + "url_match_mode": "exact", + "last_event_only": true, + "expected": {"url": "__MAP__", "response_status": 200, "event_type": "navigation"} + } + ], + "revision": 2 }, { - "sites": [ - "map" - ], + "sites": ["map"], "task_id": 767, - "require_login": true, - "storage_state": null, - "start_url": "__MAP__", - "geolocation": null, + "intent_template_id": 75, + "start_urls": ["__MAP__"], + "intent": "Find the walkway to the closest chain grocessory owned by a local business from 401 Shady Ave, Pittsburgh.", "intent_template": "Find the walkway to the closest {{store}} from {{location}}.", "instantiation_dict": { "store": "chain grocessory owned by a local business", "location": "401 Shady Ave, Pittsburgh" }, - "intent": "Find the walkway to the closest chain grocessory owned by a local business from 401 Shady Ave, Pittsburgh.", - "require_reset": false, - "eval": { - "site": "map" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "last", - "locator": "document.querySelector(\"div#content select.routing_engines\").selectedIndex", - "required_contents": { - "exact_match": "2" - } - }, - { - "url": "last", - "locator": "document.querySelector('[name=\"route_from\"').value", - "required_contents": { - "must_include": [ - "401, Shady Avenue, Shadyside" - ] - } - }, - { - "url": "last", - "locator": "document.querySelector('[name=\"route_to\"').value", - "required_contents": { - "must_include": [ - "Giant Eagle, 5550, Centre Avenue, Shadyside" - ] - } - } - ] - }, - "intent_template_id": 75 + "format_specification": null, + "start_url_context": null, + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} + }, + { + "evaluator": "NetworkEventEvaluator", + "site": "map", + "url_match_mode": "exact", + "last_event_only": true, + "expected": {"url": "__MAP__", "response_status": 200, "event_type": "navigation"} + } + ], + "revision": 2 }, { - "sites": [ - "shopping_admin" - ], + "sites": ["shopping_admin"], "task_id": 768, - "require_login": true, - "storage_state": "./.auth/shopping_admin_state.json", - "start_url": "__SHOPPING_ADMIN__", - "geolocation": null, - "intent_template": "{{quantity}} {{product}} arrived, update the stock", - "instantiation_dict": { - "quantity": "5", - "product": "blue Cronus yoga pants with size 33" - }, - "intent": "5 blue Cronus yoga pants with size 33 arrived, update the stock", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "verify_product_details", - "eval_params": { - "product_id": "872" - }, - "expected_data": { - "stock_qty": 5, - "in_stock": true - } - } - } - ], - "site": "shopping_admin" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "__SHOPPING_ADMIN__/catalog/product/edit/id/872/", - "locator": "document.querySelector('[name=\"product[quantity_and_stock_status][qty]\"').value", - "required_contents": { - "exact_match": "5" - } - }, - { - "url": "__SHOPPING_ADMIN__/catalog/product/edit/id/872/", - "locator": "document.querySelector('[name=\"product[quantity_and_stock_status][is_in_stock]\"').value", - "required_contents": { - "exact_match": "1" - } - } - ] - }, "intent_template_id": 241, - "changelogs": [ + "start_urls": ["__SHOPPING_ADMIN__"], + "intent": "5 blue Cronus yoga pants with size 33 arrived, update the stock", + "intent_template": "{{quantity}} {{product}} arrived, update the stock", + "instantiation_dict": {"quantity": "5", "product": "blue Cronus yoga pants with size 33"}, + "format_specification": null, + "start_url_context": null, + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "product_id": "872", + "site": "shopping_admin", + "expected": {"stock_qty": 5, "in_stock": true} } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping_admin" - ], + "sites": ["shopping_admin"], "task_id": 769, - "require_login": true, - "storage_state": "./.auth/shopping_admin_state.json", - "start_url": "__SHOPPING_ADMIN__", - "geolocation": null, - "intent_template": "We've received {{quantity}} {{product}}, please update the inventory.", - "instantiation_dict": { - "quantity": "378", - "product": "brown Aero daily fitness tee in every size" - }, - "intent": "We've received 378 brown Aero daily fitness tee in every size, please update the inventory.", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "verify_product_details", - "eval_params": { - "product_id": "544" - }, - "expected_data": { - "stock_qty": 478 - } - } - }, - { - "eval_func": { - "name": "verify_product_details", - "eval_params": { - "product_id": "547" - }, - "expected_data": { - "stock_qty": 478 - } - } - }, - { - "eval_func": { - "name": "verify_product_details", - "eval_params": { - "product_id": "550" - }, - "expected_data": { - "stock_qty": 478 - } - } - }, - { - "eval_func": { - "name": "verify_product_details", - "eval_params": { - "product_id": "553" - }, - "expected_data": { - "stock_qty": 478 - } - } - }, - { - "eval_func": { - "name": "verify_product_details", - "eval_params": { - "product_id": "556" - }, - "expected_data": { - "stock_qty": 478 - } - } - } - ], - "site": "shopping_admin" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "__SHOPPING_ADMIN__/catalog/product/edit/id/544/", - "locator": "document.querySelector('[name=\"product[quantity_and_stock_status][qty]\"').value", - "required_contents": { - "exact_match": "478" - } - }, - { - "url": "__SHOPPING_ADMIN__/catalog/product/edit/id/547/", - "locator": "document.querySelector('[name=\"product[quantity_and_stock_status][qty]\"').value", - "required_contents": { - "exact_match": "478" - } - }, - { - "url": "__SHOPPING_ADMIN__/catalog/product/edit/id/550/", - "locator": "document.querySelector('[name=\"product[quantity_and_stock_status][qty]\"').value", - "required_contents": { - "exact_match": "478" - } - }, - { - "url": "__SHOPPING_ADMIN__/catalog/product/edit/id/553/", - "locator": "document.querySelector('[name=\"product[quantity_and_stock_status][qty]\"').value", - "required_contents": { - "exact_match": "478" - } - }, - { - "url": "__SHOPPING_ADMIN__/catalog/product/edit/id/556/", - "locator": "document.querySelector('[name=\"product[quantity_and_stock_status][qty]\"').value", - "required_contents": { - "exact_match": "478" - } - } - ] - }, "intent_template_id": 241, - "changelogs": [ + "start_urls": ["__SHOPPING_ADMIN__"], + "intent": "We've received 378 brown Aero daily fitness tee in every size, please update the inventory.", + "intent_template": "We've received {{quantity}} {{product}}, please update the inventory.", + "instantiation_dict": {"quantity": "378", "product": "brown Aero daily fitness tee in every size"}, + "format_specification": null, + "start_url_context": null, + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + }, + { + "evaluator": "BackendStateEvaluator", + "product_id": "544", + "site": "shopping_admin", + "expected": {"stock_qty": 478} + }, + { + "evaluator": "BackendStateEvaluator", + "product_id": "547", + "site": "shopping_admin", + "expected": {"stock_qty": 478} + }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "product_id": "550", + "site": "shopping_admin", + "expected": {"stock_qty": 478} + }, + { + "evaluator": "BackendStateEvaluator", + "product_id": "553", + "site": "shopping_admin", + "expected": {"stock_qty": 478} + }, + { + "evaluator": "BackendStateEvaluator", + "product_id": "556", + "site": "shopping_admin", + "expected": {"stock_qty": 478} } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping_admin" - ], + "sites": ["shopping_admin"], "task_id": 770, - "require_login": true, - "storage_state": "./.auth/shopping_admin_state.json", - "start_url": "__SHOPPING_ADMIN__", - "geolocation": null, - "intent_template": "We've received {{quantity}}, update the inventory.", - "instantiation_dict": { - "quantity": "12 white Cora parachute pant of size 28 and 56 blue of size 29" - }, - "intent": "We've received 12 white Cora parachute pant of size 28 and 56 blue of size 29, update the inventory.", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "verify_product_details", - "eval_params": { - "product_id": "1836" - }, - "expected_data": { - "stock_qty": 112 - } - } - }, - { - "eval_func": { - "name": "verify_product_details", - "eval_params": { - "product_id": "1838" - }, - "expected_data": { - "stock_qty": 156 - } - } - } - ], - "site": "shopping_admin" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "__SHOPPING_ADMIN__/catalog/product/edit/id/1836/", - "locator": "document.querySelector('[name=\"product[quantity_and_stock_status][qty]\"').value", - "required_contents": { - "exact_match": "112" - } - }, - { - "url": "__SHOPPING_ADMIN__/catalog/product/edit/id/1838/", - "locator": "document.querySelector('[name=\"product[quantity_and_stock_status][qty]\"').value", - "required_contents": { - "exact_match": "156" - } - } - ] - }, "intent_template_id": 241, - "changelogs": [ + "start_urls": ["__SHOPPING_ADMIN__"], + "intent": "We've received 12 white Cora parachute pant of size 28 and 56 blue of size 29, update the inventory.", + "intent_template": "We've received {{quantity}}, update the inventory.", + "instantiation_dict": {"quantity": "12 white Cora parachute pant of size 28 and 56 blue of size 29"}, + "format_specification": null, + "start_url_context": null, + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "product_id": "1836", + "site": "shopping_admin", + "expected": {"stock_qty": 112} + }, + { + "evaluator": "BackendStateEvaluator", + "product_id": "1838", + "site": "shopping_admin", + "expected": {"stock_qty": 156} } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping_admin" - ], + "sites": ["shopping_admin"], "task_id": 771, - "require_login": true, - "storage_state": "./.auth/shopping_admin_state.json", - "start_url": "__SHOPPING_ADMIN__", - "geolocation": null, + "intent_template_id": 243, + "start_urls": ["__SHOPPING_ADMIN__"], + "intent": "Approve the positive reviews to display in our store.", "intent_template": "Approve the positive reviews to display in our store.", "instantiation_dict": {}, - "intent": "Approve the positive reviews to display in our store.", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "verify_review_details", - "eval_params": { - "review_id": "352" - }, - "expected_data": { - "approved": true - } - } - }, - { - "eval_func": { - "name": "verify_review_details", - "eval_params": { - "review_id": "349" - }, - "expected_data": { - "approved": true - } - } - }, - { - "eval_func": { - "name": "verify_review_details", - "eval_params": { - "review_id": "347" - }, - "expected_data": { - "approved": true - } - } - } - ], - "site": "shopping_admin" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "__SHOPPING_ADMIN__/review/product/edit/id/352", - "locator": "document.querySelector('[name=\"status_id\"').value", - "required_contents": { - "exact_match": "1" - } - }, - { - "url": "__SHOPPING_ADMIN__/review/product/edit/id/349", - "locator": "document.querySelector('[name=\"status_id\"').value", - "required_contents": { - "exact_match": "1" - } - }, - { - "url": "__SHOPPING_ADMIN__/review/product/edit/id/347", - "locator": "document.querySelector('[name=\"status_id\"').value", - "required_contents": { - "exact_match": "1" - } - } - ] - }, - "intent_template_id": 243, - "changelogs": [ + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" - } - ] - }, - { - "sites": [ - "shopping_admin" - ], - "task_id": 772, - "require_login": true, - "storage_state": "./.auth/shopping_admin_state.json", - "start_url": "__SHOPPING_ADMIN__", - "geolocation": null, - "intent_template": "Delete all {{review_type}}", - "instantiation_dict": { - "review_type": "pending negative reviews for Circe fleece" - }, - "intent": "Delete all pending negative reviews for Circe fleece", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "verify_review_details", - "eval_params": { - "review_id": "999" - }, - "expected_data": { - "exists": false - } - } - } - ], - "site": "shopping_admin" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "__SHOPPING_ADMIN__/review/product/edit/id/999", - "locator": "", - "required_contents": { - "must_include": [ - "Rating isn't Available" - ] - } - } - ] - }, - "intent_template_id": 246, - "changelogs": [ + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + }, + { + "evaluator": "BackendStateEvaluator", + "review_id": "352", + "site": "shopping_admin", + "expected": {"approved": true} + }, + { + "evaluator": "BackendStateEvaluator", + "review_id": "349", + "site": "shopping_admin", + "expected": {"approved": true} + }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "review_id": "347", + "site": "shopping_admin", + "expected": {"approved": true} } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping_admin" - ], - "task_id": 773, - "require_login": true, - "storage_state": "./.auth/shopping_admin_state.json", - "start_url": "__SHOPPING_ADMIN__", - "geolocation": null, - "intent_template": "Delete all {{review_type}}", - "instantiation_dict": { - "review_type": "pending negative reviews" - }, - "intent": "Delete all pending negative reviews", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "verify_review_details", - "eval_params": { - "review_id": "351" - }, - "expected_data": { - "exists": false - } - } - }, - { - "eval_func": { - "name": "verify_review_details", - "eval_params": { - "review_id": "353" - }, - "expected_data": { - "exists": false - } - } - } - ], - "site": "shopping_admin" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "__SHOPPING_ADMIN__/review/product/edit/id/351", - "locator": "", - "required_contents": { - "must_include": [ - "Rating isn't Available" - ] - } - }, - { - "url": "__SHOPPING_ADMIN__/review/product/edit/id/353", - "locator": "", - "required_contents": { - "must_include": [ - "Rating isn't Available" - ] - } - } - ] - }, + "sites": ["shopping_admin"], + "task_id": 772, "intent_template_id": 246, - "changelogs": [ + "start_urls": ["__SHOPPING_ADMIN__"], + "intent": "Delete all pending negative reviews for Circe fleece", + "intent_template": "Delete all {{review_type}}", + "instantiation_dict": {"review_type": "pending negative reviews for Circe fleece"}, + "format_specification": null, + "start_url_context": null, + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "review_id": "999", + "site": "shopping_admin", + "expected": {"exists": false} } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping_admin" + "sites": ["shopping_admin"], + "task_id": 773, + "intent_template_id": 246, + "start_urls": ["__SHOPPING_ADMIN__"], + "intent": "Delete all pending negative reviews", + "intent_template": "Delete all {{review_type}}", + "instantiation_dict": {"review_type": "pending negative reviews"}, + "format_specification": null, + "start_url_context": null, + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + }, + { + "evaluator": "BackendStateEvaluator", + "review_id": "351", + "site": "shopping_admin", + "expected": {"exists": false} + }, + { + "evaluator": "BackendStateEvaluator", + "review_id": "353", + "site": "shopping_admin", + "expected": {"exists": false} + } ], + "revision": 2 + }, + { + "sites": ["shopping_admin"], "task_id": 774, - "require_login": true, - "storage_state": "./.auth/shopping_admin_state.json", - "start_url": "__SHOPPING_ADMIN__", - "geolocation": null, - "intent_template": "Delete all {{review_type}}", - "instantiation_dict": { - "review_type": "pending reviews with less than 4 stars" - }, - "intent": "Delete all pending reviews with less than 4 stars", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "verify_review_details", - "eval_params": { - "review_id": "351" - }, - "expected_data": { - "exists": false - } - } - }, - { - "eval_func": { - "name": "verify_review_details", - "eval_params": { - "review_id": "353" - }, - "expected_data": { - "exists": false - } - } - }, - { - "eval_func": { - "name": "verify_review_details", - "eval_params": { - "review_id": "349" - }, - "expected_data": { - "exists": false - } - } - } - ], - "site": "shopping_admin" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "__SHOPPING_ADMIN__/review/product/edit/id/351", - "locator": "", - "required_contents": { - "must_include": [ - "Rating isn't Available" - ] - } - }, - { - "url": "__SHOPPING_ADMIN__/review/product/edit/id/353", - "locator": "", - "required_contents": { - "must_include": [ - "Rating isn't Available" - ] - } - }, - { - "url": "__SHOPPING_ADMIN__/review/product/edit/id/349", - "locator": "", - "required_contents": { - "must_include": [ - "Rating isn't Available" - ] - } - } - ] - }, "intent_template_id": 246, - "changelogs": [ + "start_urls": ["__SHOPPING_ADMIN__"], + "intent": "Delete all pending reviews with less than 4 stars", + "intent_template": "Delete all {{review_type}}", + "instantiation_dict": {"review_type": "pending reviews with less than 4 stars"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + }, + { + "evaluator": "BackendStateEvaluator", + "review_id": "351", + "site": "shopping_admin", + "expected": {"exists": false} + }, + { + "evaluator": "BackendStateEvaluator", + "review_id": "353", + "site": "shopping_admin", + "expected": {"exists": false} + }, + { + "evaluator": "BackendStateEvaluator", + "review_id": "349", + "site": "shopping_admin", + "expected": {"exists": false} } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping_admin" - ], + "sites": ["shopping_admin"], "task_id": 775, - "require_login": true, - "storage_state": "./.auth/shopping_admin_state.json", - "start_url": "__SHOPPING_ADMIN__", - "geolocation": null, - "intent_template": "Delete all {{review_type}}", - "instantiation_dict": { - "review_type": "reviews from the scammer Arden" - }, - "intent": "Delete all reviews from the scammer Arden", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "verify_review_details", - "eval_params": { - "review_id": "51" - }, - "expected_data": { - "exists": false - } - } - } - ], - "site": "shopping_admin" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "__SHOPPING_ADMIN__/review/product/edit/id/51", - "locator": "", - "required_contents": { - "must_include": [ - "Rating isn't Available" - ] - } - } - ] - }, "intent_template_id": 246, - "changelogs": [ + "start_urls": ["__SHOPPING_ADMIN__"], + "intent": "Delete all reviews from the scammer Arden", + "intent_template": "Delete all {{review_type}}", + "instantiation_dict": {"review_type": "reviews from the scammer Arden"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + }, + { + "evaluator": "BackendStateEvaluator", + "review_id": "51", + "site": "shopping_admin", + "expected": {"exists": false} } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping_admin" - ], + "sites": ["shopping_admin"], "task_id": 776, - "require_login": true, - "storage_state": "./.auth/shopping_admin_state.json", - "start_url": "__SHOPPING_ADMIN__", - "geolocation": null, - "intent_template": "Delete all {{review_type}}", - "instantiation_dict": { - "review_type": "reviews from the scammer Carlo" - }, - "intent": "Delete all reviews from the scammer Carlo", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "verify_review_details", - "eval_params": { - "review_id": "93" - }, - "expected_data": { - "exists": false - } - } - }, - { - "eval_func": { - "name": "verify_review_details", - "eval_params": { - "review_id": "109" - }, - "expected_data": { - "exists": false - } - } - } - ], - "site": "shopping_admin" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "__SHOPPING_ADMIN__/review/product/edit/id/93", - "locator": "", - "required_contents": { - "must_include": [ - "Rating isn't Available" - ] - } - }, - { - "url": "__SHOPPING_ADMIN__/review/product/edit/id/109", - "locator": "", - "required_contents": { - "must_include": [ - "Rating isn't Available" - ] - } - } - ] - }, "intent_template_id": 246, - "changelogs": [ + "start_urls": ["__SHOPPING_ADMIN__"], + "intent": "Delete all reviews from the scammer Carlo", + "intent_template": "Delete all {{review_type}}", + "instantiation_dict": {"review_type": "reviews from the scammer Carlo"}, + "format_specification": null, + "start_url_context": null, + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + }, + { + "evaluator": "BackendStateEvaluator", + "review_id": "93", + "site": "shopping_admin", + "expected": {"exists": false} + }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "review_id": "109", + "site": "shopping_admin", + "expected": {"exists": false} } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping_admin" - ], + "sites": ["shopping_admin"], "task_id": 777, - "require_login": true, - "storage_state": "./.auth/shopping_admin_state.json", - "start_url": "__SHOPPING_ADMIN__", - "geolocation": null, + "intent_template_id": 742, + "start_urls": ["__SHOPPING_ADMIN__"], + "intent": "Reduce the price of green Hollister backyard sweatshirt in all sizes by $5", "intent_template": "{{action}} the price of {{config}} by {{amount}}", "instantiation_dict": { "amount": "$5", - "action": "Reduce", + "performed_operation": "Reduce", "config": "green Hollister backyard sweatshirt in all sizes" }, - "original.instantiation_dict": { - "amount": "$5", - "action": "Reduce", - "config": "green Hollister backyard sweater in all size" - }, - "intent": "Reduce the price of green Hollister backyard sweatshirt in all sizes by $5", - "original.intent": "Reduce the price of green Hollister backyard sweater in all size by $5", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "verify_product_details", - "eval_params": { - "product_id": "120" - }, - "expected_data": { - "price": "47.00" - } - } - }, - { - "eval_func": { - "name": "verify_product_details", - "eval_params": { - "product_id": "117" - }, - "expected_data": { - "price": "47.00" - } - } - }, - { - "eval_func": { - "name": "verify_product_details", - "eval_params": { - "product_id": "114" - }, - "expected_data": { - "price": "47.00" - } - } - }, - { - "eval_func": { - "name": "verify_product_details", - "eval_params": { - "product_id": "111" - }, - "expected_data": { - "price": "47.00" - } - } - }, - { - "eval_func": { - "name": "verify_product_details", - "eval_params": { - "product_id": "123" - }, - "expected_data": { - "price": "47.00" - } - } - } - ], - "site": "shopping_admin" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "__SHOPPING_ADMIN__/catalog/product/edit/id/120/", - "locator": "document.querySelector('[name=\"product[price]\"').value", - "required_contents": { - "exact_match": "47.00" - } - }, - { - "url": "__SHOPPING_ADMIN__/catalog/product/edit/id/117/", - "locator": "document.querySelector('[name=\"product[price]\"').value", - "required_contents": { - "exact_match": "47.00" - } - }, - { - "url": "__SHOPPING_ADMIN__/catalog/product/edit/id/114/", - "locator": "document.querySelector('[name=\"product[price]\"').value", - "required_contents": { - "exact_match": "47.00" - } - }, - { - "url": "__SHOPPING_ADMIN__/catalog/product/edit/id/111/", - "locator": "document.querySelector('[name=\"product[price]\"').value", - "required_contents": { - "exact_match": "47.00" - } - }, - { - "url": "__SHOPPING_ADMIN__/catalog/product/edit/id/123/", - "locator": "document.querySelector('[name=\"product[price]\"').value", - "required_contents": { - "exact_match": "47.00" - } - } - ] - }, - "intent_template_id": 742, - "changelogs": [ + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "instantiation_dict", - "category": "spelling_or_grammar", - "note": "Use proper pluralization" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "key": "instantiation_dict", - "category": "task_ambiguity", - "note": "No sweater product exists updating to correct product sweatshirt" + "evaluator": "BackendStateEvaluator", + "product_id": "120", + "site": "shopping_admin", + "expected": {"price": "47.00"} }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "product_id": "117", + "site": "shopping_admin", + "expected": {"price": "47.00"} + }, + { + "evaluator": "BackendStateEvaluator", + "product_id": "114", + "site": "shopping_admin", + "expected": {"price": "47.00"} + }, + { + "evaluator": "BackendStateEvaluator", + "product_id": "111", + "site": "shopping_admin", + "expected": {"price": "47.00"} + }, + { + "evaluator": "BackendStateEvaluator", + "product_id": "123", + "site": "shopping_admin", + "expected": {"price": "47.00"} } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping_admin" - ], + "sites": ["shopping_admin"], "task_id": 778, - "require_login": true, - "storage_state": "./.auth/shopping_admin_state.json", - "start_url": "__SHOPPING_ADMIN__", - "geolocation": null, + "intent_template_id": 742, + "start_urls": ["__SHOPPING_ADMIN__"], + "intent": "Reduce the price of size 28 Sahara leggings by 13.5%", "intent_template": "{{action}} the price of {{config}} by {{amount}}", "instantiation_dict": { "amount": "13.5%", - "action": "Reduce", + "performed_operation": "Reduce", "config": "size 28 Sahara leggings" }, - "intent": "Reduce the price of size 28 Sahara leggings by 13.5%", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "verify_product_details", - "eval_params": { - "product_id": "1841" - }, - "expected_data": { - "price": "64.88" - } - } - }, - { - "eval_func": { - "name": "verify_product_details", - "eval_params": { - "product_id": "1852" - }, - "expected_data": { - "price": "64.88" - } - } - }, - { - "eval_func": { - "name": "verify_product_details", - "eval_params": { - "product_id": "1843" - }, - "expected_data": { - "price": "64.88" - } - } - } - ], - "site": "shopping_admin" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "__SHOPPING_ADMIN__/catalog/product/edit/id/1841/", - "locator": "document.querySelector('[name=\"product[price]\"').value", - "required_contents": { - "exact_match": "64.88" - } - }, - { - "url": "__SHOPPING_ADMIN__/catalog/product/edit/id/1842/", - "locator": "document.querySelector('[name=\"product[price]\"').value", - "required_contents": { - "exact_match": "64.88" - } - }, - { - "url": "__SHOPPING_ADMIN__/catalog/product/edit/id/1843/", - "locator": "document.querySelector('[name=\"product[price]\"').value", - "required_contents": { - "exact_match": "64.88" - } - } - ] - }, - "intent_template_id": 742, - "changelogs": [ + "format_specification": null, + "start_url_context": null, + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + }, + { + "evaluator": "BackendStateEvaluator", + "product_id": "1841", + "site": "shopping_admin", + "expected": {"price": "64.88"} + }, + { + "evaluator": "BackendStateEvaluator", + "product_id": "1852", + "site": "shopping_admin", + "expected": {"price": "64.88"} + }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "product_id": "1843", + "site": "shopping_admin", + "expected": {"price": "64.88"} } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping_admin" - ], + "sites": ["shopping_admin"], "task_id": 779, - "require_login": true, - "storage_state": "./.auth/shopping_admin_state.json", - "start_url": "__SHOPPING_ADMIN__", - "geolocation": null, + "intent_template_id": 742, + "start_urls": ["__SHOPPING_ADMIN__"], + "intent": "Reduce the price of yellow shirts from Gwyn Endurance in all sizes below L by 15%", "intent_template": "{{action}} the price of {{config}} by {{amount}}", "instantiation_dict": { "amount": "15%", - "action": "Reduce", + "performed_operation": "Reduce", "config": "yellow shirts from Gwyn Endurance in all sizes below L" }, - "original.instantiation_dict": { - "amount": "15%", - "action": "Reduce", - "config": "yellow shirts from Gwyn Endurance in all size below L" - }, - "intent": "Reduce the price of yellow shirts from Gwyn Endurance in all sizes below L by 15%", - "original.intent": "Reduce the price of yellow shirts from Gwyn Endurance in all size below L by 15%", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "verify_product_details", - "eval_params": { - "product_id": "1559" - }, - "expected_data": { - "price": "20.40" - } - } - }, - { - "eval_func": { - "name": "verify_product_details", - "eval_params": { - "product_id": "1562" - }, - "expected_data": { - "price": "20.40" - } - } - }, - { - "eval_func": { - "name": "verify_product_details", - "eval_params": { - "product_id": "1565" - }, - "expected_data": { - "price": "20.40" - } - } - } - ], - "site": "shopping_admin" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "__SHOPPING_ADMIN__/catalog/product/edit/id/1559/", - "locator": "document.querySelector('[name=\"product[price]\"').value", - "required_contents": { - "exact_match": "20.40" - } - }, - { - "url": "__SHOPPING_ADMIN__/catalog/product/edit/id/1562/", - "locator": "document.querySelector('[name=\"product[price]\"').value", - "required_contents": { - "exact_match": "20.40" - } - }, - { - "url": "__SHOPPING_ADMIN__/catalog/product/edit/id/1565/", - "locator": "document.querySelector('[name=\"product[price]\"').value", - "required_contents": { - "exact_match": "20.40" - } - } - ] - }, - "intent_template_id": 742, - "changelogs": [ + "format_specification": null, + "start_url_context": null, + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + }, + { + "evaluator": "BackendStateEvaluator", + "product_id": "1559", + "site": "shopping_admin", + "expected": {"price": "20.40"} + }, { - "key": "instantiation_dict", - "category": "spelling_or_grammar", - "note": "Use proper pluralization" + "evaluator": "BackendStateEvaluator", + "product_id": "1562", + "site": "shopping_admin", + "expected": {"price": "20.40"} }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "product_id": "1565", + "site": "shopping_admin", + "expected": {"price": "20.40"} } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping_admin" - ], + "sites": ["shopping_admin"], "task_id": 780, - "require_login": true, - "storage_state": "./.auth/shopping_admin_state.json", - "start_url": "__SHOPPING_ADMIN__/catalog/product/edit/id/1481/", - "geolocation": null, + "intent_template_id": 742, + "start_urls": ["__SHOPPING_ADMIN__/catalog/product/edit/id/1481/"], + "intent": "Increase the price of white Ingrid Running with size L and above by $17", "intent_template": "{{action}} the price of {{config}} by {{amount}}", "instantiation_dict": { "amount": "$17", - "action": "Increase", + "performed_operation": "Increase", "config": "white Ingrid Running with size L and above" }, - "intent": "Increase the price of white Ingrid Running with size L and above by $17", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "verify_product_details", - "eval_params": { - "product_id": "1264" - }, - "expected_data": { - "price": "101.00" - } - } - }, - { - "eval_func": { - "name": "verify_product_details", - "eval_params": { - "product_id": "1267" - }, - "expected_data": { - "price": "101.00" - } - } - } - ], - "site": "shopping_admin" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "__SHOPPING_ADMIN__/catalog/product/edit/id/1264/", - "locator": "document.querySelector('[name=\"product[price]\"').value", - "required_contents": { - "exact_match": "64.00" - } - }, - { - "url": "__SHOPPING_ADMIN__/catalog/product/edit/id/1267/", - "locator": "document.querySelector('[name=\"product[price]\"').value", - "required_contents": { - "exact_match": "64.00" - } - } - ] - }, - "intent_template_id": 742, - "changelogs": [ + "format_specification": null, + "start_url_context": "Complete the task given the context of the current page.", + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + }, { - "key": "expected_backend_state", - "category": "reference_alignment", - "note": "Original value was decrese of $20 when intent states increase of $17" + "evaluator": "BackendStateEvaluator", + "product_id": "1264", + "site": "shopping_admin", + "expected": {"price": "101.00"} }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "product_id": "1267", + "site": "shopping_admin", + "expected": {"price": "101.00"} } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping_admin" - ], + "sites": ["shopping_admin"], "task_id": 781, - "require_login": true, - "storage_state": "./.auth/shopping_admin_state.json", - "start_url": "__SHOPPING_ADMIN__", - "geolocation": null, + "intent_template_id": 742, + "start_urls": ["__SHOPPING_ADMIN__"], + "intent": "Increase the price of black fitness tshirts from Desiree with size XS by 37%", "intent_template": "{{action}} the price of {{config}} by {{amount}}", "instantiation_dict": { "amount": "37%", - "action": "Increase", + "performed_operation": "Increase", "config": "black fitness tshirts from Desiree with size XS" }, - "intent": "Increase the price of black fitness tshirts from Desiree with size XS by 37%", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "verify_product_details", - "eval_params": { - "product_id": "1573" - }, - "expected_data": { - "price": "32.88" - } - } - } - ], - "site": "shopping_admin" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "__SHOPPING_ADMIN__/catalog/product/edit/id/1573/", - "locator": "document.querySelector('[name=\"product[price]\"').value", - "required_contents": { - "exact_match": "32.88" - } - } - ] - }, - "intent_template_id": 742, - "changelogs": [ + "format_specification": null, + "start_url_context": null, + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "product_id": "1573", + "site": "shopping_admin", + "expected": {"price": "32.88"} } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping_admin" - ], + "sites": ["shopping_admin"], "task_id": 782, - "require_login": true, - "storage_state": "./.auth/shopping_admin_state.json", - "start_url": "__SHOPPING_ADMIN__", - "geolocation": null, + "intent_template_id": 742, + "start_urls": ["__SHOPPING_ADMIN__"], + "intent": "Increase the price of all blue running tshirts in extra small and small sizes by 23%", "intent_template": "{{action}} the price of {{config}} by {{amount}}", "instantiation_dict": { "amount": "23%", - "action": "Increase", + "performed_operation": "Increase", "config": "all blue running tshirts in extra small and small sizes" }, - "intent": "Increase the price of all blue running tshirts in extra small and small sizes by 23%", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "verify_product_details", - "eval_params": { - "product_id": "496" - }, - "expected_data": { - "price": "35.67" - } - } - }, - { - "eval_func": { - "name": "verify_product_details", - "eval_params": { - "product_id": "499" - }, - "expected_data": { - "price": "35.67" - } - } - }, - { - "eval_func": { - "name": "verify_product_details", - "eval_params": { - "product_id": "479" - }, - "expected_data": { - "price": "34.44" - } - } - }, - { - "eval_func": { - "name": "verify_product_details", - "eval_params": { - "product_id": "482" - }, - "expected_data": { - "price": "34.44" - } - } - } - ], - "site": "shopping_admin" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "__SHOPPING_ADMIN__/catalog/product/edit/id/496/", - "locator": "document.querySelector('[name=\"product[price]\"').value", - "required_contents": { - "exact_match": "22.33" - } - }, - { - "url": "__SHOPPING_ADMIN__/catalog/product/edit/id/499/", - "locator": "document.querySelector('[name=\"product[price]\"').value", - "required_contents": { - "exact_match": "22.33" - } - }, - { - "url": "__SHOPPING_ADMIN__/catalog/product/edit/id/479/", - "locator": "document.querySelector('[name=\"product[price]\"').value", - "required_contents": { - "exact_match": "21.56" - } - }, - { - "url": "__SHOPPING_ADMIN__/catalog/product/edit/id/482/", - "locator": "document.querySelector('[name=\"product[price]\"').value", - "required_contents": { - "exact_match": "21.56" - } - } - ] - }, - "intent_template_id": 742, - "changelogs": [ + "format_specification": null, + "start_url_context": null, + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + }, + { + "evaluator": "BackendStateEvaluator", + "product_id": "496", + "site": "shopping_admin", + "expected": {"price": "35.67"} + }, + { + "evaluator": "BackendStateEvaluator", + "product_id": "499", + "site": "shopping_admin", + "expected": {"price": "35.67"} + }, { - "key": "expected_backend_state", - "category": "reference_alignment", - "note": "Original values were decresed by 23% not increased as stated in the intent" + "evaluator": "BackendStateEvaluator", + "product_id": "479", + "site": "shopping_admin", + "expected": {"price": "34.44"} }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "product_id": "482", + "site": "shopping_admin", + "expected": {"price": "34.44"} } - ] + ], + "revision": 2 }, { - "sites": [ - "gitlab" - ], + "sites": ["gitlab"], "task_id": 783, - "require_login": true, - "storage_state": "./.auth/gitlab_state.json", - "start_url": "__GITLAB__", - "geolocation": null, + "intent_template_id": 351, + "start_urls": ["__GITLAB__"], + "intent": "Add the following users to repo kkroening/ffmpeg-python as maintainer: ['yjlou', 'a11yproject']", "intent_template": "Add the following users to repo {{repo}} as {{role}}: {{user_list}}", "instantiation_dict": { "repo": "kkroening/ffmpeg-python", "role": "maintainer", - "user_list": [ - "yjlou", - "a11yproject" - ] - }, - "intent": "Add the following users to repo kkroening/ffmpeg-python as maintainer: ['yjlou', 'a11yproject']", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "validate_status", - "expected_data": { - "expected_status": "ACTION_NOT_ALLOWED_ERROR" - } - } - } - ], - "site": "gitlab" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "fuzzy_match": "N/A" - }, - "reference_url": "", - "program_html": [], - "string_note": "not the owner of the repo so cannot add maintainer", - "reference_answer_raw_annotation": "N/A" + "user_list": ["yjlou", "a11yproject"] }, - "intent_template_id": 351, - "changelogs": [ + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "expected_retrieve_value", - "category": "unachievable_tasks", - "note": "byteblaze user does not have permissions to do this" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": { + "performed_operation": "retrieve", + "status": "PERMISSION_DENIED_ERROR", + "retrieved_data": null + } } - ] + ], + "revision": 2 }, { - "sites": [ - "gitlab" - ], + "sites": ["gitlab"], "task_id": 784, - "require_login": true, - "storage_state": "./.auth/gitlab_state.json", - "start_url": "__GITLAB__/CellularPrivacy/Android-IMSI-Catcher-Detector", - "geolocation": null, - "intent_template": "Tell me the {{attribute}} of the contributor who has the most commits to branch {{branch_name}}", - "instantiation_dict": { - "branch_name": "master", - "attribute": "email address" - }, - "original.instantiation_dict": { - "branch_name": "main", - "attribute": "email address" - }, - "intent": "Tell me the email address of the contributor who has the most commits to branch master", - "original.intent": "Tell me the email address of the contributor who has the most commits to branch main", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "verify_retrieved_value_normalized", - "eval_params": { - "ordered": "false", - "type": "text" - }, - "expected_data": [ - "secupwn@users.noreply.github.com" - ] - } - } - ], - "site": "gitlab" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "must_include": [ - "secupwn@users.noreply.github.com" - ] - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "secupwn@users.noreply.github.com" - }, "intent_template_id": 316, - "changelogs": [ - { - "key": "expected_retrieve_value", - "category": "avoid_permissive_text", - "note": "Use structured response to judge score" - }, - { - "key": "instantiation_dict", - "category": "reference_alignment", - "note": "No such branch named main" - }, + "start_urls": ["__GITLAB__/CellularPrivacy/Android-IMSI-Catcher-Detector"], + "intent": "Tell me the email address of the contributor who has the most commits to branch master", + "intent_template": "Tell me the {{attribute}} of the contributor who has the most commits to branch {{branch_name}}", + "instantiation_dict": {"branch_name": "master", "attribute": "email address"}, + "format_specification": null, + "start_url_context": "Complete the task given the context of the current page.", + "eval": [ { - "key": "eval", - "category": "permissive_string_match", - "note": "Switched to exact match" + "evaluator": "AgentResponseEvaluator", + "ordered": "false", + "results_schema": { "type": "array", "items": {"type": "string"} }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": ["secupwn@users.noreply.github.com"] + } } - ] + ], + "revision": 2 }, { - "sites": [ - "gitlab" - ], + "sites": ["gitlab"], "task_id": 785, - "require_login": true, - "storage_state": "./.auth/gitlab_state.json", - "start_url": "__GITLAB__/CellularPrivacy/Android-IMSI-Catcher-Detector", - "geolocation": null, - "intent_template": "Tell me the {{attribute}} of the contributor who has the most commits to branch {{branch_name}}", - "instantiation_dict": { - "branch_name": "gh-page", - "attribute": "email address" - }, - "intent": "Tell me the email address of the contributor who has the most commits to branch gh-page", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "verify_retrieved_value_normalized", - "eval_params": { - "ordered": "false", - "type": "text" - }, - "expected_data": [ - "secupwn@users.noreply.github.com" - ] - } - } - ], - "site": "gitlab" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "must_include": [ - "secupwn@users.noreply.github.com" - ] - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "secupwn@users.noreply.github.com" - }, "intent_template_id": 316, - "changelogs": [ - { - "key": "expected_retrieve_value", - "category": "avoid_permissive_text", - "note": "Use structured response to judge correctness" - }, + "start_urls": ["__GITLAB__/CellularPrivacy/Android-IMSI-Catcher-Detector"], + "intent": "Tell me the email address of the contributor who has the most commits to branch gh-page", + "intent_template": "Tell me the {{attribute}} of the contributor who has the most commits to branch {{branch_name}}", + "instantiation_dict": {"branch_name": "gh-page", "attribute": "email address"}, + "format_specification": null, + "start_url_context": "Complete the task given the context of the current page.", + "eval": [ { - "key": "eval", - "category": "permissive_string_match", - "note": "Switched to exact match" + "evaluator": "AgentResponseEvaluator", + "ordered": "false", + "results_schema": { "type": "array", "items": {"type": "string"} }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": ["secupwn@users.noreply.github.com"] + } } - ] + ], + "revision": 2 }, { - "sites": [ - "gitlab" - ], + "sites": ["gitlab"], "task_id": 786, - "require_login": true, - "storage_state": "./.auth/gitlab_state.json", - "start_url": "__GITLAB__/vinta/awesome-python", - "geolocation": null, - "intent_template": "Tell me the {{attribute}} of the contributor who has the most commits to branch {{branch_name}}", - "instantiation_dict": { - "branch_name": "master", - "attribute": "number of commits" - }, - "original.instantiation_dict": { - "branch_name": "main", - "attribute": "number of commits" - }, - "intent": "Tell me the number of commits of the contributor who has the most commits to branch master", - "original.intent": "Tell me the number of commits of the contributor who has the most commits to branch main", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "verify_retrieved_value_normalized", - "eval_params": { - "ordered": "false", - "type": "numeric" - }, - "expected_data": [ - 412 - ] - } - } - ], - "site": "gitlab" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "must_include": [ - "412" - ] - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "412" - }, "intent_template_id": 316, - "changelogs": [ - { - "key": "instantiation_dict", - "category": "reference_alignment", - "note": "No such branch named main; original answer incorrect" - }, - { - "key": "expected_retrieve_value", - "category": "avoid_permissive_text", - "note": "Use structured response to judge correctness" - }, + "start_urls": ["__GITLAB__/vinta/awesome-python"], + "intent": "Tell me the number of commits of the contributor who has the most commits to branch master", + "intent_template": "Tell me the {{attribute}} of the contributor who has the most commits to branch {{branch_name}}", + "instantiation_dict": {"branch_name": "master", "attribute": "number of commits"}, + "format_specification": null, + "start_url_context": "Complete the task given the context of the current page.", + "eval": [ { - "key": "eval", - "category": "permissive_string_match", - "note": "Switched to exact match" + "evaluator": "AgentResponseEvaluator", + "ordered": "false", + "results_schema": { "type": "array", "items": {"type": "number"} }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": [414] + } } - ] + ], + "revision": 2 }, { - "sites": [ - "gitlab" - ], + "sites": ["gitlab"], "task_id": 787, - "require_login": true, - "storage_state": "./.auth/gitlab_state.json", - "start_url": "__GITLAB__/dehenne/awesome-visibility/-/graphs/master", - "geolocation": null, - "intent_template": "Tell me the {{attribute}} of the contributor who has the most commits to branch {{branch_name}}", - "instantiation_dict": { - "branch_name": "master", - "attribute": "number of followers" - }, - "original.instantiation_dict": { - "branch_name": "main", - "attribute": "number of followers" - }, - "intent": "Tell me the number of followers of the contributor who has the most commits to branch master", - "original.intent": "Tell me the number of followers of the contributor who has the most commits to branch main", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "verify_retrieved_value_normalized", - "eval_params": { - "ordered": "false", - "type": "numeric" - }, - "expected_data": [ - 0 - ] - } - } - ], - "site": "gitlab" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "must_include": [ - "0" - ] - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "0" - }, "intent_template_id": 316, - "changelogs": [ - { - "key": "instantiation_dict", - "category": "reference_alignment", - "note": "No such branch named main; original answer incorrect" - }, - { - "key": "expected_retrieve_value", - "category": "avoid_permissive_text", - "note": "Use structured response to judge correctness" - }, + "start_urls": ["__GITLAB__/dehenne/awesome-visibility/-/graphs/master"], + "intent": "Tell me the number of followers of the contributor who has the most commits to branch master", + "intent_template": "Tell me the {{attribute}} of the contributor who has the most commits to branch {{branch_name}}", + "instantiation_dict": {"branch_name": "master", "attribute": "number of followers"}, + "format_specification": null, + "start_url_context": "Complete the task given the context of the current page.", + "eval": [ { - "key": "eval", - "category": "permissive_string_match", - "note": "Switched to exact match" + "evaluator": "AgentResponseEvaluator", + "ordered": "false", + "results_schema": { "type": "array", "items": {"type": "number"} }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": [0] + } } - ] + ], + "revision": 2 }, { - "sites": [ - "gitlab" - ], + "sites": ["gitlab"], "task_id": 788, - "require_login": true, - "storage_state": "./.auth/gitlab_state.json", - "start_url": "__GITLAB__/amwhalen/archive-my-tweets/-/tree/github/fork/chtitux/addRssFeed", - "geolocation": null, + "intent_template_id": 316, + "start_urls": ["__GITLAB__/amwhalen/archive-my-tweets/-/tree/github/fork/chtitux/addRssFeed"], + "intent": "Tell me the full name, gitlab account name, location and email address of the contributor who has the most commits to branch php52", "intent_template": "Tell me the {{attribute}} of the contributor who has the most commits to branch {{branch_name}}", "instantiation_dict": { "branch_name": "php52", "attribute": "full name, gitlab account name, location and email address" }, - "intent": "Tell me the full name, gitlab account name, location and email address of the contributor who has the most commits to branch php52", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "verify_retrieved_value_normalized", - "eval_params": { - "ordered": "false", - "type": "text" + "format_specification": "Use an object with keys: \"first_name\", \"last_name\", \"gitlab_account_name\", \"location\" and \"email_address\"", + "start_url_context": "Complete the task given the context of the current page.", + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "ordered": "false", + "results_schema": { + "type": "array", + "items": { + "type": "object", + "properties": { + "first_name" : { "type": "string" }, + "last_name" : { "type": "string" }, + "gitlab_account_name": { "type": "string" }, + "location" : { "type": "string" }, + "email_address" : { "type": "string" } }, - "expected_data": [ - "Andrew M. Whalen", - "amwhalen", - "Massachusetts", - "github@amwhalen.com" - ] + "required": ["email_address", "first_name", "gitlab_account_name", "last_name", "location"] } + }, + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": [ + { + "first_name": "Andrew", + "last_name": "Whalen", + "gitlab_account_name": "amwhalen", + "location": "Massachusetts", + "email_address": "github@amwhalen.com" + } + ] } - ], - "site": "gitlab" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "must_include": [ - "Andrew M. Whalen", - "amwhalen", - "Massachusetts", - "github@amwhalen.com" - ] - }, - "reference_url": "", - "program_html": [], - "string_note": "", - "reference_answer_raw_annotation": "Andrew M. Whalen, amwhalen, Massachusetts, github@amwhalen.com" - }, - "intent_template_id": 316, - "format_specification": "Return value is a list of format [full name, gitlab account name, location, email address]", - "changelogs": [ - { - "key": "expected_retrieve_value", - "category": "avoid_permissive_text", - "note": "Use structured response to judge correctness" - }, - { - "key": "eval", - "category": "permissive_string_match", - "note": "Switched to exact match" } - ] + ], + "revision": 2 }, { - "sites": [ - "gitlab" - ], + "sites": ["gitlab"], "task_id": 789, - "require_login": true, - "storage_state": "./.auth/gitlab_state.json", - "start_url": "__GITLAB__", - "geolocation": null, - "intent_template": "Create an issue {{issue}} in {{repo}}.", + "intent_template_id": 328, + "start_urls": ["__GITLAB__"], + "intent": "Open an issue with title \"WebAgent Support Plan\" to ask the team if they have any plan on supporting Webagent in the next quater in the respository huggingface dataset.", + "intent_template": "Open an issue with title \"{{title}}\" to {{issue}} in the respository {{repo}}.", "instantiation_dict": { - "repo": "huggingface dataset", - "issue": "asking about do they have any plan on supporting Webagent in the next quater" - }, - "intent": "Create an issue asking about do they have any plan on supporting Webagent in the next quater in huggingface dataset.", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "validate_status", - "expected_data": { - "expected_status": "RESOURCE_NOT_FOUND_ERROR" - } - } - } - ], - "site": "gitlab" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "fuzzy_match": "N/A" - }, - "reference_url": "", - "program_html": [], - "string_note": "huggingface repo does not exist", - "reference_answer_raw_annotation": "N/A" + "title": "WebAgent Support Plan", + "issue": "ask the team if they have any plan on supporting Webagent in the next quater", + "repo": "huggingface dataset" }, - "intent_template_id": 328, - "changelogs": [ + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "expected_retrieve_value", - "category": "unachievable_tasks", - "note": "Expect the right error code for unachievable tasks" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": { + "performed_operation": "retrieve", + "status": "NOT_FOUND_ERROR", + "retrieved_data": null + } } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping_admin" - ], + "sites": ["shopping_admin"], "task_id": 790, - "require_login": true, - "storage_state": "./.auth/shopping_admin_state.json", - "start_url": "__SHOPPING_ADMIN__", - "geolocation": null, - "intent_template": "Delete all {{review_type}}", - "instantiation_dict": { - "review_type": "negative reviews for Sybil running short" - }, - "intent": "Delete all negative reviews for Sybil running short", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "validate_status", - "expected_data": { - "expected_status": "ACTION_NOT_ALLOWED_ERROR" - } - } - } - ], - "site": "shopping_admin" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "fuzzy_match": "N/A" - }, - "reference_url": "", - "program_html": [], - "string_note": "No negative reviews for sybil running short", - "reference_answer_raw_annotation": "N/A" - }, "intent_template_id": 246, - "changelogs": [ + "start_urls": ["__SHOPPING_ADMIN__"], + "intent": "Delete all negative reviews for Sybil running short", + "intent_template": "Delete all {{review_type}}", + "instantiation_dict": {"review_type": "negative reviews for Sybil running short"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "expected_retrieve_value", - "category": "unachievable_tasks", - "note": "Status check used instead of fuzzy match" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": { + "performed_operation": "retrieve", + "status": "ACTION_NOT_ALLOWED_ERROR", + "retrieved_data": null + } } - ] + ], + "revision": 2 }, { - "sites": [ - "gitlab", - "reddit" - ], + "sites": ["gitlab", "reddit"], "task_id": 791, - "require_login": true, - "storage_state": "./.auth/gitlab.reddit_state.json", - "start_url": "__GITLAB__", - "geolocation": null, - "intent_template": "Make a folder named {{directory}} on the {{gitlab_repo}} repo and include a file called urls.txt that consists of the links to the 5 most recent posts from {{subreddit}}.", - "instantiation_dict": { - "directory": "car", - "subreddit": "cars", - "gitlab_repo": "gimmiethat.space" - }, - "intent": "Make a folder named car on the gimmiethat.space repo and include a file called urls.txt that consists of the links to the 5 most recent posts from cars.", - "require_reset": false, - "eval": { - "site": "reddit" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "fuzzy_match": "N/A" - }, - "reference_url": "", - "program_html": [], - "string_note": "the subreddit does not exist", - "reference_answer_raw_annotation": "N/A" - }, "intent_template_id": 84, - "changelogs": [ + "start_urls": ["__GITLAB__", "__REDDIT__"], + "intent": "Create a folder named car in gimmiethat.space repo. Within it, create a file named urls.txt that contains the URLs of the 5 most recent posts from the cars subreddit.", + "intent_template": "Create a folder named {{directory}} in {{gitlab_repo}} repo. Within it, create a file named urls.txt that contains the URLs of the 5 most recent posts from the {{subreddit}} subreddit.", + "instantiation_dict": {"directory": "car", "subreddit": "cars", "gitlab_repo": "gimmiethat.space"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "expected_retrieve_value", - "category": "unachievable_tasks", - "note": "Status check used instead of fuzzy match" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": { + "performed_operation": "retrieve", + "status": "NOT_FOUND_ERROR", + "retrieved_data": null + } } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping" - ], + "sites": ["shopping"], "task_id": 792, - "require_login": true, - "storage_state": "./.auth/shopping_state.json", - "start_url": "__SHOPPING__", - "geolocation": null, - "intent_template": "Buy the highest rated product from the {{product_category}} category within a budget {{dollar_value}}.", - "instantiation_dict": { - "product_category": "Beauty & Personal Care", - "dollar_value": "under 20" - }, - "intent": "Buy the highest rated product from the Beauty & Personal Care category within a budget under 20.", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "validate_status", - "expected_data": { - "expected_status": "ACTION_NOT_ALLOWED_ERROR" - } - } - } - ], - "site": "shopping" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "fuzzy_match": "N/A" - }, - "reference_url": "", - "program_html": [], - "string_note": "The website does not support sorting by rating and there are too many products in this category", - "reference_answer_raw_annotation": "N/A" - }, "intent_template_id": 172, - "changelogs": [ + "start_urls": ["__SHOPPING__"], + "intent": "Buy the highest rated product from the Beauty & Personal Care category within a budget under 20.", + "intent_template": "Buy the highest rated product from the {{product_category}} category within a budget {{dollar_value}}.", + "instantiation_dict": {"product_category": "Beauty & Personal Care", "dollar_value": "under 20"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "expected_retrieve_value", - "category": "unachievable_tasks", - "note": "Status check used instead of fuzzy match" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": { + "performed_operation": "retrieve", + "status": "ACTION_NOT_ALLOWED_ERROR", + "retrieved_data": null + } } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping" - ], + "sites": ["shopping"], "task_id": 793, - "require_login": true, - "storage_state": "./.auth/shopping_state.json", - "start_url": "__SHOPPING__", - "geolocation": null, + "intent_template_id": 172, + "start_urls": ["__SHOPPING__"], + "intent": "Buy the highest rated product from the Men clothing category within a budget above 50 but under 129.99.", "intent_template": "Buy the highest rated product from the {{product_category}} category within a budget {{dollar_value}}.", "instantiation_dict": { "product_category": "Men clothing", "dollar_value": "above 50 but under 129.99" }, - "intent": "Buy the highest rated product from the Men clothing category within a budget above 50 but under 129.99.", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "validate_status", - "expected_data": { - "expected_status": "ACTION_NOT_ALLOWED_ERROR" - } - } - } - ], - "site": "shopping" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "fuzzy_match": "N/A" - }, - "reference_url": "", - "program_html": [], - "string_note": "The website does not support sorting by rating and there are too many products in this category", - "reference_answer_raw_annotation": "N/A" - }, - "intent_template_id": 172, - "changelogs": [ + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "expected_retrieve_value", - "category": "unachievable_tasks", - "note": "Status check used instead of fuzzy match" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": { + "performed_operation": "retrieve", + "status": "ACTION_NOT_ALLOWED_ERROR", + "retrieved_data": null + } } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping" - ], + "sites": ["shopping"], "task_id": 794, - "require_login": true, - "storage_state": "./.auth/shopping_state.json", - "start_url": "__SHOPPING__", - "geolocation": null, - "intent_template": "Change the delivery address for my most recent order to {{address}}.", - "instantiation_dict": { - "address": "4000 Forbes Ave, Pittsburgh, PA" - }, - "intent": "Change the delivery address for my most recent order to 4000 Forbes Ave, Pittsburgh, PA.", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "validate_status", - "expected_data": { - "expected_status": "ACTION_NOT_ALLOWED_ERROR" - } - } - } - ], - "site": "shopping" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "fuzzy_match": "N/A" - }, - "reference_url": "", - "program_html": [], - "string_note": "The shipping address cannot be changed after an order is placed in the website.", - "reference_answer_raw_annotation": "N/A" - }, "intent_template_id": 191, - "changelogs": [ + "start_urls": ["__SHOPPING__"], + "intent": "Change the delivery address for my most recent order to 4000 Forbes Ave, Pittsburgh, PA.", + "intent_template": "Change the delivery address for my most recent order to {{address}}.", + "instantiation_dict": {"address": "4000 Forbes Ave, Pittsburgh, PA"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "expected_retrieve_value", - "category": "unachievable_tasks", - "note": "Status check used instead of fuzzy match" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": { + "performed_operation": "retrieve", + "status": "ACTION_NOT_ALLOWED_ERROR", + "retrieved_data": null + } } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping" - ], + "sites": ["shopping"], "task_id": 795, - "require_login": true, - "storage_state": "./.auth/shopping_state.json", - "start_url": "__SHOPPING__", - "geolocation": null, - "intent_template": "Change the delivery address for my most recent order to {{address}}.", - "instantiation_dict": { - "address": "6726 McPherson Blvd, Pittsburgh, PA" - }, - "intent": "Change the delivery address for my most recent order to 6726 McPherson Blvd, Pittsburgh, PA.", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "validate_status", - "expected_data": { - "expected_status": "ACTION_NOT_ALLOWED_ERROR" - } - } - } - ], - "site": "shopping" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "fuzzy_match": "N/A" - }, - "reference_url": "", - "program_html": [], - "string_note": "The shipping address cannot be changed after an order is placed in the website.", - "reference_answer_raw_annotation": "N/A" - }, "intent_template_id": 191, - "changelogs": [ + "start_urls": ["__SHOPPING__"], + "intent": "Change the delivery address for my most recent order to 6726 McPherson Blvd, Pittsburgh, PA.", + "intent_template": "Change the delivery address for my most recent order to {{address}}.", + "instantiation_dict": {"address": "6726 McPherson Blvd, Pittsburgh, PA"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "expected_retrieve_value", - "category": "unachievable_tasks", - "note": "Status check used instead of fuzzy match" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": { + "performed_operation": "retrieve", + "status": "ACTION_NOT_ALLOWED_ERROR", + "retrieved_data": null + } } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping" - ], + "sites": ["shopping"], "task_id": 796, - "require_login": true, - "storage_state": "./.auth/shopping_state.json", - "start_url": "__SHOPPING__", - "geolocation": null, - "intent_template": "Change the delivery address for my most recent order to {{address}}.", - "instantiation_dict": { - "address": "155 5th Street, San Francisco, CA" - }, - "intent": "Change the delivery address for my most recent order to 155 5th Street, San Francisco, CA.", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "validate_status", - "expected_data": { - "expected_status": "ACTION_NOT_ALLOWED_ERROR" - } - } - } - ], - "site": "shopping" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "fuzzy_match": "N/A" - }, - "reference_url": "", - "program_html": [], - "string_note": "The shipping address cannot be changed after an order is placed in the website.", - "reference_answer_raw_annotation": "N/A" - }, "intent_template_id": 191, - "changelogs": [ + "start_urls": ["__SHOPPING__"], + "intent": "Change the delivery address for my most recent order to 155 5th Street, San Francisco, CA.", + "intent_template": "Change the delivery address for my most recent order to {{address}}.", + "instantiation_dict": {"address": "155 5th Street, San Francisco, CA"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "expected_retrieve_value", - "category": "unachievable_tasks", - "note": "Status check used instead of fuzzy match" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": { + "performed_operation": "retrieve", + "status": "ACTION_NOT_ALLOWED_ERROR", + "retrieved_data": null + } } - ] - }, - { - "sites": [ - "shopping" ], - "task_id": 797, - "require_login": true, - "storage_state": "./.auth/shopping_state.json", - "start_url": "__SHOPPING__", - "geolocation": null, - "intent_template": "Change the delivery address for my most recent order to {{address}}.", - "instantiation_dict": { - "address": "3 Oxford St, Cambridge, MA" - }, - "intent": "Change the delivery address for my most recent order to 3 Oxford St, Cambridge, MA.", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "validate_status", - "expected_data": { - "expected_status": "ACTION_NOT_ALLOWED_ERROR" - } - } - } - ], - "site": "shopping" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "fuzzy_match": "N/A" - }, - "reference_url": "", - "program_html": [], - "string_note": "The shipping address cannot be changed after an order is placed in the website.", - "reference_answer_raw_annotation": "N/A" - }, + "revision": 2 + }, + { + "sites": ["shopping"], + "task_id": 797, "intent_template_id": 191, - "changelogs": [ + "start_urls": ["__SHOPPING__"], + "intent": "Change the delivery address for my most recent order to 3 Oxford St, Cambridge, MA.", + "intent_template": "Change the delivery address for my most recent order to {{address}}.", + "instantiation_dict": {"address": "3 Oxford St, Cambridge, MA"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "expected_retrieve_value", - "category": "unachievable_tasks", - "note": "Status check used instead of fuzzy match" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": { + "performed_operation": "retrieve", + "status": "ACTION_NOT_ALLOWED_ERROR", + "retrieved_data": null + } } - ] + ], + "revision": 2 }, { - "sites": [ - "shopping" - ], + "sites": ["shopping"], "task_id": 798, - "require_login": true, - "storage_state": "./.auth/shopping_state.json", - "start_url": "__SHOPPING__", - "geolocation": null, - "intent_template": "Change the delivery address for my most recent order to {{address}}.", - "instantiation_dict": { - "address": "77 Massachusetts Ave, Cambridge, MA" - }, - "intent": "Change the delivery address for my most recent order to 77 Massachusetts Ave, Cambridge, MA.", - "require_reset": false, - "eval": { - "expected_retrieve_value": [ - { - "eval_func": { - "name": "validate_status", - "expected_data": { - "expected_status": "ACTION_NOT_ALLOWED_ERROR" - } - } - } - ], - "site": "shopping" - }, - "original.eval": { - "eval_types": [ - "string_match" - ], - "reference_answers": { - "fuzzy_match": "N/A" - }, - "reference_url": "", - "program_html": [], - "string_note": "The shipping address cannot be changed after an order is placed in the website.", - "reference_answer_raw_annotation": "N/A" - }, "intent_template_id": 191, - "changelogs": [ + "start_urls": ["__SHOPPING__"], + "intent": "Change the delivery address for my most recent order to 77 Massachusetts Ave, Cambridge, MA.", + "intent_template": "Change the delivery address for my most recent order to {{address}}.", + "instantiation_dict": {"address": "77 Massachusetts Ave, Cambridge, MA"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "expected_retrieve_value", - "category": "unachievable_tasks", - "note": "Status check used instead of fuzzy match" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": { + "performed_operation": "retrieve", + "status": "ACTION_NOT_ALLOWED_ERROR", + "retrieved_data": null + } } - ] + ], + "revision": 2 }, { - "sites": [ - "gitlab" - ], + "sites": ["gitlab"], "task_id": 799, - "require_login": true, - "storage_state": "./.auth/gitlab_state.json", - "start_url": "__GITLAB__", - "geolocation": null, - "intent_template": "create a new group \"{{name}}\" with members {{members}}", - "instantiation_dict": { - "name": "n-lab", - "members": "patou, egpast, westurner, jontutcher" - }, - "intent": "create a new group \"n-lab\" with members patou, egpast, westurner, jontutcher", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "does_group_exist_with_members", - "eval_params": { - "group_name": "n-lab", - "member_usernames": [ - "patou", - "egpast", - "westurner", - "jontutcher" - ] - } - } - } - ], - "site": "gitlab" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "__GITLAB__/groups/n-lab/-/group_members", - "locator": "", - "required_contents": { - "must_include": [ - "@patou", - "@egpast", - "@westurner", - "@jontutcher" - ] - } - } - ] - }, "intent_template_id": 600, - "changelogs": [ + "start_urls": ["__GITLAB__"], + "intent": "create a new group \"n-lab\" with members patou, egpast, westurner, jontutcher", + "intent_template": "create a new group \"{{name}}\" with members {{members}}", + "instantiation_dict": {"name": "n-lab", "members": "patou, egpast, westurner, jontutcher"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "expected_backend_state", - "category": "switch_to_api_eval", - "note": "Use API to verify group and members; more accurate" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "group_name": "n-lab", + "member_usernames": ["patou", "egpast", "westurner", "jontutcher"], + "site": "gitlab", + "expected": true } - ] + ], + "revision": 2 }, { - "sites": [ - "gitlab" - ], + "sites": ["gitlab"], "task_id": 800, - "require_login": true, - "storage_state": "./.auth/gitlab_state.json", - "start_url": "__GITLAB__", - "geolocation": null, + "intent_template_id": 600, + "start_urls": ["__GITLAB__"], + "intent": "create a new group \"x-lab\" with members JonasVautherin, dilipchandima, dawiss1337, bmyun, DCMJY", "intent_template": "create a new group \"{{name}}\" with members {{members}}", "instantiation_dict": { "name": "x-lab", "members": "JonasVautherin, dilipchandima, dawiss1337, bmyun, DCMJY" }, - "intent": "create a new group \"x-lab\" with members JonasVautherin, dilipchandima, dawiss1337, bmyun, DCMJY", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "does_group_exist_with_members", - "eval_params": { - "group_name": "x-lab", - "member_usernames": [ - "JonasVautherin", - "dilipchandima", - "dawiss1337", - "bmyun", - "DCMJY" - ] - } - } - } - ], - "site": "gitlab" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "__GITLAB__/groups/x-lab/-/group_members", - "locator": "", - "required_contents": { - "must_include": [ - "@JonasVautherin", - "@dilipchandima", - "@dawiss1337", - "@bmyun", - "@DCMJY" - ] - } - } - ] - }, - "intent_template_id": 600, - "changelogs": [ + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "expected_backend_state", - "category": "switch_to_api_eval", - "note": "Use API to verify group and members; more accurate" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "group_name": "x-lab", + "member_usernames": ["JonasVautherin", "dilipchandima", "dawiss1337", "bmyun", "DCMJY"], + "site": "gitlab", + "expected": true } - ] + ], + "revision": 2 }, { - "sites": [ - "gitlab" - ], + "sites": ["gitlab"], "task_id": 801, - "require_login": true, - "storage_state": "./.auth/gitlab_state.json", - "start_url": "__GITLAB__", - "geolocation": null, - "intent_template": "create a new group \"{{name}}\" with members {{members}}", - "instantiation_dict": { - "name": "crew", - "members": "ASWATFZLLC, patrickhlauke, westurner, linkmatrix" - }, - "intent": "create a new group \"crew\" with members ASWATFZLLC, patrickhlauke, westurner, linkmatrix", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "does_group_exist_with_members", - "eval_params": { - "group_name": "crew", - "member_usernames": [ - "ASWATFZLLC", - "patrickhlauke", - "westurner", - "linkmatrix" - ] - } - } - } - ], - "site": "gitlab" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "__GITLAB__/groups/crew/-/group_members", - "locator": "", - "required_contents": { - "must_include": [ - "@ASWATFZLLC", - "@patrickhlauke", - "@westurner", - "@linkmatrix" - ] - } - } - ] - }, "intent_template_id": 600, - "changelogs": [ + "start_urls": ["__GITLAB__"], + "intent": "create a new group \"crew\" with members ASWATFZLLC, patrickhlauke, westurner, linkmatrix", + "intent_template": "create a new group \"{{name}}\" with members {{members}}", + "instantiation_dict": {"name": "crew", "members": "ASWATFZLLC, patrickhlauke, westurner, linkmatrix"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "expected_backend_state", - "category": "switch_to_api_eval", - "note": "Use API to verify group and members; more accurate" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "group_name": "crew", + "member_usernames": ["ASWATFZLLC", "patrickhlauke", "westurner", "linkmatrix"], + "site": "gitlab", + "expected": true } - ] + ], + "revision": 2 }, { - "sites": [ - "gitlab" - ], + "sites": ["gitlab"], "task_id": 802, - "require_login": true, - "storage_state": "./.auth/gitlab_state.json", - "start_url": "__GITLAB__", - "geolocation": null, - "intent_template": "create a new group \"{{name}}\" with members {{members}}", - "instantiation_dict": { - "name": "coding_friends", - "members": "qhduan, Agnes-U" - }, - "intent": "create a new group \"coding_friends\" with members qhduan, Agnes-U", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "does_group_exist_with_members", - "eval_params": { - "group_name": "coding_friends", - "member_usernames": [ - "qhduan", - "Agnes-U" - ] - } - } - } - ], - "site": "gitlab" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "__GITLAB__/groups/coding_friends/-/group_members", - "locator": "", - "required_contents": { - "must_include": [ - "@qhduan", - "@Agnes-U" - ] - } - } - ] - }, "intent_template_id": 600, - "changelogs": [ + "start_urls": ["__GITLAB__"], + "intent": "create a new group \"coding_friends\" with members qhduan, Agnes-U", + "intent_template": "create a new group \"{{name}}\" with members {{members}}", + "instantiation_dict": {"name": "coding_friends", "members": "qhduan, Agnes-U"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "expected_backend_state", - "category": "switch_to_api_eval", - "note": "Use API to verify group and members; more accurate" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "group_name": "coding_friends", + "member_usernames": ["qhduan", "Agnes-U"], + "site": "gitlab", + "expected": true } - ] + ], + "revision": 2 }, { - "sites": [ - "gitlab" - ], + "sites": ["gitlab"], "task_id": 803, - "require_login": true, - "storage_state": "./.auth/gitlab_state.json", - "start_url": "__GITLAB__", - "geolocation": null, - "intent_template": "create a new group \"{{name}}\" with members {{members}}", - "instantiation_dict": { - "name": "webagent", - "members": "pandey2000, sayakpaul, sayakpaul" - }, - "intent": "create a new group \"webagent\" with members pandey2000, sayakpaul, sayakpaul", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "does_group_exist_with_members", - "eval_params": { - "group_name": "webagent", - "member_usernames": [ - "pandey2000", - "sayakpaul" - ] - } - } - } - ], - "site": "gitlab" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "__GITLAB__/groups/webagent/-/group_members", - "locator": "", - "required_contents": { - "must_include": [ - "@pandey2000", - "@sayakpaul" - ] - } - } - ] - }, "intent_template_id": 600, - "changelogs": [ + "start_urls": ["__GITLAB__"], + "intent": "create a new group \"webagent\" with members pandey2000, sayakpaul, sayakpaul", + "intent_template": "create a new group \"{{name}}\" with members {{members}}", + "instantiation_dict": {"name": "webagent", "members": "pandey2000, sayakpaul, sayakpaul"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "expected_backend_state", - "category": "switch_to_api_eval", - "note": "Use API to verify group and members; more accurate" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "group_name": "webagent", + "member_usernames": ["pandey2000", "sayakpaul"], + "site": "gitlab", + "expected": true } - ] + ], + "revision": 2 }, { - "sites": [ - "gitlab" - ], + "sites": ["gitlab"], "task_id": 804, - "require_login": true, - "storage_state": "./.auth/gitlab_state.json", - "start_url": "__GITLAB__", - "geolocation": null, - "intent_template": "Assign the issue regarding {{issue}} to {{account}}.", - "instantiation_dict": { - "issue": "flash alerts", - "account": "myself and primer" - }, - "intent": "Assign the issue regarding flash alerts to myself and primer.", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "issue_assignee_is", - "eval_params": { - "usernames": [ - "byteblaze", - "primer" - ], - "group": "primer", - "project": "design", - "issue_iid": 104 - } - } - } - ], - "site": "gitlab" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "__GITLAB__/dashboard/issues?scope=all&state=opened&assignee_username=byteblaze", - "locator": "", - "required_contents": { - "must_include": [ - "Add documentation on using Flash alerts in dialog components", - "Clarify usage of flash alert" - ] - } - }, - { - "url": "__GITLAB__/dashboard/issues?scope=all&state=opened&assignee_username=primer", - "locator": "", - "required_contents": { - "must_include": [ - "Add documentation on using Flash alerts in dialog components", - "Clarify usage of flash alert" - ] - } - } - ] - }, "intent_template_id": 999, - "changelogs": [ + "start_urls": ["__GITLAB__"], + "intent": "Assign the issue regarding flash alerts to myself and primer.", + "intent_template": "Assign the issue regarding {{issue}} to {{account}}.", + "instantiation_dict": {"issue": "flash alerts", "account": "myself and primer"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "expected_backend_state", - "category": "switch_to_api_eval", - "note": "Use API to verify; more accurate" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "usernames": ["byteblaze", "primer"], + "group": "primer", + "project": "design", + "issue_iid": 104, + "site": "gitlab", + "expected": true } - ] + ], + "revision": 2 }, { - "sites": [ - "gitlab" - ], + "sites": ["gitlab"], "task_id": 805, - "require_login": true, - "storage_state": "./.auth/gitlab_state.json", - "start_url": "__GITLAB__/a11yproject/a11yproject.com", - "geolocation": null, + "intent_template_id": 335, + "start_urls": ["__GITLAB__/a11yproject/a11yproject.com"], + "intent": "Submit a merge request for the branch that implements the support of template strings to be merged into master branch, assign myself and Roshan as the reviewer", "intent_template": "Submit a merge request for {{source_branch}} to be merged into {{target_branch}} branch, assign {{reviewer}} as the reviewer", "instantiation_dict": { "source_branch": "the branch that implements the support of template strings", "target_branch": "master", "reviewer": "myself and Roshan" }, - "intent": "Submit a merge request for the branch that implements the support of template strings to be merged into master branch, assign myself and Roshan as the reviewer", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "check_merge_request_has_values", - "eval_params": { - "group": "a11yproject", - "project": "a11yproject.com", - "source_branch": "feature/replace-gulp", - "target_branch": "main", - "reviewer": "byteblaze" - } - } - } - ], - "site": "gitlab" - }, - "original.eval": { - "eval_types": [ - "url_match", - "program_html" - ], - "reference_answers": null, - "reference_url": "__GITLAB__/a11yproject/a11yproject.com/-/merge_requests", - "program_html": [ - { - "url": "last", - "locator": "document.querySelectorAll(\".detail-page-description > a.gl-font-monospace\")[0].outerText", - "required_contents": { - "exact_match": "feature/replace-gulp" - } - }, - { - "url": "last", - "locator": "document.querySelectorAll(\".detail-page-description > a.gl-font-monospace\")[1].outerText", - "required_contents": { - "exact_match": "main" - } - }, - { - "url": "last", - "locator": "document.querySelector('.block.reviewer').outerText", - "required_contents": { - "must_include": [ - "Byte Blaze", - "Roshan Jossy" - ] - } - } - ], - "url_note": "GOLD in PRED" - }, - "intent_template_id": 335, - "changelogs": [ + "format_specification": null, + "start_url_context": "Complete the task given the context of the current page.", + "eval": [ { - "key": "expected_backend_state", - "category": "switch_to_api_eval", - "note": "Use API to verify; more accurate" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "group": "a11yproject", + "project": "a11yproject.com", + "source_branch": "feature/replace-gulp", + "target_branch": "main", + "reviewer": "byteblaze", + "site": "gitlab", + "expected": true } - ] + ], + "revision": 2 }, { - "sites": [ - "gitlab" - ], + "sites": ["gitlab"], "task_id": 806, - "require_login": true, - "storage_state": "./.auth/gitlab_state.json", - "start_url": "__GITLAB__", - "geolocation": null, + "intent_template_id": 335, + "start_urls": ["__GITLAB__"], + "intent": "Submit a merge request for a11yproject.com/redesign branch to be merged into markdown-figure-block branch, assign myself as the reviewer", "intent_template": "Submit a merge request for {{source_branch}} branch to be merged into {{target_branch}} branch, assign {{reviewer}} as the reviewer", "instantiation_dict": { "source_branch": "a11yproject.com/redesign", "target_branch": "markdown-figure-block", "reviewer": "myself" }, - "intent": "Submit a merge request for a11yproject.com/redesign branch to be merged into markdown-figure-block branch, assign myself as the reviewer", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "check_merge_request_has_values", - "eval_params": { - "group": "a11yproject", - "project": "a11yproject.com", - "source_branch": "redesign", - "target_branch": "feature/markdown-figure-block", - "reviewer": "byteblaze" - } - } - } - ], - "site": "gitlab" - }, - "original.eval": { - "eval_types": [ - "url_match", - "program_html" - ], - "reference_answers": null, - "reference_url": "__GITLAB__/a11yproject/a11yproject.com/-/merge_requests", - "program_html": [ - { - "url": "last", - "locator": "document.querySelectorAll(\".detail-page-description > a.gl-font-monospace\")[0].outerText", - "required_contents": { - "exact_match": "redesign" - } - }, - { - "url": "last", - "locator": "document.querySelectorAll(\".detail-page-description > a.gl-font-monospace\")[1].outerText", - "required_contents": { - "exact_match": "feature/markdown-figure-block" - } - }, - { - "url": "last", - "locator": "document.querySelector('.block.reviewer').outerText", - "required_contents": { - "must_include": [ - "Byte Blaze" - ] - } - } - ], - "url_note": "GOLD in PRED" - }, - "intent_template_id": 335, - "changelogs": [ + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "expected_backend_state", - "category": "switch_to_api_eval", - "note": "Use API to verify; more accurate" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "group": "a11yproject", + "project": "a11yproject.com", + "source_branch": "redesign", + "target_branch": "feature/markdown-figure-block", + "reviewer": "byteblaze", + "site": "gitlab", + "expected": true } - ] + ], + "revision": 2 }, { - "sites": [ - "gitlab" - ], + "sites": ["gitlab"], "task_id": 807, - "require_login": true, - "storage_state": "./.auth/gitlab_state.json", - "start_url": "__GITLAB__/primer/design", - "geolocation": null, + "intent_template_id": 335, + "start_urls": ["__GITLAB__/primer/design"], + "intent": "Submit a merge request for build time debug branch to be merged into main branch, assign myself as the reviewer", "intent_template": "Submit a merge request for {{source_branch}} branch to be merged into {{target_branch}} branch, assign {{reviewer}} as the reviewer", "instantiation_dict": { "source_branch": "build time debug", "target_branch": "main", "reviewer": "myself" }, - "intent": "Submit a merge request for build time debug branch to be merged into main branch, assign myself as the reviewer", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "check_merge_request_has_values", - "eval_params": { - "group": "primer", - "project": "design", - "source_branch": "debug-build-time", - "target_branch": "main", - "reviewer": "byteblaze" - } - } - } - ], - "site": "gitlab" - }, - "original.eval": { - "eval_types": [ - "url_match", - "program_html" - ], - "reference_answers": null, - "reference_url": "__GITLAB__/primer/design/-/merge_requests", - "program_html": [ - { - "url": "last", - "locator": "document.querySelectorAll(\".detail-page-description > a.gl-font-monospace\")[1].outerText", - "required_contents": { - "exact_match": "main" - } - }, - { - "url": "last", - "locator": "document.querySelectorAll(\".detail-page-description > a.gl-font-monospace\")[0].outerText", - "required_contents": { - "exact_match": "debug-build-time" - } - }, - { - "url": "last", - "locator": "document.querySelector('.block.reviewer').outerText", - "required_contents": { - "must_include": [ - "Byte Blaze" - ] - } - } - ], - "url_note": "GOLD in PRED" - }, - "intent_template_id": 335, - "changelogs": [ + "format_specification": null, + "start_url_context": "Complete the task given the context of the current page.", + "eval": [ { - "key": "expected_backend_state", - "category": "switch_to_api_eval", - "note": "Use API to verify; more accurate" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "group": "primer", + "project": "design", + "source_branch": "debug-build-time", + "target_branch": "main", + "reviewer": "byteblaze", + "site": "gitlab", + "expected": true } - ] + ], + "revision": 2 }, { - "sites": [ - "gitlab" - ], + "sites": ["gitlab"], "task_id": 808, - "require_login": true, - "storage_state": "./.auth/gitlab_state.json", - "start_url": "__GITLAB__", - "geolocation": null, + "intent_template_id": 327, + "start_urls": ["__GITLAB__"], + "intent": "Create an issue in cloud-to-butt repo with title \"Let's keep the project alive\". Assign the issue to myself. Set due date to be the end of Q1 2033", "intent_template": "Create an issue in {{repo}} repo with title \"{{issue}}\". Assign the issue to {{account}}. Set due date to be {{due}}", "instantiation_dict": { "repo": "cloud-to-butt", @@ -49539,86 +23309,35 @@ "account": "myself", "due": "the end of Q1 2033" }, - "intent": "Create an issue in cloud-to-butt repo with title \"Let's keep the project alive\". Assign the issue to myself. Set due date to be the end of Q1 2033", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "project_has_issue_with_fields", - "eval_params": { - "group": "byteblaze", - "project": "cloud-to-butt", - "fields": { - "title": "Let's keep the project alive", - "assignee.username": "byteblaze", - "due_date": "2033-03-31" - } - } - } - } - ], - "site": "gitlab" - }, - "original.eval": { - "eval_types": [ - "url_match", - "program_html" - ], - "reference_answers": null, - "reference_url": "__GITLAB__/byteblaze/cloud-to-butt/-/issues", - "program_html": [ - { - "url": "last", - "locator": "document.querySelector('[data-qa-selector=\"title_content\"]').outerText", - "required_contents": { - "exact_match": "Let's keep the project alive" - } - }, - { - "url": "last", - "locator": "document.querySelector('[data-testid=\"sidebar-due-date\"').outerText", - "required_contents": { - "must_include": [ - "Mar 31, 2033" - ] - } - }, - { - "url": "last", - "locator": "document.querySelector('.block.assignee').outerText", - "required_contents": { - "must_include": [ - "Byte Blaze" - ] - } - } - ], - "url_note": "GOLD in PRED" - }, - "intent_template_id": 327, - "changelogs": [ + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "expected_backend_state", - "category": "switch_to_api_eval", - "note": "Use API to verify; more accurate" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "group": "byteblaze", + "project": "cloud-to-butt", + "fields": { + "title": "Let's keep the project alive", + "assignee.username": "byteblaze", + "due_date": "2033-03-31" + }, + "site": "gitlab", + "expected": true } - ] + ], + "revision": 2 }, { - "sites": [ - "gitlab" - ], + "sites": ["gitlab"], "task_id": 809, - "require_login": true, - "storage_state": "./.auth/gitlab_state.json", - "start_url": "__GITLAB__", - "geolocation": null, + "intent_template_id": 327, + "start_urls": ["__GITLAB__"], + "intent": "Create an issue in a11yproject repo with title \"404 for many URLs\". Assign the issue to myself. Set due date to be 2030-1-3", "intent_template": "Create an issue in {{repo}} repo with title \"{{issue}}\". Assign the issue to {{account}}. Set due date to be {{due}}", "instantiation_dict": { "repo": "a11yproject", @@ -49626,210 +23345,83 @@ "account": "myself", "due": "2030-1-3" }, - "intent": "Create an issue in a11yproject repo with title \"404 for many URLs\". Assign the issue to myself. Set due date to be 2030-1-3", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "project_has_issue_with_fields", - "eval_params": { - "group": "a11yproject", - "project": "a11yproject.com", - "fields": { - "title": "404 for many URLs", - "assignee.username": "byteblaze", - "due_date": "2030-01-03" - } - } - } - } - ], - "site": "gitlab" - }, - "original.eval": { - "eval_types": [ - "url_match", - "program_html" - ], - "reference_answers": null, - "reference_url": "__GITLAB__/a11yproject/a11yproject.com/-/issues", - "program_html": [ - { - "url": "last", - "locator": "document.querySelector('[data-qa-selector=\"title_content\"]').outerText", - "required_contents": { - "exact_match": "404 for many URLs" - } - }, - { - "url": "last", - "locator": "document.querySelector('[data-testid=\"sidebar-due-date\"').outerText", - "required_contents": { - "must_include": [ - "Jan 3, 2030" - ] - } - }, - { - "url": "last", - "locator": "document.querySelector('.block.assignee').outerText", - "required_contents": { - "must_include": [ - "Byte Blaze" - ] - } - } - ], - "url_note": "GOLD in PRED" - }, - "intent_template_id": 327, - "changelogs": [ + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "expected_backend_state", - "category": "switch_to_api_eval", - "note": "Use API to verify; more accurate" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "group": "a11yproject", + "project": "a11yproject.com", + "fields": { + "title": "404 for many URLs", + "assignee.username": "byteblaze", + "due_date": "2030-01-03" + }, + "site": "gitlab", + "expected": true } - ] + ], + "revision": 2 }, { - "sites": [ - "gitlab" - ], + "sites": ["gitlab"], "task_id": 810, - "require_login": true, - "storage_state": "./.auth/gitlab_state.json", - "start_url": "__GITLAB__", - "geolocation": null, - "intent_template": "Assign the issue regarding {{issue}} in {{repo}} repo to {{account}}.", - "instantiation_dict": { - "repo": "primer design guide", - "issue": "flash alert", - "account": "myself" - }, - "intent": "Assign the issue regarding flash alert in primer design guide repo to myself.", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "issue_assignee_is", - "eval_params": { - "usernames": [ - "byteblaze" - ], - "group": "primer", - "project": "design", - "issue_iid": 104 - } - } - } - ], - "site": "gitlab" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "__GITLAB__/dashboard/issues?scope=all&state=opened&assignee_username=byteblaze", - "locator": "", - "required_contents": { - "must_include": [ - "Add documentation on using Flash alerts in dialog components", - "Clarify usage of flash alert" - ] - } - } - ] - }, "intent_template_id": 999, - "changelogs": [ + "start_urls": ["__GITLAB__"], + "intent": "Assign the issue regarding flash alert in primer design guide repo to myself.", + "intent_template": "Assign the issue regarding {{issue}} in {{repo}} repo to {{account}}.", + "instantiation_dict": {"repo": "primer design guide", "issue": "flash alert", "account": "myself"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "expected_backend_state", - "category": "switch_to_api_eval", - "note": "Use API to verify; more accurate" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "usernames": ["byteblaze"], + "group": "primer", + "project": "design", + "issue_iid": 104, + "site": "gitlab", + "expected": true } - ] + ], + "revision": 2 }, { - "sites": [ - "gitlab" - ], + "sites": ["gitlab"], "task_id": 811, - "require_login": true, - "storage_state": "./.auth/gitlab_state.json", - "start_url": "__GITLAB__", - "geolocation": null, - "intent_template": "Assign the issue regarding {{issue}} in {{repo}} to {{account}}.", - "instantiation_dict": { - "repo": "a11yproject", - "issue": 404, - "account": "myself" - }, - "intent": "Assign the issue regarding 404 in a11yproject to myself.", - "require_reset": false, - "eval": { - "expected_backend_state": [ - { - "eval_func": { - "name": "issue_assignee_is", - "eval_params": { - "usernames": [ - "byteblaze" - ], - "group": "a11yproject", - "project": "a11yproject.com", - "issue_iid": 1478 - } - } - } - ], - "site": "gitlab" - }, - "original.eval": { - "eval_types": [ - "program_html" - ], - "reference_answers": null, - "reference_url": "", - "program_html": [ - { - "url": "__GITLAB__/dashboard/issues?scope=all&state=opened&assignee_username=byteblaze", - "locator": "", - "required_contents": { - "must_include": [ - "404s, bad host, timeouts, bad urls for URLs linked from website" - ] - } - } - ] - }, "intent_template_id": 999, - "changelogs": [ + "start_urls": ["__GITLAB__"], + "intent": "Assign the issue regarding 404 in a11yproject to myself.", + "intent_template": "Assign the issue regarding {{issue}} in {{repo}} to {{account}}.", + "instantiation_dict": {"repo": "a11yproject", "issue": 404, "account": "myself"}, + "format_specification": null, + "start_url_context": null, + "eval": [ { - "key": "expected_backend_state", - "category": "switch_to_api_eval", - "note": "Use API to verify; more accurate" + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "key": "expected_backend_state", - "category": "dom_dependent_eval", - "note": "Switched from dom dependent evaluation to backend state check" + "evaluator": "BackendStateEvaluator", + "usernames": ["byteblaze"], + "group": "a11yproject", + "project": "a11yproject.com", + "issue_iid": 1478, + "site": "gitlab", + "expected": true } - ] + ], + "revision": 2 } -] \ No newline at end of file +] From e59f75479d84f3c54bd3aede842d6fc75cd68376 Mon Sep 17 00:00:00 2001 From: Nicolas Gontier Date: Wed, 22 Oct 2025 18:09:30 +0000 Subject: [PATCH 17/64] update task name template to webarena_verified.templateID.taskID --- .../benchmark/metadata/webarena_verified.csv | 1624 ++++++++--------- .../browsergym/webarena_verified/__init__.py | 4 +- .../browsergym/webarena_verified/config.py | 11 + 3 files changed, 825 insertions(+), 814 deletions(-) diff --git a/browsergym/experiments/src/browsergym/experiments/benchmark/metadata/webarena_verified.csv b/browsergym/experiments/src/browsergym/experiments/benchmark/metadata/webarena_verified.csv index 2b70a143..94dae972 100644 --- a/browsergym/experiments/src/browsergym/experiments/benchmark/metadata/webarena_verified.csv +++ b/browsergym/experiments/src/browsergym/experiments/benchmark/metadata/webarena_verified.csv @@ -1,813 +1,813 @@ task_name,requires_reset,sites,eval_types,task_id,browsergym_split,depends_on -webarena_verified.0,False,shopping_admin,retrieve_value,0,train, -webarena_verified.1,False,shopping_admin,retrieve_value,1,test,webarena_verified.0 -webarena_verified.2,False,shopping_admin,retrieve_value,2,train,webarena_verified.1 -webarena_verified.3,False,shopping_admin,retrieve_value,3,test,webarena_verified.2 -webarena_verified.4,False,shopping_admin,retrieve_value,4,train,webarena_verified.3 -webarena_verified.5,False,shopping_admin,retrieve_value,5,train,webarena_verified.4 -webarena_verified.6,False,shopping_admin,retrieve_value,6,test,webarena_verified.5 -webarena_verified.7,False,map,retrieve_value,7,train, -webarena_verified.8,False,map,string_match,8,test,webarena_verified.7 -webarena_verified.9,False,map,retrieve_value,9,test,webarena_verified.8 -webarena_verified.10,False,map,retrieve_value,10,test,webarena_verified.9 -webarena_verified.11,False,shopping_admin,retrieve_value,11,test,webarena_verified.6 -webarena_verified.12,False,shopping_admin,retrieve_value,12,train,webarena_verified.11 -webarena_verified.13,False,shopping_admin,retrieve_value,13,train,webarena_verified.12 -webarena_verified.14,False,shopping_admin,retrieve_value,14,train,webarena_verified.13 -webarena_verified.15,False,shopping_admin,retrieve_value,15,test,webarena_verified.14 -webarena_verified.16,False,map,string_match,16,test,webarena_verified.10 -webarena_verified.17,False,map,string_match,17,train,webarena_verified.16 -webarena_verified.18,False,map,string_match,18,test,webarena_verified.17 -webarena_verified.19,False,map,string_match,19,train,webarena_verified.18 -webarena_verified.20,False,map,string_match,20,test,webarena_verified.19 -webarena_verified.21,False,shopping,retrieve_value,21,test, -webarena_verified.22,False,shopping,retrieve_value,22,test,webarena_verified.21 -webarena_verified.23,False,shopping,retrieve_value,23,test,webarena_verified.22 -webarena_verified.24,False,shopping,retrieve_value,24,test,webarena_verified.23 -webarena_verified.25,False,shopping,retrieve_value,25,test,webarena_verified.24 -webarena_verified.26,False,shopping,retrieve_value,26,test,webarena_verified.25 -webarena_verified.27,False,reddit,retrieve_value,27,test, -webarena_verified.28,False,reddit,retrieve_value,28,train,webarena_verified.27 -webarena_verified.29,False,reddit,retrieve_value,29,train,webarena_verified.28 -webarena_verified.30,False,reddit,retrieve_value,30,test,webarena_verified.29 -webarena_verified.31,False,reddit,retrieve_value,31,train,webarena_verified.30 -webarena_verified.32,False,map,retrieve_value,32,test,webarena_verified.20 -webarena_verified.33,False,map,retrieve_value,33,test,webarena_verified.32 -webarena_verified.34,False,map,retrieve_value,34,train,webarena_verified.33 -webarena_verified.35,False,map,retrieve_value,35,test,webarena_verified.34 -webarena_verified.36,False,map,retrieve_value,36,test,webarena_verified.35 -webarena_verified.37,False,map,retrieve_value,37,train,webarena_verified.36 -webarena_verified.38,False,map,retrieve_value,38,train,webarena_verified.37 -webarena_verified.39,False,map,retrieve_value,39,train,webarena_verified.38 -webarena_verified.40,False,map,retrieve_value,40,test,webarena_verified.39 -webarena_verified.41,False,shopping_admin,retrieve_value,41,train,webarena_verified.15 -webarena_verified.42,False,shopping_admin,retrieve_value,42,train,webarena_verified.41 -webarena_verified.43,False,shopping_admin,retrieve_value,43,test,webarena_verified.42 -webarena_verified.44,False,gitlab,ui_state,44,train, -webarena_verified.45,False,gitlab,ui_state,45,test,webarena_verified.44 -webarena_verified.46,False,gitlab,ui_state,46,test,webarena_verified.45 -webarena_verified.47,False,shopping,retrieve_value,47,train,webarena_verified.26 -webarena_verified.48,False,shopping,retrieve_value,48,test,webarena_verified.47 -webarena_verified.49,False,shopping,retrieve_value,49,train,webarena_verified.48 -webarena_verified.50,False,shopping,retrieve_value,50,train,webarena_verified.49 -webarena_verified.51,False,shopping,retrieve_value,51,test,webarena_verified.50 -webarena_verified.52,False,map,string_match,52,test,webarena_verified.40 -webarena_verified.53,False,map,string_match,53,train,webarena_verified.52 -webarena_verified.54,False,map,string_match,54,test,webarena_verified.53 -webarena_verified.55,False,map,string_match,55,train,webarena_verified.54 -webarena_verified.56,False,map,string_match,56,train,webarena_verified.55 -webarena_verified.57,False,map,retrieve_value,57,train,webarena_verified.56 -webarena_verified.58,False,map,retrieve_value,58,train,webarena_verified.57 -webarena_verified.59,False,map,retrieve_value,59,test,webarena_verified.58 -webarena_verified.60,False,map,retrieve_value,60,test,webarena_verified.59 -webarena_verified.61,False,map,retrieve_value,61,train,webarena_verified.60 -webarena_verified.62,False,shopping_admin,retrieve_value,62,train,webarena_verified.43 -webarena_verified.63,False,shopping_admin,retrieve_value,63,test,webarena_verified.62 -webarena_verified.64,False,shopping_admin,retrieve_value,64,test,webarena_verified.63 -webarena_verified.65,False,shopping_admin,retrieve_value,65,train,webarena_verified.64 -webarena_verified.66,False,reddit,retrieve_value,66,test,webarena_verified.31 -webarena_verified.67,False,reddit,retrieve_value,67,test,webarena_verified.66 -webarena_verified.68,False,reddit,retrieve_value,68,train,webarena_verified.67 -webarena_verified.69,False,reddit,retrieve_value,69,test,webarena_verified.68 -webarena_verified.70,False,map,retrieve_value,70,train,webarena_verified.61 -webarena_verified.71,False,map,retrieve_value,71,test,webarena_verified.70 -webarena_verified.72,False,map,retrieve_value,72,train,webarena_verified.71 -webarena_verified.73,False,map,retrieve_value,73,test,webarena_verified.72 -webarena_verified.74,False,map,string_match,74,train,webarena_verified.73 -webarena_verified.75,False,map,string_match,75,train,webarena_verified.74 -webarena_verified.76,False,map,retrieve_value,76,train,webarena_verified.75 -webarena_verified.77,False,shopping_admin,retrieve_value,77,test,webarena_verified.65 -webarena_verified.78,False,shopping_admin,retrieve_value,78,train,webarena_verified.77 -webarena_verified.79,False,shopping_admin,retrieve_value,79,test,webarena_verified.78 -webarena_verified.80,False,map,string_match,80,test,webarena_verified.76 -webarena_verified.81,False,map,string_match,81,test,webarena_verified.80 -webarena_verified.82,False,map,string_match,82,train,webarena_verified.81 -webarena_verified.83,False,map,string_match,83,train,webarena_verified.82 -webarena_verified.84,False,map,string_match,84,train,webarena_verified.83 -webarena_verified.85,False,map,string_match,85,test,webarena_verified.84 -webarena_verified.86,False,map,string_match,86,test,webarena_verified.85 -webarena_verified.87,False,map,string_match,87,train,webarena_verified.86 -webarena_verified.88,False,map,string_match,88,train,webarena_verified.87 -webarena_verified.89,False,map,retrieve_value,89,test,webarena_verified.88 -webarena_verified.90,False,map,retrieve_value,90,test,webarena_verified.89 -webarena_verified.91,False,map,retrieve_value,91,train,webarena_verified.90 -webarena_verified.92,False,map,retrieve_value,92,train,webarena_verified.91 -webarena_verified.93,False,map,retrieve_value,93,train,webarena_verified.92 -webarena_verified.94,False,shopping_admin,retrieve_value,94,test,webarena_verified.79 -webarena_verified.95,False,shopping_admin,retrieve_value,95,train,webarena_verified.94 -webarena_verified.96,False,shopping,retrieve_value,96,test,webarena_verified.51 -webarena_verified.97,False,map wikipedia,retrieve_value,97,test,webarena_verified.93 -webarena_verified.98,False,map,retrieve_value,98,test,webarena_verified.97 -webarena_verified.99,False,map,retrieve_value,99,train,webarena_verified.98 -webarena_verified.100,False,map,retrieve_value,100,test,webarena_verified.99 -webarena_verified.101,False,map,string_match,101,train,webarena_verified.100 -webarena_verified.102,False,gitlab,ui_state,102,train,webarena_verified.46 -webarena_verified.103,False,gitlab,ui_state,103,train,webarena_verified.102 -webarena_verified.104,False,gitlab,ui_state,104,test,webarena_verified.103 -webarena_verified.105,False,gitlab,ui_state,105,train,webarena_verified.104 -webarena_verified.106,False,gitlab,ui_state,106,test,webarena_verified.105 -webarena_verified.107,False,shopping_admin,retrieve_value,107,test,webarena_verified.95 -webarena_verified.108,False,shopping_admin,retrieve_value,108,train,webarena_verified.107 -webarena_verified.109,False,shopping_admin,retrieve_value,109,test,webarena_verified.108 -webarena_verified.110,False,shopping_admin,retrieve_value,110,train,webarena_verified.109 -webarena_verified.111,False,shopping_admin,retrieve_value,111,train,webarena_verified.110 -webarena_verified.112,False,shopping_admin,retrieve_value,112,test,webarena_verified.111 -webarena_verified.113,False,shopping_admin,retrieve_value,113,test,webarena_verified.112 -webarena_verified.114,False,shopping_admin,retrieve_value,114,train,webarena_verified.113 -webarena_verified.115,False,shopping_admin,retrieve_value,115,test,webarena_verified.114 -webarena_verified.116,False,shopping_admin,retrieve_value,116,test,webarena_verified.115 -webarena_verified.117,False,shopping,retrieve_value,117,test,webarena_verified.96 -webarena_verified.118,False,shopping,program_html,118,train,webarena_verified.117 -webarena_verified.119,False,shopping_admin,retrieve_value,119,test,webarena_verified.116 -webarena_verified.120,False,shopping_admin,retrieve_value,120,train,webarena_verified.119 -webarena_verified.121,False,shopping_admin,retrieve_value,121,train,webarena_verified.120 -webarena_verified.122,False,shopping_admin,retrieve_value,122,test,webarena_verified.121 -webarena_verified.123,False,shopping_admin,retrieve_value,123,train,webarena_verified.122 -webarena_verified.124,False,shopping,retrieve_value,124,train,webarena_verified.118 -webarena_verified.125,False,shopping,retrieve_value,125,train,webarena_verified.124 -webarena_verified.126,False,shopping,retrieve_value,126,test,webarena_verified.125 -webarena_verified.127,False,shopping_admin,retrieve_value,127,train,webarena_verified.123 -webarena_verified.128,False,shopping_admin,retrieve_value,128,train,webarena_verified.127 -webarena_verified.129,False,shopping_admin,retrieve_value,129,train,webarena_verified.128 -webarena_verified.130,False,shopping_admin,retrieve_value,130,train,webarena_verified.129 -webarena_verified.131,False,shopping_admin,retrieve_value,131,test,webarena_verified.130 -webarena_verified.132,False,gitlab,retrieve_value,132,train,webarena_verified.106 -webarena_verified.133,False,gitlab,retrieve_value,133,test,webarena_verified.132 -webarena_verified.134,False,gitlab,retrieve_value,134,test,webarena_verified.133 -webarena_verified.135,False,gitlab,retrieve_value,135,train,webarena_verified.134 -webarena_verified.136,False,gitlab,retrieve_value,136,train,webarena_verified.135 -webarena_verified.137,False,map,string_match,137,test,webarena_verified.101 -webarena_verified.138,False,map,string_match,138,test,webarena_verified.137 -webarena_verified.139,False,map,string_match,139,test,webarena_verified.138 -webarena_verified.140,False,map,string_match,140,train,webarena_verified.139 -webarena_verified.141,False,shopping,retrieve_value,141,train,webarena_verified.126 -webarena_verified.142,False,shopping,retrieve_value,142,train,webarena_verified.141 -webarena_verified.143,False,shopping,retrieve_value,143,test,webarena_verified.142 -webarena_verified.144,False,shopping,retrieve_value,144,test,webarena_verified.143 -webarena_verified.145,False,shopping,retrieve_value,145,train,webarena_verified.144 -webarena_verified.146,False,shopping,retrieve_value,146,test,webarena_verified.145 -webarena_verified.147,False,shopping,retrieve_value,147,train,webarena_verified.146 -webarena_verified.148,False,shopping,retrieve_value,148,train,webarena_verified.147 -webarena_verified.149,False,shopping,retrieve_value,149,test,webarena_verified.148 -webarena_verified.150,False,shopping,retrieve_value,150,train,webarena_verified.149 -webarena_verified.151,False,map,string_match,151,train,webarena_verified.140 -webarena_verified.152,False,map,string_match,152,train,webarena_verified.151 -webarena_verified.153,False,map,string_match,153,test,webarena_verified.152 -webarena_verified.154,False,map,string_match,154,train,webarena_verified.153 -webarena_verified.155,False,map,string_match,155,test,webarena_verified.154 -webarena_verified.156,False,gitlab,ui_state,156,test,webarena_verified.136 -webarena_verified.157,False,shopping_admin,ui_state,157,train,webarena_verified.131 -webarena_verified.158,False,shopping,ui_state,158,test,webarena_verified.150 -webarena_verified.159,False,shopping,ui_state,159,train,webarena_verified.158 -webarena_verified.160,False,shopping,ui_state,160,train,webarena_verified.159 -webarena_verified.161,False,shopping,ui_state,161,train,webarena_verified.160 -webarena_verified.162,False,shopping,ui_state,162,test,webarena_verified.161 -webarena_verified.163,False,shopping,retrieve_value,163,test,webarena_verified.162 -webarena_verified.164,False,shopping,retrieve_value,164,test,webarena_verified.163 -webarena_verified.165,False,shopping,retrieve_value,165,test,webarena_verified.164 -webarena_verified.166,False,shopping,retrieve_value,166,test,webarena_verified.165 -webarena_verified.167,False,shopping,retrieve_value,167,test,webarena_verified.166 -webarena_verified.168,False,gitlab,retrieve_value,168,test,webarena_verified.156 -webarena_verified.169,False,gitlab,retrieve_value,169,train,webarena_verified.168 -webarena_verified.170,False,gitlab,retrieve_value,170,train,webarena_verified.169 -webarena_verified.171,False,gitlab,retrieve_value,171,test,webarena_verified.170 -webarena_verified.172,False,gitlab,retrieve_value,172,train,webarena_verified.171 -webarena_verified.173,False,gitlab,retrieve_value,173,train,webarena_verified.172 -webarena_verified.174,False,gitlab,retrieve_value,174,test,webarena_verified.173 -webarena_verified.175,False,gitlab,retrieve_value,175,train,webarena_verified.174 -webarena_verified.176,False,gitlab,retrieve_value,176,train,webarena_verified.175 -webarena_verified.177,False,gitlab,retrieve_value,177,test,webarena_verified.176 -webarena_verified.178,False,gitlab,retrieve_value,178,test,webarena_verified.177 -webarena_verified.179,False,gitlab,retrieve_value,179,train,webarena_verified.178 -webarena_verified.180,False,gitlab,retrieve_value,180,train,webarena_verified.179 -webarena_verified.181,False,gitlab,retrieve_value,181,test,webarena_verified.180 -webarena_verified.182,False,gitlab,retrieve_value,182,train,webarena_verified.181 -webarena_verified.183,False,shopping_admin,retrieve_value,183,train,webarena_verified.157 -webarena_verified.184,False,shopping_admin,retrieve_value,184,train,webarena_verified.183 -webarena_verified.185,False,shopping_admin,retrieve_value,185,test,webarena_verified.184 -webarena_verified.186,False,shopping_admin,retrieve_value,186,train,webarena_verified.185 -webarena_verified.187,False,shopping_admin,retrieve_value,187,test,webarena_verified.186 -webarena_verified.188,False,shopping,retrieve_value,188,test,webarena_verified.167 -webarena_verified.189,False,shopping,retrieve_value,189,train,webarena_verified.188 -webarena_verified.190,False,shopping,retrieve_value,190,train,webarena_verified.189 -webarena_verified.191,False,shopping,retrieve_value,191,train,webarena_verified.190 -webarena_verified.192,False,shopping,retrieve_value,192,test,webarena_verified.191 -webarena_verified.193,False,shopping_admin,retrieve_value,193,train,webarena_verified.187 -webarena_verified.194,False,shopping_admin,retrieve_value,194,train,webarena_verified.193 -webarena_verified.195,False,shopping_admin,retrieve_value,195,test,webarena_verified.194 -webarena_verified.196,False,shopping_admin,retrieve_value,196,train,webarena_verified.195 -webarena_verified.197,False,shopping_admin,retrieve_value,197,train,webarena_verified.196 -webarena_verified.198,False,shopping_admin,retrieve_value,198,train,webarena_verified.197 -webarena_verified.199,False,shopping_admin,retrieve_value,199,train,webarena_verified.198 -webarena_verified.200,False,shopping_admin,retrieve_value,200,train,webarena_verified.199 -webarena_verified.201,False,shopping_admin,retrieve_value,201,test,webarena_verified.200 -webarena_verified.202,False,shopping_admin,retrieve_value,202,train,webarena_verified.201 -webarena_verified.203,False,shopping_admin,retrieve_value,203,test,webarena_verified.202 -webarena_verified.204,False,shopping_admin,retrieve_value,204,test,webarena_verified.203 -webarena_verified.205,False,gitlab,retrieve_value,205,train,webarena_verified.182 -webarena_verified.206,False,gitlab,retrieve_value,206,test,webarena_verified.205 -webarena_verified.207,False,gitlab,retrieve_value,207,test,webarena_verified.206 -webarena_verified.208,False,shopping_admin,retrieve_value,208,test,webarena_verified.204 -webarena_verified.209,False,shopping_admin,retrieve_value,209,test,webarena_verified.208 -webarena_verified.210,False,shopping_admin,retrieve_value,210,train,webarena_verified.209 -webarena_verified.211,False,shopping_admin,retrieve_value,211,train,webarena_verified.210 -webarena_verified.212,False,shopping_admin,retrieve_value,212,train,webarena_verified.211 -webarena_verified.213,False,shopping_admin,retrieve_value,213,test,webarena_verified.212 -webarena_verified.214,False,shopping_admin,retrieve_value,214,train,webarena_verified.213 -webarena_verified.215,False,shopping_admin,retrieve_value,215,test,webarena_verified.214 -webarena_verified.216,False,shopping_admin,retrieve_value,216,train,webarena_verified.215 -webarena_verified.217,False,shopping_admin,retrieve_value,217,train,webarena_verified.216 -webarena_verified.218,False,map,string_match,218,train,webarena_verified.155 -webarena_verified.219,False,map,string_match,219,test,webarena_verified.218 -webarena_verified.220,False,map,string_match,220,train,webarena_verified.219 -webarena_verified.221,False,map,string_match,221,test,webarena_verified.220 -webarena_verified.222,False,map,string_match,222,train,webarena_verified.221 -webarena_verified.223,False,map,string_match,223,test,webarena_verified.222 -webarena_verified.224,False,map,string_match,224,test,webarena_verified.223 -webarena_verified.225,False,shopping,retrieve_value,225,test,webarena_verified.192 -webarena_verified.226,False,shopping,retrieve_value,226,train,webarena_verified.225 -webarena_verified.227,False,shopping,retrieve_value,227,train,webarena_verified.226 -webarena_verified.228,False,shopping,retrieve_value,228,test,webarena_verified.227 -webarena_verified.229,False,shopping,retrieve_value,229,test,webarena_verified.228 -webarena_verified.230,False,shopping,retrieve_value,230,train,webarena_verified.229 -webarena_verified.231,False,shopping,retrieve_value,231,test,webarena_verified.230 -webarena_verified.232,False,shopping,retrieve_value,232,train,webarena_verified.231 -webarena_verified.233,False,shopping,retrieve_value,233,test,webarena_verified.232 -webarena_verified.234,False,shopping,retrieve_value,234,train,webarena_verified.233 -webarena_verified.235,False,shopping,retrieve_value,235,train,webarena_verified.234 -webarena_verified.236,False,map,retrieve_value,236,train,webarena_verified.224 -webarena_verified.237,False,map,retrieve_value,237,train,webarena_verified.236 -webarena_verified.238,False,shopping,ui_state,238,train,webarena_verified.235 -webarena_verified.239,False,shopping,ui_state,239,train,webarena_verified.238 -webarena_verified.240,False,shopping,ui_state,240,test,webarena_verified.239 -webarena_verified.241,False,shopping,ui_state,241,train,webarena_verified.240 -webarena_verified.242,False,shopping,ui_state,242,test,webarena_verified.241 -webarena_verified.243,False,shopping_admin,retrieve_value,243,train,webarena_verified.217 -webarena_verified.244,False,shopping_admin,retrieve_value,244,test,webarena_verified.243 -webarena_verified.245,False,shopping_admin,retrieve_value,245,train,webarena_verified.244 -webarena_verified.246,False,shopping_admin,retrieve_value,246,test,webarena_verified.245 -webarena_verified.247,False,shopping_admin,retrieve_value,247,train,webarena_verified.246 -webarena_verified.248,False,map,retrieve_value,248,test,webarena_verified.237 -webarena_verified.249,False,map,retrieve_value,249,train,webarena_verified.248 -webarena_verified.250,False,map,retrieve_value,250,test,webarena_verified.249 -webarena_verified.251,False,map,retrieve_value,251,train,webarena_verified.250 -webarena_verified.252,False,map,retrieve_value,252,train,webarena_verified.251 -webarena_verified.253,False,map,string_match,253,test,webarena_verified.252 -webarena_verified.254,False,map,retrieve_value,254,train,webarena_verified.253 -webarena_verified.255,False,map,retrieve_value,255,test,webarena_verified.254 -webarena_verified.256,False,map,retrieve_value,256,train,webarena_verified.255 -webarena_verified.257,False,map,string_match,257,test,webarena_verified.256 -webarena_verified.258,False,gitlab,ui_state,258,train,webarena_verified.207 -webarena_verified.259,False,gitlab,retrieve_value,259,train,webarena_verified.258 -webarena_verified.260,False,shopping,ui_state,260,test,webarena_verified.242 -webarena_verified.261,False,shopping,ui_state,261,train,webarena_verified.260 -webarena_verified.262,False,shopping,ui_state,262,train,webarena_verified.261 -webarena_verified.263,False,shopping,ui_state,263,test,webarena_verified.262 -webarena_verified.264,False,shopping,ui_state,264,train,webarena_verified.263 -webarena_verified.265,False,wikipedia map,retrieve_value,265,test,webarena_verified.257 -webarena_verified.266,False,wikipedia map,retrieve_value,266,test,webarena_verified.265 -webarena_verified.267,False,wikipedia map,retrieve_value,267,train,webarena_verified.266 -webarena_verified.268,False,wikipedia map,retrieve_value,268,test,webarena_verified.267 -webarena_verified.269,False,shopping,ui_state,269,train,webarena_verified.264 -webarena_verified.270,False,shopping,ui_state,270,train,webarena_verified.269 -webarena_verified.271,False,shopping,ui_state,271,test,webarena_verified.270 -webarena_verified.272,False,shopping,ui_state,272,test,webarena_verified.271 -webarena_verified.273,False,shopping,ui_state,273,train,webarena_verified.272 -webarena_verified.274,False,shopping,ui_state,274,test,webarena_verified.273 -webarena_verified.275,False,shopping,ui_state,275,test,webarena_verified.274 -webarena_verified.276,False,shopping,ui_state,276,train,webarena_verified.275 -webarena_verified.277,False,shopping,ui_state,277,train,webarena_verified.276 -webarena_verified.278,False,shopping,ui_state,278,train,webarena_verified.277 -webarena_verified.279,False,shopping,retrieve_value,279,train,webarena_verified.278 -webarena_verified.280,False,shopping,retrieve_value,280,test,webarena_verified.279 -webarena_verified.281,False,shopping,retrieve_value,281,train,webarena_verified.280 -webarena_verified.282,False,shopping,retrieve_value,282,train,webarena_verified.281 -webarena_verified.283,False,shopping,ui_state,283,test,webarena_verified.282 -webarena_verified.284,False,shopping,ui_state,284,test,webarena_verified.283 -webarena_verified.285,False,shopping,ui_state,285,train,webarena_verified.284 -webarena_verified.286,False,shopping,ui_state,286,test,webarena_verified.285 -webarena_verified.287,False,map,string_match,287,test,webarena_verified.268 -webarena_verified.288,False,shopping_admin,retrieve_value,288,train,webarena_verified.247 -webarena_verified.289,False,shopping_admin,retrieve_value,289,test,webarena_verified.288 -webarena_verified.290,False,shopping_admin,retrieve_value,290,train,webarena_verified.289 -webarena_verified.291,False,shopping_admin,retrieve_value,291,train,webarena_verified.290 -webarena_verified.292,False,shopping_admin,retrieve_value,292,test,webarena_verified.291 -webarena_verified.293,False,gitlab,retrieve_value,293,train,webarena_verified.259 -webarena_verified.294,False,gitlab,retrieve_value,294,train,webarena_verified.293 -webarena_verified.295,False,gitlab,retrieve_value,295,test,webarena_verified.294 -webarena_verified.296,False,gitlab,retrieve_value,296,train,webarena_verified.295 -webarena_verified.297,False,gitlab,retrieve_value,297,test,webarena_verified.296 -webarena_verified.298,False,shopping,ui_state,298,train,webarena_verified.286 -webarena_verified.299,False,shopping,ui_state,299,train,webarena_verified.298 -webarena_verified.300,False,shopping,ui_state,300,test,webarena_verified.299 -webarena_verified.301,False,shopping,retrieve_value,301,test,webarena_verified.300 -webarena_verified.302,False,shopping,retrieve_value,302,train,webarena_verified.301 -webarena_verified.303,False,gitlab,retrieve_value,303,test,webarena_verified.297 -webarena_verified.304,False,gitlab,retrieve_value,304,train,webarena_verified.303 -webarena_verified.305,False,gitlab,retrieve_value,305,train,webarena_verified.304 -webarena_verified.306,False,gitlab,retrieve_value,306,test,webarena_verified.305 -webarena_verified.307,False,gitlab,retrieve_value,307,train,webarena_verified.306 -webarena_verified.308,False,gitlab,retrieve_value,308,train,webarena_verified.307 -webarena_verified.309,False,gitlab,retrieve_value,309,train,webarena_verified.308 -webarena_verified.310,False,gitlab,retrieve_value,310,train,webarena_verified.309 -webarena_verified.311,False,gitlab,retrieve_value,311,test,webarena_verified.310 -webarena_verified.312,False,gitlab,retrieve_value,312,test,webarena_verified.311 -webarena_verified.313,False,shopping,retrieve_value,313,train,webarena_verified.302 -webarena_verified.314,False,gitlab,retrieve_value,314,train,webarena_verified.312 -webarena_verified.315,False,gitlab,retrieve_value,315,train,webarena_verified.314 -webarena_verified.316,False,gitlab,retrieve_value,316,test,webarena_verified.315 -webarena_verified.317,False,gitlab,retrieve_value,317,test,webarena_verified.316 -webarena_verified.318,False,gitlab,retrieve_value,318,train,webarena_verified.317 -webarena_verified.319,False,shopping,retrieve_value,319,train,webarena_verified.313 -webarena_verified.320,False,shopping,retrieve_value,320,test,webarena_verified.319 -webarena_verified.321,False,shopping,retrieve_value,321,train,webarena_verified.320 -webarena_verified.322,False,shopping,retrieve_value,322,test,webarena_verified.321 -webarena_verified.323,False,shopping,retrieve_value,323,train,webarena_verified.322 -webarena_verified.324,False,shopping,ui_state,324,train,webarena_verified.323 -webarena_verified.325,False,shopping,ui_state,325,test,webarena_verified.324 -webarena_verified.326,False,shopping,ui_state,326,train,webarena_verified.325 -webarena_verified.327,False,shopping,ui_state,327,test,webarena_verified.326 -webarena_verified.328,False,shopping,ui_state,328,train,webarena_verified.327 -webarena_verified.329,False,shopping,retrieve_value,329,test,webarena_verified.328 -webarena_verified.330,False,shopping,retrieve_value,330,test,webarena_verified.329 -webarena_verified.331,False,shopping,retrieve_value,331,test,webarena_verified.330 -webarena_verified.332,False,shopping,retrieve_value,332,train,webarena_verified.331 -webarena_verified.333,False,shopping,retrieve_value,333,train,webarena_verified.332 -webarena_verified.334,False,shopping,retrieve_value,334,train,webarena_verified.333 -webarena_verified.335,False,shopping,retrieve_value,335,train,webarena_verified.334 -webarena_verified.336,False,shopping,retrieve_value,336,test,webarena_verified.335 -webarena_verified.337,False,shopping,retrieve_value,337,test,webarena_verified.336 -webarena_verified.338,False,shopping,retrieve_value,338,train,webarena_verified.337 -webarena_verified.339,False,gitlab,ui_state,339,test,webarena_verified.318 -webarena_verified.340,False,gitlab,ui_state,340,train,webarena_verified.339 -webarena_verified.341,False,gitlab,ui_state,341,test,webarena_verified.340 -webarena_verified.342,False,gitlab,ui_state,342,test,webarena_verified.341 -webarena_verified.343,False,gitlab,ui_state,343,test,webarena_verified.342 -webarena_verified.344,False,shopping_admin,retrieve_value,344,test,webarena_verified.292 -webarena_verified.345,False,shopping_admin,retrieve_value,345,train,webarena_verified.344 -webarena_verified.346,False,shopping_admin,retrieve_value,346,train,webarena_verified.345 -webarena_verified.347,False,shopping_admin,retrieve_value,347,train,webarena_verified.346 -webarena_verified.348,False,shopping_admin,retrieve_value,348,test,webarena_verified.347 -webarena_verified.349,False,gitlab,retrieve_value,349,test,webarena_verified.343 -webarena_verified.350,False,gitlab,retrieve_value,350,test,webarena_verified.349 -webarena_verified.351,False,shopping,ui_state,351,train,webarena_verified.338 -webarena_verified.352,False,shopping,ui_state,352,test,webarena_verified.351 -webarena_verified.353,False,shopping,ui_state,353,test,webarena_verified.352 -webarena_verified.354,False,shopping,ui_state,354,train,webarena_verified.353 -webarena_verified.355,False,shopping,ui_state,355,train,webarena_verified.354 -webarena_verified.356,False,map,program_html,356,test,webarena_verified.287 -webarena_verified.357,False,gitlab,ui_state,357,test,webarena_verified.350 -webarena_verified.358,False,shopping,retrieve_value,358,train,webarena_verified.355 -webarena_verified.359,False,shopping,retrieve_value,359,test,webarena_verified.358 -webarena_verified.360,False,shopping,retrieve_value,360,train,webarena_verified.359 -webarena_verified.361,False,shopping,retrieve_value,361,train,webarena_verified.360 -webarena_verified.362,False,shopping,retrieve_value,362,test,webarena_verified.361 -webarena_verified.363,False,map,retrieve_value,363,train,webarena_verified.356 -webarena_verified.364,False,map,retrieve_value,364,test,webarena_verified.363 -webarena_verified.365,False,map,retrieve_value,365,test,webarena_verified.364 -webarena_verified.366,False,map,retrieve_value,366,train,webarena_verified.365 -webarena_verified.367,False,map,retrieve_value,367,train,webarena_verified.366 -webarena_verified.368,False,shopping,retrieve_value,368,test,webarena_verified.362 -webarena_verified.369,False,map,program_html,369,train,webarena_verified.367 -webarena_verified.370,False,map,program_html,370,test,webarena_verified.369 -webarena_verified.371,False,map,program_html,371,test,webarena_verified.370 -webarena_verified.372,False,map,program_html,372,train,webarena_verified.371 -webarena_verified.373,False,map,program_html,373,train,webarena_verified.372 -webarena_verified.374,False,shopping_admin,ui_state,374,train,webarena_verified.348 -webarena_verified.375,False,shopping_admin,ui_state,375,train,webarena_verified.374 -webarena_verified.376,False,shopping,retrieve_value,376,test,webarena_verified.368 -webarena_verified.377,False,map,ui_state,377,test,webarena_verified.373 -webarena_verified.378,False,map,ui_state,378,train,webarena_verified.377 -webarena_verified.379,False,map,ui_state,379,train,webarena_verified.378 -webarena_verified.380,False,map,ui_state,380,test,webarena_verified.379 -webarena_verified.381,False,map,ui_state,381,train,webarena_verified.380 -webarena_verified.382,False,map,string_match,382,test,webarena_verified.381 -webarena_verified.383,False,map,retrieve_value,383,test,webarena_verified.382 -webarena_verified.384,False,shopping,retrieve_value,384,test,webarena_verified.376 -webarena_verified.385,False,shopping,retrieve_value,385,train,webarena_verified.384 -webarena_verified.386,False,shopping,retrieve_value,386,test,webarena_verified.385 -webarena_verified.387,False,shopping,retrieve_value,387,train,webarena_verified.386 -webarena_verified.388,False,shopping,retrieve_value,388,test,webarena_verified.387 -webarena_verified.389,False,gitlab,backend_state,389,test,webarena_verified.357 -webarena_verified.390,False,gitlab,backend_state,390,train,webarena_verified.389 -webarena_verified.391,False,gitlab,backend_state,391,train,webarena_verified.390 -webarena_verified.392,False,gitlab,backend_state,392,test,webarena_verified.391 -webarena_verified.393,False,gitlab,backend_state,393,train,webarena_verified.392 -webarena_verified.394,False,gitlab,backend_state,394,test,webarena_verified.393 -webarena_verified.395,False,gitlab,backend_state,395,train,webarena_verified.394 -webarena_verified.396,False,gitlab,backend_state,396,train,webarena_verified.395 -webarena_verified.397,False,gitlab,backend_state,397,train,webarena_verified.396 -webarena_verified.398,False,gitlab,backend_state,398,test,webarena_verified.397 -webarena_verified.399,False,reddit,backend_state,399,train,webarena_verified.69 -webarena_verified.400,False,reddit,backend_state,400,test,webarena_verified.399 -webarena_verified.401,False,reddit,backend_state,401,train,webarena_verified.400 -webarena_verified.402,False,reddit,backend_state,402,train,webarena_verified.401 -webarena_verified.403,False,reddit,backend_state,403,test,webarena_verified.402 -webarena_verified.404,False,reddit,backend_state,404,train,webarena_verified.403 -webarena_verified.405,False,reddit,backend_state,405,test,webarena_verified.404 -webarena_verified.406,False,reddit,backend_state,406,train,webarena_verified.405 -webarena_verified.407,False,reddit,backend_state,407,test,webarena_verified.406 -webarena_verified.408,False,reddit,backend_state,408,train,webarena_verified.407 -webarena_verified.409,False,reddit,backend_state,409,test,webarena_verified.408 -webarena_verified.410,False,reddit,backend_state,410,test,webarena_verified.409 -webarena_verified.411,False,gitlab,backend_state,411,test,webarena_verified.398 -webarena_verified.412,False,gitlab,backend_state,412,test,webarena_verified.411 -webarena_verified.413,False,gitlab,backend_state,413,test,webarena_verified.412 -webarena_verified.414,False,gitlab,backend_state,414,test,webarena_verified.413 -webarena_verified.415,False,gitlab,backend_state,415,test,webarena_verified.414 -webarena_verified.416,False,gitlab,backend_state,416,test,webarena_verified.415 -webarena_verified.417,False,gitlab,backend_state,417,test,webarena_verified.416 -webarena_verified.418,False,gitlab,backend_state,418,train,webarena_verified.417 -webarena_verified.419,False,gitlab,backend_state,419,test,webarena_verified.418 -webarena_verified.420,False,gitlab,backend_state,420,test,webarena_verified.419 -webarena_verified.421,False,gitlab,backend_state,421,train,webarena_verified.420 -webarena_verified.422,False,gitlab,backend_state,422,train,webarena_verified.421 -webarena_verified.423,False,shopping_admin,backend_state,423,train,webarena_verified.375 -webarena_verified.424,False,wikipedia map,program_html,424,train,webarena_verified.383 -webarena_verified.425,False,wikipedia map,program_html,425,train,webarena_verified.424 -webarena_verified.426,False,wikipedia map,program_html,426,test,webarena_verified.425 -webarena_verified.427,False,wikipedia map,program_html,427,test,webarena_verified.426 -webarena_verified.428,False,wikipedia map,program_html,428,train,webarena_verified.427 -webarena_verified.429,False,wikipedia map,program_html,429,train,webarena_verified.428 -webarena_verified.430,False,wikipedia map,program_html,430,test,webarena_verified.429 -webarena_verified.431,False,shopping,program_html,431,train,webarena_verified.388 -webarena_verified.432,False,shopping,backend_state,432,test,webarena_verified.431 -webarena_verified.433,False,shopping,backend_state,433,train,webarena_verified.432 -webarena_verified.434,False,shopping,backend_state,434,train,webarena_verified.433 -webarena_verified.435,False,shopping,backend_state,435,train,webarena_verified.434 -webarena_verified.436,False,shopping,backend_state,436,test,webarena_verified.435 -webarena_verified.437,False,shopping,backend_state,437,train,webarena_verified.436 -webarena_verified.438,False,shopping,backend_state,438,train,webarena_verified.437 -webarena_verified.439,False,shopping,backend_state,439,train,webarena_verified.438 -webarena_verified.440,False,shopping,backend_state,440,test,webarena_verified.439 -webarena_verified.441,False,gitlab,backend_state,441,train,webarena_verified.422 -webarena_verified.442,False,gitlab,backend_state,442,train,webarena_verified.441 -webarena_verified.443,False,gitlab,backend_state,443,test,webarena_verified.442 -webarena_verified.444,False,gitlab,backend_state,444,train,webarena_verified.443 -webarena_verified.445,False,gitlab,backend_state,445,test,webarena_verified.444 -webarena_verified.446,False,gitlab,backend_state,446,test,webarena_verified.445 -webarena_verified.447,False,gitlab,backend_state,447,train,webarena_verified.446 -webarena_verified.448,False,gitlab,backend_state,448,test,webarena_verified.447 -webarena_verified.449,False,gitlab,backend_state,449,test,webarena_verified.448 -webarena_verified.450,False,gitlab,retrieve_value,450,train,webarena_verified.449 -webarena_verified.451,False,gitlab,retrieve_value,451,train,webarena_verified.450 -webarena_verified.452,False,gitlab,retrieve_value,452,train,webarena_verified.451 -webarena_verified.453,False,shopping_admin,backend_state,453,train,webarena_verified.423 -webarena_verified.454,False,shopping_admin,backend_state,454,test,webarena_verified.453 -webarena_verified.455,False,shopping_admin,backend_state,455,train,webarena_verified.454 -webarena_verified.456,False,shopping_admin,backend_state,456,test,webarena_verified.455 -webarena_verified.457,False,shopping_admin,backend_state,457,train,webarena_verified.456 -webarena_verified.458,False,shopping_admin,backend_state,458,test,webarena_verified.457 -webarena_verified.459,False,shopping_admin,backend_state,459,test,webarena_verified.458 -webarena_verified.460,False,shopping_admin,backend_state,460,train,webarena_verified.459 -webarena_verified.461,False,shopping_admin,backend_state,461,train,webarena_verified.460 -webarena_verified.462,False,shopping_admin,backend_state,462,test,webarena_verified.461 -webarena_verified.463,False,shopping_admin,backend_state,463,test,webarena_verified.462 -webarena_verified.464,False,shopping_admin,backend_state,464,train,webarena_verified.463 -webarena_verified.465,False,shopping,backend_state,465,train,webarena_verified.440 -webarena_verified.466,False,shopping,backend_state,466,train,webarena_verified.465 -webarena_verified.467,False,shopping,backend_state,467,train,webarena_verified.466 -webarena_verified.468,False,shopping,backend_state,468,test,webarena_verified.467 -webarena_verified.469,False,shopping,backend_state,469,test,webarena_verified.468 -webarena_verified.470,False,shopping_admin,backend_state,470,test,webarena_verified.464 -webarena_verified.471,False,shopping_admin,backend_state,471,test,webarena_verified.470 -webarena_verified.472,False,shopping_admin,backend_state,472,train,webarena_verified.471 -webarena_verified.473,False,shopping_admin,backend_state,473,train,webarena_verified.472 -webarena_verified.474,False,shopping_admin,backend_state,474,train,webarena_verified.473 -webarena_verified.475,False,gitlab,backend_state,475,train,webarena_verified.452 -webarena_verified.476,False,gitlab,backend_state,476,train,webarena_verified.475 -webarena_verified.477,False,gitlab,backend_state,477,train,webarena_verified.476 -webarena_verified.478,False,gitlab,backend_state,478,test,webarena_verified.477 -webarena_verified.479,False,gitlab,backend_state,479,test,webarena_verified.478 -webarena_verified.480,False,gitlab,backend_state,480,train,webarena_verified.479 -webarena_verified.481,False,gitlab,backend_state,481,train,webarena_verified.480 -webarena_verified.482,False,gitlab,backend_state,482,train,webarena_verified.481 -webarena_verified.483,False,gitlab,backend_state,483,test,webarena_verified.482 -webarena_verified.484,False,gitlab,backend_state,484,train,webarena_verified.483 -webarena_verified.485,False,gitlab,backend_state,485,test,webarena_verified.484 -webarena_verified.486,False,shopping_admin,backend_state,486,train,webarena_verified.474 -webarena_verified.487,False,shopping_admin,backend_state,487,test,webarena_verified.486 -webarena_verified.488,False,shopping_admin,backend_state,488,test,webarena_verified.487 -webarena_verified.489,False,shopping_admin,backend_state,489,train,webarena_verified.488 -webarena_verified.490,False,shopping_admin,backend_state,490,train,webarena_verified.489 -webarena_verified.491,False,shopping_admin,retrieve_value,491,test,webarena_verified.490 -webarena_verified.492,False,shopping_admin,backend_state,492,train,webarena_verified.491 -webarena_verified.493,False,shopping_admin,backend_state,493,train,webarena_verified.492 -webarena_verified.494,False,shopping_admin,backend_state,494,train,webarena_verified.493 -webarena_verified.495,False,shopping_admin,backend_state,495,test,webarena_verified.494 -webarena_verified.496,False,shopping_admin,backend_state,496,train,webarena_verified.495 -webarena_verified.497,False,shopping_admin,backend_state,497,test,webarena_verified.496 -webarena_verified.498,False,shopping_admin,backend_state,498,test,webarena_verified.497 -webarena_verified.499,False,shopping_admin,backend_state,499,train,webarena_verified.498 -webarena_verified.500,False,shopping_admin,backend_state,500,train,webarena_verified.499 -webarena_verified.501,False,shopping_admin,backend_state,501,train,webarena_verified.500 -webarena_verified.502,False,shopping_admin,backend_state,502,test,webarena_verified.501 -webarena_verified.503,False,shopping_admin,backend_state,503,train,webarena_verified.502 -webarena_verified.504,False,shopping_admin,backend_state,504,test,webarena_verified.503 -webarena_verified.505,False,shopping_admin,backend_state,505,train,webarena_verified.504 -webarena_verified.506,False,shopping,backend_state,506,train,webarena_verified.469 -webarena_verified.507,False,shopping,backend_state,507,train,webarena_verified.506 -webarena_verified.508,False,shopping,backend_state,508,test,webarena_verified.507 -webarena_verified.509,False,shopping,backend_state,509,test,webarena_verified.508 -webarena_verified.510,False,shopping,backend_state,510,test,webarena_verified.509 -webarena_verified.511,False,shopping,program_html,511,test,webarena_verified.510 -webarena_verified.512,False,shopping,program_html,512,train,webarena_verified.511 -webarena_verified.513,False,shopping,program_html,513,train,webarena_verified.512 -webarena_verified.514,False,shopping,program_html,514,test,webarena_verified.513 -webarena_verified.515,False,shopping,program_html,515,train,webarena_verified.514 -webarena_verified.516,False,shopping,backend_state,516,train,webarena_verified.515 -webarena_verified.517,False,shopping,backend_state,517,test,webarena_verified.516 -webarena_verified.518,False,shopping,backend_state,518,test,webarena_verified.517 -webarena_verified.519,False,shopping,backend_state,519,test,webarena_verified.518 -webarena_verified.520,False,shopping,backend_state,520,train,webarena_verified.519 -webarena_verified.521,False,shopping,backend_state,521,test,webarena_verified.520 -webarena_verified.522,False,gitlab,backend_state,522,test,webarena_verified.485 -webarena_verified.523,False,gitlab,backend_state,523,train,webarena_verified.522 -webarena_verified.524,False,gitlab,backend_state,524,test,webarena_verified.523 -webarena_verified.525,False,gitlab,backend_state,525,train,webarena_verified.524 -webarena_verified.526,False,gitlab,backend_state,526,train,webarena_verified.525 -webarena_verified.527,False,gitlab,backend_state,527,test,webarena_verified.526 -webarena_verified.528,False,shopping,program_html,528,train,webarena_verified.521 -webarena_verified.529,False,shopping,program_html,529,test,webarena_verified.528 -webarena_verified.530,False,shopping,program_html,530,test,webarena_verified.529 -webarena_verified.531,False,shopping,program_html,531,train,webarena_verified.530 -webarena_verified.532,False,shopping,program_html,532,train,webarena_verified.531 -webarena_verified.533,False,gitlab,backend_state,533,test,webarena_verified.527 -webarena_verified.534,False,gitlab,backend_state,534,train,webarena_verified.533 -webarena_verified.535,False,gitlab,backend_state,535,test,webarena_verified.534 -webarena_verified.536,False,gitlab,backend_state,536,train,webarena_verified.535 -webarena_verified.537,False,gitlab,backend_state,537,train,webarena_verified.536 -webarena_verified.538,False,shopping_admin,backend_state,538,train,webarena_verified.505 -webarena_verified.539,False,shopping_admin,backend_state,539,train,webarena_verified.538 -webarena_verified.540,False,shopping_admin,backend_state,540,test,webarena_verified.539 -webarena_verified.541,False,shopping_admin,backend_state,541,test,webarena_verified.540 -webarena_verified.542,False,shopping_admin,backend_state,542,train,webarena_verified.541 -webarena_verified.543,False,shopping_admin,backend_state,543,test,webarena_verified.542 -webarena_verified.544,False,shopping_admin,backend_state,544,test,webarena_verified.543 -webarena_verified.545,False,shopping_admin,backend_state,545,test,webarena_verified.544 -webarena_verified.546,False,shopping_admin,retrieve_value,546,train,webarena_verified.545 -webarena_verified.547,False,shopping_admin,backend_state,547,train,webarena_verified.546 -webarena_verified.548,False,shopping_admin,backend_state,548,train,webarena_verified.547 -webarena_verified.549,False,shopping_admin,backend_state,549,test,webarena_verified.548 -webarena_verified.550,False,shopping_admin,backend_state,550,train,webarena_verified.549 -webarena_verified.551,False,shopping_admin,backend_state,551,test,webarena_verified.550 -webarena_verified.552,False,gitlab reddit,program_html,552,test,webarena_verified.537 webarena_verified.410 -webarena_verified.553,False,gitlab reddit,program_html,553,test,webarena_verified.552 -webarena_verified.554,False,gitlab reddit,program_html,554,test,webarena_verified.553 -webarena_verified.555,False,gitlab reddit,program_html,555,test,webarena_verified.554 -webarena_verified.556,False,gitlab wikipedia,program_html,556,train,webarena_verified.555 -webarena_verified.557,False,gitlab wikipedia,program_html,557,test,webarena_verified.556 -webarena_verified.558,False,gitlab wikipedia,program_html,558,train,webarena_verified.557 -webarena_verified.559,False,gitlab wikipedia,program_html,559,train,webarena_verified.558 -webarena_verified.560,False,gitlab wikipedia,program_html,560,test,webarena_verified.559 -webarena_verified.561,False,gitlab wikipedia,program_html,561,test,webarena_verified.560 -webarena_verified.562,False,gitlab reddit,program_html,562,train,webarena_verified.561 webarena_verified.555 -webarena_verified.563,False,gitlab reddit,program_html,563,train,webarena_verified.562 -webarena_verified.564,False,gitlab reddit,program_html,564,train,webarena_verified.563 -webarena_verified.565,False,gitlab reddit,program_html,565,test,webarena_verified.564 -webarena_verified.566,False,gitlab reddit,program_html,566,test,webarena_verified.565 -webarena_verified.567,False,gitlab,backend_state,567,test,webarena_verified.566 -webarena_verified.568,False,gitlab,backend_state,568,train,webarena_verified.567 -webarena_verified.569,False,gitlab,backend_state,569,train,webarena_verified.568 -webarena_verified.570,False,gitlab,backend_state,570,test,webarena_verified.569 -webarena_verified.571,False,shopping,backend_state,571,test,webarena_verified.532 -webarena_verified.572,False,shopping,backend_state,572,train,webarena_verified.571 -webarena_verified.573,False,shopping,backend_state,573,train,webarena_verified.572 -webarena_verified.574,False,shopping,backend_state,574,test,webarena_verified.573 -webarena_verified.575,False,shopping,backend_state,575,train,webarena_verified.574 -webarena_verified.576,False,gitlab,backend_state,576,test,webarena_verified.570 -webarena_verified.577,False,gitlab,backend_state,577,train,webarena_verified.576 -webarena_verified.578,False,gitlab,backend_state,578,test,webarena_verified.577 -webarena_verified.579,False,gitlab,backend_state,579,train,webarena_verified.578 -webarena_verified.580,False,reddit,backend_state,580,train,webarena_verified.566 -webarena_verified.581,False,reddit,backend_state,581,train,webarena_verified.580 -webarena_verified.582,False,reddit,backend_state,582,test,webarena_verified.581 -webarena_verified.583,False,reddit,backend_state,583,test,webarena_verified.582 -webarena_verified.584,False,reddit,backend_state,584,train,webarena_verified.583 -webarena_verified.585,False,shopping,backend_state,585,train,webarena_verified.575 -webarena_verified.586,False,shopping,backend_state,586,test,webarena_verified.585 -webarena_verified.587,False,shopping,backend_state,587,train,webarena_verified.586 -webarena_verified.588,False,shopping,backend_state,588,train,webarena_verified.587 -webarena_verified.589,False,shopping,backend_state,589,test,webarena_verified.588 -webarena_verified.590,False,gitlab,backend_state,590,train,webarena_verified.579 -webarena_verified.591,False,gitlab,backend_state,591,test,webarena_verified.590 -webarena_verified.592,False,gitlab,backend_state,592,test,webarena_verified.591 -webarena_verified.593,False,gitlab,backend_state,593,test,webarena_verified.592 -webarena_verified.594,False,gitlab,backend_state,594,train,webarena_verified.593 -webarena_verified.595,False,reddit,backend_state,595,train,webarena_verified.584 -webarena_verified.596,False,reddit,backend_state,596,test,webarena_verified.595 -webarena_verified.597,False,reddit,backend_state,597,train,webarena_verified.596 -webarena_verified.598,False,reddit,backend_state,598,train,webarena_verified.597 -webarena_verified.599,False,reddit,backend_state,599,test,webarena_verified.598 -webarena_verified.600,False,reddit,backend_state,600,test,webarena_verified.599 -webarena_verified.601,False,reddit,backend_state,601,train,webarena_verified.600 -webarena_verified.602,False,reddit,backend_state,602,train,webarena_verified.601 -webarena_verified.603,False,reddit,backend_state,603,train,webarena_verified.602 -webarena_verified.604,False,reddit,backend_state,604,test,webarena_verified.603 -webarena_verified.605,False,reddit,backend_state,605,train,webarena_verified.604 -webarena_verified.606,False,reddit,backend_state,606,train,webarena_verified.605 -webarena_verified.607,False,reddit,backend_state,607,test,webarena_verified.606 -webarena_verified.608,False,reddit,backend_state,608,test,webarena_verified.607 -webarena_verified.609,False,reddit,backend_state,609,train,webarena_verified.608 -webarena_verified.610,False,reddit,backend_state,610,train,webarena_verified.609 -webarena_verified.611,False,reddit,backend_state,611,train,webarena_verified.610 -webarena_verified.612,False,reddit,backend_state,612,test,webarena_verified.611 -webarena_verified.613,False,reddit,backend_state,613,train,webarena_verified.612 -webarena_verified.614,False,reddit,backend_state,614,test,webarena_verified.613 -webarena_verified.615,False,reddit,ui_state,615,test,webarena_verified.614 -webarena_verified.616,False,reddit,ui_state,616,test,webarena_verified.615 -webarena_verified.617,False,reddit,ui_state,617,train,webarena_verified.616 -webarena_verified.618,False,reddit,ui_state,618,train,webarena_verified.617 -webarena_verified.619,False,reddit,ui_state,619,train,webarena_verified.618 -webarena_verified.620,False,reddit,backend_state,620,train,webarena_verified.619 -webarena_verified.621,False,reddit,backend_state,621,train,webarena_verified.620 -webarena_verified.622,False,reddit,backend_state,622,train,webarena_verified.621 -webarena_verified.623,False,reddit,backend_state,623,test,webarena_verified.622 -webarena_verified.624,False,reddit,backend_state,624,test,webarena_verified.623 -webarena_verified.625,False,reddit,backend_state,625,train,webarena_verified.624 -webarena_verified.626,False,reddit,backend_state,626,train,webarena_verified.625 -webarena_verified.627,False,reddit,backend_state,627,train,webarena_verified.626 -webarena_verified.628,False,reddit,backend_state,628,test,webarena_verified.627 -webarena_verified.629,False,reddit,backend_state,629,test,webarena_verified.628 -webarena_verified.630,False,reddit,backend_state,630,test,webarena_verified.629 -webarena_verified.631,False,reddit,backend_state,631,train,webarena_verified.630 -webarena_verified.632,False,reddit,backend_state,632,train,webarena_verified.631 -webarena_verified.633,False,reddit,backend_state,633,test,webarena_verified.632 -webarena_verified.634,False,reddit,backend_state,634,train,webarena_verified.633 -webarena_verified.635,False,reddit,backend_state,635,train,webarena_verified.634 -webarena_verified.636,False,reddit,backend_state,636,train,webarena_verified.635 -webarena_verified.637,False,reddit,backend_state,637,train,webarena_verified.636 -webarena_verified.638,False,reddit,ui_state,638,test,webarena_verified.637 -webarena_verified.639,False,reddit,backend_state,639,test,webarena_verified.638 -webarena_verified.640,False,reddit,backend_state,640,train,webarena_verified.639 -webarena_verified.641,False,reddit,backend_state,641,test,webarena_verified.640 -webarena_verified.642,False,reddit,backend_state,642,test,webarena_verified.641 -webarena_verified.643,False,reddit,backend_state,643,train,webarena_verified.642 -webarena_verified.644,False,reddit,backend_state,644,train,webarena_verified.643 -webarena_verified.645,False,reddit,backend_state,645,train,webarena_verified.644 -webarena_verified.646,False,reddit,backend_state,646,train,webarena_verified.645 -webarena_verified.647,False,reddit,backend_state,647,train,webarena_verified.646 -webarena_verified.648,False,reddit,backend_state,648,test,webarena_verified.647 -webarena_verified.649,False,reddit,backend_state,649,test,webarena_verified.648 -webarena_verified.650,False,reddit,backend_state,650,train,webarena_verified.649 -webarena_verified.651,False,reddit,backend_state,651,train,webarena_verified.650 -webarena_verified.652,False,reddit,backend_state,652,train,webarena_verified.651 -webarena_verified.653,False,shopping,ui_state,653,train,webarena_verified.589 -webarena_verified.654,False,shopping,ui_state,654,test,webarena_verified.653 -webarena_verified.655,False,shopping,ui_state,655,test,webarena_verified.654 -webarena_verified.656,False,shopping,ui_state,656,train,webarena_verified.655 -webarena_verified.657,False,shopping,ui_state,657,train,webarena_verified.656 -webarena_verified.658,False,gitlab,backend_state,658,train,webarena_verified.594 -webarena_verified.659,False,gitlab,backend_state,659,test,webarena_verified.658 -webarena_verified.660,False,gitlab,backend_state,660,test,webarena_verified.659 -webarena_verified.661,False,gitlab,backend_state,661,test,webarena_verified.660 -webarena_verified.662,False,gitlab,backend_state,662,train,webarena_verified.661 -webarena_verified.663,False,gitlab,backend_state,663,train,webarena_verified.662 -webarena_verified.664,False,gitlab,backend_state,664,test,webarena_verified.663 -webarena_verified.665,False,gitlab,backend_state,665,train,webarena_verified.664 -webarena_verified.666,False,gitlab,retrieve_value,666,test,webarena_verified.665 -webarena_verified.667,False,gitlab,backend_state,667,test,webarena_verified.666 -webarena_verified.668,False,gitlab,retrieve_value,668,test,webarena_verified.667 -webarena_verified.669,False,gitlab,backend_state,669,test,webarena_verified.668 -webarena_verified.670,False,gitlab,backend_state,670,train,webarena_verified.669 -webarena_verified.671,False,shopping reddit,ui_state,671,train,webarena_verified.657 webarena_verified.652 -webarena_verified.672,False,shopping reddit,ui_state,672,train,webarena_verified.671 -webarena_verified.673,False,shopping reddit,ui_state,673,test,webarena_verified.672 -webarena_verified.674,False,shopping reddit,ui_state,674,test,webarena_verified.673 -webarena_verified.675,False,shopping reddit,ui_state,675,train,webarena_verified.674 -webarena_verified.676,False,shopping_admin,ui_state,676,test,webarena_verified.551 -webarena_verified.677,False,shopping_admin,ui_state,677,test,webarena_verified.676 -webarena_verified.678,False,shopping_admin,ui_state,678,train,webarena_verified.677 -webarena_verified.679,False,shopping_admin,ui_state,679,train,webarena_verified.678 -webarena_verified.680,False,shopping_admin,ui_state,680,train,webarena_verified.679 -webarena_verified.681,False,reddit gitlab,ui_state,681,train,webarena_verified.675 webarena_verified.670 -webarena_verified.682,False,reddit gitlab,ui_state,682,train,webarena_verified.681 -webarena_verified.683,False,reddit gitlab,ui_state,683,test,webarena_verified.682 -webarena_verified.684,False,reddit gitlab,ui_state,684,train,webarena_verified.683 -webarena_verified.685,False,reddit gitlab,ui_state,685,train,webarena_verified.684 -webarena_verified.686,False,reddit gitlab,ui_state,686,train,webarena_verified.685 -webarena_verified.687,False,reddit gitlab,ui_state,687,test,webarena_verified.686 -webarena_verified.688,False,reddit gitlab,ui_state,688,test,webarena_verified.687 -webarena_verified.689,False,shopping,ui_state,689,test,webarena_verified.675 -webarena_verified.690,False,shopping,ui_state,690,test,webarena_verified.689 -webarena_verified.691,False,shopping,ui_state,691,train,webarena_verified.690 -webarena_verified.692,False,shopping,ui_state,692,train,webarena_verified.691 -webarena_verified.693,False,shopping,ui_state,693,train,webarena_verified.692 -webarena_verified.694,False,shopping_admin,backend_state,694,train,webarena_verified.680 -webarena_verified.695,False,shopping_admin,backend_state,695,train,webarena_verified.694 -webarena_verified.696,False,shopping_admin,backend_state,696,test,webarena_verified.695 -webarena_verified.697,False,shopping_admin,backend_state,697,train,webarena_verified.696 -webarena_verified.698,False,shopping_admin,backend_state,698,test,webarena_verified.697 -webarena_verified.699,False,shopping_admin,backend_state,699,train,webarena_verified.698 -webarena_verified.700,False,shopping_admin,backend_state,700,test,webarena_verified.699 -webarena_verified.701,False,shopping_admin,backend_state,701,test,webarena_verified.700 -webarena_verified.702,False,shopping_admin,backend_state,702,train,webarena_verified.701 -webarena_verified.703,False,shopping_admin,backend_state,703,train,webarena_verified.702 -webarena_verified.704,False,shopping_admin,ui_state,704,test,webarena_verified.703 -webarena_verified.705,False,shopping_admin,ui_state,705,test,webarena_verified.704 -webarena_verified.706,False,shopping_admin,ui_state,706,train,webarena_verified.705 -webarena_verified.707,False,shopping_admin,ui_state,707,train,webarena_verified.706 -webarena_verified.708,False,shopping_admin,ui_state,708,train,webarena_verified.707 -webarena_verified.709,False,shopping_admin,ui_state,709,test,webarena_verified.708 -webarena_verified.710,False,shopping_admin,ui_state,710,test,webarena_verified.709 -webarena_verified.711,False,shopping_admin,ui_state,711,train,webarena_verified.710 -webarena_verified.712,False,shopping_admin,ui_state,712,train,webarena_verified.711 -webarena_verified.713,False,shopping_admin,ui_state,713,train,webarena_verified.712 -webarena_verified.714,False,reddit,backend_state,714,train,webarena_verified.688 -webarena_verified.715,False,reddit,backend_state,715,train,webarena_verified.714 -webarena_verified.716,False,reddit,backend_state,716,train,webarena_verified.715 -webarena_verified.717,False,reddit,backend_state,717,test,webarena_verified.716 -webarena_verified.718,False,reddit,backend_state,718,test,webarena_verified.717 -webarena_verified.719,False,reddit,backend_state,719,train,webarena_verified.718 -webarena_verified.720,False,reddit,backend_state,720,test,webarena_verified.719 -webarena_verified.721,False,reddit,backend_state,721,train,webarena_verified.720 -webarena_verified.722,False,reddit,backend_state,722,train,webarena_verified.721 -webarena_verified.723,False,reddit,backend_state,723,test,webarena_verified.722 -webarena_verified.724,False,reddit,backend_state,724,test,webarena_verified.723 -webarena_verified.725,False,reddit,backend_state,725,test,webarena_verified.724 -webarena_verified.726,False,reddit,backend_state,726,test,webarena_verified.725 -webarena_verified.727,False,reddit,backend_state,727,train,webarena_verified.726 -webarena_verified.728,False,reddit,backend_state,728,train,webarena_verified.727 -webarena_verified.729,False,reddit,backend_state,729,train,webarena_verified.728 -webarena_verified.730,False,reddit,backend_state,730,test,webarena_verified.729 -webarena_verified.731,False,reddit,backend_state,731,test,webarena_verified.730 -webarena_verified.732,False,reddit,backend_state,732,train,webarena_verified.731 -webarena_verified.733,False,reddit,backend_state,733,train,webarena_verified.732 -webarena_verified.734,False,reddit,program_html,734,train,webarena_verified.733 -webarena_verified.735,False,reddit,program_html,735,test,webarena_verified.734 -webarena_verified.736,False,gitlab,backend_state,736,train,webarena_verified.688 -webarena_verified.737,False,wikipedia map,program_html,737,train,webarena_verified.430 -webarena_verified.738,False,wikipedia map,program_html,738,test,webarena_verified.737 -webarena_verified.739,False,wikipedia map,program_html,739,train,webarena_verified.738 -webarena_verified.740,False,wikipedia map,program_html,740,test,webarena_verified.739 -webarena_verified.741,False,wikipedia map,program_html,741,train,webarena_verified.740 -webarena_verified.742,False,gitlab,backend_state,742,test,webarena_verified.736 -webarena_verified.743,False,gitlab,backend_state,743,test,webarena_verified.742 -webarena_verified.744,False,gitlab,backend_state,744,test,webarena_verified.743 -webarena_verified.745,False,gitlab,backend_state,745,test,webarena_verified.744 -webarena_verified.746,False,gitlab,backend_state,746,train,webarena_verified.745 -webarena_verified.747,False,gitlab,backend_state,747,train,webarena_verified.746 -webarena_verified.748,False,gitlab,backend_state,748,train,webarena_verified.747 -webarena_verified.749,False,gitlab,backend_state,749,test,webarena_verified.748 -webarena_verified.750,False,gitlab,backend_state,750,test,webarena_verified.749 -webarena_verified.751,False,gitlab,backend_state,751,train,webarena_verified.750 -webarena_verified.752,False,gitlab,backend_state,752,train,webarena_verified.751 -webarena_verified.753,False,gitlab,backend_state,753,test,webarena_verified.752 -webarena_verified.754,False,gitlab,backend_state,754,train,webarena_verified.753 -webarena_verified.755,False,gitlab,backend_state,755,test,webarena_verified.754 -webarena_verified.756,False,gitlab,backend_state,756,train,webarena_verified.755 -webarena_verified.757,False,map,program_html,757,test,webarena_verified.741 -webarena_verified.758,False,map,program_html,758,test,webarena_verified.757 -webarena_verified.759,False,map shopping_admin,program_html,759,test,webarena_verified.758 webarena_verified.713 -webarena_verified.760,False,map shopping_admin,program_html,760,test,webarena_verified.759 -webarena_verified.761,False,map,program_html,761,train,webarena_verified.760 -webarena_verified.762,False,map,program_html,762,train,webarena_verified.761 -webarena_verified.763,False,map,program_html,763,test,webarena_verified.762 -webarena_verified.764,False,map,program_html,764,test,webarena_verified.763 -webarena_verified.765,False,map,program_html,765,train,webarena_verified.764 -webarena_verified.766,False,map,program_html,766,train,webarena_verified.765 -webarena_verified.767,False,map,program_html,767,train,webarena_verified.766 -webarena_verified.768,False,shopping_admin,backend_state,768,test,webarena_verified.760 -webarena_verified.769,False,shopping_admin,backend_state,769,test,webarena_verified.768 -webarena_verified.770,False,shopping_admin,backend_state,770,train,webarena_verified.769 -webarena_verified.771,False,shopping_admin,backend_state,771,test,webarena_verified.770 -webarena_verified.772,False,shopping_admin,backend_state,772,test,webarena_verified.771 -webarena_verified.773,False,shopping_admin,backend_state,773,train,webarena_verified.772 -webarena_verified.774,False,shopping_admin,backend_state,774,train,webarena_verified.773 -webarena_verified.775,False,shopping_admin,backend_state,775,train,webarena_verified.774 -webarena_verified.776,False,shopping_admin,backend_state,776,test,webarena_verified.775 -webarena_verified.777,False,shopping_admin,backend_state,777,train,webarena_verified.776 -webarena_verified.778,False,shopping_admin,backend_state,778,test,webarena_verified.777 -webarena_verified.779,False,shopping_admin,backend_state,779,train,webarena_verified.778 -webarena_verified.780,False,shopping_admin,backend_state,780,test,webarena_verified.779 -webarena_verified.781,False,shopping_admin,backend_state,781,train,webarena_verified.780 -webarena_verified.782,False,shopping_admin,backend_state,782,test,webarena_verified.781 -webarena_verified.783,False,gitlab,retrieve_value,783,train,webarena_verified.756 -webarena_verified.784,False,gitlab,retrieve_value,784,test,webarena_verified.783 -webarena_verified.785,False,gitlab,retrieve_value,785,test,webarena_verified.784 -webarena_verified.786,False,gitlab,retrieve_value,786,test,webarena_verified.785 -webarena_verified.787,False,gitlab,retrieve_value,787,test,webarena_verified.786 -webarena_verified.788,False,gitlab,retrieve_value,788,test,webarena_verified.787 -webarena_verified.789,False,gitlab,retrieve_value,789,test,webarena_verified.788 -webarena_verified.790,False,shopping_admin,retrieve_value,790,test,webarena_verified.782 -webarena_verified.791,False,gitlab reddit,string_match,791,train,webarena_verified.789 webarena_verified.735 -webarena_verified.792,False,shopping,retrieve_value,792,test,webarena_verified.693 -webarena_verified.793,False,shopping,retrieve_value,793,train,webarena_verified.792 -webarena_verified.794,False,shopping,retrieve_value,794,test,webarena_verified.793 -webarena_verified.795,False,shopping,retrieve_value,795,train,webarena_verified.794 -webarena_verified.796,False,shopping,retrieve_value,796,train,webarena_verified.795 -webarena_verified.797,False,shopping,retrieve_value,797,test,webarena_verified.796 -webarena_verified.798,False,shopping,retrieve_value,798,train,webarena_verified.797 -webarena_verified.799,False,gitlab,backend_state,799,train,webarena_verified.791 -webarena_verified.800,False,gitlab,backend_state,800,test,webarena_verified.799 -webarena_verified.801,False,gitlab,backend_state,801,train,webarena_verified.800 -webarena_verified.802,False,gitlab,backend_state,802,train,webarena_verified.801 -webarena_verified.803,False,gitlab,backend_state,803,test,webarena_verified.802 -webarena_verified.804,False,gitlab,backend_state,804,train,webarena_verified.803 -webarena_verified.805,False,gitlab,backend_state,805,test,webarena_verified.804 -webarena_verified.806,False,gitlab,backend_state,806,test,webarena_verified.805 -webarena_verified.807,False,gitlab,backend_state,807,train,webarena_verified.806 -webarena_verified.808,False,gitlab,backend_state,808,train,webarena_verified.807 -webarena_verified.809,False,gitlab,backend_state,809,train,webarena_verified.808 -webarena_verified.810,False,gitlab,backend_state,810,test,webarena_verified.809 -webarena_verified.811,False,gitlab,backend_state,811,test,webarena_verified.810 +webarena_verified.279.0,False,shopping_admin,retrieve_value,0,train, +webarena_verified.279.1,False,shopping_admin,retrieve_value,1,test,webarena_verified.279.0 +webarena_verified.279.2,False,shopping_admin,retrieve_value,2,train,webarena_verified.279.1 +webarena_verified.279.3,False,shopping_admin,retrieve_value,3,test,webarena_verified.279.2 +webarena_verified.279.4,False,shopping_admin,retrieve_value,4,train,webarena_verified.279.3 +webarena_verified.279.5,False,shopping_admin,retrieve_value,5,train,webarena_verified.279.4 +webarena_verified.279.6,False,shopping_admin,retrieve_value,6,test,webarena_verified.279.5 +webarena_verified.79.7,False,map,retrieve_value,7,train, +webarena_verified.79.8,False,map,string_match,8,test,webarena_verified.79.7 +webarena_verified.79.9,False,map,retrieve_value,9,test,webarena_verified.79.8 +webarena_verified.79.10,False,map,retrieve_value,10,test,webarena_verified.79.9 +webarena_verified.288.11,False,shopping_admin,retrieve_value,11,test,webarena_verified.288.6 +webarena_verified.288.12,False,shopping_admin,retrieve_value,12,train,webarena_verified.288.11 +webarena_verified.288.13,False,shopping_admin,retrieve_value,13,train,webarena_verified.288.12 +webarena_verified.288.14,False,shopping_admin,retrieve_value,14,train,webarena_verified.288.13 +webarena_verified.288.15,False,shopping_admin,retrieve_value,15,test,webarena_verified.288.14 +webarena_verified.73.16,False,map,string_match,16,test,webarena_verified.73.10 +webarena_verified.73.17,False,map,string_match,17,train,webarena_verified.73.16 +webarena_verified.73.18,False,map,string_match,18,test,webarena_verified.73.17 +webarena_verified.73.19,False,map,string_match,19,train,webarena_verified.73.18 +webarena_verified.73.20,False,map,string_match,20,test,webarena_verified.73.19 +webarena_verified.222.21,False,shopping,retrieve_value,21,test, +webarena_verified.222.22,False,shopping,retrieve_value,22,test,webarena_verified.222.21 +webarena_verified.222.23,False,shopping,retrieve_value,23,test,webarena_verified.222.22 +webarena_verified.222.24,False,shopping,retrieve_value,24,test,webarena_verified.222.23 +webarena_verified.222.25,False,shopping,retrieve_value,25,test,webarena_verified.222.24 +webarena_verified.222.26,False,shopping,retrieve_value,26,test,webarena_verified.222.25 +webarena_verified.33.27,False,reddit,retrieve_value,27,test, +webarena_verified.33.28,False,reddit,retrieve_value,28,train,webarena_verified.33.27 +webarena_verified.33.29,False,reddit,retrieve_value,29,train,webarena_verified.33.28 +webarena_verified.33.30,False,reddit,retrieve_value,30,test,webarena_verified.33.29 +webarena_verified.33.31,False,reddit,retrieve_value,31,train,webarena_verified.33.30 +webarena_verified.78.32,False,map,retrieve_value,32,test,webarena_verified.78.20 +webarena_verified.78.33,False,map,retrieve_value,33,test,webarena_verified.78.32 +webarena_verified.78.34,False,map,retrieve_value,34,train,webarena_verified.78.33 +webarena_verified.78.35,False,map,retrieve_value,35,test,webarena_verified.78.34 +webarena_verified.77.36,False,map,retrieve_value,36,test,webarena_verified.77.35 +webarena_verified.77.37,False,map,retrieve_value,37,train,webarena_verified.77.36 +webarena_verified.77.38,False,map,retrieve_value,38,train,webarena_verified.77.37 +webarena_verified.77.39,False,map,retrieve_value,39,train,webarena_verified.77.38 +webarena_verified.77.40,False,map,retrieve_value,40,test,webarena_verified.77.39 +webarena_verified.285.41,False,shopping_admin,retrieve_value,41,train,webarena_verified.285.15 +webarena_verified.285.42,False,shopping_admin,retrieve_value,42,train,webarena_verified.285.41 +webarena_verified.285.43,False,shopping_admin,retrieve_value,43,test,webarena_verified.285.42 +webarena_verified.303.44,False,gitlab,ui_state,44,train, +webarena_verified.300.45,False,gitlab,ui_state,45,test,webarena_verified.300.44 +webarena_verified.300.46,False,gitlab,ui_state,46,test,webarena_verified.300.45 +webarena_verified.197.47,False,shopping,retrieve_value,47,train,webarena_verified.197.26 +webarena_verified.197.48,False,shopping,retrieve_value,48,test,webarena_verified.197.47 +webarena_verified.197.49,False,shopping,retrieve_value,49,train,webarena_verified.197.48 +webarena_verified.197.50,False,shopping,retrieve_value,50,train,webarena_verified.197.49 +webarena_verified.197.51,False,shopping,retrieve_value,51,test,webarena_verified.197.50 +webarena_verified.68.52,False,map,string_match,52,test,webarena_verified.68.40 +webarena_verified.68.53,False,map,string_match,53,train,webarena_verified.68.52 +webarena_verified.68.54,False,map,string_match,54,test,webarena_verified.68.53 +webarena_verified.68.55,False,map,string_match,55,train,webarena_verified.68.54 +webarena_verified.68.56,False,map,string_match,56,train,webarena_verified.68.55 +webarena_verified.69.57,False,map,retrieve_value,57,train,webarena_verified.69.56 +webarena_verified.69.58,False,map,retrieve_value,58,train,webarena_verified.69.57 +webarena_verified.69.59,False,map,retrieve_value,59,test,webarena_verified.69.58 +webarena_verified.69.60,False,map,retrieve_value,60,test,webarena_verified.69.59 +webarena_verified.69.61,False,map,retrieve_value,61,train,webarena_verified.69.60 +webarena_verified.276.62,False,shopping_admin,retrieve_value,62,train,webarena_verified.276.43 +webarena_verified.276.63,False,shopping_admin,retrieve_value,63,test,webarena_verified.276.62 +webarena_verified.276.64,False,shopping_admin,retrieve_value,64,test,webarena_verified.276.63 +webarena_verified.276.65,False,shopping_admin,retrieve_value,65,train,webarena_verified.276.64 +webarena_verified.17.66,False,reddit,retrieve_value,66,test,webarena_verified.17.31 +webarena_verified.17.67,False,reddit,retrieve_value,67,test,webarena_verified.17.66 +webarena_verified.17.68,False,reddit,retrieve_value,68,train,webarena_verified.17.67 +webarena_verified.17.69,False,reddit,retrieve_value,69,test,webarena_verified.17.68 +webarena_verified.70.70,False,map,retrieve_value,70,train,webarena_verified.70.61 +webarena_verified.70.71,False,map,retrieve_value,71,test,webarena_verified.70.70 +webarena_verified.70.72,False,map,retrieve_value,72,train,webarena_verified.70.71 +webarena_verified.70.73,False,map,retrieve_value,73,test,webarena_verified.70.72 +webarena_verified.65.74,False,map,string_match,74,train,webarena_verified.65.73 +webarena_verified.65.75,False,map,string_match,75,train,webarena_verified.65.74 +webarena_verified.65.76,False,map,retrieve_value,76,train,webarena_verified.65.75 +webarena_verified.277.77,False,shopping_admin,retrieve_value,77,test,webarena_verified.277.65 +webarena_verified.277.78,False,shopping_admin,retrieve_value,78,train,webarena_verified.277.77 +webarena_verified.277.79,False,shopping_admin,retrieve_value,79,test,webarena_verified.277.78 +webarena_verified.72.80,False,map,string_match,80,test,webarena_verified.72.76 +webarena_verified.72.81,False,map,string_match,81,test,webarena_verified.72.80 +webarena_verified.72.82,False,map,string_match,82,train,webarena_verified.72.81 +webarena_verified.72.83,False,map,string_match,83,train,webarena_verified.72.82 +webarena_verified.64.84,False,map,string_match,84,train,webarena_verified.64.83 +webarena_verified.64.85,False,map,string_match,85,test,webarena_verified.64.84 +webarena_verified.64.86,False,map,string_match,86,test,webarena_verified.64.85 +webarena_verified.64.87,False,map,string_match,87,train,webarena_verified.64.86 +webarena_verified.64.88,False,map,string_match,88,train,webarena_verified.64.87 +webarena_verified.67.89,False,map,retrieve_value,89,test,webarena_verified.67.88 +webarena_verified.67.90,False,map,retrieve_value,90,test,webarena_verified.67.89 +webarena_verified.67.91,False,map,retrieve_value,91,train,webarena_verified.67.90 +webarena_verified.67.92,False,map,retrieve_value,92,train,webarena_verified.67.91 +webarena_verified.67.93,False,map,retrieve_value,93,train,webarena_verified.67.92 +webarena_verified.274.94,False,shopping_admin,retrieve_value,94,test,webarena_verified.274.79 +webarena_verified.274.95,False,shopping_admin,retrieve_value,95,train,webarena_verified.274.94 +webarena_verified.193.96,False,shopping,retrieve_value,96,test,webarena_verified.193.51 +webarena_verified.120.97,False,map wikipedia,retrieve_value,97,test,webarena_verified.120.93 +webarena_verified.66.98,False,map,retrieve_value,98,test,webarena_verified.66.97 +webarena_verified.66.99,False,map,retrieve_value,99,train,webarena_verified.66.98 +webarena_verified.66.100,False,map,retrieve_value,100,test,webarena_verified.66.99 +webarena_verified.66.101,False,map,string_match,101,train,webarena_verified.66.100 +webarena_verified.349.102,False,gitlab,ui_state,102,train,webarena_verified.349.46 +webarena_verified.349.103,False,gitlab,ui_state,103,train,webarena_verified.349.102 +webarena_verified.349.104,False,gitlab,ui_state,104,test,webarena_verified.349.103 +webarena_verified.349.105,False,gitlab,ui_state,105,train,webarena_verified.349.104 +webarena_verified.349.106,False,gitlab,ui_state,106,test,webarena_verified.349.105 +webarena_verified.270.107,False,shopping_admin,retrieve_value,107,test,webarena_verified.270.95 +webarena_verified.270.108,False,shopping_admin,retrieve_value,108,train,webarena_verified.270.107 +webarena_verified.270.109,False,shopping_admin,retrieve_value,109,test,webarena_verified.270.108 +webarena_verified.270.110,False,shopping_admin,retrieve_value,110,train,webarena_verified.270.109 +webarena_verified.270.111,False,shopping_admin,retrieve_value,111,train,webarena_verified.270.110 +webarena_verified.245.112,False,shopping_admin,retrieve_value,112,test,webarena_verified.245.111 +webarena_verified.245.113,False,shopping_admin,retrieve_value,113,test,webarena_verified.245.112 +webarena_verified.245.114,False,shopping_admin,retrieve_value,114,train,webarena_verified.245.113 +webarena_verified.245.115,False,shopping_admin,retrieve_value,115,test,webarena_verified.245.114 +webarena_verified.245.116,False,shopping_admin,retrieve_value,116,test,webarena_verified.245.115 +webarena_verified.161.117,False,shopping,retrieve_value,117,test,webarena_verified.161.96 +webarena_verified.151.118,False,shopping,program_html,118,train,webarena_verified.151.117 +webarena_verified.250.119,False,shopping_admin,retrieve_value,119,test,webarena_verified.250.116 +webarena_verified.250.120,False,shopping_admin,retrieve_value,120,train,webarena_verified.250.119 +webarena_verified.250.121,False,shopping_admin,retrieve_value,121,train,webarena_verified.250.120 +webarena_verified.250.122,False,shopping_admin,retrieve_value,122,test,webarena_verified.250.121 +webarena_verified.250.123,False,shopping_admin,retrieve_value,123,train,webarena_verified.250.122 +webarena_verified.159.124,False,shopping,retrieve_value,124,train,webarena_verified.159.118 +webarena_verified.159.125,False,shopping,retrieve_value,125,train,webarena_verified.159.124 +webarena_verified.159.126,False,shopping,retrieve_value,126,test,webarena_verified.159.125 +webarena_verified.1001.127,False,shopping_admin,retrieve_value,127,train,webarena_verified.1001.123 +webarena_verified.1002.128,False,shopping_admin,retrieve_value,128,train,webarena_verified.1002.127 +webarena_verified.1002.129,False,shopping_admin,retrieve_value,129,train,webarena_verified.1002.128 +webarena_verified.1002.130,False,shopping_admin,retrieve_value,130,train,webarena_verified.1002.129 +webarena_verified.1002.131,False,shopping_admin,retrieve_value,131,test,webarena_verified.1002.130 +webarena_verified.322.132,False,gitlab,retrieve_value,132,train,webarena_verified.322.106 +webarena_verified.322.133,False,gitlab,retrieve_value,133,test,webarena_verified.322.132 +webarena_verified.322.134,False,gitlab,retrieve_value,134,test,webarena_verified.322.133 +webarena_verified.322.135,False,gitlab,retrieve_value,135,train,webarena_verified.322.134 +webarena_verified.322.136,False,gitlab,retrieve_value,136,train,webarena_verified.322.135 +webarena_verified.51.137,False,map,string_match,137,test,webarena_verified.51.101 +webarena_verified.51.138,False,map,string_match,138,test,webarena_verified.51.137 +webarena_verified.51.139,False,map,string_match,139,test,webarena_verified.51.138 +webarena_verified.51.140,False,map,string_match,140,train,webarena_verified.51.139 +webarena_verified.162.141,False,shopping,retrieve_value,141,train,webarena_verified.162.126 +webarena_verified.162.142,False,shopping,retrieve_value,142,train,webarena_verified.162.141 +webarena_verified.162.143,False,shopping,retrieve_value,143,test,webarena_verified.162.142 +webarena_verified.162.144,False,shopping,retrieve_value,144,test,webarena_verified.162.143 +webarena_verified.162.145,False,shopping,retrieve_value,145,train,webarena_verified.162.144 +webarena_verified.155.146,False,shopping,retrieve_value,146,test,webarena_verified.155.145 +webarena_verified.155.147,False,shopping,retrieve_value,147,train,webarena_verified.155.146 +webarena_verified.155.148,False,shopping,retrieve_value,148,train,webarena_verified.155.147 +webarena_verified.155.149,False,shopping,retrieve_value,149,test,webarena_verified.155.148 +webarena_verified.155.150,False,shopping,retrieve_value,150,train,webarena_verified.155.149 +webarena_verified.36.151,False,map,string_match,151,train,webarena_verified.36.140 +webarena_verified.36.152,False,map,string_match,152,train,webarena_verified.36.151 +webarena_verified.36.153,False,map,string_match,153,test,webarena_verified.36.152 +webarena_verified.36.154,False,map,string_match,154,train,webarena_verified.36.153 +webarena_verified.36.155,False,map,string_match,155,test,webarena_verified.36.154 +webarena_verified.290.156,False,gitlab,ui_state,156,test,webarena_verified.290.136 +webarena_verified.255.157,False,shopping_admin,ui_state,157,train,webarena_verified.255.131 +webarena_verified.171.158,False,shopping,ui_state,158,test,webarena_verified.171.150 +webarena_verified.171.159,False,shopping,ui_state,159,train,webarena_verified.171.158 +webarena_verified.171.160,False,shopping,ui_state,160,train,webarena_verified.171.159 +webarena_verified.171.161,False,shopping,ui_state,161,train,webarena_verified.171.160 +webarena_verified.171.162,False,shopping,ui_state,162,test,webarena_verified.171.161 +webarena_verified.136.163,False,shopping,retrieve_value,163,test,webarena_verified.136.162 +webarena_verified.136.164,False,shopping,retrieve_value,164,test,webarena_verified.136.163 +webarena_verified.136.165,False,shopping,retrieve_value,165,test,webarena_verified.136.164 +webarena_verified.136.166,False,shopping,retrieve_value,166,test,webarena_verified.136.165 +webarena_verified.136.167,False,shopping,retrieve_value,167,test,webarena_verified.136.166 +webarena_verified.289.168,False,gitlab,retrieve_value,168,test,webarena_verified.289.156 +webarena_verified.289.169,False,gitlab,retrieve_value,169,train,webarena_verified.289.168 +webarena_verified.289.170,False,gitlab,retrieve_value,170,train,webarena_verified.289.169 +webarena_verified.289.171,False,gitlab,retrieve_value,171,test,webarena_verified.289.170 +webarena_verified.289.172,False,gitlab,retrieve_value,172,train,webarena_verified.289.171 +webarena_verified.310.173,False,gitlab,retrieve_value,173,train,webarena_verified.310.172 +webarena_verified.310.174,False,gitlab,retrieve_value,174,test,webarena_verified.310.173 +webarena_verified.310.175,False,gitlab,retrieve_value,175,train,webarena_verified.310.174 +webarena_verified.310.176,False,gitlab,retrieve_value,176,train,webarena_verified.310.175 +webarena_verified.310.177,False,gitlab,retrieve_value,177,test,webarena_verified.310.176 +webarena_verified.500.178,False,gitlab,retrieve_value,178,test,webarena_verified.500.177 +webarena_verified.500.179,False,gitlab,retrieve_value,179,train,webarena_verified.500.178 +webarena_verified.500.180,False,gitlab,retrieve_value,180,train,webarena_verified.500.179 +webarena_verified.500.181,False,gitlab,retrieve_value,181,test,webarena_verified.500.180 +webarena_verified.500.182,False,gitlab,retrieve_value,182,train,webarena_verified.500.181 +webarena_verified.368.183,False,shopping_admin,retrieve_value,183,train,webarena_verified.368.157 +webarena_verified.368.184,False,shopping_admin,retrieve_value,184,train,webarena_verified.368.183 +webarena_verified.368.185,False,shopping_admin,retrieve_value,185,test,webarena_verified.368.184 +webarena_verified.368.186,False,shopping_admin,retrieve_value,186,train,webarena_verified.368.185 +webarena_verified.368.187,False,shopping_admin,retrieve_value,187,test,webarena_verified.368.186 +webarena_verified.214.188,False,shopping,retrieve_value,188,test,webarena_verified.214.167 +webarena_verified.214.189,False,shopping,retrieve_value,189,train,webarena_verified.214.188 +webarena_verified.214.190,False,shopping,retrieve_value,190,train,webarena_verified.214.189 +webarena_verified.214.191,False,shopping,retrieve_value,191,train,webarena_verified.214.190 +webarena_verified.214.192,False,shopping,retrieve_value,192,test,webarena_verified.214.191 +webarena_verified.367.193,False,shopping_admin,retrieve_value,193,train,webarena_verified.367.187 +webarena_verified.367.194,False,shopping_admin,retrieve_value,194,train,webarena_verified.367.193 +webarena_verified.367.195,False,shopping_admin,retrieve_value,195,test,webarena_verified.367.194 +webarena_verified.367.196,False,shopping_admin,retrieve_value,196,train,webarena_verified.367.195 +webarena_verified.367.197,False,shopping_admin,retrieve_value,197,train,webarena_verified.367.196 +webarena_verified.366.198,False,shopping_admin,retrieve_value,198,train,webarena_verified.366.197 +webarena_verified.366.199,False,shopping_admin,retrieve_value,199,train,webarena_verified.366.198 +webarena_verified.366.200,False,shopping_admin,retrieve_value,200,train,webarena_verified.366.199 +webarena_verified.366.201,False,shopping_admin,retrieve_value,201,test,webarena_verified.366.200 +webarena_verified.366.202,False,shopping_admin,retrieve_value,202,train,webarena_verified.366.201 +webarena_verified.366.203,False,shopping_admin,retrieve_value,203,test,webarena_verified.366.202 +webarena_verified.366.204,False,shopping_admin,retrieve_value,204,test,webarena_verified.366.203 +webarena_verified.320.205,False,gitlab,retrieve_value,205,train,webarena_verified.320.182 +webarena_verified.320.206,False,gitlab,retrieve_value,206,test,webarena_verified.320.205 +webarena_verified.320.207,False,gitlab,retrieve_value,207,test,webarena_verified.320.206 +webarena_verified.364.208,False,shopping_admin,retrieve_value,208,test,webarena_verified.364.204 +webarena_verified.364.209,False,shopping_admin,retrieve_value,209,test,webarena_verified.364.208 +webarena_verified.364.210,False,shopping_admin,retrieve_value,210,train,webarena_verified.364.209 +webarena_verified.364.211,False,shopping_admin,retrieve_value,211,train,webarena_verified.364.210 +webarena_verified.364.212,False,shopping_admin,retrieve_value,212,train,webarena_verified.364.211 +webarena_verified.249.213,False,shopping_admin,retrieve_value,213,test,webarena_verified.249.212 +webarena_verified.249.214,False,shopping_admin,retrieve_value,214,train,webarena_verified.249.213 +webarena_verified.249.215,False,shopping_admin,retrieve_value,215,test,webarena_verified.249.214 +webarena_verified.249.216,False,shopping_admin,retrieve_value,216,train,webarena_verified.249.215 +webarena_verified.249.217,False,shopping_admin,retrieve_value,217,train,webarena_verified.249.216 +webarena_verified.41.218,False,map,string_match,218,train,webarena_verified.41.155 +webarena_verified.41.219,False,map,string_match,219,test,webarena_verified.41.218 +webarena_verified.41.220,False,map,string_match,220,train,webarena_verified.41.219 +webarena_verified.35.221,False,map,string_match,221,test,webarena_verified.35.220 +webarena_verified.35.222,False,map,string_match,222,train,webarena_verified.35.221 +webarena_verified.35.223,False,map,string_match,223,test,webarena_verified.35.222 +webarena_verified.35.224,False,map,string_match,224,test,webarena_verified.35.223 +webarena_verified.135.225,False,shopping,retrieve_value,225,test,webarena_verified.135.192 +webarena_verified.370.226,False,shopping,retrieve_value,226,train,webarena_verified.370.225 +webarena_verified.370.227,False,shopping,retrieve_value,227,train,webarena_verified.370.226 +webarena_verified.370.228,False,shopping,retrieve_value,228,test,webarena_verified.370.227 +webarena_verified.370.229,False,shopping,retrieve_value,229,test,webarena_verified.370.228 +webarena_verified.370.230,False,shopping,retrieve_value,230,train,webarena_verified.370.229 +webarena_verified.213.231,False,shopping,retrieve_value,231,test,webarena_verified.213.230 +webarena_verified.213.232,False,shopping,retrieve_value,232,train,webarena_verified.213.231 +webarena_verified.213.233,False,shopping,retrieve_value,233,test,webarena_verified.213.232 +webarena_verified.213.234,False,shopping,retrieve_value,234,train,webarena_verified.213.233 +webarena_verified.213.235,False,shopping,retrieve_value,235,train,webarena_verified.213.234 +webarena_verified.39.236,False,map,retrieve_value,236,train,webarena_verified.39.224 +webarena_verified.39.237,False,map,retrieve_value,237,train,webarena_verified.39.236 +webarena_verified.138.238,False,shopping,ui_state,238,train,webarena_verified.138.235 +webarena_verified.138.239,False,shopping,ui_state,239,train,webarena_verified.138.238 +webarena_verified.138.240,False,shopping,ui_state,240,test,webarena_verified.138.239 +webarena_verified.138.241,False,shopping,ui_state,241,train,webarena_verified.138.240 +webarena_verified.138.242,False,shopping,ui_state,242,test,webarena_verified.138.241 +webarena_verified.244.243,False,shopping_admin,retrieve_value,243,train,webarena_verified.244.217 +webarena_verified.244.244,False,shopping_admin,retrieve_value,244,test,webarena_verified.244.243 +webarena_verified.244.245,False,shopping_admin,retrieve_value,245,train,webarena_verified.244.244 +webarena_verified.244.246,False,shopping_admin,retrieve_value,246,test,webarena_verified.244.245 +webarena_verified.244.247,False,shopping_admin,retrieve_value,247,train,webarena_verified.244.246 +webarena_verified.46.248,False,map,retrieve_value,248,test,webarena_verified.46.237 +webarena_verified.46.249,False,map,retrieve_value,249,train,webarena_verified.46.248 +webarena_verified.46.250,False,map,retrieve_value,250,test,webarena_verified.46.249 +webarena_verified.46.251,False,map,retrieve_value,251,train,webarena_verified.46.250 +webarena_verified.46.252,False,map,retrieve_value,252,train,webarena_verified.46.251 +webarena_verified.501.253,False,map,string_match,253,test,webarena_verified.501.252 +webarena_verified.501.254,False,map,retrieve_value,254,train,webarena_verified.501.253 +webarena_verified.501.255,False,map,retrieve_value,255,test,webarena_verified.501.254 +webarena_verified.501.256,False,map,retrieve_value,256,train,webarena_verified.501.255 +webarena_verified.501.257,False,map,string_match,257,test,webarena_verified.501.256 +webarena_verified.325.258,False,gitlab,ui_state,258,train,webarena_verified.325.207 +webarena_verified.312.259,False,gitlab,retrieve_value,259,train,webarena_verified.312.258 +webarena_verified.211.260,False,shopping,ui_state,260,test,webarena_verified.211.242 +webarena_verified.211.261,False,shopping,ui_state,261,train,webarena_verified.211.260 +webarena_verified.211.262,False,shopping,ui_state,262,train,webarena_verified.211.261 +webarena_verified.211.263,False,shopping,ui_state,263,test,webarena_verified.211.262 +webarena_verified.211.264,False,shopping,ui_state,264,train,webarena_verified.211.263 +webarena_verified.85.265,False,wikipedia map,retrieve_value,265,test,webarena_verified.85.257 +webarena_verified.85.266,False,wikipedia map,retrieve_value,266,test,webarena_verified.85.265 +webarena_verified.85.267,False,wikipedia map,retrieve_value,267,train,webarena_verified.85.266 +webarena_verified.85.268,False,wikipedia map,retrieve_value,268,test,webarena_verified.85.267 +webarena_verified.139.269,False,shopping,ui_state,269,train,webarena_verified.139.264 +webarena_verified.139.270,False,shopping,ui_state,270,train,webarena_verified.139.269 +webarena_verified.139.271,False,shopping,ui_state,271,test,webarena_verified.139.270 +webarena_verified.139.272,False,shopping,ui_state,272,test,webarena_verified.139.271 +webarena_verified.139.273,False,shopping,ui_state,273,train,webarena_verified.139.272 +webarena_verified.212.274,False,shopping,ui_state,274,test,webarena_verified.212.273 +webarena_verified.212.275,False,shopping,ui_state,275,test,webarena_verified.212.274 +webarena_verified.212.276,False,shopping,ui_state,276,train,webarena_verified.212.275 +webarena_verified.212.277,False,shopping,ui_state,277,train,webarena_verified.212.276 +webarena_verified.212.278,False,shopping,ui_state,278,train,webarena_verified.212.277 +webarena_verified.204.279,False,shopping,retrieve_value,279,train,webarena_verified.204.278 +webarena_verified.204.280,False,shopping,retrieve_value,280,test,webarena_verified.204.279 +webarena_verified.204.281,False,shopping,retrieve_value,281,train,webarena_verified.204.280 +webarena_verified.204.282,False,shopping,retrieve_value,282,train,webarena_verified.204.281 +webarena_verified.210.283,False,shopping,ui_state,283,test,webarena_verified.210.282 +webarena_verified.207.284,False,shopping,ui_state,284,test,webarena_verified.207.283 +webarena_verified.207.285,False,shopping,ui_state,285,train,webarena_verified.207.284 +webarena_verified.207.286,False,shopping,ui_state,286,test,webarena_verified.207.285 +webarena_verified.47.287,False,map,string_match,287,test,webarena_verified.47.268 +webarena_verified.234.288,False,shopping_admin,retrieve_value,288,train,webarena_verified.234.247 +webarena_verified.234.289,False,shopping_admin,retrieve_value,289,test,webarena_verified.234.288 +webarena_verified.234.290,False,shopping_admin,retrieve_value,290,train,webarena_verified.234.289 +webarena_verified.234.291,False,shopping_admin,retrieve_value,291,train,webarena_verified.234.290 +webarena_verified.234.292,False,shopping_admin,retrieve_value,292,test,webarena_verified.234.291 +webarena_verified.329.293,False,gitlab,retrieve_value,293,train,webarena_verified.329.259 +webarena_verified.329.294,False,gitlab,retrieve_value,294,train,webarena_verified.329.293 +webarena_verified.329.295,False,gitlab,retrieve_value,295,test,webarena_verified.329.294 +webarena_verified.329.296,False,gitlab,retrieve_value,296,train,webarena_verified.329.295 +webarena_verified.329.297,False,gitlab,retrieve_value,297,test,webarena_verified.329.296 +webarena_verified.180.298,False,shopping,ui_state,298,train,webarena_verified.180.286 +webarena_verified.180.299,False,shopping,ui_state,299,train,webarena_verified.180.298 +webarena_verified.180.300,False,shopping,ui_state,300,test,webarena_verified.180.299 +webarena_verified.180.301,False,shopping,retrieve_value,301,test,webarena_verified.180.300 +webarena_verified.180.302,False,shopping,retrieve_value,302,train,webarena_verified.180.301 +webarena_verified.321.303,False,gitlab,retrieve_value,303,test,webarena_verified.321.297 +webarena_verified.321.304,False,gitlab,retrieve_value,304,train,webarena_verified.321.303 +webarena_verified.321.305,False,gitlab,retrieve_value,305,train,webarena_verified.321.304 +webarena_verified.321.306,False,gitlab,retrieve_value,306,test,webarena_verified.321.305 +webarena_verified.321.307,False,gitlab,retrieve_value,307,train,webarena_verified.321.306 +webarena_verified.323.308,False,gitlab,retrieve_value,308,train,webarena_verified.323.307 +webarena_verified.323.309,False,gitlab,retrieve_value,309,train,webarena_verified.323.308 +webarena_verified.323.310,False,gitlab,retrieve_value,310,train,webarena_verified.323.309 +webarena_verified.323.311,False,gitlab,retrieve_value,311,test,webarena_verified.323.310 +webarena_verified.323.312,False,gitlab,retrieve_value,312,test,webarena_verified.323.311 +webarena_verified.134.313,False,shopping,retrieve_value,313,train,webarena_verified.134.302 +webarena_verified.324.314,False,gitlab,retrieve_value,314,train,webarena_verified.324.312 +webarena_verified.324.315,False,gitlab,retrieve_value,315,train,webarena_verified.324.314 +webarena_verified.324.316,False,gitlab,retrieve_value,316,test,webarena_verified.324.315 +webarena_verified.324.317,False,gitlab,retrieve_value,317,test,webarena_verified.324.316 +webarena_verified.324.318,False,gitlab,retrieve_value,318,train,webarena_verified.324.317 +webarena_verified.160.319,False,shopping,retrieve_value,319,train,webarena_verified.160.313 +webarena_verified.160.320,False,shopping,retrieve_value,320,test,webarena_verified.160.319 +webarena_verified.160.321,False,shopping,retrieve_value,321,train,webarena_verified.160.320 +webarena_verified.160.322,False,shopping,retrieve_value,322,test,webarena_verified.160.321 +webarena_verified.160.323,False,shopping,retrieve_value,323,train,webarena_verified.160.322 +webarena_verified.208.324,False,shopping,ui_state,324,train,webarena_verified.208.323 +webarena_verified.208.325,False,shopping,ui_state,325,test,webarena_verified.208.324 +webarena_verified.208.326,False,shopping,ui_state,326,train,webarena_verified.208.325 +webarena_verified.208.327,False,shopping,ui_state,327,test,webarena_verified.208.326 +webarena_verified.208.328,False,shopping,ui_state,328,train,webarena_verified.208.327 +webarena_verified.147.329,False,shopping,retrieve_value,329,test,webarena_verified.147.328 +webarena_verified.147.330,False,shopping,retrieve_value,330,test,webarena_verified.147.329 +webarena_verified.147.331,False,shopping,retrieve_value,331,test,webarena_verified.147.330 +webarena_verified.147.332,False,shopping,retrieve_value,332,train,webarena_verified.147.331 +webarena_verified.147.333,False,shopping,retrieve_value,333,train,webarena_verified.147.332 +webarena_verified.169.334,False,shopping,retrieve_value,334,train,webarena_verified.169.333 +webarena_verified.169.335,False,shopping,retrieve_value,335,train,webarena_verified.169.334 +webarena_verified.169.336,False,shopping,retrieve_value,336,test,webarena_verified.169.335 +webarena_verified.169.337,False,shopping,retrieve_value,337,test,webarena_verified.169.336 +webarena_verified.169.338,False,shopping,retrieve_value,338,train,webarena_verified.169.337 +webarena_verified.299.339,False,gitlab,ui_state,339,test,webarena_verified.299.318 +webarena_verified.299.340,False,gitlab,ui_state,340,train,webarena_verified.299.339 +webarena_verified.299.341,False,gitlab,ui_state,341,test,webarena_verified.299.340 +webarena_verified.299.342,False,gitlab,ui_state,342,test,webarena_verified.299.341 +webarena_verified.299.343,False,gitlab,ui_state,343,test,webarena_verified.299.342 +webarena_verified.248.344,False,shopping_admin,retrieve_value,344,test,webarena_verified.248.292 +webarena_verified.248.345,False,shopping_admin,retrieve_value,345,train,webarena_verified.248.344 +webarena_verified.248.346,False,shopping_admin,retrieve_value,346,train,webarena_verified.248.345 +webarena_verified.248.347,False,shopping_admin,retrieve_value,347,train,webarena_verified.248.346 +webarena_verified.248.348,False,shopping_admin,retrieve_value,348,test,webarena_verified.248.347 +webarena_verified.298.349,False,gitlab,retrieve_value,349,test,webarena_verified.298.343 +webarena_verified.298.350,False,gitlab,retrieve_value,350,test,webarena_verified.298.349 +webarena_verified.137.351,False,shopping,ui_state,351,train,webarena_verified.137.338 +webarena_verified.137.352,False,shopping,ui_state,352,test,webarena_verified.137.351 +webarena_verified.137.353,False,shopping,ui_state,353,test,webarena_verified.137.352 +webarena_verified.137.354,False,shopping,ui_state,354,train,webarena_verified.137.353 +webarena_verified.137.355,False,shopping,ui_state,355,train,webarena_verified.137.354 +webarena_verified.49.356,False,map,program_html,356,test,webarena_verified.49.287 +webarena_verified.291.357,False,gitlab,ui_state,357,test,webarena_verified.291.350 +webarena_verified.206.358,False,shopping,retrieve_value,358,train,webarena_verified.206.355 +webarena_verified.206.359,False,shopping,retrieve_value,359,test,webarena_verified.206.358 +webarena_verified.206.360,False,shopping,retrieve_value,360,train,webarena_verified.206.359 +webarena_verified.206.361,False,shopping,retrieve_value,361,train,webarena_verified.206.360 +webarena_verified.206.362,False,shopping,retrieve_value,362,test,webarena_verified.206.361 +webarena_verified.58.363,False,map,retrieve_value,363,train,webarena_verified.58.356 +webarena_verified.58.364,False,map,retrieve_value,364,test,webarena_verified.58.363 +webarena_verified.58.365,False,map,retrieve_value,365,test,webarena_verified.58.364 +webarena_verified.58.366,False,map,retrieve_value,366,train,webarena_verified.58.365 +webarena_verified.58.367,False,map,retrieve_value,367,train,webarena_verified.58.366 +webarena_verified.188.368,False,shopping,retrieve_value,368,test,webarena_verified.188.362 +webarena_verified.52.369,False,map,program_html,369,train,webarena_verified.52.367 +webarena_verified.52.370,False,map,program_html,370,test,webarena_verified.52.369 +webarena_verified.52.371,False,map,program_html,371,test,webarena_verified.52.370 +webarena_verified.52.372,False,map,program_html,372,train,webarena_verified.52.371 +webarena_verified.52.373,False,map,program_html,373,train,webarena_verified.52.372 +webarena_verified.266.374,False,shopping_admin,ui_state,374,train,webarena_verified.266.348 +webarena_verified.266.375,False,shopping_admin,ui_state,375,train,webarena_verified.266.374 +webarena_verified.182.376,False,shopping,retrieve_value,376,test,webarena_verified.182.368 +webarena_verified.59.377,False,map,ui_state,377,test,webarena_verified.59.373 +webarena_verified.59.378,False,map,ui_state,378,train,webarena_verified.59.377 +webarena_verified.59.379,False,map,ui_state,379,train,webarena_verified.59.378 +webarena_verified.59.380,False,map,ui_state,380,test,webarena_verified.59.379 +webarena_verified.59.381,False,map,ui_state,381,train,webarena_verified.59.380 +webarena_verified.781.382,False,map,string_match,382,test,webarena_verified.781.381 +webarena_verified.782.383,False,map,retrieve_value,383,test,webarena_verified.782.382 +webarena_verified.666.384,False,shopping,retrieve_value,384,test,webarena_verified.666.376 +webarena_verified.666.385,False,shopping,retrieve_value,385,train,webarena_verified.666.384 +webarena_verified.1355.386,False,shopping,retrieve_value,386,test,webarena_verified.1355.385 +webarena_verified.1356.387,False,shopping,retrieve_value,387,train,webarena_verified.1356.386 +webarena_verified.1356.388,False,shopping,retrieve_value,388,test,webarena_verified.1356.387 +webarena_verified.348.389,False,gitlab,backend_state,389,test,webarena_verified.348.357 +webarena_verified.348.390,False,gitlab,backend_state,390,train,webarena_verified.348.389 +webarena_verified.348.391,False,gitlab,backend_state,391,train,webarena_verified.348.390 +webarena_verified.348.392,False,gitlab,backend_state,392,test,webarena_verified.348.391 +webarena_verified.348.393,False,gitlab,backend_state,393,train,webarena_verified.348.392 +webarena_verified.352.394,False,gitlab,backend_state,394,test,webarena_verified.352.393 +webarena_verified.352.395,False,gitlab,backend_state,395,train,webarena_verified.352.394 +webarena_verified.352.396,False,gitlab,backend_state,396,train,webarena_verified.352.395 +webarena_verified.352.397,False,gitlab,backend_state,397,train,webarena_verified.352.396 +webarena_verified.352.398,False,gitlab,backend_state,398,test,webarena_verified.352.397 +webarena_verified.6.399,False,reddit,backend_state,399,train,webarena_verified.6.69 +webarena_verified.6.400,False,reddit,backend_state,400,test,webarena_verified.6.399 +webarena_verified.6.401,False,reddit,backend_state,401,train,webarena_verified.6.400 +webarena_verified.6.402,False,reddit,backend_state,402,train,webarena_verified.6.401 +webarena_verified.6.403,False,reddit,backend_state,403,test,webarena_verified.6.402 +webarena_verified.22.404,False,reddit,backend_state,404,train,webarena_verified.22.403 +webarena_verified.22.405,False,reddit,backend_state,405,test,webarena_verified.22.404 +webarena_verified.22.406,False,reddit,backend_state,406,train,webarena_verified.22.405 +webarena_verified.22.407,False,reddit,backend_state,407,test,webarena_verified.22.406 +webarena_verified.22.408,False,reddit,backend_state,408,train,webarena_verified.22.407 +webarena_verified.23.409,False,reddit,backend_state,409,test,webarena_verified.23.408 +webarena_verified.23.410,False,reddit,backend_state,410,test,webarena_verified.23.409 +webarena_verified.355.411,False,gitlab,backend_state,411,test,webarena_verified.355.398 +webarena_verified.355.412,False,gitlab,backend_state,412,test,webarena_verified.355.411 +webarena_verified.355.413,False,gitlab,backend_state,413,test,webarena_verified.355.412 +webarena_verified.355.414,False,gitlab,backend_state,414,test,webarena_verified.355.413 +webarena_verified.360.415,False,gitlab,backend_state,415,test,webarena_verified.360.414 +webarena_verified.360.416,False,gitlab,backend_state,416,test,webarena_verified.360.415 +webarena_verified.360.417,False,gitlab,backend_state,417,test,webarena_verified.360.416 +webarena_verified.361.418,False,gitlab,backend_state,418,train,webarena_verified.361.417 +webarena_verified.361.419,False,gitlab,backend_state,419,test,webarena_verified.361.418 +webarena_verified.361.420,False,gitlab,backend_state,420,test,webarena_verified.361.419 +webarena_verified.361.421,False,gitlab,backend_state,421,train,webarena_verified.361.420 +webarena_verified.361.422,False,gitlab,backend_state,422,train,webarena_verified.361.421 +webarena_verified.237.423,False,shopping_admin,backend_state,423,train,webarena_verified.237.375 +webarena_verified.371.424,False,wikipedia map,program_html,424,train,webarena_verified.371.383 +webarena_verified.371.425,False,wikipedia map,program_html,425,train,webarena_verified.371.424 +webarena_verified.371.426,False,wikipedia map,program_html,426,test,webarena_verified.371.425 +webarena_verified.371.427,False,wikipedia map,program_html,427,test,webarena_verified.371.426 +webarena_verified.371.428,False,wikipedia map,program_html,428,train,webarena_verified.371.427 +webarena_verified.371.429,False,wikipedia map,program_html,429,train,webarena_verified.371.428 +webarena_verified.371.430,False,wikipedia map,program_html,430,test,webarena_verified.371.429 +webarena_verified.145.431,False,shopping,program_html,431,train,webarena_verified.145.388 +webarena_verified.145.432,False,shopping,backend_state,432,test,webarena_verified.145.431 +webarena_verified.145.433,False,shopping,backend_state,433,train,webarena_verified.145.432 +webarena_verified.145.434,False,shopping,backend_state,434,train,webarena_verified.145.433 +webarena_verified.145.435,False,shopping,backend_state,435,train,webarena_verified.145.434 +webarena_verified.156.436,False,shopping,backend_state,436,test,webarena_verified.156.435 +webarena_verified.156.437,False,shopping,backend_state,437,train,webarena_verified.156.436 +webarena_verified.156.438,False,shopping,backend_state,438,train,webarena_verified.156.437 +webarena_verified.156.439,False,shopping,backend_state,439,train,webarena_verified.156.438 +webarena_verified.156.440,False,shopping,backend_state,440,test,webarena_verified.156.439 +webarena_verified.308.441,False,gitlab,backend_state,441,train,webarena_verified.308.422 +webarena_verified.308.442,False,gitlab,backend_state,442,train,webarena_verified.308.441 +webarena_verified.308.443,False,gitlab,backend_state,443,test,webarena_verified.308.442 +webarena_verified.308.444,False,gitlab,backend_state,444,train,webarena_verified.308.443 +webarena_verified.308.445,False,gitlab,backend_state,445,test,webarena_verified.308.444 +webarena_verified.999.446,False,gitlab,backend_state,446,test,webarena_verified.999.445 +webarena_verified.999.447,False,gitlab,backend_state,447,train,webarena_verified.999.446 +webarena_verified.331.448,False,gitlab,backend_state,448,test,webarena_verified.331.447 +webarena_verified.331.449,False,gitlab,backend_state,449,test,webarena_verified.331.448 +webarena_verified.331.450,False,gitlab,retrieve_value,450,train,webarena_verified.331.449 +webarena_verified.331.451,False,gitlab,retrieve_value,451,train,webarena_verified.331.450 +webarena_verified.331.452,False,gitlab,retrieve_value,452,train,webarena_verified.331.451 +webarena_verified.242.453,False,shopping_admin,backend_state,453,train,webarena_verified.242.423 +webarena_verified.242.454,False,shopping_admin,backend_state,454,test,webarena_verified.242.453 +webarena_verified.242.455,False,shopping_admin,backend_state,455,train,webarena_verified.242.454 +webarena_verified.242.456,False,shopping_admin,backend_state,456,test,webarena_verified.242.455 +webarena_verified.242.457,False,shopping_admin,backend_state,457,train,webarena_verified.242.456 +webarena_verified.247.458,False,shopping_admin,backend_state,458,test,webarena_verified.247.457 +webarena_verified.247.459,False,shopping_admin,backend_state,459,test,webarena_verified.247.458 +webarena_verified.247.460,False,shopping_admin,backend_state,460,train,webarena_verified.247.459 +webarena_verified.247.461,False,shopping_admin,backend_state,461,train,webarena_verified.247.460 +webarena_verified.247.462,False,shopping_admin,backend_state,462,test,webarena_verified.247.461 +webarena_verified.247.463,False,shopping_admin,backend_state,463,test,webarena_verified.247.462 +webarena_verified.251.464,False,shopping_admin,backend_state,464,train,webarena_verified.251.463 +webarena_verified.186.465,False,shopping,backend_state,465,train,webarena_verified.186.440 +webarena_verified.186.466,False,shopping,backend_state,466,train,webarena_verified.186.465 +webarena_verified.186.467,False,shopping,backend_state,467,train,webarena_verified.186.466 +webarena_verified.186.468,False,shopping,backend_state,468,test,webarena_verified.186.467 +webarena_verified.186.469,False,shopping,backend_state,469,test,webarena_verified.186.468 +webarena_verified.257.470,False,shopping_admin,backend_state,470,test,webarena_verified.257.464 +webarena_verified.257.471,False,shopping_admin,backend_state,471,test,webarena_verified.257.470 +webarena_verified.257.472,False,shopping_admin,backend_state,472,train,webarena_verified.257.471 +webarena_verified.257.473,False,shopping_admin,backend_state,473,train,webarena_verified.257.472 +webarena_verified.257.474,False,shopping_admin,backend_state,474,train,webarena_verified.257.473 +webarena_verified.292.475,False,gitlab,backend_state,475,train,webarena_verified.292.452 +webarena_verified.292.476,False,gitlab,backend_state,476,train,webarena_verified.292.475 +webarena_verified.292.477,False,gitlab,backend_state,477,train,webarena_verified.292.476 +webarena_verified.292.478,False,gitlab,backend_state,478,test,webarena_verified.292.477 +webarena_verified.292.479,False,gitlab,backend_state,479,test,webarena_verified.292.478 +webarena_verified.293.480,False,gitlab,backend_state,480,train,webarena_verified.293.479 +webarena_verified.294.481,False,gitlab,backend_state,481,train,webarena_verified.294.480 +webarena_verified.294.482,False,gitlab,backend_state,482,train,webarena_verified.294.481 +webarena_verified.294.483,False,gitlab,backend_state,483,test,webarena_verified.294.482 +webarena_verified.294.484,False,gitlab,backend_state,484,train,webarena_verified.294.483 +webarena_verified.294.485,False,gitlab,backend_state,485,test,webarena_verified.294.484 +webarena_verified.275.486,False,shopping_admin,backend_state,486,train,webarena_verified.275.474 +webarena_verified.275.487,False,shopping_admin,backend_state,487,test,webarena_verified.275.486 +webarena_verified.275.488,False,shopping_admin,backend_state,488,test,webarena_verified.275.487 +webarena_verified.275.489,False,shopping_admin,backend_state,489,train,webarena_verified.275.488 +webarena_verified.275.490,False,shopping_admin,backend_state,490,train,webarena_verified.275.489 +webarena_verified.280.491,False,shopping_admin,retrieve_value,491,test,webarena_verified.280.490 +webarena_verified.280.492,False,shopping_admin,backend_state,492,train,webarena_verified.280.491 +webarena_verified.280.493,False,shopping_admin,backend_state,493,train,webarena_verified.280.492 +webarena_verified.280.494,False,shopping_admin,backend_state,494,train,webarena_verified.280.493 +webarena_verified.280.495,False,shopping_admin,backend_state,495,test,webarena_verified.280.494 +webarena_verified.284.496,False,shopping_admin,backend_state,496,train,webarena_verified.284.495 +webarena_verified.284.497,False,shopping_admin,backend_state,497,test,webarena_verified.284.496 +webarena_verified.284.498,False,shopping_admin,backend_state,498,test,webarena_verified.284.497 +webarena_verified.284.499,False,shopping_admin,backend_state,499,train,webarena_verified.284.498 +webarena_verified.284.500,False,shopping_admin,backend_state,500,train,webarena_verified.284.499 +webarena_verified.287.501,False,shopping_admin,backend_state,501,train,webarena_verified.287.500 +webarena_verified.287.502,False,shopping_admin,backend_state,502,test,webarena_verified.287.501 +webarena_verified.287.503,False,shopping_admin,backend_state,503,train,webarena_verified.287.502 +webarena_verified.287.504,False,shopping_admin,backend_state,504,test,webarena_verified.287.503 +webarena_verified.287.505,False,shopping_admin,backend_state,505,train,webarena_verified.287.504 +webarena_verified.172.506,False,shopping,backend_state,506,train,webarena_verified.172.469 +webarena_verified.172.507,False,shopping,backend_state,507,train,webarena_verified.172.506 +webarena_verified.172.508,False,shopping,backend_state,508,test,webarena_verified.172.507 +webarena_verified.216.509,False,shopping,backend_state,509,test,webarena_verified.216.508 +webarena_verified.216.510,False,shopping,backend_state,510,test,webarena_verified.216.509 +webarena_verified.189.511,False,shopping,program_html,511,test,webarena_verified.189.510 +webarena_verified.189.512,False,shopping,program_html,512,train,webarena_verified.189.511 +webarena_verified.189.513,False,shopping,program_html,513,train,webarena_verified.189.512 +webarena_verified.189.514,False,shopping,program_html,514,test,webarena_verified.189.513 +webarena_verified.189.515,False,shopping,program_html,515,train,webarena_verified.189.514 +webarena_verified.196.516,False,shopping,backend_state,516,train,webarena_verified.196.515 +webarena_verified.196.517,False,shopping,backend_state,517,test,webarena_verified.196.516 +webarena_verified.196.518,False,shopping,backend_state,518,test,webarena_verified.196.517 +webarena_verified.196.519,False,shopping,backend_state,519,test,webarena_verified.196.518 +webarena_verified.196.520,False,shopping,backend_state,520,train,webarena_verified.196.519 +webarena_verified.199.521,False,shopping,backend_state,521,test,webarena_verified.199.520 +webarena_verified.352.522,False,gitlab,backend_state,522,test,webarena_verified.352.485 +webarena_verified.354.523,False,gitlab,backend_state,523,train,webarena_verified.354.522 +webarena_verified.354.524,False,gitlab,backend_state,524,test,webarena_verified.354.523 +webarena_verified.354.525,False,gitlab,backend_state,525,train,webarena_verified.354.524 +webarena_verified.354.526,False,gitlab,backend_state,526,train,webarena_verified.354.525 +webarena_verified.354.527,False,gitlab,backend_state,527,test,webarena_verified.354.526 +webarena_verified.154.528,False,shopping,program_html,528,train,webarena_verified.154.521 +webarena_verified.154.529,False,shopping,program_html,529,test,webarena_verified.154.528 +webarena_verified.154.530,False,shopping,program_html,530,test,webarena_verified.154.529 +webarena_verified.154.531,False,shopping,program_html,531,train,webarena_verified.154.530 +webarena_verified.154.532,False,shopping,program_html,532,train,webarena_verified.154.531 +webarena_verified.330.533,False,gitlab,backend_state,533,test,webarena_verified.330.527 +webarena_verified.330.534,False,gitlab,backend_state,534,train,webarena_verified.330.533 +webarena_verified.330.535,False,gitlab,backend_state,535,test,webarena_verified.330.534 +webarena_verified.330.536,False,gitlab,backend_state,536,train,webarena_verified.330.535 +webarena_verified.330.537,False,gitlab,backend_state,537,train,webarena_verified.330.536 +webarena_verified.240.538,False,shopping_admin,backend_state,538,train,webarena_verified.240.505 +webarena_verified.240.539,False,shopping_admin,backend_state,539,train,webarena_verified.240.538 +webarena_verified.240.540,False,shopping_admin,backend_state,540,test,webarena_verified.240.539 +webarena_verified.240.541,False,shopping_admin,backend_state,541,test,webarena_verified.240.540 +webarena_verified.240.542,False,shopping_admin,backend_state,542,train,webarena_verified.240.541 +webarena_verified.251.543,False,shopping_admin,backend_state,543,test,webarena_verified.251.542 +webarena_verified.251.544,False,shopping_admin,backend_state,544,test,webarena_verified.251.543 +webarena_verified.251.545,False,shopping_admin,backend_state,545,test,webarena_verified.251.544 +webarena_verified.251.546,False,shopping_admin,retrieve_value,546,train,webarena_verified.251.545 +webarena_verified.252.547,False,shopping_admin,backend_state,547,train,webarena_verified.252.546 +webarena_verified.252.548,False,shopping_admin,backend_state,548,train,webarena_verified.252.547 +webarena_verified.252.549,False,shopping_admin,backend_state,549,test,webarena_verified.252.548 +webarena_verified.252.550,False,shopping_admin,backend_state,550,train,webarena_verified.252.549 +webarena_verified.252.551,False,shopping_admin,backend_state,551,test,webarena_verified.252.550 +webarena_verified.84.552,False,gitlab reddit,program_html,552,test,webarena_verified.84.537 webarena_verified.84.410 +webarena_verified.84.553,False,gitlab reddit,program_html,553,test,webarena_verified.84.552 +webarena_verified.84.554,False,gitlab reddit,program_html,554,test,webarena_verified.84.553 +webarena_verified.84.555,False,gitlab reddit,program_html,555,test,webarena_verified.84.554 +webarena_verified.87.556,False,gitlab wikipedia,program_html,556,train,webarena_verified.87.555 +webarena_verified.87.557,False,gitlab wikipedia,program_html,557,test,webarena_verified.87.556 +webarena_verified.87.558,False,gitlab wikipedia,program_html,558,train,webarena_verified.87.557 +webarena_verified.87.559,False,gitlab wikipedia,program_html,559,train,webarena_verified.87.558 +webarena_verified.87.560,False,gitlab wikipedia,program_html,560,test,webarena_verified.87.559 +webarena_verified.87.561,False,gitlab wikipedia,program_html,561,test,webarena_verified.87.560 +webarena_verified.88.562,False,gitlab reddit,program_html,562,train,webarena_verified.88.561 webarena_verified.88.555 +webarena_verified.88.563,False,gitlab reddit,program_html,563,train,webarena_verified.88.562 +webarena_verified.88.564,False,gitlab reddit,program_html,564,train,webarena_verified.88.563 +webarena_verified.88.565,False,gitlab reddit,program_html,565,test,webarena_verified.88.564 +webarena_verified.88.566,False,gitlab reddit,program_html,566,test,webarena_verified.88.565 +webarena_verified.293.567,False,gitlab,backend_state,567,test,webarena_verified.293.566 +webarena_verified.293.568,False,gitlab,backend_state,568,train,webarena_verified.293.567 +webarena_verified.293.569,False,gitlab,backend_state,569,train,webarena_verified.293.568 +webarena_verified.293.570,False,gitlab,backend_state,570,test,webarena_verified.293.569 +webarena_verified.165.571,False,shopping,backend_state,571,test,webarena_verified.165.532 +webarena_verified.165.572,False,shopping,backend_state,572,train,webarena_verified.165.571 +webarena_verified.165.573,False,shopping,backend_state,573,train,webarena_verified.165.572 +webarena_verified.165.574,False,shopping,backend_state,574,test,webarena_verified.165.573 +webarena_verified.165.575,False,shopping,backend_state,575,train,webarena_verified.165.574 +webarena_verified.351.576,False,gitlab,backend_state,576,test,webarena_verified.351.570 +webarena_verified.351.577,False,gitlab,backend_state,577,train,webarena_verified.351.576 +webarena_verified.351.578,False,gitlab,backend_state,578,test,webarena_verified.351.577 +webarena_verified.351.579,False,gitlab,backend_state,579,train,webarena_verified.351.578 +webarena_verified.7.580,False,reddit,backend_state,580,train,webarena_verified.7.566 +webarena_verified.7.581,False,reddit,backend_state,581,train,webarena_verified.7.580 +webarena_verified.7.582,False,reddit,backend_state,582,test,webarena_verified.7.581 +webarena_verified.7.583,False,reddit,backend_state,583,test,webarena_verified.7.582 +webarena_verified.7.584,False,reddit,backend_state,584,train,webarena_verified.7.583 +webarena_verified.194.585,False,shopping,backend_state,585,train,webarena_verified.194.575 +webarena_verified.194.586,False,shopping,backend_state,586,test,webarena_verified.194.585 +webarena_verified.194.587,False,shopping,backend_state,587,train,webarena_verified.194.586 +webarena_verified.194.588,False,shopping,backend_state,588,train,webarena_verified.194.587 +webarena_verified.194.589,False,shopping,backend_state,589,test,webarena_verified.194.588 +webarena_verified.339.590,False,gitlab,backend_state,590,train,webarena_verified.339.579 +webarena_verified.339.591,False,gitlab,backend_state,591,test,webarena_verified.339.590 +webarena_verified.339.592,False,gitlab,backend_state,592,test,webarena_verified.339.591 +webarena_verified.339.593,False,gitlab,backend_state,593,test,webarena_verified.339.592 +webarena_verified.339.594,False,gitlab,backend_state,594,train,webarena_verified.339.593 +webarena_verified.4.595,False,reddit,backend_state,595,train,webarena_verified.4.584 +webarena_verified.4.596,False,reddit,backend_state,596,test,webarena_verified.4.595 +webarena_verified.4.597,False,reddit,backend_state,597,train,webarena_verified.4.596 +webarena_verified.4.598,False,reddit,backend_state,598,train,webarena_verified.4.597 +webarena_verified.4.599,False,reddit,backend_state,599,test,webarena_verified.4.598 +webarena_verified.3765.600,False,reddit,backend_state,600,test,webarena_verified.3765.599 +webarena_verified.3765.601,False,reddit,backend_state,601,train,webarena_verified.3765.600 +webarena_verified.3765.602,False,reddit,backend_state,602,train,webarena_verified.3765.601 +webarena_verified.3765.603,False,reddit,backend_state,603,train,webarena_verified.3765.602 +webarena_verified.3765.604,False,reddit,backend_state,604,test,webarena_verified.3765.603 +webarena_verified.5.605,False,reddit,backend_state,605,train,webarena_verified.5.604 +webarena_verified.5.606,False,reddit,backend_state,606,train,webarena_verified.5.605 +webarena_verified.5.607,False,reddit,backend_state,607,test,webarena_verified.5.606 +webarena_verified.5.608,False,reddit,backend_state,608,test,webarena_verified.5.607 +webarena_verified.5.609,False,reddit,backend_state,609,train,webarena_verified.5.608 +webarena_verified.9.610,False,reddit,backend_state,610,train,webarena_verified.9.609 +webarena_verified.9.611,False,reddit,backend_state,611,train,webarena_verified.9.610 +webarena_verified.9.612,False,reddit,backend_state,612,test,webarena_verified.9.611 +webarena_verified.9.613,False,reddit,backend_state,613,train,webarena_verified.9.612 +webarena_verified.9.614,False,reddit,backend_state,614,test,webarena_verified.9.613 +webarena_verified.11.615,False,reddit,ui_state,615,test,webarena_verified.11.614 +webarena_verified.11.616,False,reddit,ui_state,616,test,webarena_verified.11.615 +webarena_verified.11.617,False,reddit,ui_state,617,train,webarena_verified.11.616 +webarena_verified.11.618,False,reddit,ui_state,618,train,webarena_verified.11.617 +webarena_verified.11.619,False,reddit,ui_state,619,train,webarena_verified.11.618 +webarena_verified.12.620,False,reddit,backend_state,620,train,webarena_verified.12.619 +webarena_verified.12.621,False,reddit,backend_state,621,train,webarena_verified.12.620 +webarena_verified.12.622,False,reddit,backend_state,622,train,webarena_verified.12.621 +webarena_verified.12.623,False,reddit,backend_state,623,test,webarena_verified.12.622 +webarena_verified.12.624,False,reddit,backend_state,624,test,webarena_verified.12.623 +webarena_verified.13.625,False,reddit,backend_state,625,train,webarena_verified.13.624 +webarena_verified.13.626,False,reddit,backend_state,626,train,webarena_verified.13.625 +webarena_verified.13.627,False,reddit,backend_state,627,train,webarena_verified.13.626 +webarena_verified.13.628,False,reddit,backend_state,628,test,webarena_verified.13.627 +webarena_verified.13.629,False,reddit,backend_state,629,test,webarena_verified.13.628 +webarena_verified.15.630,False,reddit,backend_state,630,test,webarena_verified.15.629 +webarena_verified.15.631,False,reddit,backend_state,631,train,webarena_verified.15.630 +webarena_verified.15.632,False,reddit,backend_state,632,train,webarena_verified.15.631 +webarena_verified.15.633,False,reddit,backend_state,633,test,webarena_verified.15.632 +webarena_verified.15.634,False,reddit,backend_state,634,train,webarena_verified.15.633 +webarena_verified.6100.635,False,reddit,backend_state,635,train,webarena_verified.6100.634 +webarena_verified.6100.636,False,reddit,backend_state,636,train,webarena_verified.6100.635 +webarena_verified.6100.637,False,reddit,backend_state,637,train,webarena_verified.6100.636 +webarena_verified.6100.638,False,reddit,ui_state,638,test,webarena_verified.6100.637 +webarena_verified.6100.639,False,reddit,backend_state,639,test,webarena_verified.6100.638 +webarena_verified.16.640,False,reddit,backend_state,640,train,webarena_verified.16.639 +webarena_verified.16.641,False,reddit,backend_state,641,test,webarena_verified.16.640 +webarena_verified.16.642,False,reddit,backend_state,642,test,webarena_verified.16.641 +webarena_verified.16.643,False,reddit,backend_state,643,train,webarena_verified.16.642 +webarena_verified.16.644,False,reddit,backend_state,644,train,webarena_verified.16.643 +webarena_verified.19.645,False,reddit,backend_state,645,train,webarena_verified.19.644 +webarena_verified.19.646,False,reddit,backend_state,646,train,webarena_verified.19.645 +webarena_verified.19.647,False,reddit,backend_state,647,train,webarena_verified.19.646 +webarena_verified.19.648,False,reddit,backend_state,648,test,webarena_verified.19.647 +webarena_verified.19.649,False,reddit,backend_state,649,test,webarena_verified.19.648 +webarena_verified.23.650,False,reddit,backend_state,650,train,webarena_verified.23.649 +webarena_verified.23.651,False,reddit,backend_state,651,train,webarena_verified.23.650 +webarena_verified.23.652,False,reddit,backend_state,652,train,webarena_verified.23.651 +webarena_verified.153.653,False,shopping,ui_state,653,train,webarena_verified.153.589 +webarena_verified.153.654,False,shopping,ui_state,654,test,webarena_verified.153.653 +webarena_verified.153.655,False,shopping,ui_state,655,test,webarena_verified.153.654 +webarena_verified.153.656,False,shopping,ui_state,656,train,webarena_verified.153.655 +webarena_verified.153.657,False,shopping,ui_state,657,train,webarena_verified.153.656 +webarena_verified.327.658,False,gitlab,backend_state,658,train,webarena_verified.327.594 +webarena_verified.327.659,False,gitlab,backend_state,659,test,webarena_verified.327.658 +webarena_verified.327.660,False,gitlab,backend_state,660,test,webarena_verified.327.659 +webarena_verified.328.661,False,gitlab,backend_state,661,test,webarena_verified.328.660 +webarena_verified.328.662,False,gitlab,backend_state,662,train,webarena_verified.328.661 +webarena_verified.328.663,False,gitlab,backend_state,663,train,webarena_verified.328.662 +webarena_verified.328.664,False,gitlab,backend_state,664,test,webarena_verified.328.663 +webarena_verified.328.665,False,gitlab,backend_state,665,train,webarena_verified.328.664 +webarena_verified.335.666,False,gitlab,retrieve_value,666,test,webarena_verified.335.665 +webarena_verified.335.667,False,gitlab,backend_state,667,test,webarena_verified.335.666 +webarena_verified.335.668,False,gitlab,retrieve_value,668,test,webarena_verified.335.667 +webarena_verified.337.669,False,gitlab,backend_state,669,test,webarena_verified.337.668 +webarena_verified.337.670,False,gitlab,backend_state,670,train,webarena_verified.337.669 +webarena_verified.101.671,False,shopping reddit,ui_state,671,train,webarena_verified.101.657 webarena_verified.101.652 +webarena_verified.101.672,False,shopping reddit,ui_state,672,train,webarena_verified.101.671 +webarena_verified.101.673,False,shopping reddit,ui_state,673,test,webarena_verified.101.672 +webarena_verified.101.674,False,shopping reddit,ui_state,674,test,webarena_verified.101.673 +webarena_verified.101.675,False,shopping reddit,ui_state,675,train,webarena_verified.101.674 +webarena_verified.253.676,False,shopping_admin,ui_state,676,test,webarena_verified.253.551 +webarena_verified.253.677,False,shopping_admin,ui_state,677,test,webarena_verified.253.676 +webarena_verified.253.678,False,shopping_admin,ui_state,678,train,webarena_verified.253.677 +webarena_verified.253.679,False,shopping_admin,ui_state,679,train,webarena_verified.253.678 +webarena_verified.253.680,False,shopping_admin,ui_state,680,train,webarena_verified.253.679 +webarena_verified.116.681,False,reddit gitlab,ui_state,681,train,webarena_verified.116.675 webarena_verified.116.670 +webarena_verified.116.682,False,reddit gitlab,ui_state,682,train,webarena_verified.116.681 +webarena_verified.116.683,False,reddit gitlab,ui_state,683,test,webarena_verified.116.682 +webarena_verified.117.684,False,reddit gitlab,ui_state,684,train,webarena_verified.117.683 +webarena_verified.117.685,False,reddit gitlab,ui_state,685,train,webarena_verified.117.684 +webarena_verified.117.686,False,reddit gitlab,ui_state,686,train,webarena_verified.117.685 +webarena_verified.117.687,False,reddit gitlab,ui_state,687,test,webarena_verified.117.686 +webarena_verified.117.688,False,reddit gitlab,ui_state,688,test,webarena_verified.117.687 +webarena_verified.163.689,False,shopping,ui_state,689,test,webarena_verified.163.675 +webarena_verified.163.690,False,shopping,ui_state,690,test,webarena_verified.163.689 +webarena_verified.163.691,False,shopping,ui_state,691,train,webarena_verified.163.690 +webarena_verified.163.692,False,shopping,ui_state,692,train,webarena_verified.163.691 +webarena_verified.163.693,False,shopping,ui_state,693,train,webarena_verified.163.692 +webarena_verified.256.694,False,shopping_admin,backend_state,694,train,webarena_verified.256.680 +webarena_verified.256.695,False,shopping_admin,backend_state,695,train,webarena_verified.256.694 +webarena_verified.256.696,False,shopping_admin,backend_state,696,test,webarena_verified.256.695 +webarena_verified.256.697,False,shopping_admin,backend_state,697,train,webarena_verified.256.696 +webarena_verified.256.698,False,shopping_admin,backend_state,698,test,webarena_verified.256.697 +webarena_verified.258.699,False,shopping_admin,backend_state,699,train,webarena_verified.258.698 +webarena_verified.258.700,False,shopping_admin,backend_state,700,test,webarena_verified.258.699 +webarena_verified.258.701,False,shopping_admin,backend_state,701,test,webarena_verified.258.700 +webarena_verified.258.702,False,shopping_admin,backend_state,702,train,webarena_verified.258.701 +webarena_verified.258.703,False,shopping_admin,backend_state,703,train,webarena_verified.258.702 +webarena_verified.268.704,False,shopping_admin,ui_state,704,test,webarena_verified.268.703 +webarena_verified.268.705,False,shopping_admin,ui_state,705,test,webarena_verified.268.704 +webarena_verified.268.706,False,shopping_admin,ui_state,706,train,webarena_verified.268.705 +webarena_verified.268.707,False,shopping_admin,ui_state,707,train,webarena_verified.268.706 +webarena_verified.268.708,False,shopping_admin,ui_state,708,train,webarena_verified.268.707 +webarena_verified.271.709,False,shopping_admin,ui_state,709,test,webarena_verified.271.708 +webarena_verified.271.710,False,shopping_admin,ui_state,710,test,webarena_verified.271.709 +webarena_verified.271.711,False,shopping_admin,ui_state,711,train,webarena_verified.271.710 +webarena_verified.271.712,False,shopping_admin,ui_state,712,train,webarena_verified.271.711 +webarena_verified.271.713,False,shopping_admin,ui_state,713,train,webarena_verified.271.712 +webarena_verified.24.714,False,reddit,backend_state,714,train,webarena_verified.24.688 +webarena_verified.24.715,False,reddit,backend_state,715,train,webarena_verified.24.714 +webarena_verified.24.716,False,reddit,backend_state,716,train,webarena_verified.24.715 +webarena_verified.24.717,False,reddit,backend_state,717,test,webarena_verified.24.716 +webarena_verified.24.718,False,reddit,backend_state,718,test,webarena_verified.24.717 +webarena_verified.25.719,False,reddit,backend_state,719,train,webarena_verified.25.718 +webarena_verified.25.720,False,reddit,backend_state,720,test,webarena_verified.25.719 +webarena_verified.25.721,False,reddit,backend_state,721,train,webarena_verified.25.720 +webarena_verified.25.722,False,reddit,backend_state,722,train,webarena_verified.25.721 +webarena_verified.25.723,False,reddit,backend_state,723,test,webarena_verified.25.722 +webarena_verified.25.724,False,reddit,backend_state,724,test,webarena_verified.25.723 +webarena_verified.1510.725,False,reddit,backend_state,725,test,webarena_verified.1510.724 +webarena_verified.1510.726,False,reddit,backend_state,726,test,webarena_verified.1510.725 +webarena_verified.1510.727,False,reddit,backend_state,727,train,webarena_verified.1510.726 +webarena_verified.1510.728,False,reddit,backend_state,728,train,webarena_verified.1510.727 +webarena_verified.1510.729,False,reddit,backend_state,729,train,webarena_verified.1510.728 +webarena_verified.1510.730,False,reddit,backend_state,730,test,webarena_verified.1510.729 +webarena_verified.27.731,False,reddit,backend_state,731,test,webarena_verified.27.730 +webarena_verified.27.732,False,reddit,backend_state,732,train,webarena_verified.27.731 +webarena_verified.27.733,False,reddit,backend_state,733,train,webarena_verified.27.732 +webarena_verified.27.734,False,reddit,program_html,734,train,webarena_verified.27.733 +webarena_verified.27.735,False,reddit,program_html,735,test,webarena_verified.27.734 +webarena_verified.355.736,False,gitlab,backend_state,736,train,webarena_verified.355.688 +webarena_verified.94.737,False,wikipedia map,program_html,737,train,webarena_verified.94.430 +webarena_verified.94.738,False,wikipedia map,program_html,738,test,webarena_verified.94.737 +webarena_verified.94.739,False,wikipedia map,program_html,739,train,webarena_verified.94.738 +webarena_verified.94.740,False,wikipedia map,program_html,740,test,webarena_verified.94.739 +webarena_verified.94.741,False,wikipedia map,program_html,741,train,webarena_verified.94.740 +webarena_verified.332.742,False,gitlab,backend_state,742,test,webarena_verified.332.736 +webarena_verified.332.743,False,gitlab,backend_state,743,test,webarena_verified.332.742 +webarena_verified.332.744,False,gitlab,backend_state,744,test,webarena_verified.332.743 +webarena_verified.332.745,False,gitlab,backend_state,745,test,webarena_verified.332.744 +webarena_verified.332.746,False,gitlab,backend_state,746,train,webarena_verified.332.745 +webarena_verified.2100.747,False,gitlab,backend_state,747,train,webarena_verified.2100.746 +webarena_verified.2100.748,False,gitlab,backend_state,748,train,webarena_verified.2100.747 +webarena_verified.2100.749,False,gitlab,backend_state,749,test,webarena_verified.2100.748 +webarena_verified.2100.750,False,gitlab,backend_state,750,test,webarena_verified.2100.749 +webarena_verified.2100.751,False,gitlab,backend_state,751,train,webarena_verified.2100.750 +webarena_verified.332.752,False,gitlab,backend_state,752,train,webarena_verified.332.751 +webarena_verified.332.753,False,gitlab,backend_state,753,test,webarena_verified.332.752 +webarena_verified.332.754,False,gitlab,backend_state,754,train,webarena_verified.332.753 +webarena_verified.332.755,False,gitlab,backend_state,755,test,webarena_verified.332.754 +webarena_verified.332.756,False,gitlab,backend_state,756,train,webarena_verified.332.755 +webarena_verified.42.757,False,map,program_html,757,test,webarena_verified.42.741 +webarena_verified.42.758,False,map,program_html,758,test,webarena_verified.42.757 +webarena_verified.42.759,False,map shopping_admin,program_html,759,test,webarena_verified.42.758 webarena_verified.42.713 +webarena_verified.42.760,False,map shopping_admin,program_html,760,test,webarena_verified.42.759 +webarena_verified.54.761,False,map,program_html,761,train,webarena_verified.54.760 +webarena_verified.54.762,False,map,program_html,762,train,webarena_verified.54.761 +webarena_verified.75.763,False,map,program_html,763,test,webarena_verified.75.762 +webarena_verified.75.764,False,map,program_html,764,test,webarena_verified.75.763 +webarena_verified.75.765,False,map,program_html,765,train,webarena_verified.75.764 +webarena_verified.75.766,False,map,program_html,766,train,webarena_verified.75.765 +webarena_verified.75.767,False,map,program_html,767,train,webarena_verified.75.766 +webarena_verified.241.768,False,shopping_admin,backend_state,768,test,webarena_verified.241.760 +webarena_verified.241.769,False,shopping_admin,backend_state,769,test,webarena_verified.241.768 +webarena_verified.241.770,False,shopping_admin,backend_state,770,train,webarena_verified.241.769 +webarena_verified.243.771,False,shopping_admin,backend_state,771,test,webarena_verified.243.770 +webarena_verified.246.772,False,shopping_admin,backend_state,772,test,webarena_verified.246.771 +webarena_verified.246.773,False,shopping_admin,backend_state,773,train,webarena_verified.246.772 +webarena_verified.246.774,False,shopping_admin,backend_state,774,train,webarena_verified.246.773 +webarena_verified.246.775,False,shopping_admin,backend_state,775,train,webarena_verified.246.774 +webarena_verified.246.776,False,shopping_admin,backend_state,776,test,webarena_verified.246.775 +webarena_verified.742.777,False,shopping_admin,backend_state,777,train,webarena_verified.742.776 +webarena_verified.742.778,False,shopping_admin,backend_state,778,test,webarena_verified.742.777 +webarena_verified.742.779,False,shopping_admin,backend_state,779,train,webarena_verified.742.778 +webarena_verified.742.780,False,shopping_admin,backend_state,780,test,webarena_verified.742.779 +webarena_verified.742.781,False,shopping_admin,backend_state,781,train,webarena_verified.742.780 +webarena_verified.742.782,False,shopping_admin,backend_state,782,test,webarena_verified.742.781 +webarena_verified.351.783,False,gitlab,retrieve_value,783,train,webarena_verified.351.756 +webarena_verified.316.784,False,gitlab,retrieve_value,784,test,webarena_verified.316.783 +webarena_verified.316.785,False,gitlab,retrieve_value,785,test,webarena_verified.316.784 +webarena_verified.316.786,False,gitlab,retrieve_value,786,test,webarena_verified.316.785 +webarena_verified.316.787,False,gitlab,retrieve_value,787,test,webarena_verified.316.786 +webarena_verified.316.788,False,gitlab,retrieve_value,788,test,webarena_verified.316.787 +webarena_verified.328.789,False,gitlab,retrieve_value,789,test,webarena_verified.328.788 +webarena_verified.246.790,False,shopping_admin,retrieve_value,790,test,webarena_verified.246.782 +webarena_verified.84.791,False,gitlab reddit,string_match,791,train,webarena_verified.84.789 webarena_verified.84.735 +webarena_verified.172.792,False,shopping,retrieve_value,792,test,webarena_verified.172.693 +webarena_verified.172.793,False,shopping,retrieve_value,793,train,webarena_verified.172.792 +webarena_verified.191.794,False,shopping,retrieve_value,794,test,webarena_verified.191.793 +webarena_verified.191.795,False,shopping,retrieve_value,795,train,webarena_verified.191.794 +webarena_verified.191.796,False,shopping,retrieve_value,796,train,webarena_verified.191.795 +webarena_verified.191.797,False,shopping,retrieve_value,797,test,webarena_verified.191.796 +webarena_verified.191.798,False,shopping,retrieve_value,798,train,webarena_verified.191.797 +webarena_verified.600.799,False,gitlab,backend_state,799,train,webarena_verified.600.791 +webarena_verified.600.800,False,gitlab,backend_state,800,test,webarena_verified.600.799 +webarena_verified.600.801,False,gitlab,backend_state,801,train,webarena_verified.600.800 +webarena_verified.600.802,False,gitlab,backend_state,802,train,webarena_verified.600.801 +webarena_verified.600.803,False,gitlab,backend_state,803,test,webarena_verified.600.802 +webarena_verified.999.804,False,gitlab,backend_state,804,train,webarena_verified.999.803 +webarena_verified.335.805,False,gitlab,backend_state,805,test,webarena_verified.335.804 +webarena_verified.335.806,False,gitlab,backend_state,806,test,webarena_verified.335.805 +webarena_verified.335.807,False,gitlab,backend_state,807,train,webarena_verified.335.806 +webarena_verified.327.808,False,gitlab,backend_state,808,train,webarena_verified.327.807 +webarena_verified.327.809,False,gitlab,backend_state,809,train,webarena_verified.327.808 +webarena_verified.999.810,False,gitlab,backend_state,810,test,webarena_verified.999.809 +webarena_verified.999.811,False,gitlab,backend_state,811,test,webarena_verified.999.810 diff --git a/browsergym/webarena_verified/src/browsergym/webarena_verified/__init__.py b/browsergym/webarena_verified/src/browsergym/webarena_verified/__init__.py index fffda0ee..030d4830 100644 --- a/browsergym/webarena_verified/src/browsergym/webarena_verified/__init__.py +++ b/browsergym/webarena_verified/src/browsergym/webarena_verified/__init__.py @@ -14,8 +14,8 @@ ALL_WEBARENA_TASK_IDS = [] # register all WebArena benchmark -for task_id in config.TASK_IDS: - gym_id = f"webarena_verified.{task_id}" +for task_id, intent_template_id in zip(config.TASK_IDS, config.INTENT_TEMPLATE_IDS): + gym_id = f"webarena_verified.{intent_template_id}.{task_id}" register_task( gym_id, task.WebArenaVerifiedTask, diff --git a/browsergym/webarena_verified/src/browsergym/webarena_verified/config.py b/browsergym/webarena_verified/src/browsergym/webarena_verified/config.py index b3aabb95..b79e414d 100644 --- a/browsergym/webarena_verified/src/browsergym/webarena_verified/config.py +++ b/browsergym/webarena_verified/src/browsergym/webarena_verified/config.py @@ -1 +1,12 @@ +import json + TASK_IDS = range(812) +INTENT_TEMPLATE_IDS = [] + +with open("browsergym/webarena_verified/webarena_verified.json", "r") as f: + data = json.load(f) + +for task in data: + INTENT_TEMPLATE_IDS.append(task["intent_template_id"]) + +assert len(INTENT_TEMPLATE_IDS) == len(TASK_IDS), "Number of intent template IDs must match number of task IDs" From 5b05044ed89d84654f7ee68af19d5636b6805e80 Mon Sep 17 00:00:00 2001 From: Nicolas Gontier Date: Thu, 23 Oct 2025 15:19:09 +0000 Subject: [PATCH 18/64] fix config --- .../src/browsergym/webarena_verified/config.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/browsergym/webarena_verified/src/browsergym/webarena_verified/config.py b/browsergym/webarena_verified/src/browsergym/webarena_verified/config.py index b79e414d..b9766ad7 100644 --- a/browsergym/webarena_verified/src/browsergym/webarena_verified/config.py +++ b/browsergym/webarena_verified/src/browsergym/webarena_verified/config.py @@ -1,9 +1,10 @@ import json +from pathlib import Path TASK_IDS = range(812) INTENT_TEMPLATE_IDS = [] -with open("browsergym/webarena_verified/webarena_verified.json", "r") as f: +with open(Path(__file__).parent / "webarena_verified.json", "r") as f: data = json.load(f) for task in data: From 56574eb94549671e918f41640c8c1c9d8a8be1d3 Mon Sep 17 00:00:00 2001 From: Nicolas Gontier Date: Thu, 23 Oct 2025 15:44:59 +0000 Subject: [PATCH 19/64] fix csv file --- .../benchmark/metadata/webarena_verified.csv | 402 +++++++++--------- 1 file changed, 201 insertions(+), 201 deletions(-) diff --git a/browsergym/experiments/src/browsergym/experiments/benchmark/metadata/webarena_verified.csv b/browsergym/experiments/src/browsergym/experiments/benchmark/metadata/webarena_verified.csv index 94dae972..8bdd5a95 100644 --- a/browsergym/experiments/src/browsergym/experiments/benchmark/metadata/webarena_verified.csv +++ b/browsergym/experiments/src/browsergym/experiments/benchmark/metadata/webarena_verified.csv @@ -10,12 +10,12 @@ webarena_verified.79.7,False,map,retrieve_value,7,train, webarena_verified.79.8,False,map,string_match,8,test,webarena_verified.79.7 webarena_verified.79.9,False,map,retrieve_value,9,test,webarena_verified.79.8 webarena_verified.79.10,False,map,retrieve_value,10,test,webarena_verified.79.9 -webarena_verified.288.11,False,shopping_admin,retrieve_value,11,test,webarena_verified.288.6 +webarena_verified.288.11,False,shopping_admin,retrieve_value,11,test,webarena_verified.279.6 webarena_verified.288.12,False,shopping_admin,retrieve_value,12,train,webarena_verified.288.11 webarena_verified.288.13,False,shopping_admin,retrieve_value,13,train,webarena_verified.288.12 webarena_verified.288.14,False,shopping_admin,retrieve_value,14,train,webarena_verified.288.13 webarena_verified.288.15,False,shopping_admin,retrieve_value,15,test,webarena_verified.288.14 -webarena_verified.73.16,False,map,string_match,16,test,webarena_verified.73.10 +webarena_verified.73.16,False,map,string_match,16,test,webarena_verified.79.10 webarena_verified.73.17,False,map,string_match,17,train,webarena_verified.73.16 webarena_verified.73.18,False,map,string_match,18,test,webarena_verified.73.17 webarena_verified.73.19,False,map,string_match,19,train,webarena_verified.73.18 @@ -31,783 +31,783 @@ webarena_verified.33.28,False,reddit,retrieve_value,28,train,webarena_verified.3 webarena_verified.33.29,False,reddit,retrieve_value,29,train,webarena_verified.33.28 webarena_verified.33.30,False,reddit,retrieve_value,30,test,webarena_verified.33.29 webarena_verified.33.31,False,reddit,retrieve_value,31,train,webarena_verified.33.30 -webarena_verified.78.32,False,map,retrieve_value,32,test,webarena_verified.78.20 +webarena_verified.78.32,False,map,retrieve_value,32,test,webarena_verified.73.20 webarena_verified.78.33,False,map,retrieve_value,33,test,webarena_verified.78.32 webarena_verified.78.34,False,map,retrieve_value,34,train,webarena_verified.78.33 webarena_verified.78.35,False,map,retrieve_value,35,test,webarena_verified.78.34 -webarena_verified.77.36,False,map,retrieve_value,36,test,webarena_verified.77.35 +webarena_verified.77.36,False,map,retrieve_value,36,test,webarena_verified.78.35 webarena_verified.77.37,False,map,retrieve_value,37,train,webarena_verified.77.36 webarena_verified.77.38,False,map,retrieve_value,38,train,webarena_verified.77.37 webarena_verified.77.39,False,map,retrieve_value,39,train,webarena_verified.77.38 webarena_verified.77.40,False,map,retrieve_value,40,test,webarena_verified.77.39 -webarena_verified.285.41,False,shopping_admin,retrieve_value,41,train,webarena_verified.285.15 +webarena_verified.285.41,False,shopping_admin,retrieve_value,41,train,webarena_verified.288.15 webarena_verified.285.42,False,shopping_admin,retrieve_value,42,train,webarena_verified.285.41 webarena_verified.285.43,False,shopping_admin,retrieve_value,43,test,webarena_verified.285.42 webarena_verified.303.44,False,gitlab,ui_state,44,train, -webarena_verified.300.45,False,gitlab,ui_state,45,test,webarena_verified.300.44 +webarena_verified.300.45,False,gitlab,ui_state,45,test,webarena_verified.303.44 webarena_verified.300.46,False,gitlab,ui_state,46,test,webarena_verified.300.45 -webarena_verified.197.47,False,shopping,retrieve_value,47,train,webarena_verified.197.26 +webarena_verified.197.47,False,shopping,retrieve_value,47,train,webarena_verified.222.26 webarena_verified.197.48,False,shopping,retrieve_value,48,test,webarena_verified.197.47 webarena_verified.197.49,False,shopping,retrieve_value,49,train,webarena_verified.197.48 webarena_verified.197.50,False,shopping,retrieve_value,50,train,webarena_verified.197.49 webarena_verified.197.51,False,shopping,retrieve_value,51,test,webarena_verified.197.50 -webarena_verified.68.52,False,map,string_match,52,test,webarena_verified.68.40 +webarena_verified.68.52,False,map,string_match,52,test,webarena_verified.77.40 webarena_verified.68.53,False,map,string_match,53,train,webarena_verified.68.52 webarena_verified.68.54,False,map,string_match,54,test,webarena_verified.68.53 webarena_verified.68.55,False,map,string_match,55,train,webarena_verified.68.54 webarena_verified.68.56,False,map,string_match,56,train,webarena_verified.68.55 -webarena_verified.69.57,False,map,retrieve_value,57,train,webarena_verified.69.56 +webarena_verified.69.57,False,map,retrieve_value,57,train,webarena_verified.68.56 webarena_verified.69.58,False,map,retrieve_value,58,train,webarena_verified.69.57 webarena_verified.69.59,False,map,retrieve_value,59,test,webarena_verified.69.58 webarena_verified.69.60,False,map,retrieve_value,60,test,webarena_verified.69.59 webarena_verified.69.61,False,map,retrieve_value,61,train,webarena_verified.69.60 -webarena_verified.276.62,False,shopping_admin,retrieve_value,62,train,webarena_verified.276.43 +webarena_verified.276.62,False,shopping_admin,retrieve_value,62,train,webarena_verified.285.43 webarena_verified.276.63,False,shopping_admin,retrieve_value,63,test,webarena_verified.276.62 webarena_verified.276.64,False,shopping_admin,retrieve_value,64,test,webarena_verified.276.63 webarena_verified.276.65,False,shopping_admin,retrieve_value,65,train,webarena_verified.276.64 -webarena_verified.17.66,False,reddit,retrieve_value,66,test,webarena_verified.17.31 +webarena_verified.17.66,False,reddit,retrieve_value,66,test,webarena_verified.33.31 webarena_verified.17.67,False,reddit,retrieve_value,67,test,webarena_verified.17.66 webarena_verified.17.68,False,reddit,retrieve_value,68,train,webarena_verified.17.67 webarena_verified.17.69,False,reddit,retrieve_value,69,test,webarena_verified.17.68 -webarena_verified.70.70,False,map,retrieve_value,70,train,webarena_verified.70.61 +webarena_verified.70.70,False,map,retrieve_value,70,train,webarena_verified.69.61 webarena_verified.70.71,False,map,retrieve_value,71,test,webarena_verified.70.70 webarena_verified.70.72,False,map,retrieve_value,72,train,webarena_verified.70.71 webarena_verified.70.73,False,map,retrieve_value,73,test,webarena_verified.70.72 -webarena_verified.65.74,False,map,string_match,74,train,webarena_verified.65.73 +webarena_verified.65.74,False,map,string_match,74,train,webarena_verified.70.73 webarena_verified.65.75,False,map,string_match,75,train,webarena_verified.65.74 webarena_verified.65.76,False,map,retrieve_value,76,train,webarena_verified.65.75 -webarena_verified.277.77,False,shopping_admin,retrieve_value,77,test,webarena_verified.277.65 +webarena_verified.277.77,False,shopping_admin,retrieve_value,77,test,webarena_verified.276.65 webarena_verified.277.78,False,shopping_admin,retrieve_value,78,train,webarena_verified.277.77 webarena_verified.277.79,False,shopping_admin,retrieve_value,79,test,webarena_verified.277.78 -webarena_verified.72.80,False,map,string_match,80,test,webarena_verified.72.76 +webarena_verified.72.80,False,map,string_match,80,test,webarena_verified.65.76 webarena_verified.72.81,False,map,string_match,81,test,webarena_verified.72.80 webarena_verified.72.82,False,map,string_match,82,train,webarena_verified.72.81 webarena_verified.72.83,False,map,string_match,83,train,webarena_verified.72.82 -webarena_verified.64.84,False,map,string_match,84,train,webarena_verified.64.83 +webarena_verified.64.84,False,map,string_match,84,train,webarena_verified.72.83 webarena_verified.64.85,False,map,string_match,85,test,webarena_verified.64.84 webarena_verified.64.86,False,map,string_match,86,test,webarena_verified.64.85 webarena_verified.64.87,False,map,string_match,87,train,webarena_verified.64.86 webarena_verified.64.88,False,map,string_match,88,train,webarena_verified.64.87 -webarena_verified.67.89,False,map,retrieve_value,89,test,webarena_verified.67.88 +webarena_verified.67.89,False,map,retrieve_value,89,test,webarena_verified.64.88 webarena_verified.67.90,False,map,retrieve_value,90,test,webarena_verified.67.89 webarena_verified.67.91,False,map,retrieve_value,91,train,webarena_verified.67.90 webarena_verified.67.92,False,map,retrieve_value,92,train,webarena_verified.67.91 webarena_verified.67.93,False,map,retrieve_value,93,train,webarena_verified.67.92 -webarena_verified.274.94,False,shopping_admin,retrieve_value,94,test,webarena_verified.274.79 +webarena_verified.274.94,False,shopping_admin,retrieve_value,94,test,webarena_verified.277.79 webarena_verified.274.95,False,shopping_admin,retrieve_value,95,train,webarena_verified.274.94 -webarena_verified.193.96,False,shopping,retrieve_value,96,test,webarena_verified.193.51 -webarena_verified.120.97,False,map wikipedia,retrieve_value,97,test,webarena_verified.120.93 -webarena_verified.66.98,False,map,retrieve_value,98,test,webarena_verified.66.97 +webarena_verified.193.96,False,shopping,retrieve_value,96,test,webarena_verified.197.51 +webarena_verified.120.97,False,map wikipedia,retrieve_value,97,test,webarena_verified.67.93 +webarena_verified.66.98,False,map,retrieve_value,98,test,webarena_verified.120.97 webarena_verified.66.99,False,map,retrieve_value,99,train,webarena_verified.66.98 webarena_verified.66.100,False,map,retrieve_value,100,test,webarena_verified.66.99 webarena_verified.66.101,False,map,string_match,101,train,webarena_verified.66.100 -webarena_verified.349.102,False,gitlab,ui_state,102,train,webarena_verified.349.46 +webarena_verified.349.102,False,gitlab,ui_state,102,train,webarena_verified.300.46 webarena_verified.349.103,False,gitlab,ui_state,103,train,webarena_verified.349.102 webarena_verified.349.104,False,gitlab,ui_state,104,test,webarena_verified.349.103 webarena_verified.349.105,False,gitlab,ui_state,105,train,webarena_verified.349.104 webarena_verified.349.106,False,gitlab,ui_state,106,test,webarena_verified.349.105 -webarena_verified.270.107,False,shopping_admin,retrieve_value,107,test,webarena_verified.270.95 +webarena_verified.270.107,False,shopping_admin,retrieve_value,107,test,webarena_verified.274.95 webarena_verified.270.108,False,shopping_admin,retrieve_value,108,train,webarena_verified.270.107 webarena_verified.270.109,False,shopping_admin,retrieve_value,109,test,webarena_verified.270.108 webarena_verified.270.110,False,shopping_admin,retrieve_value,110,train,webarena_verified.270.109 webarena_verified.270.111,False,shopping_admin,retrieve_value,111,train,webarena_verified.270.110 -webarena_verified.245.112,False,shopping_admin,retrieve_value,112,test,webarena_verified.245.111 +webarena_verified.245.112,False,shopping_admin,retrieve_value,112,test,webarena_verified.270.111 webarena_verified.245.113,False,shopping_admin,retrieve_value,113,test,webarena_verified.245.112 webarena_verified.245.114,False,shopping_admin,retrieve_value,114,train,webarena_verified.245.113 webarena_verified.245.115,False,shopping_admin,retrieve_value,115,test,webarena_verified.245.114 webarena_verified.245.116,False,shopping_admin,retrieve_value,116,test,webarena_verified.245.115 -webarena_verified.161.117,False,shopping,retrieve_value,117,test,webarena_verified.161.96 -webarena_verified.151.118,False,shopping,program_html,118,train,webarena_verified.151.117 -webarena_verified.250.119,False,shopping_admin,retrieve_value,119,test,webarena_verified.250.116 +webarena_verified.161.117,False,shopping,retrieve_value,117,test,webarena_verified.193.96 +webarena_verified.151.118,False,shopping,program_html,118,train,webarena_verified.161.117 +webarena_verified.250.119,False,shopping_admin,retrieve_value,119,test,webarena_verified.245.116 webarena_verified.250.120,False,shopping_admin,retrieve_value,120,train,webarena_verified.250.119 webarena_verified.250.121,False,shopping_admin,retrieve_value,121,train,webarena_verified.250.120 webarena_verified.250.122,False,shopping_admin,retrieve_value,122,test,webarena_verified.250.121 webarena_verified.250.123,False,shopping_admin,retrieve_value,123,train,webarena_verified.250.122 -webarena_verified.159.124,False,shopping,retrieve_value,124,train,webarena_verified.159.118 +webarena_verified.159.124,False,shopping,retrieve_value,124,train,webarena_verified.151.118 webarena_verified.159.125,False,shopping,retrieve_value,125,train,webarena_verified.159.124 webarena_verified.159.126,False,shopping,retrieve_value,126,test,webarena_verified.159.125 -webarena_verified.1001.127,False,shopping_admin,retrieve_value,127,train,webarena_verified.1001.123 -webarena_verified.1002.128,False,shopping_admin,retrieve_value,128,train,webarena_verified.1002.127 +webarena_verified.1001.127,False,shopping_admin,retrieve_value,127,train,webarena_verified.250.123 +webarena_verified.1002.128,False,shopping_admin,retrieve_value,128,train,webarena_verified.1001.127 webarena_verified.1002.129,False,shopping_admin,retrieve_value,129,train,webarena_verified.1002.128 webarena_verified.1002.130,False,shopping_admin,retrieve_value,130,train,webarena_verified.1002.129 webarena_verified.1002.131,False,shopping_admin,retrieve_value,131,test,webarena_verified.1002.130 -webarena_verified.322.132,False,gitlab,retrieve_value,132,train,webarena_verified.322.106 +webarena_verified.322.132,False,gitlab,retrieve_value,132,train,webarena_verified.349.106 webarena_verified.322.133,False,gitlab,retrieve_value,133,test,webarena_verified.322.132 webarena_verified.322.134,False,gitlab,retrieve_value,134,test,webarena_verified.322.133 webarena_verified.322.135,False,gitlab,retrieve_value,135,train,webarena_verified.322.134 webarena_verified.322.136,False,gitlab,retrieve_value,136,train,webarena_verified.322.135 -webarena_verified.51.137,False,map,string_match,137,test,webarena_verified.51.101 +webarena_verified.51.137,False,map,string_match,137,test,webarena_verified.66.101 webarena_verified.51.138,False,map,string_match,138,test,webarena_verified.51.137 webarena_verified.51.139,False,map,string_match,139,test,webarena_verified.51.138 webarena_verified.51.140,False,map,string_match,140,train,webarena_verified.51.139 -webarena_verified.162.141,False,shopping,retrieve_value,141,train,webarena_verified.162.126 +webarena_verified.162.141,False,shopping,retrieve_value,141,train,webarena_verified.159.126 webarena_verified.162.142,False,shopping,retrieve_value,142,train,webarena_verified.162.141 webarena_verified.162.143,False,shopping,retrieve_value,143,test,webarena_verified.162.142 webarena_verified.162.144,False,shopping,retrieve_value,144,test,webarena_verified.162.143 webarena_verified.162.145,False,shopping,retrieve_value,145,train,webarena_verified.162.144 -webarena_verified.155.146,False,shopping,retrieve_value,146,test,webarena_verified.155.145 +webarena_verified.155.146,False,shopping,retrieve_value,146,test,webarena_verified.162.145 webarena_verified.155.147,False,shopping,retrieve_value,147,train,webarena_verified.155.146 webarena_verified.155.148,False,shopping,retrieve_value,148,train,webarena_verified.155.147 webarena_verified.155.149,False,shopping,retrieve_value,149,test,webarena_verified.155.148 webarena_verified.155.150,False,shopping,retrieve_value,150,train,webarena_verified.155.149 -webarena_verified.36.151,False,map,string_match,151,train,webarena_verified.36.140 +webarena_verified.36.151,False,map,string_match,151,train,webarena_verified.51.140 webarena_verified.36.152,False,map,string_match,152,train,webarena_verified.36.151 webarena_verified.36.153,False,map,string_match,153,test,webarena_verified.36.152 webarena_verified.36.154,False,map,string_match,154,train,webarena_verified.36.153 webarena_verified.36.155,False,map,string_match,155,test,webarena_verified.36.154 -webarena_verified.290.156,False,gitlab,ui_state,156,test,webarena_verified.290.136 -webarena_verified.255.157,False,shopping_admin,ui_state,157,train,webarena_verified.255.131 -webarena_verified.171.158,False,shopping,ui_state,158,test,webarena_verified.171.150 +webarena_verified.290.156,False,gitlab,ui_state,156,test,webarena_verified.322.136 +webarena_verified.255.157,False,shopping_admin,ui_state,157,train,webarena_verified.1002.131 +webarena_verified.171.158,False,shopping,ui_state,158,test,webarena_verified.155.150 webarena_verified.171.159,False,shopping,ui_state,159,train,webarena_verified.171.158 webarena_verified.171.160,False,shopping,ui_state,160,train,webarena_verified.171.159 webarena_verified.171.161,False,shopping,ui_state,161,train,webarena_verified.171.160 webarena_verified.171.162,False,shopping,ui_state,162,test,webarena_verified.171.161 -webarena_verified.136.163,False,shopping,retrieve_value,163,test,webarena_verified.136.162 +webarena_verified.136.163,False,shopping,retrieve_value,163,test,webarena_verified.171.162 webarena_verified.136.164,False,shopping,retrieve_value,164,test,webarena_verified.136.163 webarena_verified.136.165,False,shopping,retrieve_value,165,test,webarena_verified.136.164 webarena_verified.136.166,False,shopping,retrieve_value,166,test,webarena_verified.136.165 webarena_verified.136.167,False,shopping,retrieve_value,167,test,webarena_verified.136.166 -webarena_verified.289.168,False,gitlab,retrieve_value,168,test,webarena_verified.289.156 +webarena_verified.289.168,False,gitlab,retrieve_value,168,test,webarena_verified.290.156 webarena_verified.289.169,False,gitlab,retrieve_value,169,train,webarena_verified.289.168 webarena_verified.289.170,False,gitlab,retrieve_value,170,train,webarena_verified.289.169 webarena_verified.289.171,False,gitlab,retrieve_value,171,test,webarena_verified.289.170 webarena_verified.289.172,False,gitlab,retrieve_value,172,train,webarena_verified.289.171 -webarena_verified.310.173,False,gitlab,retrieve_value,173,train,webarena_verified.310.172 +webarena_verified.310.173,False,gitlab,retrieve_value,173,train,webarena_verified.289.172 webarena_verified.310.174,False,gitlab,retrieve_value,174,test,webarena_verified.310.173 webarena_verified.310.175,False,gitlab,retrieve_value,175,train,webarena_verified.310.174 webarena_verified.310.176,False,gitlab,retrieve_value,176,train,webarena_verified.310.175 webarena_verified.310.177,False,gitlab,retrieve_value,177,test,webarena_verified.310.176 -webarena_verified.500.178,False,gitlab,retrieve_value,178,test,webarena_verified.500.177 +webarena_verified.500.178,False,gitlab,retrieve_value,178,test,webarena_verified.310.177 webarena_verified.500.179,False,gitlab,retrieve_value,179,train,webarena_verified.500.178 webarena_verified.500.180,False,gitlab,retrieve_value,180,train,webarena_verified.500.179 webarena_verified.500.181,False,gitlab,retrieve_value,181,test,webarena_verified.500.180 webarena_verified.500.182,False,gitlab,retrieve_value,182,train,webarena_verified.500.181 -webarena_verified.368.183,False,shopping_admin,retrieve_value,183,train,webarena_verified.368.157 +webarena_verified.368.183,False,shopping_admin,retrieve_value,183,train,webarena_verified.255.157 webarena_verified.368.184,False,shopping_admin,retrieve_value,184,train,webarena_verified.368.183 webarena_verified.368.185,False,shopping_admin,retrieve_value,185,test,webarena_verified.368.184 webarena_verified.368.186,False,shopping_admin,retrieve_value,186,train,webarena_verified.368.185 webarena_verified.368.187,False,shopping_admin,retrieve_value,187,test,webarena_verified.368.186 -webarena_verified.214.188,False,shopping,retrieve_value,188,test,webarena_verified.214.167 +webarena_verified.214.188,False,shopping,retrieve_value,188,test,webarena_verified.136.167 webarena_verified.214.189,False,shopping,retrieve_value,189,train,webarena_verified.214.188 webarena_verified.214.190,False,shopping,retrieve_value,190,train,webarena_verified.214.189 webarena_verified.214.191,False,shopping,retrieve_value,191,train,webarena_verified.214.190 webarena_verified.214.192,False,shopping,retrieve_value,192,test,webarena_verified.214.191 -webarena_verified.367.193,False,shopping_admin,retrieve_value,193,train,webarena_verified.367.187 +webarena_verified.367.193,False,shopping_admin,retrieve_value,193,train,webarena_verified.368.187 webarena_verified.367.194,False,shopping_admin,retrieve_value,194,train,webarena_verified.367.193 webarena_verified.367.195,False,shopping_admin,retrieve_value,195,test,webarena_verified.367.194 webarena_verified.367.196,False,shopping_admin,retrieve_value,196,train,webarena_verified.367.195 webarena_verified.367.197,False,shopping_admin,retrieve_value,197,train,webarena_verified.367.196 -webarena_verified.366.198,False,shopping_admin,retrieve_value,198,train,webarena_verified.366.197 +webarena_verified.366.198,False,shopping_admin,retrieve_value,198,train,webarena_verified.367.197 webarena_verified.366.199,False,shopping_admin,retrieve_value,199,train,webarena_verified.366.198 webarena_verified.366.200,False,shopping_admin,retrieve_value,200,train,webarena_verified.366.199 webarena_verified.366.201,False,shopping_admin,retrieve_value,201,test,webarena_verified.366.200 webarena_verified.366.202,False,shopping_admin,retrieve_value,202,train,webarena_verified.366.201 webarena_verified.366.203,False,shopping_admin,retrieve_value,203,test,webarena_verified.366.202 webarena_verified.366.204,False,shopping_admin,retrieve_value,204,test,webarena_verified.366.203 -webarena_verified.320.205,False,gitlab,retrieve_value,205,train,webarena_verified.320.182 +webarena_verified.320.205,False,gitlab,retrieve_value,205,train,webarena_verified.500.182 webarena_verified.320.206,False,gitlab,retrieve_value,206,test,webarena_verified.320.205 webarena_verified.320.207,False,gitlab,retrieve_value,207,test,webarena_verified.320.206 -webarena_verified.364.208,False,shopping_admin,retrieve_value,208,test,webarena_verified.364.204 +webarena_verified.364.208,False,shopping_admin,retrieve_value,208,test,webarena_verified.366.204 webarena_verified.364.209,False,shopping_admin,retrieve_value,209,test,webarena_verified.364.208 webarena_verified.364.210,False,shopping_admin,retrieve_value,210,train,webarena_verified.364.209 webarena_verified.364.211,False,shopping_admin,retrieve_value,211,train,webarena_verified.364.210 webarena_verified.364.212,False,shopping_admin,retrieve_value,212,train,webarena_verified.364.211 -webarena_verified.249.213,False,shopping_admin,retrieve_value,213,test,webarena_verified.249.212 +webarena_verified.249.213,False,shopping_admin,retrieve_value,213,test,webarena_verified.364.212 webarena_verified.249.214,False,shopping_admin,retrieve_value,214,train,webarena_verified.249.213 webarena_verified.249.215,False,shopping_admin,retrieve_value,215,test,webarena_verified.249.214 webarena_verified.249.216,False,shopping_admin,retrieve_value,216,train,webarena_verified.249.215 webarena_verified.249.217,False,shopping_admin,retrieve_value,217,train,webarena_verified.249.216 -webarena_verified.41.218,False,map,string_match,218,train,webarena_verified.41.155 +webarena_verified.41.218,False,map,string_match,218,train,webarena_verified.36.155 webarena_verified.41.219,False,map,string_match,219,test,webarena_verified.41.218 webarena_verified.41.220,False,map,string_match,220,train,webarena_verified.41.219 -webarena_verified.35.221,False,map,string_match,221,test,webarena_verified.35.220 +webarena_verified.35.221,False,map,string_match,221,test,webarena_verified.41.220 webarena_verified.35.222,False,map,string_match,222,train,webarena_verified.35.221 webarena_verified.35.223,False,map,string_match,223,test,webarena_verified.35.222 webarena_verified.35.224,False,map,string_match,224,test,webarena_verified.35.223 -webarena_verified.135.225,False,shopping,retrieve_value,225,test,webarena_verified.135.192 -webarena_verified.370.226,False,shopping,retrieve_value,226,train,webarena_verified.370.225 +webarena_verified.135.225,False,shopping,retrieve_value,225,test,webarena_verified.214.192 +webarena_verified.370.226,False,shopping,retrieve_value,226,train,webarena_verified.135.225 webarena_verified.370.227,False,shopping,retrieve_value,227,train,webarena_verified.370.226 webarena_verified.370.228,False,shopping,retrieve_value,228,test,webarena_verified.370.227 webarena_verified.370.229,False,shopping,retrieve_value,229,test,webarena_verified.370.228 webarena_verified.370.230,False,shopping,retrieve_value,230,train,webarena_verified.370.229 -webarena_verified.213.231,False,shopping,retrieve_value,231,test,webarena_verified.213.230 +webarena_verified.213.231,False,shopping,retrieve_value,231,test,webarena_verified.370.230 webarena_verified.213.232,False,shopping,retrieve_value,232,train,webarena_verified.213.231 webarena_verified.213.233,False,shopping,retrieve_value,233,test,webarena_verified.213.232 webarena_verified.213.234,False,shopping,retrieve_value,234,train,webarena_verified.213.233 webarena_verified.213.235,False,shopping,retrieve_value,235,train,webarena_verified.213.234 -webarena_verified.39.236,False,map,retrieve_value,236,train,webarena_verified.39.224 +webarena_verified.39.236,False,map,retrieve_value,236,train,webarena_verified.35.224 webarena_verified.39.237,False,map,retrieve_value,237,train,webarena_verified.39.236 -webarena_verified.138.238,False,shopping,ui_state,238,train,webarena_verified.138.235 +webarena_verified.138.238,False,shopping,ui_state,238,train,webarena_verified.213.235 webarena_verified.138.239,False,shopping,ui_state,239,train,webarena_verified.138.238 webarena_verified.138.240,False,shopping,ui_state,240,test,webarena_verified.138.239 webarena_verified.138.241,False,shopping,ui_state,241,train,webarena_verified.138.240 webarena_verified.138.242,False,shopping,ui_state,242,test,webarena_verified.138.241 -webarena_verified.244.243,False,shopping_admin,retrieve_value,243,train,webarena_verified.244.217 +webarena_verified.244.243,False,shopping_admin,retrieve_value,243,train,webarena_verified.249.217 webarena_verified.244.244,False,shopping_admin,retrieve_value,244,test,webarena_verified.244.243 webarena_verified.244.245,False,shopping_admin,retrieve_value,245,train,webarena_verified.244.244 webarena_verified.244.246,False,shopping_admin,retrieve_value,246,test,webarena_verified.244.245 webarena_verified.244.247,False,shopping_admin,retrieve_value,247,train,webarena_verified.244.246 -webarena_verified.46.248,False,map,retrieve_value,248,test,webarena_verified.46.237 +webarena_verified.46.248,False,map,retrieve_value,248,test,webarena_verified.39.237 webarena_verified.46.249,False,map,retrieve_value,249,train,webarena_verified.46.248 webarena_verified.46.250,False,map,retrieve_value,250,test,webarena_verified.46.249 webarena_verified.46.251,False,map,retrieve_value,251,train,webarena_verified.46.250 webarena_verified.46.252,False,map,retrieve_value,252,train,webarena_verified.46.251 -webarena_verified.501.253,False,map,string_match,253,test,webarena_verified.501.252 +webarena_verified.501.253,False,map,string_match,253,test,webarena_verified.46.252 webarena_verified.501.254,False,map,retrieve_value,254,train,webarena_verified.501.253 webarena_verified.501.255,False,map,retrieve_value,255,test,webarena_verified.501.254 webarena_verified.501.256,False,map,retrieve_value,256,train,webarena_verified.501.255 webarena_verified.501.257,False,map,string_match,257,test,webarena_verified.501.256 -webarena_verified.325.258,False,gitlab,ui_state,258,train,webarena_verified.325.207 -webarena_verified.312.259,False,gitlab,retrieve_value,259,train,webarena_verified.312.258 -webarena_verified.211.260,False,shopping,ui_state,260,test,webarena_verified.211.242 +webarena_verified.325.258,False,gitlab,ui_state,258,train,webarena_verified.320.207 +webarena_verified.312.259,False,gitlab,retrieve_value,259,train,webarena_verified.325.258 +webarena_verified.211.260,False,shopping,ui_state,260,test,webarena_verified.138.242 webarena_verified.211.261,False,shopping,ui_state,261,train,webarena_verified.211.260 webarena_verified.211.262,False,shopping,ui_state,262,train,webarena_verified.211.261 webarena_verified.211.263,False,shopping,ui_state,263,test,webarena_verified.211.262 webarena_verified.211.264,False,shopping,ui_state,264,train,webarena_verified.211.263 -webarena_verified.85.265,False,wikipedia map,retrieve_value,265,test,webarena_verified.85.257 +webarena_verified.85.265,False,wikipedia map,retrieve_value,265,test,webarena_verified.501.257 webarena_verified.85.266,False,wikipedia map,retrieve_value,266,test,webarena_verified.85.265 webarena_verified.85.267,False,wikipedia map,retrieve_value,267,train,webarena_verified.85.266 webarena_verified.85.268,False,wikipedia map,retrieve_value,268,test,webarena_verified.85.267 -webarena_verified.139.269,False,shopping,ui_state,269,train,webarena_verified.139.264 +webarena_verified.139.269,False,shopping,ui_state,269,train,webarena_verified.211.264 webarena_verified.139.270,False,shopping,ui_state,270,train,webarena_verified.139.269 webarena_verified.139.271,False,shopping,ui_state,271,test,webarena_verified.139.270 webarena_verified.139.272,False,shopping,ui_state,272,test,webarena_verified.139.271 webarena_verified.139.273,False,shopping,ui_state,273,train,webarena_verified.139.272 -webarena_verified.212.274,False,shopping,ui_state,274,test,webarena_verified.212.273 +webarena_verified.212.274,False,shopping,ui_state,274,test,webarena_verified.139.273 webarena_verified.212.275,False,shopping,ui_state,275,test,webarena_verified.212.274 webarena_verified.212.276,False,shopping,ui_state,276,train,webarena_verified.212.275 webarena_verified.212.277,False,shopping,ui_state,277,train,webarena_verified.212.276 webarena_verified.212.278,False,shopping,ui_state,278,train,webarena_verified.212.277 -webarena_verified.204.279,False,shopping,retrieve_value,279,train,webarena_verified.204.278 +webarena_verified.204.279,False,shopping,retrieve_value,279,train,webarena_verified.212.278 webarena_verified.204.280,False,shopping,retrieve_value,280,test,webarena_verified.204.279 webarena_verified.204.281,False,shopping,retrieve_value,281,train,webarena_verified.204.280 webarena_verified.204.282,False,shopping,retrieve_value,282,train,webarena_verified.204.281 -webarena_verified.210.283,False,shopping,ui_state,283,test,webarena_verified.210.282 -webarena_verified.207.284,False,shopping,ui_state,284,test,webarena_verified.207.283 +webarena_verified.210.283,False,shopping,ui_state,283,test,webarena_verified.204.282 +webarena_verified.207.284,False,shopping,ui_state,284,test,webarena_verified.210.283 webarena_verified.207.285,False,shopping,ui_state,285,train,webarena_verified.207.284 webarena_verified.207.286,False,shopping,ui_state,286,test,webarena_verified.207.285 -webarena_verified.47.287,False,map,string_match,287,test,webarena_verified.47.268 -webarena_verified.234.288,False,shopping_admin,retrieve_value,288,train,webarena_verified.234.247 +webarena_verified.47.287,False,map,string_match,287,test,webarena_verified.85.268 +webarena_verified.234.288,False,shopping_admin,retrieve_value,288,train,webarena_verified.244.247 webarena_verified.234.289,False,shopping_admin,retrieve_value,289,test,webarena_verified.234.288 webarena_verified.234.290,False,shopping_admin,retrieve_value,290,train,webarena_verified.234.289 webarena_verified.234.291,False,shopping_admin,retrieve_value,291,train,webarena_verified.234.290 webarena_verified.234.292,False,shopping_admin,retrieve_value,292,test,webarena_verified.234.291 -webarena_verified.329.293,False,gitlab,retrieve_value,293,train,webarena_verified.329.259 +webarena_verified.329.293,False,gitlab,retrieve_value,293,train,webarena_verified.312.259 webarena_verified.329.294,False,gitlab,retrieve_value,294,train,webarena_verified.329.293 webarena_verified.329.295,False,gitlab,retrieve_value,295,test,webarena_verified.329.294 webarena_verified.329.296,False,gitlab,retrieve_value,296,train,webarena_verified.329.295 webarena_verified.329.297,False,gitlab,retrieve_value,297,test,webarena_verified.329.296 -webarena_verified.180.298,False,shopping,ui_state,298,train,webarena_verified.180.286 +webarena_verified.180.298,False,shopping,ui_state,298,train,webarena_verified.207.286 webarena_verified.180.299,False,shopping,ui_state,299,train,webarena_verified.180.298 webarena_verified.180.300,False,shopping,ui_state,300,test,webarena_verified.180.299 webarena_verified.180.301,False,shopping,retrieve_value,301,test,webarena_verified.180.300 webarena_verified.180.302,False,shopping,retrieve_value,302,train,webarena_verified.180.301 -webarena_verified.321.303,False,gitlab,retrieve_value,303,test,webarena_verified.321.297 +webarena_verified.321.303,False,gitlab,retrieve_value,303,test,webarena_verified.329.297 webarena_verified.321.304,False,gitlab,retrieve_value,304,train,webarena_verified.321.303 webarena_verified.321.305,False,gitlab,retrieve_value,305,train,webarena_verified.321.304 webarena_verified.321.306,False,gitlab,retrieve_value,306,test,webarena_verified.321.305 webarena_verified.321.307,False,gitlab,retrieve_value,307,train,webarena_verified.321.306 -webarena_verified.323.308,False,gitlab,retrieve_value,308,train,webarena_verified.323.307 +webarena_verified.323.308,False,gitlab,retrieve_value,308,train,webarena_verified.321.307 webarena_verified.323.309,False,gitlab,retrieve_value,309,train,webarena_verified.323.308 webarena_verified.323.310,False,gitlab,retrieve_value,310,train,webarena_verified.323.309 webarena_verified.323.311,False,gitlab,retrieve_value,311,test,webarena_verified.323.310 webarena_verified.323.312,False,gitlab,retrieve_value,312,test,webarena_verified.323.311 -webarena_verified.134.313,False,shopping,retrieve_value,313,train,webarena_verified.134.302 -webarena_verified.324.314,False,gitlab,retrieve_value,314,train,webarena_verified.324.312 +webarena_verified.134.313,False,shopping,retrieve_value,313,train,webarena_verified.180.302 +webarena_verified.324.314,False,gitlab,retrieve_value,314,train,webarena_verified.323.312 webarena_verified.324.315,False,gitlab,retrieve_value,315,train,webarena_verified.324.314 webarena_verified.324.316,False,gitlab,retrieve_value,316,test,webarena_verified.324.315 webarena_verified.324.317,False,gitlab,retrieve_value,317,test,webarena_verified.324.316 webarena_verified.324.318,False,gitlab,retrieve_value,318,train,webarena_verified.324.317 -webarena_verified.160.319,False,shopping,retrieve_value,319,train,webarena_verified.160.313 +webarena_verified.160.319,False,shopping,retrieve_value,319,train,webarena_verified.134.313 webarena_verified.160.320,False,shopping,retrieve_value,320,test,webarena_verified.160.319 webarena_verified.160.321,False,shopping,retrieve_value,321,train,webarena_verified.160.320 webarena_verified.160.322,False,shopping,retrieve_value,322,test,webarena_verified.160.321 webarena_verified.160.323,False,shopping,retrieve_value,323,train,webarena_verified.160.322 -webarena_verified.208.324,False,shopping,ui_state,324,train,webarena_verified.208.323 +webarena_verified.208.324,False,shopping,ui_state,324,train,webarena_verified.160.323 webarena_verified.208.325,False,shopping,ui_state,325,test,webarena_verified.208.324 webarena_verified.208.326,False,shopping,ui_state,326,train,webarena_verified.208.325 webarena_verified.208.327,False,shopping,ui_state,327,test,webarena_verified.208.326 webarena_verified.208.328,False,shopping,ui_state,328,train,webarena_verified.208.327 -webarena_verified.147.329,False,shopping,retrieve_value,329,test,webarena_verified.147.328 +webarena_verified.147.329,False,shopping,retrieve_value,329,test,webarena_verified.208.328 webarena_verified.147.330,False,shopping,retrieve_value,330,test,webarena_verified.147.329 webarena_verified.147.331,False,shopping,retrieve_value,331,test,webarena_verified.147.330 webarena_verified.147.332,False,shopping,retrieve_value,332,train,webarena_verified.147.331 webarena_verified.147.333,False,shopping,retrieve_value,333,train,webarena_verified.147.332 -webarena_verified.169.334,False,shopping,retrieve_value,334,train,webarena_verified.169.333 +webarena_verified.169.334,False,shopping,retrieve_value,334,train,webarena_verified.147.333 webarena_verified.169.335,False,shopping,retrieve_value,335,train,webarena_verified.169.334 webarena_verified.169.336,False,shopping,retrieve_value,336,test,webarena_verified.169.335 webarena_verified.169.337,False,shopping,retrieve_value,337,test,webarena_verified.169.336 webarena_verified.169.338,False,shopping,retrieve_value,338,train,webarena_verified.169.337 -webarena_verified.299.339,False,gitlab,ui_state,339,test,webarena_verified.299.318 +webarena_verified.299.339,False,gitlab,ui_state,339,test,webarena_verified.324.318 webarena_verified.299.340,False,gitlab,ui_state,340,train,webarena_verified.299.339 webarena_verified.299.341,False,gitlab,ui_state,341,test,webarena_verified.299.340 webarena_verified.299.342,False,gitlab,ui_state,342,test,webarena_verified.299.341 webarena_verified.299.343,False,gitlab,ui_state,343,test,webarena_verified.299.342 -webarena_verified.248.344,False,shopping_admin,retrieve_value,344,test,webarena_verified.248.292 +webarena_verified.248.344,False,shopping_admin,retrieve_value,344,test,webarena_verified.234.292 webarena_verified.248.345,False,shopping_admin,retrieve_value,345,train,webarena_verified.248.344 webarena_verified.248.346,False,shopping_admin,retrieve_value,346,train,webarena_verified.248.345 webarena_verified.248.347,False,shopping_admin,retrieve_value,347,train,webarena_verified.248.346 webarena_verified.248.348,False,shopping_admin,retrieve_value,348,test,webarena_verified.248.347 -webarena_verified.298.349,False,gitlab,retrieve_value,349,test,webarena_verified.298.343 +webarena_verified.298.349,False,gitlab,retrieve_value,349,test,webarena_verified.299.343 webarena_verified.298.350,False,gitlab,retrieve_value,350,test,webarena_verified.298.349 -webarena_verified.137.351,False,shopping,ui_state,351,train,webarena_verified.137.338 +webarena_verified.137.351,False,shopping,ui_state,351,train,webarena_verified.169.338 webarena_verified.137.352,False,shopping,ui_state,352,test,webarena_verified.137.351 webarena_verified.137.353,False,shopping,ui_state,353,test,webarena_verified.137.352 webarena_verified.137.354,False,shopping,ui_state,354,train,webarena_verified.137.353 webarena_verified.137.355,False,shopping,ui_state,355,train,webarena_verified.137.354 -webarena_verified.49.356,False,map,program_html,356,test,webarena_verified.49.287 -webarena_verified.291.357,False,gitlab,ui_state,357,test,webarena_verified.291.350 -webarena_verified.206.358,False,shopping,retrieve_value,358,train,webarena_verified.206.355 +webarena_verified.49.356,False,map,program_html,356,test,webarena_verified.47.287 +webarena_verified.291.357,False,gitlab,ui_state,357,test,webarena_verified.298.350 +webarena_verified.206.358,False,shopping,retrieve_value,358,train,webarena_verified.137.355 webarena_verified.206.359,False,shopping,retrieve_value,359,test,webarena_verified.206.358 webarena_verified.206.360,False,shopping,retrieve_value,360,train,webarena_verified.206.359 webarena_verified.206.361,False,shopping,retrieve_value,361,train,webarena_verified.206.360 webarena_verified.206.362,False,shopping,retrieve_value,362,test,webarena_verified.206.361 -webarena_verified.58.363,False,map,retrieve_value,363,train,webarena_verified.58.356 +webarena_verified.58.363,False,map,retrieve_value,363,train,webarena_verified.49.356 webarena_verified.58.364,False,map,retrieve_value,364,test,webarena_verified.58.363 webarena_verified.58.365,False,map,retrieve_value,365,test,webarena_verified.58.364 webarena_verified.58.366,False,map,retrieve_value,366,train,webarena_verified.58.365 webarena_verified.58.367,False,map,retrieve_value,367,train,webarena_verified.58.366 -webarena_verified.188.368,False,shopping,retrieve_value,368,test,webarena_verified.188.362 -webarena_verified.52.369,False,map,program_html,369,train,webarena_verified.52.367 +webarena_verified.188.368,False,shopping,retrieve_value,368,test,webarena_verified.206.362 +webarena_verified.52.369,False,map,program_html,369,train,webarena_verified.58.367 webarena_verified.52.370,False,map,program_html,370,test,webarena_verified.52.369 webarena_verified.52.371,False,map,program_html,371,test,webarena_verified.52.370 webarena_verified.52.372,False,map,program_html,372,train,webarena_verified.52.371 webarena_verified.52.373,False,map,program_html,373,train,webarena_verified.52.372 -webarena_verified.266.374,False,shopping_admin,ui_state,374,train,webarena_verified.266.348 +webarena_verified.266.374,False,shopping_admin,ui_state,374,train,webarena_verified.248.348 webarena_verified.266.375,False,shopping_admin,ui_state,375,train,webarena_verified.266.374 -webarena_verified.182.376,False,shopping,retrieve_value,376,test,webarena_verified.182.368 -webarena_verified.59.377,False,map,ui_state,377,test,webarena_verified.59.373 +webarena_verified.182.376,False,shopping,retrieve_value,376,test,webarena_verified.188.368 +webarena_verified.59.377,False,map,ui_state,377,test,webarena_verified.52.373 webarena_verified.59.378,False,map,ui_state,378,train,webarena_verified.59.377 webarena_verified.59.379,False,map,ui_state,379,train,webarena_verified.59.378 webarena_verified.59.380,False,map,ui_state,380,test,webarena_verified.59.379 webarena_verified.59.381,False,map,ui_state,381,train,webarena_verified.59.380 -webarena_verified.781.382,False,map,string_match,382,test,webarena_verified.781.381 -webarena_verified.782.383,False,map,retrieve_value,383,test,webarena_verified.782.382 -webarena_verified.666.384,False,shopping,retrieve_value,384,test,webarena_verified.666.376 +webarena_verified.781.382,False,map,string_match,382,test,webarena_verified.59.381 +webarena_verified.782.383,False,map,retrieve_value,383,test,webarena_verified.781.382 +webarena_verified.666.384,False,shopping,retrieve_value,384,test,webarena_verified.182.376 webarena_verified.666.385,False,shopping,retrieve_value,385,train,webarena_verified.666.384 -webarena_verified.1355.386,False,shopping,retrieve_value,386,test,webarena_verified.1355.385 -webarena_verified.1356.387,False,shopping,retrieve_value,387,train,webarena_verified.1356.386 +webarena_verified.1355.386,False,shopping,retrieve_value,386,test,webarena_verified.666.385 +webarena_verified.1356.387,False,shopping,retrieve_value,387,train,webarena_verified.1355.386 webarena_verified.1356.388,False,shopping,retrieve_value,388,test,webarena_verified.1356.387 -webarena_verified.348.389,False,gitlab,backend_state,389,test,webarena_verified.348.357 +webarena_verified.348.389,False,gitlab,backend_state,389,test,webarena_verified.291.357 webarena_verified.348.390,False,gitlab,backend_state,390,train,webarena_verified.348.389 webarena_verified.348.391,False,gitlab,backend_state,391,train,webarena_verified.348.390 webarena_verified.348.392,False,gitlab,backend_state,392,test,webarena_verified.348.391 webarena_verified.348.393,False,gitlab,backend_state,393,train,webarena_verified.348.392 -webarena_verified.352.394,False,gitlab,backend_state,394,test,webarena_verified.352.393 +webarena_verified.352.394,False,gitlab,backend_state,394,test,webarena_verified.348.393 webarena_verified.352.395,False,gitlab,backend_state,395,train,webarena_verified.352.394 webarena_verified.352.396,False,gitlab,backend_state,396,train,webarena_verified.352.395 webarena_verified.352.397,False,gitlab,backend_state,397,train,webarena_verified.352.396 webarena_verified.352.398,False,gitlab,backend_state,398,test,webarena_verified.352.397 -webarena_verified.6.399,False,reddit,backend_state,399,train,webarena_verified.6.69 +webarena_verified.6.399,False,reddit,backend_state,399,train,webarena_verified.17.69 webarena_verified.6.400,False,reddit,backend_state,400,test,webarena_verified.6.399 webarena_verified.6.401,False,reddit,backend_state,401,train,webarena_verified.6.400 webarena_verified.6.402,False,reddit,backend_state,402,train,webarena_verified.6.401 webarena_verified.6.403,False,reddit,backend_state,403,test,webarena_verified.6.402 -webarena_verified.22.404,False,reddit,backend_state,404,train,webarena_verified.22.403 +webarena_verified.22.404,False,reddit,backend_state,404,train,webarena_verified.6.403 webarena_verified.22.405,False,reddit,backend_state,405,test,webarena_verified.22.404 webarena_verified.22.406,False,reddit,backend_state,406,train,webarena_verified.22.405 webarena_verified.22.407,False,reddit,backend_state,407,test,webarena_verified.22.406 webarena_verified.22.408,False,reddit,backend_state,408,train,webarena_verified.22.407 -webarena_verified.23.409,False,reddit,backend_state,409,test,webarena_verified.23.408 +webarena_verified.23.409,False,reddit,backend_state,409,test,webarena_verified.22.408 webarena_verified.23.410,False,reddit,backend_state,410,test,webarena_verified.23.409 -webarena_verified.355.411,False,gitlab,backend_state,411,test,webarena_verified.355.398 +webarena_verified.355.411,False,gitlab,backend_state,411,test,webarena_verified.352.398 webarena_verified.355.412,False,gitlab,backend_state,412,test,webarena_verified.355.411 webarena_verified.355.413,False,gitlab,backend_state,413,test,webarena_verified.355.412 webarena_verified.355.414,False,gitlab,backend_state,414,test,webarena_verified.355.413 -webarena_verified.360.415,False,gitlab,backend_state,415,test,webarena_verified.360.414 +webarena_verified.360.415,False,gitlab,backend_state,415,test,webarena_verified.355.414 webarena_verified.360.416,False,gitlab,backend_state,416,test,webarena_verified.360.415 webarena_verified.360.417,False,gitlab,backend_state,417,test,webarena_verified.360.416 -webarena_verified.361.418,False,gitlab,backend_state,418,train,webarena_verified.361.417 +webarena_verified.361.418,False,gitlab,backend_state,418,train,webarena_verified.360.417 webarena_verified.361.419,False,gitlab,backend_state,419,test,webarena_verified.361.418 webarena_verified.361.420,False,gitlab,backend_state,420,test,webarena_verified.361.419 webarena_verified.361.421,False,gitlab,backend_state,421,train,webarena_verified.361.420 webarena_verified.361.422,False,gitlab,backend_state,422,train,webarena_verified.361.421 -webarena_verified.237.423,False,shopping_admin,backend_state,423,train,webarena_verified.237.375 -webarena_verified.371.424,False,wikipedia map,program_html,424,train,webarena_verified.371.383 +webarena_verified.237.423,False,shopping_admin,backend_state,423,train,webarena_verified.266.375 +webarena_verified.371.424,False,wikipedia map,program_html,424,train,webarena_verified.782.383 webarena_verified.371.425,False,wikipedia map,program_html,425,train,webarena_verified.371.424 webarena_verified.371.426,False,wikipedia map,program_html,426,test,webarena_verified.371.425 webarena_verified.371.427,False,wikipedia map,program_html,427,test,webarena_verified.371.426 webarena_verified.371.428,False,wikipedia map,program_html,428,train,webarena_verified.371.427 webarena_verified.371.429,False,wikipedia map,program_html,429,train,webarena_verified.371.428 webarena_verified.371.430,False,wikipedia map,program_html,430,test,webarena_verified.371.429 -webarena_verified.145.431,False,shopping,program_html,431,train,webarena_verified.145.388 +webarena_verified.145.431,False,shopping,program_html,431,train,webarena_verified.1356.388 webarena_verified.145.432,False,shopping,backend_state,432,test,webarena_verified.145.431 webarena_verified.145.433,False,shopping,backend_state,433,train,webarena_verified.145.432 webarena_verified.145.434,False,shopping,backend_state,434,train,webarena_verified.145.433 webarena_verified.145.435,False,shopping,backend_state,435,train,webarena_verified.145.434 -webarena_verified.156.436,False,shopping,backend_state,436,test,webarena_verified.156.435 +webarena_verified.156.436,False,shopping,backend_state,436,test,webarena_verified.145.435 webarena_verified.156.437,False,shopping,backend_state,437,train,webarena_verified.156.436 webarena_verified.156.438,False,shopping,backend_state,438,train,webarena_verified.156.437 webarena_verified.156.439,False,shopping,backend_state,439,train,webarena_verified.156.438 webarena_verified.156.440,False,shopping,backend_state,440,test,webarena_verified.156.439 -webarena_verified.308.441,False,gitlab,backend_state,441,train,webarena_verified.308.422 +webarena_verified.308.441,False,gitlab,backend_state,441,train,webarena_verified.361.422 webarena_verified.308.442,False,gitlab,backend_state,442,train,webarena_verified.308.441 webarena_verified.308.443,False,gitlab,backend_state,443,test,webarena_verified.308.442 webarena_verified.308.444,False,gitlab,backend_state,444,train,webarena_verified.308.443 webarena_verified.308.445,False,gitlab,backend_state,445,test,webarena_verified.308.444 -webarena_verified.999.446,False,gitlab,backend_state,446,test,webarena_verified.999.445 +webarena_verified.999.446,False,gitlab,backend_state,446,test,webarena_verified.308.445 webarena_verified.999.447,False,gitlab,backend_state,447,train,webarena_verified.999.446 -webarena_verified.331.448,False,gitlab,backend_state,448,test,webarena_verified.331.447 +webarena_verified.331.448,False,gitlab,backend_state,448,test,webarena_verified.999.447 webarena_verified.331.449,False,gitlab,backend_state,449,test,webarena_verified.331.448 webarena_verified.331.450,False,gitlab,retrieve_value,450,train,webarena_verified.331.449 webarena_verified.331.451,False,gitlab,retrieve_value,451,train,webarena_verified.331.450 webarena_verified.331.452,False,gitlab,retrieve_value,452,train,webarena_verified.331.451 -webarena_verified.242.453,False,shopping_admin,backend_state,453,train,webarena_verified.242.423 +webarena_verified.242.453,False,shopping_admin,backend_state,453,train,webarena_verified.237.423 webarena_verified.242.454,False,shopping_admin,backend_state,454,test,webarena_verified.242.453 webarena_verified.242.455,False,shopping_admin,backend_state,455,train,webarena_verified.242.454 webarena_verified.242.456,False,shopping_admin,backend_state,456,test,webarena_verified.242.455 webarena_verified.242.457,False,shopping_admin,backend_state,457,train,webarena_verified.242.456 -webarena_verified.247.458,False,shopping_admin,backend_state,458,test,webarena_verified.247.457 +webarena_verified.247.458,False,shopping_admin,backend_state,458,test,webarena_verified.242.457 webarena_verified.247.459,False,shopping_admin,backend_state,459,test,webarena_verified.247.458 webarena_verified.247.460,False,shopping_admin,backend_state,460,train,webarena_verified.247.459 webarena_verified.247.461,False,shopping_admin,backend_state,461,train,webarena_verified.247.460 webarena_verified.247.462,False,shopping_admin,backend_state,462,test,webarena_verified.247.461 webarena_verified.247.463,False,shopping_admin,backend_state,463,test,webarena_verified.247.462 -webarena_verified.251.464,False,shopping_admin,backend_state,464,train,webarena_verified.251.463 -webarena_verified.186.465,False,shopping,backend_state,465,train,webarena_verified.186.440 +webarena_verified.251.464,False,shopping_admin,backend_state,464,train,webarena_verified.247.463 +webarena_verified.186.465,False,shopping,backend_state,465,train,webarena_verified.156.440 webarena_verified.186.466,False,shopping,backend_state,466,train,webarena_verified.186.465 webarena_verified.186.467,False,shopping,backend_state,467,train,webarena_verified.186.466 webarena_verified.186.468,False,shopping,backend_state,468,test,webarena_verified.186.467 webarena_verified.186.469,False,shopping,backend_state,469,test,webarena_verified.186.468 -webarena_verified.257.470,False,shopping_admin,backend_state,470,test,webarena_verified.257.464 +webarena_verified.257.470,False,shopping_admin,backend_state,470,test,webarena_verified.251.464 webarena_verified.257.471,False,shopping_admin,backend_state,471,test,webarena_verified.257.470 webarena_verified.257.472,False,shopping_admin,backend_state,472,train,webarena_verified.257.471 webarena_verified.257.473,False,shopping_admin,backend_state,473,train,webarena_verified.257.472 webarena_verified.257.474,False,shopping_admin,backend_state,474,train,webarena_verified.257.473 -webarena_verified.292.475,False,gitlab,backend_state,475,train,webarena_verified.292.452 +webarena_verified.292.475,False,gitlab,backend_state,475,train,webarena_verified.331.452 webarena_verified.292.476,False,gitlab,backend_state,476,train,webarena_verified.292.475 webarena_verified.292.477,False,gitlab,backend_state,477,train,webarena_verified.292.476 webarena_verified.292.478,False,gitlab,backend_state,478,test,webarena_verified.292.477 webarena_verified.292.479,False,gitlab,backend_state,479,test,webarena_verified.292.478 -webarena_verified.293.480,False,gitlab,backend_state,480,train,webarena_verified.293.479 -webarena_verified.294.481,False,gitlab,backend_state,481,train,webarena_verified.294.480 +webarena_verified.293.480,False,gitlab,backend_state,480,train,webarena_verified.292.479 +webarena_verified.294.481,False,gitlab,backend_state,481,train,webarena_verified.293.480 webarena_verified.294.482,False,gitlab,backend_state,482,train,webarena_verified.294.481 webarena_verified.294.483,False,gitlab,backend_state,483,test,webarena_verified.294.482 webarena_verified.294.484,False,gitlab,backend_state,484,train,webarena_verified.294.483 webarena_verified.294.485,False,gitlab,backend_state,485,test,webarena_verified.294.484 -webarena_verified.275.486,False,shopping_admin,backend_state,486,train,webarena_verified.275.474 +webarena_verified.275.486,False,shopping_admin,backend_state,486,train,webarena_verified.257.474 webarena_verified.275.487,False,shopping_admin,backend_state,487,test,webarena_verified.275.486 webarena_verified.275.488,False,shopping_admin,backend_state,488,test,webarena_verified.275.487 webarena_verified.275.489,False,shopping_admin,backend_state,489,train,webarena_verified.275.488 webarena_verified.275.490,False,shopping_admin,backend_state,490,train,webarena_verified.275.489 -webarena_verified.280.491,False,shopping_admin,retrieve_value,491,test,webarena_verified.280.490 +webarena_verified.280.491,False,shopping_admin,retrieve_value,491,test,webarena_verified.275.490 webarena_verified.280.492,False,shopping_admin,backend_state,492,train,webarena_verified.280.491 webarena_verified.280.493,False,shopping_admin,backend_state,493,train,webarena_verified.280.492 webarena_verified.280.494,False,shopping_admin,backend_state,494,train,webarena_verified.280.493 webarena_verified.280.495,False,shopping_admin,backend_state,495,test,webarena_verified.280.494 -webarena_verified.284.496,False,shopping_admin,backend_state,496,train,webarena_verified.284.495 +webarena_verified.284.496,False,shopping_admin,backend_state,496,train,webarena_verified.280.495 webarena_verified.284.497,False,shopping_admin,backend_state,497,test,webarena_verified.284.496 webarena_verified.284.498,False,shopping_admin,backend_state,498,test,webarena_verified.284.497 webarena_verified.284.499,False,shopping_admin,backend_state,499,train,webarena_verified.284.498 webarena_verified.284.500,False,shopping_admin,backend_state,500,train,webarena_verified.284.499 -webarena_verified.287.501,False,shopping_admin,backend_state,501,train,webarena_verified.287.500 +webarena_verified.287.501,False,shopping_admin,backend_state,501,train,webarena_verified.284.500 webarena_verified.287.502,False,shopping_admin,backend_state,502,test,webarena_verified.287.501 webarena_verified.287.503,False,shopping_admin,backend_state,503,train,webarena_verified.287.502 webarena_verified.287.504,False,shopping_admin,backend_state,504,test,webarena_verified.287.503 webarena_verified.287.505,False,shopping_admin,backend_state,505,train,webarena_verified.287.504 -webarena_verified.172.506,False,shopping,backend_state,506,train,webarena_verified.172.469 +webarena_verified.172.506,False,shopping,backend_state,506,train,webarena_verified.186.469 webarena_verified.172.507,False,shopping,backend_state,507,train,webarena_verified.172.506 webarena_verified.172.508,False,shopping,backend_state,508,test,webarena_verified.172.507 -webarena_verified.216.509,False,shopping,backend_state,509,test,webarena_verified.216.508 +webarena_verified.216.509,False,shopping,backend_state,509,test,webarena_verified.172.508 webarena_verified.216.510,False,shopping,backend_state,510,test,webarena_verified.216.509 -webarena_verified.189.511,False,shopping,program_html,511,test,webarena_verified.189.510 +webarena_verified.189.511,False,shopping,program_html,511,test,webarena_verified.216.510 webarena_verified.189.512,False,shopping,program_html,512,train,webarena_verified.189.511 webarena_verified.189.513,False,shopping,program_html,513,train,webarena_verified.189.512 webarena_verified.189.514,False,shopping,program_html,514,test,webarena_verified.189.513 webarena_verified.189.515,False,shopping,program_html,515,train,webarena_verified.189.514 -webarena_verified.196.516,False,shopping,backend_state,516,train,webarena_verified.196.515 +webarena_verified.196.516,False,shopping,backend_state,516,train,webarena_verified.189.515 webarena_verified.196.517,False,shopping,backend_state,517,test,webarena_verified.196.516 webarena_verified.196.518,False,shopping,backend_state,518,test,webarena_verified.196.517 webarena_verified.196.519,False,shopping,backend_state,519,test,webarena_verified.196.518 webarena_verified.196.520,False,shopping,backend_state,520,train,webarena_verified.196.519 -webarena_verified.199.521,False,shopping,backend_state,521,test,webarena_verified.199.520 -webarena_verified.352.522,False,gitlab,backend_state,522,test,webarena_verified.352.485 -webarena_verified.354.523,False,gitlab,backend_state,523,train,webarena_verified.354.522 +webarena_verified.199.521,False,shopping,backend_state,521,test,webarena_verified.196.520 +webarena_verified.352.522,False,gitlab,backend_state,522,test,webarena_verified.294.485 +webarena_verified.354.523,False,gitlab,backend_state,523,train,webarena_verified.352.522 webarena_verified.354.524,False,gitlab,backend_state,524,test,webarena_verified.354.523 webarena_verified.354.525,False,gitlab,backend_state,525,train,webarena_verified.354.524 webarena_verified.354.526,False,gitlab,backend_state,526,train,webarena_verified.354.525 webarena_verified.354.527,False,gitlab,backend_state,527,test,webarena_verified.354.526 -webarena_verified.154.528,False,shopping,program_html,528,train,webarena_verified.154.521 +webarena_verified.154.528,False,shopping,program_html,528,train,webarena_verified.199.521 webarena_verified.154.529,False,shopping,program_html,529,test,webarena_verified.154.528 webarena_verified.154.530,False,shopping,program_html,530,test,webarena_verified.154.529 webarena_verified.154.531,False,shopping,program_html,531,train,webarena_verified.154.530 webarena_verified.154.532,False,shopping,program_html,532,train,webarena_verified.154.531 -webarena_verified.330.533,False,gitlab,backend_state,533,test,webarena_verified.330.527 +webarena_verified.330.533,False,gitlab,backend_state,533,test,webarena_verified.354.527 webarena_verified.330.534,False,gitlab,backend_state,534,train,webarena_verified.330.533 webarena_verified.330.535,False,gitlab,backend_state,535,test,webarena_verified.330.534 webarena_verified.330.536,False,gitlab,backend_state,536,train,webarena_verified.330.535 webarena_verified.330.537,False,gitlab,backend_state,537,train,webarena_verified.330.536 -webarena_verified.240.538,False,shopping_admin,backend_state,538,train,webarena_verified.240.505 +webarena_verified.240.538,False,shopping_admin,backend_state,538,train,webarena_verified.287.505 webarena_verified.240.539,False,shopping_admin,backend_state,539,train,webarena_verified.240.538 webarena_verified.240.540,False,shopping_admin,backend_state,540,test,webarena_verified.240.539 webarena_verified.240.541,False,shopping_admin,backend_state,541,test,webarena_verified.240.540 webarena_verified.240.542,False,shopping_admin,backend_state,542,train,webarena_verified.240.541 -webarena_verified.251.543,False,shopping_admin,backend_state,543,test,webarena_verified.251.542 +webarena_verified.251.543,False,shopping_admin,backend_state,543,test,webarena_verified.240.542 webarena_verified.251.544,False,shopping_admin,backend_state,544,test,webarena_verified.251.543 webarena_verified.251.545,False,shopping_admin,backend_state,545,test,webarena_verified.251.544 webarena_verified.251.546,False,shopping_admin,retrieve_value,546,train,webarena_verified.251.545 -webarena_verified.252.547,False,shopping_admin,backend_state,547,train,webarena_verified.252.546 +webarena_verified.252.547,False,shopping_admin,backend_state,547,train,webarena_verified.251.546 webarena_verified.252.548,False,shopping_admin,backend_state,548,train,webarena_verified.252.547 webarena_verified.252.549,False,shopping_admin,backend_state,549,test,webarena_verified.252.548 webarena_verified.252.550,False,shopping_admin,backend_state,550,train,webarena_verified.252.549 webarena_verified.252.551,False,shopping_admin,backend_state,551,test,webarena_verified.252.550 -webarena_verified.84.552,False,gitlab reddit,program_html,552,test,webarena_verified.84.537 webarena_verified.84.410 +webarena_verified.84.552,False,gitlab reddit,program_html,552,test,webarena_verified.23.410 webarena_verified.84.553,False,gitlab reddit,program_html,553,test,webarena_verified.84.552 webarena_verified.84.554,False,gitlab reddit,program_html,554,test,webarena_verified.84.553 webarena_verified.84.555,False,gitlab reddit,program_html,555,test,webarena_verified.84.554 -webarena_verified.87.556,False,gitlab wikipedia,program_html,556,train,webarena_verified.87.555 +webarena_verified.87.556,False,gitlab wikipedia,program_html,556,train,webarena_verified.84.555 webarena_verified.87.557,False,gitlab wikipedia,program_html,557,test,webarena_verified.87.556 webarena_verified.87.558,False,gitlab wikipedia,program_html,558,train,webarena_verified.87.557 webarena_verified.87.559,False,gitlab wikipedia,program_html,559,train,webarena_verified.87.558 webarena_verified.87.560,False,gitlab wikipedia,program_html,560,test,webarena_verified.87.559 webarena_verified.87.561,False,gitlab wikipedia,program_html,561,test,webarena_verified.87.560 -webarena_verified.88.562,False,gitlab reddit,program_html,562,train,webarena_verified.88.561 webarena_verified.88.555 +webarena_verified.88.562,False,gitlab reddit,program_html,562,train,webarena_verified.84.555 webarena_verified.88.563,False,gitlab reddit,program_html,563,train,webarena_verified.88.562 webarena_verified.88.564,False,gitlab reddit,program_html,564,train,webarena_verified.88.563 webarena_verified.88.565,False,gitlab reddit,program_html,565,test,webarena_verified.88.564 webarena_verified.88.566,False,gitlab reddit,program_html,566,test,webarena_verified.88.565 -webarena_verified.293.567,False,gitlab,backend_state,567,test,webarena_verified.293.566 +webarena_verified.293.567,False,gitlab,backend_state,567,test,webarena_verified.88.566 webarena_verified.293.568,False,gitlab,backend_state,568,train,webarena_verified.293.567 webarena_verified.293.569,False,gitlab,backend_state,569,train,webarena_verified.293.568 webarena_verified.293.570,False,gitlab,backend_state,570,test,webarena_verified.293.569 -webarena_verified.165.571,False,shopping,backend_state,571,test,webarena_verified.165.532 +webarena_verified.165.571,False,shopping,backend_state,571,test,webarena_verified.154.532 webarena_verified.165.572,False,shopping,backend_state,572,train,webarena_verified.165.571 webarena_verified.165.573,False,shopping,backend_state,573,train,webarena_verified.165.572 webarena_verified.165.574,False,shopping,backend_state,574,test,webarena_verified.165.573 webarena_verified.165.575,False,shopping,backend_state,575,train,webarena_verified.165.574 -webarena_verified.351.576,False,gitlab,backend_state,576,test,webarena_verified.351.570 +webarena_verified.351.576,False,gitlab,backend_state,576,test,webarena_verified.293.570 webarena_verified.351.577,False,gitlab,backend_state,577,train,webarena_verified.351.576 webarena_verified.351.578,False,gitlab,backend_state,578,test,webarena_verified.351.577 webarena_verified.351.579,False,gitlab,backend_state,579,train,webarena_verified.351.578 -webarena_verified.7.580,False,reddit,backend_state,580,train,webarena_verified.7.566 +webarena_verified.7.580,False,reddit,backend_state,580,train,webarena_verified.88.566 webarena_verified.7.581,False,reddit,backend_state,581,train,webarena_verified.7.580 webarena_verified.7.582,False,reddit,backend_state,582,test,webarena_verified.7.581 webarena_verified.7.583,False,reddit,backend_state,583,test,webarena_verified.7.582 webarena_verified.7.584,False,reddit,backend_state,584,train,webarena_verified.7.583 -webarena_verified.194.585,False,shopping,backend_state,585,train,webarena_verified.194.575 +webarena_verified.194.585,False,shopping,backend_state,585,train,webarena_verified.165.575 webarena_verified.194.586,False,shopping,backend_state,586,test,webarena_verified.194.585 webarena_verified.194.587,False,shopping,backend_state,587,train,webarena_verified.194.586 webarena_verified.194.588,False,shopping,backend_state,588,train,webarena_verified.194.587 webarena_verified.194.589,False,shopping,backend_state,589,test,webarena_verified.194.588 -webarena_verified.339.590,False,gitlab,backend_state,590,train,webarena_verified.339.579 +webarena_verified.339.590,False,gitlab,backend_state,590,train,webarena_verified.351.579 webarena_verified.339.591,False,gitlab,backend_state,591,test,webarena_verified.339.590 webarena_verified.339.592,False,gitlab,backend_state,592,test,webarena_verified.339.591 webarena_verified.339.593,False,gitlab,backend_state,593,test,webarena_verified.339.592 webarena_verified.339.594,False,gitlab,backend_state,594,train,webarena_verified.339.593 -webarena_verified.4.595,False,reddit,backend_state,595,train,webarena_verified.4.584 +webarena_verified.4.595,False,reddit,backend_state,595,train,webarena_verified.7.584 webarena_verified.4.596,False,reddit,backend_state,596,test,webarena_verified.4.595 webarena_verified.4.597,False,reddit,backend_state,597,train,webarena_verified.4.596 webarena_verified.4.598,False,reddit,backend_state,598,train,webarena_verified.4.597 webarena_verified.4.599,False,reddit,backend_state,599,test,webarena_verified.4.598 -webarena_verified.3765.600,False,reddit,backend_state,600,test,webarena_verified.3765.599 +webarena_verified.3765.600,False,reddit,backend_state,600,test,webarena_verified.4.599 webarena_verified.3765.601,False,reddit,backend_state,601,train,webarena_verified.3765.600 webarena_verified.3765.602,False,reddit,backend_state,602,train,webarena_verified.3765.601 webarena_verified.3765.603,False,reddit,backend_state,603,train,webarena_verified.3765.602 webarena_verified.3765.604,False,reddit,backend_state,604,test,webarena_verified.3765.603 -webarena_verified.5.605,False,reddit,backend_state,605,train,webarena_verified.5.604 +webarena_verified.5.605,False,reddit,backend_state,605,train,webarena_verified.3765.604 webarena_verified.5.606,False,reddit,backend_state,606,train,webarena_verified.5.605 webarena_verified.5.607,False,reddit,backend_state,607,test,webarena_verified.5.606 webarena_verified.5.608,False,reddit,backend_state,608,test,webarena_verified.5.607 webarena_verified.5.609,False,reddit,backend_state,609,train,webarena_verified.5.608 -webarena_verified.9.610,False,reddit,backend_state,610,train,webarena_verified.9.609 +webarena_verified.9.610,False,reddit,backend_state,610,train,webarena_verified.5.609 webarena_verified.9.611,False,reddit,backend_state,611,train,webarena_verified.9.610 webarena_verified.9.612,False,reddit,backend_state,612,test,webarena_verified.9.611 webarena_verified.9.613,False,reddit,backend_state,613,train,webarena_verified.9.612 webarena_verified.9.614,False,reddit,backend_state,614,test,webarena_verified.9.613 -webarena_verified.11.615,False,reddit,ui_state,615,test,webarena_verified.11.614 +webarena_verified.11.615,False,reddit,ui_state,615,test,webarena_verified.9.614 webarena_verified.11.616,False,reddit,ui_state,616,test,webarena_verified.11.615 webarena_verified.11.617,False,reddit,ui_state,617,train,webarena_verified.11.616 webarena_verified.11.618,False,reddit,ui_state,618,train,webarena_verified.11.617 webarena_verified.11.619,False,reddit,ui_state,619,train,webarena_verified.11.618 -webarena_verified.12.620,False,reddit,backend_state,620,train,webarena_verified.12.619 +webarena_verified.12.620,False,reddit,backend_state,620,train,webarena_verified.11.619 webarena_verified.12.621,False,reddit,backend_state,621,train,webarena_verified.12.620 webarena_verified.12.622,False,reddit,backend_state,622,train,webarena_verified.12.621 webarena_verified.12.623,False,reddit,backend_state,623,test,webarena_verified.12.622 webarena_verified.12.624,False,reddit,backend_state,624,test,webarena_verified.12.623 -webarena_verified.13.625,False,reddit,backend_state,625,train,webarena_verified.13.624 +webarena_verified.13.625,False,reddit,backend_state,625,train,webarena_verified.12.624 webarena_verified.13.626,False,reddit,backend_state,626,train,webarena_verified.13.625 webarena_verified.13.627,False,reddit,backend_state,627,train,webarena_verified.13.626 webarena_verified.13.628,False,reddit,backend_state,628,test,webarena_verified.13.627 webarena_verified.13.629,False,reddit,backend_state,629,test,webarena_verified.13.628 -webarena_verified.15.630,False,reddit,backend_state,630,test,webarena_verified.15.629 +webarena_verified.15.630,False,reddit,backend_state,630,test,webarena_verified.13.629 webarena_verified.15.631,False,reddit,backend_state,631,train,webarena_verified.15.630 webarena_verified.15.632,False,reddit,backend_state,632,train,webarena_verified.15.631 webarena_verified.15.633,False,reddit,backend_state,633,test,webarena_verified.15.632 webarena_verified.15.634,False,reddit,backend_state,634,train,webarena_verified.15.633 -webarena_verified.6100.635,False,reddit,backend_state,635,train,webarena_verified.6100.634 +webarena_verified.6100.635,False,reddit,backend_state,635,train,webarena_verified.15.634 webarena_verified.6100.636,False,reddit,backend_state,636,train,webarena_verified.6100.635 webarena_verified.6100.637,False,reddit,backend_state,637,train,webarena_verified.6100.636 webarena_verified.6100.638,False,reddit,ui_state,638,test,webarena_verified.6100.637 webarena_verified.6100.639,False,reddit,backend_state,639,test,webarena_verified.6100.638 -webarena_verified.16.640,False,reddit,backend_state,640,train,webarena_verified.16.639 +webarena_verified.16.640,False,reddit,backend_state,640,train,webarena_verified.6100.639 webarena_verified.16.641,False,reddit,backend_state,641,test,webarena_verified.16.640 webarena_verified.16.642,False,reddit,backend_state,642,test,webarena_verified.16.641 webarena_verified.16.643,False,reddit,backend_state,643,train,webarena_verified.16.642 webarena_verified.16.644,False,reddit,backend_state,644,train,webarena_verified.16.643 -webarena_verified.19.645,False,reddit,backend_state,645,train,webarena_verified.19.644 +webarena_verified.19.645,False,reddit,backend_state,645,train,webarena_verified.16.644 webarena_verified.19.646,False,reddit,backend_state,646,train,webarena_verified.19.645 webarena_verified.19.647,False,reddit,backend_state,647,train,webarena_verified.19.646 webarena_verified.19.648,False,reddit,backend_state,648,test,webarena_verified.19.647 webarena_verified.19.649,False,reddit,backend_state,649,test,webarena_verified.19.648 -webarena_verified.23.650,False,reddit,backend_state,650,train,webarena_verified.23.649 +webarena_verified.23.650,False,reddit,backend_state,650,train,webarena_verified.19.649 webarena_verified.23.651,False,reddit,backend_state,651,train,webarena_verified.23.650 webarena_verified.23.652,False,reddit,backend_state,652,train,webarena_verified.23.651 -webarena_verified.153.653,False,shopping,ui_state,653,train,webarena_verified.153.589 +webarena_verified.153.653,False,shopping,ui_state,653,train,webarena_verified.194.589 webarena_verified.153.654,False,shopping,ui_state,654,test,webarena_verified.153.653 webarena_verified.153.655,False,shopping,ui_state,655,test,webarena_verified.153.654 webarena_verified.153.656,False,shopping,ui_state,656,train,webarena_verified.153.655 webarena_verified.153.657,False,shopping,ui_state,657,train,webarena_verified.153.656 -webarena_verified.327.658,False,gitlab,backend_state,658,train,webarena_verified.327.594 +webarena_verified.327.658,False,gitlab,backend_state,658,train,webarena_verified.339.594 webarena_verified.327.659,False,gitlab,backend_state,659,test,webarena_verified.327.658 webarena_verified.327.660,False,gitlab,backend_state,660,test,webarena_verified.327.659 -webarena_verified.328.661,False,gitlab,backend_state,661,test,webarena_verified.328.660 +webarena_verified.328.661,False,gitlab,backend_state,661,test,webarena_verified.327.660 webarena_verified.328.662,False,gitlab,backend_state,662,train,webarena_verified.328.661 webarena_verified.328.663,False,gitlab,backend_state,663,train,webarena_verified.328.662 webarena_verified.328.664,False,gitlab,backend_state,664,test,webarena_verified.328.663 webarena_verified.328.665,False,gitlab,backend_state,665,train,webarena_verified.328.664 -webarena_verified.335.666,False,gitlab,retrieve_value,666,test,webarena_verified.335.665 +webarena_verified.335.666,False,gitlab,retrieve_value,666,test,webarena_verified.328.665 webarena_verified.335.667,False,gitlab,backend_state,667,test,webarena_verified.335.666 webarena_verified.335.668,False,gitlab,retrieve_value,668,test,webarena_verified.335.667 -webarena_verified.337.669,False,gitlab,backend_state,669,test,webarena_verified.337.668 +webarena_verified.337.669,False,gitlab,backend_state,669,test,webarena_verified.335.668 webarena_verified.337.670,False,gitlab,backend_state,670,train,webarena_verified.337.669 -webarena_verified.101.671,False,shopping reddit,ui_state,671,train,webarena_verified.101.657 webarena_verified.101.652 +webarena_verified.101.671,False,shopping reddit,ui_state,671,train,webarena_verified.23.652 webarena_verified.101.672,False,shopping reddit,ui_state,672,train,webarena_verified.101.671 webarena_verified.101.673,False,shopping reddit,ui_state,673,test,webarena_verified.101.672 webarena_verified.101.674,False,shopping reddit,ui_state,674,test,webarena_verified.101.673 webarena_verified.101.675,False,shopping reddit,ui_state,675,train,webarena_verified.101.674 -webarena_verified.253.676,False,shopping_admin,ui_state,676,test,webarena_verified.253.551 +webarena_verified.253.676,False,shopping_admin,ui_state,676,test,webarena_verified.252.551 webarena_verified.253.677,False,shopping_admin,ui_state,677,test,webarena_verified.253.676 webarena_verified.253.678,False,shopping_admin,ui_state,678,train,webarena_verified.253.677 webarena_verified.253.679,False,shopping_admin,ui_state,679,train,webarena_verified.253.678 webarena_verified.253.680,False,shopping_admin,ui_state,680,train,webarena_verified.253.679 -webarena_verified.116.681,False,reddit gitlab,ui_state,681,train,webarena_verified.116.675 webarena_verified.116.670 +webarena_verified.116.681,False,reddit gitlab,ui_state,681,train,webarena_verified.337.670 webarena_verified.116.682,False,reddit gitlab,ui_state,682,train,webarena_verified.116.681 webarena_verified.116.683,False,reddit gitlab,ui_state,683,test,webarena_verified.116.682 -webarena_verified.117.684,False,reddit gitlab,ui_state,684,train,webarena_verified.117.683 +webarena_verified.117.684,False,reddit gitlab,ui_state,684,train,webarena_verified.116.683 webarena_verified.117.685,False,reddit gitlab,ui_state,685,train,webarena_verified.117.684 webarena_verified.117.686,False,reddit gitlab,ui_state,686,train,webarena_verified.117.685 webarena_verified.117.687,False,reddit gitlab,ui_state,687,test,webarena_verified.117.686 webarena_verified.117.688,False,reddit gitlab,ui_state,688,test,webarena_verified.117.687 -webarena_verified.163.689,False,shopping,ui_state,689,test,webarena_verified.163.675 +webarena_verified.163.689,False,shopping,ui_state,689,test,webarena_verified.101.675 webarena_verified.163.690,False,shopping,ui_state,690,test,webarena_verified.163.689 webarena_verified.163.691,False,shopping,ui_state,691,train,webarena_verified.163.690 webarena_verified.163.692,False,shopping,ui_state,692,train,webarena_verified.163.691 webarena_verified.163.693,False,shopping,ui_state,693,train,webarena_verified.163.692 -webarena_verified.256.694,False,shopping_admin,backend_state,694,train,webarena_verified.256.680 +webarena_verified.256.694,False,shopping_admin,backend_state,694,train,webarena_verified.253.680 webarena_verified.256.695,False,shopping_admin,backend_state,695,train,webarena_verified.256.694 webarena_verified.256.696,False,shopping_admin,backend_state,696,test,webarena_verified.256.695 webarena_verified.256.697,False,shopping_admin,backend_state,697,train,webarena_verified.256.696 webarena_verified.256.698,False,shopping_admin,backend_state,698,test,webarena_verified.256.697 -webarena_verified.258.699,False,shopping_admin,backend_state,699,train,webarena_verified.258.698 +webarena_verified.258.699,False,shopping_admin,backend_state,699,train,webarena_verified.256.698 webarena_verified.258.700,False,shopping_admin,backend_state,700,test,webarena_verified.258.699 webarena_verified.258.701,False,shopping_admin,backend_state,701,test,webarena_verified.258.700 webarena_verified.258.702,False,shopping_admin,backend_state,702,train,webarena_verified.258.701 webarena_verified.258.703,False,shopping_admin,backend_state,703,train,webarena_verified.258.702 -webarena_verified.268.704,False,shopping_admin,ui_state,704,test,webarena_verified.268.703 +webarena_verified.268.704,False,shopping_admin,ui_state,704,test,webarena_verified.258.703 webarena_verified.268.705,False,shopping_admin,ui_state,705,test,webarena_verified.268.704 webarena_verified.268.706,False,shopping_admin,ui_state,706,train,webarena_verified.268.705 webarena_verified.268.707,False,shopping_admin,ui_state,707,train,webarena_verified.268.706 webarena_verified.268.708,False,shopping_admin,ui_state,708,train,webarena_verified.268.707 -webarena_verified.271.709,False,shopping_admin,ui_state,709,test,webarena_verified.271.708 +webarena_verified.271.709,False,shopping_admin,ui_state,709,test,webarena_verified.268.708 webarena_verified.271.710,False,shopping_admin,ui_state,710,test,webarena_verified.271.709 webarena_verified.271.711,False,shopping_admin,ui_state,711,train,webarena_verified.271.710 webarena_verified.271.712,False,shopping_admin,ui_state,712,train,webarena_verified.271.711 webarena_verified.271.713,False,shopping_admin,ui_state,713,train,webarena_verified.271.712 -webarena_verified.24.714,False,reddit,backend_state,714,train,webarena_verified.24.688 +webarena_verified.24.714,False,reddit,backend_state,714,train,webarena_verified.117.688 webarena_verified.24.715,False,reddit,backend_state,715,train,webarena_verified.24.714 webarena_verified.24.716,False,reddit,backend_state,716,train,webarena_verified.24.715 webarena_verified.24.717,False,reddit,backend_state,717,test,webarena_verified.24.716 webarena_verified.24.718,False,reddit,backend_state,718,test,webarena_verified.24.717 -webarena_verified.25.719,False,reddit,backend_state,719,train,webarena_verified.25.718 +webarena_verified.25.719,False,reddit,backend_state,719,train,webarena_verified.24.718 webarena_verified.25.720,False,reddit,backend_state,720,test,webarena_verified.25.719 webarena_verified.25.721,False,reddit,backend_state,721,train,webarena_verified.25.720 webarena_verified.25.722,False,reddit,backend_state,722,train,webarena_verified.25.721 webarena_verified.25.723,False,reddit,backend_state,723,test,webarena_verified.25.722 webarena_verified.25.724,False,reddit,backend_state,724,test,webarena_verified.25.723 -webarena_verified.1510.725,False,reddit,backend_state,725,test,webarena_verified.1510.724 +webarena_verified.1510.725,False,reddit,backend_state,725,test,webarena_verified.25.724 webarena_verified.1510.726,False,reddit,backend_state,726,test,webarena_verified.1510.725 webarena_verified.1510.727,False,reddit,backend_state,727,train,webarena_verified.1510.726 webarena_verified.1510.728,False,reddit,backend_state,728,train,webarena_verified.1510.727 webarena_verified.1510.729,False,reddit,backend_state,729,train,webarena_verified.1510.728 webarena_verified.1510.730,False,reddit,backend_state,730,test,webarena_verified.1510.729 -webarena_verified.27.731,False,reddit,backend_state,731,test,webarena_verified.27.730 +webarena_verified.27.731,False,reddit,backend_state,731,test,webarena_verified.1510.730 webarena_verified.27.732,False,reddit,backend_state,732,train,webarena_verified.27.731 webarena_verified.27.733,False,reddit,backend_state,733,train,webarena_verified.27.732 webarena_verified.27.734,False,reddit,program_html,734,train,webarena_verified.27.733 webarena_verified.27.735,False,reddit,program_html,735,test,webarena_verified.27.734 -webarena_verified.355.736,False,gitlab,backend_state,736,train,webarena_verified.355.688 -webarena_verified.94.737,False,wikipedia map,program_html,737,train,webarena_verified.94.430 +webarena_verified.355.736,False,gitlab,backend_state,736,train,webarena_verified.117.688 +webarena_verified.94.737,False,wikipedia map,program_html,737,train,webarena_verified.371.430 webarena_verified.94.738,False,wikipedia map,program_html,738,test,webarena_verified.94.737 webarena_verified.94.739,False,wikipedia map,program_html,739,train,webarena_verified.94.738 webarena_verified.94.740,False,wikipedia map,program_html,740,test,webarena_verified.94.739 webarena_verified.94.741,False,wikipedia map,program_html,741,train,webarena_verified.94.740 -webarena_verified.332.742,False,gitlab,backend_state,742,test,webarena_verified.332.736 +webarena_verified.332.742,False,gitlab,backend_state,742,test,webarena_verified.355.736 webarena_verified.332.743,False,gitlab,backend_state,743,test,webarena_verified.332.742 webarena_verified.332.744,False,gitlab,backend_state,744,test,webarena_verified.332.743 webarena_verified.332.745,False,gitlab,backend_state,745,test,webarena_verified.332.744 webarena_verified.332.746,False,gitlab,backend_state,746,train,webarena_verified.332.745 -webarena_verified.2100.747,False,gitlab,backend_state,747,train,webarena_verified.2100.746 +webarena_verified.2100.747,False,gitlab,backend_state,747,train,webarena_verified.332.746 webarena_verified.2100.748,False,gitlab,backend_state,748,train,webarena_verified.2100.747 webarena_verified.2100.749,False,gitlab,backend_state,749,test,webarena_verified.2100.748 webarena_verified.2100.750,False,gitlab,backend_state,750,test,webarena_verified.2100.749 webarena_verified.2100.751,False,gitlab,backend_state,751,train,webarena_verified.2100.750 -webarena_verified.332.752,False,gitlab,backend_state,752,train,webarena_verified.332.751 +webarena_verified.332.752,False,gitlab,backend_state,752,train,webarena_verified.2100.751 webarena_verified.332.753,False,gitlab,backend_state,753,test,webarena_verified.332.752 webarena_verified.332.754,False,gitlab,backend_state,754,train,webarena_verified.332.753 webarena_verified.332.755,False,gitlab,backend_state,755,test,webarena_verified.332.754 webarena_verified.332.756,False,gitlab,backend_state,756,train,webarena_verified.332.755 -webarena_verified.42.757,False,map,program_html,757,test,webarena_verified.42.741 +webarena_verified.42.757,False,map,program_html,757,test,webarena_verified.94.741 webarena_verified.42.758,False,map,program_html,758,test,webarena_verified.42.757 -webarena_verified.42.759,False,map shopping_admin,program_html,759,test,webarena_verified.42.758 webarena_verified.42.713 +webarena_verified.42.759,False,map shopping_admin,program_html,759,test,webarena_verified.271.713 webarena_verified.42.760,False,map shopping_admin,program_html,760,test,webarena_verified.42.759 -webarena_verified.54.761,False,map,program_html,761,train,webarena_verified.54.760 +webarena_verified.54.761,False,map,program_html,761,train,webarena_verified.42.760 webarena_verified.54.762,False,map,program_html,762,train,webarena_verified.54.761 -webarena_verified.75.763,False,map,program_html,763,test,webarena_verified.75.762 +webarena_verified.75.763,False,map,program_html,763,test,webarena_verified.54.762 webarena_verified.75.764,False,map,program_html,764,test,webarena_verified.75.763 webarena_verified.75.765,False,map,program_html,765,train,webarena_verified.75.764 webarena_verified.75.766,False,map,program_html,766,train,webarena_verified.75.765 webarena_verified.75.767,False,map,program_html,767,train,webarena_verified.75.766 -webarena_verified.241.768,False,shopping_admin,backend_state,768,test,webarena_verified.241.760 +webarena_verified.241.768,False,shopping_admin,backend_state,768,test,webarena_verified.42.760 webarena_verified.241.769,False,shopping_admin,backend_state,769,test,webarena_verified.241.768 webarena_verified.241.770,False,shopping_admin,backend_state,770,train,webarena_verified.241.769 -webarena_verified.243.771,False,shopping_admin,backend_state,771,test,webarena_verified.243.770 -webarena_verified.246.772,False,shopping_admin,backend_state,772,test,webarena_verified.246.771 +webarena_verified.243.771,False,shopping_admin,backend_state,771,test,webarena_verified.241.770 +webarena_verified.246.772,False,shopping_admin,backend_state,772,test,webarena_verified.243.771 webarena_verified.246.773,False,shopping_admin,backend_state,773,train,webarena_verified.246.772 webarena_verified.246.774,False,shopping_admin,backend_state,774,train,webarena_verified.246.773 webarena_verified.246.775,False,shopping_admin,backend_state,775,train,webarena_verified.246.774 webarena_verified.246.776,False,shopping_admin,backend_state,776,test,webarena_verified.246.775 -webarena_verified.742.777,False,shopping_admin,backend_state,777,train,webarena_verified.742.776 +webarena_verified.742.777,False,shopping_admin,backend_state,777,train,webarena_verified.246.776 webarena_verified.742.778,False,shopping_admin,backend_state,778,test,webarena_verified.742.777 webarena_verified.742.779,False,shopping_admin,backend_state,779,train,webarena_verified.742.778 webarena_verified.742.780,False,shopping_admin,backend_state,780,test,webarena_verified.742.779 webarena_verified.742.781,False,shopping_admin,backend_state,781,train,webarena_verified.742.780 webarena_verified.742.782,False,shopping_admin,backend_state,782,test,webarena_verified.742.781 -webarena_verified.351.783,False,gitlab,retrieve_value,783,train,webarena_verified.351.756 -webarena_verified.316.784,False,gitlab,retrieve_value,784,test,webarena_verified.316.783 +webarena_verified.351.783,False,gitlab,retrieve_value,783,train,webarena_verified.332.756 +webarena_verified.316.784,False,gitlab,retrieve_value,784,test,webarena_verified.351.783 webarena_verified.316.785,False,gitlab,retrieve_value,785,test,webarena_verified.316.784 webarena_verified.316.786,False,gitlab,retrieve_value,786,test,webarena_verified.316.785 webarena_verified.316.787,False,gitlab,retrieve_value,787,test,webarena_verified.316.786 webarena_verified.316.788,False,gitlab,retrieve_value,788,test,webarena_verified.316.787 -webarena_verified.328.789,False,gitlab,retrieve_value,789,test,webarena_verified.328.788 -webarena_verified.246.790,False,shopping_admin,retrieve_value,790,test,webarena_verified.246.782 -webarena_verified.84.791,False,gitlab reddit,string_match,791,train,webarena_verified.84.789 webarena_verified.84.735 -webarena_verified.172.792,False,shopping,retrieve_value,792,test,webarena_verified.172.693 +webarena_verified.328.789,False,gitlab,retrieve_value,789,test,webarena_verified.316.788 +webarena_verified.246.790,False,shopping_admin,retrieve_value,790,test,webarena_verified.742.782 +webarena_verified.84.791,False,gitlab reddit,string_match,791,train,webarena_verified.27.735 +webarena_verified.172.792,False,shopping,retrieve_value,792,test,webarena_verified.163.693 webarena_verified.172.793,False,shopping,retrieve_value,793,train,webarena_verified.172.792 -webarena_verified.191.794,False,shopping,retrieve_value,794,test,webarena_verified.191.793 +webarena_verified.191.794,False,shopping,retrieve_value,794,test,webarena_verified.172.793 webarena_verified.191.795,False,shopping,retrieve_value,795,train,webarena_verified.191.794 webarena_verified.191.796,False,shopping,retrieve_value,796,train,webarena_verified.191.795 webarena_verified.191.797,False,shopping,retrieve_value,797,test,webarena_verified.191.796 webarena_verified.191.798,False,shopping,retrieve_value,798,train,webarena_verified.191.797 -webarena_verified.600.799,False,gitlab,backend_state,799,train,webarena_verified.600.791 +webarena_verified.600.799,False,gitlab,backend_state,799,train,webarena_verified.84.791 webarena_verified.600.800,False,gitlab,backend_state,800,test,webarena_verified.600.799 webarena_verified.600.801,False,gitlab,backend_state,801,train,webarena_verified.600.800 webarena_verified.600.802,False,gitlab,backend_state,802,train,webarena_verified.600.801 webarena_verified.600.803,False,gitlab,backend_state,803,test,webarena_verified.600.802 -webarena_verified.999.804,False,gitlab,backend_state,804,train,webarena_verified.999.803 -webarena_verified.335.805,False,gitlab,backend_state,805,test,webarena_verified.335.804 +webarena_verified.999.804,False,gitlab,backend_state,804,train,webarena_verified.600.803 +webarena_verified.335.805,False,gitlab,backend_state,805,test,webarena_verified.999.804 webarena_verified.335.806,False,gitlab,backend_state,806,test,webarena_verified.335.805 webarena_verified.335.807,False,gitlab,backend_state,807,train,webarena_verified.335.806 -webarena_verified.327.808,False,gitlab,backend_state,808,train,webarena_verified.327.807 +webarena_verified.327.808,False,gitlab,backend_state,808,train,webarena_verified.335.807 webarena_verified.327.809,False,gitlab,backend_state,809,train,webarena_verified.327.808 -webarena_verified.999.810,False,gitlab,backend_state,810,test,webarena_verified.999.809 +webarena_verified.999.810,False,gitlab,backend_state,810,test,webarena_verified.327.809 webarena_verified.999.811,False,gitlab,backend_state,811,test,webarena_verified.999.810 From 81f930c50abf57bf0f5239ed594891cb66531261 Mon Sep 17 00:00:00 2001 From: "nicolas.gontier" Date: Fri, 24 Oct 2025 22:15:54 -0400 Subject: [PATCH 20/64] add webarena_verified backend --- .../browsergym/experiments/benchmark/base.py | 2 +- .../experiments/benchmark/configs.py | 2 +- .../browsergym/experiments/benchmark/utils.py | 41 ++++++++++--------- 3 files changed, 23 insertions(+), 22 deletions(-) diff --git a/browsergym/experiments/src/browsergym/experiments/benchmark/base.py b/browsergym/experiments/src/browsergym/experiments/benchmark/base.py index df4d20eb..d3560b5e 100644 --- a/browsergym/experiments/src/browsergym/experiments/benchmark/base.py +++ b/browsergym/experiments/src/browsergym/experiments/benchmark/base.py @@ -53,7 +53,7 @@ def make_action_set(self): BenchmarkBackend = Literal[ - "miniwob", "webarena", "visualwebarena", "workarena", "assistantbench", "weblinx" + "miniwob", "webarena", "webarena_verified", "visualwebarena", "workarena", "assistantbench", "weblinx" ] diff --git a/browsergym/experiments/src/browsergym/experiments/benchmark/configs.py b/browsergym/experiments/src/browsergym/experiments/benchmark/configs.py index f5394a36..9551111d 100644 --- a/browsergym/experiments/src/browsergym/experiments/benchmark/configs.py +++ b/browsergym/experiments/src/browsergym/experiments/benchmark/configs.py @@ -138,7 +138,7 @@ high_level_action_set_args=DEFAULT_HIGHLEVEL_ACTION_SET_ARGS["webarena"], is_multi_tab=True, supports_parallel_seeds=False, - backends=["webarena"], + backends=["webarena_verified"], env_args_list=make_env_args_list_from_repeat_tasks( task_list=task_list_from_metadata(metadata=task_metadata("webarena_verified")), max_steps=30, diff --git a/browsergym/experiments/src/browsergym/experiments/benchmark/utils.py b/browsergym/experiments/src/browsergym/experiments/benchmark/utils.py index b7f256c8..d9e57fee 100644 --- a/browsergym/experiments/src/browsergym/experiments/benchmark/utils.py +++ b/browsergym/experiments/src/browsergym/experiments/benchmark/utils.py @@ -147,29 +147,30 @@ def prepare_backend(backend: str): import browsergym.webarena_verified # full reset the instance (requires environment variables properly set up) - from browsergym.webarena_verified.instance import WebArenaVerifiedInstance + from browsergym.webarena.instance import WebArenaInstance - default_instance = WebArenaVerifiedInstance() + default_instance = WebArenaInstance() default_instance.full_reset() - # logging.info( - # f"Initiating WebArena Verified instance warm-up. Some tasks will be pre-loaded (massaged) to trigger some caching mechanisms and make the server more responsive." - # ) - # massage_tasks( - # [ - # f"webarena_verified.{id}" - # for id in [ - # 410, # reddit - # 533, # gitlab - # 561, # gitlab wiki - # 562, # gitlab reddit - # 574, # shopping - # 640, # reddit - # 680, # shopping_admin - # 740, # wiki map - # ] - # ] - # ) + logging.info( + f"Initiating WebArena instance warm-up. Some tasks will be pre-loaded (massaged) to trigger some caching mechanisms and make the server more responsive." + ) + massage_tasks( + [ + f"webarena_verified.{intent_template_id}.{task_id}" + for intent_template_id, task_id in + [ + (23, 410), # reddit + # (330, 533), # gitlab + # (87, 561), # gitlab wiki + # (87, 562), # gitlab reddit + (165, 574), # shopping + (16, 640), # reddit + # (253, 680), # shopping_admin + # (94, 740), # wiki map + ] + ] + ) case "visualwebarena": # register environments From cbca5a2a7c444f6b976066ca2be767dfdb26bb04 Mon Sep 17 00:00:00 2001 From: "nicolas.gontier" Date: Fri, 24 Oct 2025 22:16:49 -0400 Subject: [PATCH 21/64] fix wav tasks --- .../src/browsergym/webarena_verified/task.py | 22 ++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) diff --git a/browsergym/webarena_verified/src/browsergym/webarena_verified/task.py b/browsergym/webarena_verified/src/browsergym/webarena_verified/task.py index 643355ce..bf805804 100644 --- a/browsergym/webarena_verified/src/browsergym/webarena_verified/task.py +++ b/browsergym/webarena_verified/src/browsergym/webarena_verified/task.py @@ -2,8 +2,11 @@ import json import logging import tempfile +from pathlib import Path +from time import sleep from typing import Optional +import playwright._impl._errors as playwright_errors import playwright.sync_api from browsergym.webarena.task import GenericWebArenaTask @@ -94,9 +97,23 @@ def setup(self, page: playwright.sync_api.Page) -> tuple[str, dict]: # build the evaluator using the new webarena_verified evaluation system self.evaluator = WebArenaVerifiedEvaluator(self.webarena_instance) + # add extra context headers if they are present (e.g. for access token to the self hosted webarena verified instances) + extra_headers_file_path = Path(__file__).parent / "pw_extra_headers.json" + if extra_headers_file_path.exists(): + with open(extra_headers_file_path, "r") as f: + extra_headers = json.load(f) + page.context.set_extra_http_headers(extra_headers) + # authenticate for site in self.config["sites"]: - self.webarena_instance.ui_login(site=site, page=page) + for attempt in range(3): + try: + self.webarena_instance.ui_login(site=site, page=page) + break # Success, move to next site + except playwright_errors.TimeoutError as e: + if attempt == 2: # Last attempt (0, 1, 2) + raise # Re-raise the timeout error after 3 failed attempts + sleep(1) # Wait 1 second before retrying # enable playwright tracing (required for webarena_verified evaluation) page.context.tracing.start(snapshots=True) @@ -107,8 +124,7 @@ def setup(self, page: playwright.sync_api.Page) -> tuple[str, dict]: # navigate to the starting url(s) (might need several pages) # https://github.com/web-arena-x/webarena/blob/c6475f0e9affe5252a2966e26b8cb4c834a4ae40/browser_env/envs.py#L150 - if self.config["start_url"]: - start_urls = self.config["start_url"].split(" |AND| ") + if start_urls := self.config.get("start_urls"): for i, url in enumerate(start_urls): page.goto(url) if i < len(start_urls) - 1: From 8d4381be11c5248971d5181a05e1d2e47ce07033 Mon Sep 17 00:00:00 2001 From: Nicolas Gontier Date: Sat, 25 Oct 2025 02:32:39 +0000 Subject: [PATCH 22/64] do not check reachable if url is todo --- browsergym/webarena/src/browsergym/webarena/instance.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/browsergym/webarena/src/browsergym/webarena/instance.py b/browsergym/webarena/src/browsergym/webarena/instance.py index 503560f6..8e4c9c8c 100644 --- a/browsergym/webarena/src/browsergym/webarena/instance.py +++ b/browsergym/webarena/src/browsergym/webarena/instance.py @@ -146,6 +146,8 @@ def _check_is_reachable(self, timeout: int): """ for site, url in self.urls.items(): + if url == "todo": + continue try: requests.get(url, timeout=timeout) except (requests.exceptions.ConnectionError, requests.exceptions.Timeout): From 63b4b0770ee780f897c333f0d8873cea922d7e0d Mon Sep 17 00:00:00 2001 From: Nicolas Gontier Date: Mon, 27 Oct 2025 19:51:53 +0000 Subject: [PATCH 23/64] fix tmp trace creation, update goal to prompt model to satisfy wav return format/ --- .../browsergym/experiments/benchmark/utils.py | 1 + .../webarena_verified/evaluators.py | 18 +++++++++--------- .../src/browsergym/webarena_verified/task.py | 19 +++++++++++++++++-- 3 files changed, 27 insertions(+), 11 deletions(-) diff --git a/browsergym/experiments/src/browsergym/experiments/benchmark/utils.py b/browsergym/experiments/src/browsergym/experiments/benchmark/utils.py index d9e57fee..b1dcaa48 100644 --- a/browsergym/experiments/src/browsergym/experiments/benchmark/utils.py +++ b/browsergym/experiments/src/browsergym/experiments/benchmark/utils.py @@ -160,6 +160,7 @@ def prepare_backend(backend: str): f"webarena_verified.{intent_template_id}.{task_id}" for intent_template_id, task_id in [ + # gitlab, shopping_admin, and map are not ready yet (23, 410), # reddit # (330, 533), # gitlab # (87, 561), # gitlab wiki diff --git a/browsergym/webarena_verified/src/browsergym/webarena_verified/evaluators.py b/browsergym/webarena_verified/src/browsergym/webarena_verified/evaluators.py index 1bd38ae6..75f5233d 100644 --- a/browsergym/webarena_verified/src/browsergym/webarena_verified/evaluators.py +++ b/browsergym/webarena_verified/src/browsergym/webarena_verified/evaluators.py @@ -8,7 +8,7 @@ import tempfile from pathlib import Path -import playwright +import playwright.sync_api from browsergym.webarena.instance import WebArenaInstance from webarena_verified.api import WebArenaVerifiedDataReader @@ -95,16 +95,16 @@ def __call__( # stop playwright tracing with tempfile.TemporaryDirectory() as temp_dir: - trace_path = Path(temp_dir) / f"wav_{task.task_id}.zip" + trace_path = Path(temp_dir) / "trace.zip" page.context.tracing.stop(path=trace_path) - # Create evaluation context - context = TaskEvalContext( - task=task, - agent_response_raw=trajectory[-1].get("answer"), - network_trace=NetworkTrace.from_content(trace_path), - environments=self.evaluator.config.environments, - ) + # Create evaluation context + context = TaskEvalContext( + task=task, + agent_response_raw=trajectory[-1].get("answer"), + network_trace=NetworkTrace.from_content(trace_path), + config=self.evaluator.config, + ) # Run wa_verified evaluation and return float score logger.info(f"Running webarena_verified evaluation for task {task.task_id}") diff --git a/browsergym/webarena_verified/src/browsergym/webarena_verified/task.py b/browsergym/webarena_verified/src/browsergym/webarena_verified/task.py index bf805804..c41b3db1 100644 --- a/browsergym/webarena_verified/src/browsergym/webarena_verified/task.py +++ b/browsergym/webarena_verified/src/browsergym/webarena_verified/task.py @@ -11,6 +11,7 @@ from browsergym.webarena.task import GenericWebArenaTask from browsergym.webarena_verified.evaluators import WebArenaVerifiedEvaluator +from webarena_verified.types import FinalAgentResponse logger = logging.getLogger(__name__) @@ -89,8 +90,10 @@ def setup(self, page: playwright.sync_api.Page) -> tuple[str, dict]: self.config = self.random.choice(self.task_configs) # hack: dynamically build a config file to read from - with tempfile.NamedTemporaryFile(mode="w+", delete=False) as f: - json.dump(self.config, f) + with tempfile.NamedTemporaryFile( + mode="w+", delete=False, prefix=f"wav-{self.config['intent_template_id']}-{self.config['task_id']}_", suffix=".json" + ) as f: + json.dump(self.config, f, indent=4) f.flush() self.config_file = f.name @@ -133,6 +136,18 @@ def setup(self, page: playwright.sync_api.Page) -> tuple[str, dict]: # recover goal goal = self.config["intent"] + # WebArena Verified requires a specific format for the agent response + response_schema = FinalAgentResponse.model_json_schema() + goal += f""" + +--- +Final response format: When you send your final answer to the user with `send_msg_to_user`, your message must be a json formatted string that matches the following schema: +``` +{json.dumps(response_schema, indent=4)} +``` +Your message in `send_msg_to_user` will be validated against this schema. +""" + # This note is present in all webarena's agent prompts # https://github.com/web-arena-x/webarena/blob/c6475f0e9affe5252a2966e26b8cb4c834a4ae40/agent/prompts/raw/p_cot_id_actree_2s.py#L34 # However, webarena_verified does not have a homepage, so skip this hint From b7f847a62d6cf471a53362941122324905ebca3e Mon Sep 17 00:00:00 2001 From: Nicolas Gontier Date: Tue, 28 Oct 2025 20:33:36 +0000 Subject: [PATCH 24/64] create webarena_verified action space with special submit function to match the benchmark expected agent response format --- .../src/browsergym/core/action/functions.py | 50 ++++++++++++++++++- .../src/browsergym/core/action/highlevel.py | 24 +++++++++ .../experiments/benchmark/configs.py | 9 +++- .../src/browsergym/webarena_verified/task.py | 22 ++++---- 4 files changed, 94 insertions(+), 11 deletions(-) diff --git a/browsergym/core/src/browsergym/core/action/functions.py b/browsergym/core/src/browsergym/core/action/functions.py index c54afbe7..65a2b33d 100644 --- a/browsergym/core/src/browsergym/core/action/functions.py +++ b/browsergym/core/src/browsergym/core/action/functions.py @@ -1,6 +1,7 @@ # these are placeholders # all these symbols will be available in browsergym actions -from typing import Literal +import json +from typing import Any, Literal import playwright.sync_api @@ -24,6 +25,53 @@ inspect.getsource(). """ +def send_response_to_wav( + performed_operation: Literal["RETRIEVE", "MUTATE", "NAVIGATE"], + status: Literal["SUCCESS", "ACTION_NOT_ALLOWED_ERROR", "NOT_FOUND_ERROR", "PERMISSION_DENIED_ERROR", "DATA_VALIDATION_ERROR", "UNKNOWN_ERROR"], + retrieved_data: list[str | int | float | bool | dict[str, Any] | None] | None = None, + error_details: str | None = None, +): + """Send the final response. + Args: + performed_operation: The overall type of work performed to attain the task objective. + - RETRIEVE: Use when retrieving data is the main objective of the task + - MUTATE: Use when creating, updating, or deleting data is the main objective of the task + - NAVIGATE: Use when navigating or browsing to show a specific page or location is the main objective of the task + status: The outcome of the task execution. + - SUCCESS: Use when the task objective was fully achieved + - ACTION_NOT_ALLOWED_ERROR: Use when the platform does not support the requested action + - NOT_FOUND_ERROR: Use when the target entity or resource could not be located after retry attempts + - PERMISSION_DENIED_ERROR: Use when the current user lacks permission to perform the action + - DATA_VALIDATION_ERROR: Use when required input data was missing or invalid + - UNKNOWN_ERROR: Use when an unexpected failure doesn't match other categories + retrieved_data: Array of items for 'retrieve' operations, null for 'mutate' and 'navigate' operations. + Returns empty array if no items found. All items must be the same type (either all primitives of the same type, or all objects with the same keys). + Use appropriate data type formats (e.g., numbers for amounts/counts, true/false for booleans, not strings). + For list of objects, the user instruction contains the format specification. + error_details: Null when status is 'SUCCESS'. Otherwise, explains what failed, why it failed, and what was attempted. + + Examples: + send_response_to_wav("RETRIEVE", "SUCCESS", ["The city was built in 1751."]) + send_response_to_wav("RETRIEVE", "SUCCESS", [{"name": "John Doe", "age": 30}]) + send_response_to_wav("RETRIEVE", "SUCCESS", [0,3]) + send_response_to_wav("RETRIEVE", "ACTION_NOT_ALLOWED_ERROR", None) + send_response_to_wav("RETRIEVE", "NOT_FOUND_ERROR", None, "No city found.") + send_response_to_wav("MUTATE", "SUCCESS", None) + send_response_to_wav("MUTATE", "PERMISSION_DENIED_ERROR", None, "User lacks permission to build a city.") + send_response_to_wav("NAVIGATE", "SUCCESS", None) + send_response_to_wav("NAVIGATE", "DATA_VALIDATION_ERROR", None, "Invalid city name.") + send_response_to_wav("NAVIGATE", "UNKNOWN_ERROR", None, "Unexpected error.") + + """ + final_response_dict = { + "performed_operation": performed_operation, + "status": status, + "retrieved_data": retrieved_data, + "error_details": error_details, + } + text = json.dumps(final_response_dict) + send_message_to_user(text) + def send_msg_to_user(text: str): """ diff --git a/browsergym/core/src/browsergym/core/action/highlevel.py b/browsergym/core/src/browsergym/core/action/highlevel.py index 97ac0a8d..72d42aca 100644 --- a/browsergym/core/src/browsergym/core/action/highlevel.py +++ b/browsergym/core/src/browsergym/core/action/highlevel.py @@ -36,6 +36,7 @@ scroll_at, select_option, send_msg_to_user, + send_response_to_wav, tab_close, tab_focus, upload_file, @@ -151,6 +152,28 @@ send_msg_to_user, # STOP | stop(answer) | stop [answer] report_infeasible, ## explicit unachievable action, equivalent STOP "N/A" ], + # webarena_verified agent response schema + # https://github.com/ServiceNow/platform-labs-webarena-verified/blob/main/src/webarena_verified/types/agent_response.py + "webarena_verified": [ + # # code | paper | prompt + scroll, # SCROLL | scroll(dir) | scroll [down|up] + keyboard_press, # KEY_PRESS | press(key_comb) | press [key_comb] + # MOUSE_CLICK | | + # KEYBOARD_TYPE | | + # MOUSE_HOVER | | + click, # CLICK | click(elem) | click [id] + fill, # TYPE | type(elem, text) | type [id] [content] + hover, # HOVER | hover(elem) | hover [id] + tab_focus, # PAGE_FOCUS | tab_focus(index) | tab_focus [tab_index] + new_tab, # NEW_TAB | new_tab() | new_tab + go_back, # GO_BACK | go_back() | go_back + go_forward, # GO_FORWARD | go_forward() | go_forward + goto, # GOTO_URL | goto(url) | goto [url] + tab_close, # PAGE_CLOSE | tab_close() | close_tab + # CHECK | | + select_option, # SELECT_OPTION | | + send_response_to_wav, # STOP | stop(answer) | stop [answer] + ], # from the visualwebarena paper # https://arxiv.org/abs/2401.13649 # from the visualwebarena source code @@ -272,6 +295,7 @@ class HighLevelActionSet(AbstractActionSet): "miniwob_liu18", "miniwob_humphreys22", "webarena", + "webarena_verified", "visualwebarena", "workarena", "workarena++", diff --git a/browsergym/experiments/src/browsergym/experiments/benchmark/configs.py b/browsergym/experiments/src/browsergym/experiments/benchmark/configs.py index 9551111d..6b030ccb 100644 --- a/browsergym/experiments/src/browsergym/experiments/benchmark/configs.py +++ b/browsergym/experiments/src/browsergym/experiments/benchmark/configs.py @@ -65,6 +65,13 @@ retry_with_force=True, demo_mode="off", ), + "webarena_verified": HighLevelActionSetArgs( + subsets=["webarena_verified"], + multiaction=False, + strict=False, + retry_with_force=True, + demo_mode="off", + ), # from https://arxiv.org/abs/2401.13649 "visualwebarena": HighLevelActionSetArgs( subsets=["visualwebarena"], @@ -135,7 +142,7 @@ ), "webarena_verified": lambda n_repeats=1: Benchmark( name="webarena_verified", - high_level_action_set_args=DEFAULT_HIGHLEVEL_ACTION_SET_ARGS["webarena"], + high_level_action_set_args=DEFAULT_HIGHLEVEL_ACTION_SET_ARGS["webarena_verified"], is_multi_tab=True, supports_parallel_seeds=False, backends=["webarena_verified"], diff --git a/browsergym/webarena_verified/src/browsergym/webarena_verified/task.py b/browsergym/webarena_verified/src/browsergym/webarena_verified/task.py index c41b3db1..ce94c5ed 100644 --- a/browsergym/webarena_verified/src/browsergym/webarena_verified/task.py +++ b/browsergym/webarena_verified/src/browsergym/webarena_verified/task.py @@ -137,16 +137,20 @@ def setup(self, page: playwright.sync_api.Page) -> tuple[str, dict]: goal = self.config["intent"] # WebArena Verified requires a specific format for the agent response - response_schema = FinalAgentResponse.model_json_schema() - goal += f""" - ---- -Final response format: When you send your final answer to the user with `send_msg_to_user`, your message must be a json formatted string that matches the following schema: -``` -{json.dumps(response_schema, indent=4)} -``` -Your message in `send_msg_to_user` will be validated against this schema. + goal += """ + +When you are done, send your final answer to the user with `send_response_to_wav`. """ +# response_schema = FinalAgentResponse.model_json_schema() +# goal += f""" + +# --- +# Final response format: When you send your final answer to the user with `send_msg_to_user`, your message must be a json formatted string that matches the following schema: +# ``` +# {json.dumps(response_schema, indent=4)} +# ``` +# Your message in `send_msg_to_user` will be validated against this schema. +# """ # This note is present in all webarena's agent prompts # https://github.com/web-arena-x/webarena/blob/c6475f0e9affe5252a2966e26b8cb4c834a4ae40/agent/prompts/raw/p_cot_id_actree_2s.py#L34 From 3e6b5b7f8d199b2aa3485cac22ed9066b3fb1119 Mon Sep 17 00:00:00 2001 From: Nicolas Gontier Date: Wed, 29 Oct 2025 00:24:31 +0000 Subject: [PATCH 25/64] look for extra header file path in environment variable --- .../src/browsergym/webarena_verified/task.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/browsergym/webarena_verified/src/browsergym/webarena_verified/task.py b/browsergym/webarena_verified/src/browsergym/webarena_verified/task.py index ce94c5ed..711bc836 100644 --- a/browsergym/webarena_verified/src/browsergym/webarena_verified/task.py +++ b/browsergym/webarena_verified/src/browsergym/webarena_verified/task.py @@ -1,6 +1,7 @@ import importlib.resources import json import logging +import os import tempfile from pathlib import Path from time import sleep @@ -101,11 +102,14 @@ def setup(self, page: playwright.sync_api.Page) -> tuple[str, dict]: self.evaluator = WebArenaVerifiedEvaluator(self.webarena_instance) # add extra context headers if they are present (e.g. for access token to the self hosted webarena verified instances) - extra_headers_file_path = Path(__file__).parent / "pw_extra_headers.json" - if extra_headers_file_path.exists(): - with open(extra_headers_file_path, "r") as f: - extra_headers = json.load(f) - page.context.set_extra_http_headers(extra_headers) + if os.environ.get("PW_EXTRA_HEADERS"): + extra_headers_file_path = Path(os.environ["PW_EXTRA_HEADERS"]) + try: + with open(extra_headers_file_path, "r") as f: + extra_headers = json.load(f) + page.context.set_extra_http_headers(extra_headers) + except Exception as e: + logger.warning(f"Failed to load extra headers from {extra_headers_file_path}: {e}. Make sure to set the PW_EXTRA_HEADERS environment variable to the path of an existing json file containing the extra headers. Continuing without extra headers.") # authenticate for site in self.config["sites"]: From 525fd3b4a0fa9cfe6b444561313410320b487882 Mon Sep 17 00:00:00 2001 From: Nicolas Gontier Date: Fri, 31 Oct 2025 18:37:39 +0000 Subject: [PATCH 26/64] undo special action set for webarena_verified --- .../src/browsergym/core/action/functions.py | 50 +------------------ .../src/browsergym/core/action/highlevel.py | 23 --------- .../experiments/benchmark/configs.py | 2 +- .../src/browsergym/webarena_verified/task.py | 39 ++++----------- 4 files changed, 11 insertions(+), 103 deletions(-) diff --git a/browsergym/core/src/browsergym/core/action/functions.py b/browsergym/core/src/browsergym/core/action/functions.py index 65a2b33d..c54afbe7 100644 --- a/browsergym/core/src/browsergym/core/action/functions.py +++ b/browsergym/core/src/browsergym/core/action/functions.py @@ -1,7 +1,6 @@ # these are placeholders # all these symbols will be available in browsergym actions -import json -from typing import Any, Literal +from typing import Literal import playwright.sync_api @@ -25,53 +24,6 @@ inspect.getsource(). """ -def send_response_to_wav( - performed_operation: Literal["RETRIEVE", "MUTATE", "NAVIGATE"], - status: Literal["SUCCESS", "ACTION_NOT_ALLOWED_ERROR", "NOT_FOUND_ERROR", "PERMISSION_DENIED_ERROR", "DATA_VALIDATION_ERROR", "UNKNOWN_ERROR"], - retrieved_data: list[str | int | float | bool | dict[str, Any] | None] | None = None, - error_details: str | None = None, -): - """Send the final response. - Args: - performed_operation: The overall type of work performed to attain the task objective. - - RETRIEVE: Use when retrieving data is the main objective of the task - - MUTATE: Use when creating, updating, or deleting data is the main objective of the task - - NAVIGATE: Use when navigating or browsing to show a specific page or location is the main objective of the task - status: The outcome of the task execution. - - SUCCESS: Use when the task objective was fully achieved - - ACTION_NOT_ALLOWED_ERROR: Use when the platform does not support the requested action - - NOT_FOUND_ERROR: Use when the target entity or resource could not be located after retry attempts - - PERMISSION_DENIED_ERROR: Use when the current user lacks permission to perform the action - - DATA_VALIDATION_ERROR: Use when required input data was missing or invalid - - UNKNOWN_ERROR: Use when an unexpected failure doesn't match other categories - retrieved_data: Array of items for 'retrieve' operations, null for 'mutate' and 'navigate' operations. - Returns empty array if no items found. All items must be the same type (either all primitives of the same type, or all objects with the same keys). - Use appropriate data type formats (e.g., numbers for amounts/counts, true/false for booleans, not strings). - For list of objects, the user instruction contains the format specification. - error_details: Null when status is 'SUCCESS'. Otherwise, explains what failed, why it failed, and what was attempted. - - Examples: - send_response_to_wav("RETRIEVE", "SUCCESS", ["The city was built in 1751."]) - send_response_to_wav("RETRIEVE", "SUCCESS", [{"name": "John Doe", "age": 30}]) - send_response_to_wav("RETRIEVE", "SUCCESS", [0,3]) - send_response_to_wav("RETRIEVE", "ACTION_NOT_ALLOWED_ERROR", None) - send_response_to_wav("RETRIEVE", "NOT_FOUND_ERROR", None, "No city found.") - send_response_to_wav("MUTATE", "SUCCESS", None) - send_response_to_wav("MUTATE", "PERMISSION_DENIED_ERROR", None, "User lacks permission to build a city.") - send_response_to_wav("NAVIGATE", "SUCCESS", None) - send_response_to_wav("NAVIGATE", "DATA_VALIDATION_ERROR", None, "Invalid city name.") - send_response_to_wav("NAVIGATE", "UNKNOWN_ERROR", None, "Unexpected error.") - - """ - final_response_dict = { - "performed_operation": performed_operation, - "status": status, - "retrieved_data": retrieved_data, - "error_details": error_details, - } - text = json.dumps(final_response_dict) - send_message_to_user(text) - def send_msg_to_user(text: str): """ diff --git a/browsergym/core/src/browsergym/core/action/highlevel.py b/browsergym/core/src/browsergym/core/action/highlevel.py index 72d42aca..81b93dcc 100644 --- a/browsergym/core/src/browsergym/core/action/highlevel.py +++ b/browsergym/core/src/browsergym/core/action/highlevel.py @@ -36,7 +36,6 @@ scroll_at, select_option, send_msg_to_user, - send_response_to_wav, tab_close, tab_focus, upload_file, @@ -152,28 +151,6 @@ send_msg_to_user, # STOP | stop(answer) | stop [answer] report_infeasible, ## explicit unachievable action, equivalent STOP "N/A" ], - # webarena_verified agent response schema - # https://github.com/ServiceNow/platform-labs-webarena-verified/blob/main/src/webarena_verified/types/agent_response.py - "webarena_verified": [ - # # code | paper | prompt - scroll, # SCROLL | scroll(dir) | scroll [down|up] - keyboard_press, # KEY_PRESS | press(key_comb) | press [key_comb] - # MOUSE_CLICK | | - # KEYBOARD_TYPE | | - # MOUSE_HOVER | | - click, # CLICK | click(elem) | click [id] - fill, # TYPE | type(elem, text) | type [id] [content] - hover, # HOVER | hover(elem) | hover [id] - tab_focus, # PAGE_FOCUS | tab_focus(index) | tab_focus [tab_index] - new_tab, # NEW_TAB | new_tab() | new_tab - go_back, # GO_BACK | go_back() | go_back - go_forward, # GO_FORWARD | go_forward() | go_forward - goto, # GOTO_URL | goto(url) | goto [url] - tab_close, # PAGE_CLOSE | tab_close() | close_tab - # CHECK | | - select_option, # SELECT_OPTION | | - send_response_to_wav, # STOP | stop(answer) | stop [answer] - ], # from the visualwebarena paper # https://arxiv.org/abs/2401.13649 # from the visualwebarena source code diff --git a/browsergym/experiments/src/browsergym/experiments/benchmark/configs.py b/browsergym/experiments/src/browsergym/experiments/benchmark/configs.py index 6b030ccb..543eeecd 100644 --- a/browsergym/experiments/src/browsergym/experiments/benchmark/configs.py +++ b/browsergym/experiments/src/browsergym/experiments/benchmark/configs.py @@ -142,7 +142,7 @@ ), "webarena_verified": lambda n_repeats=1: Benchmark( name="webarena_verified", - high_level_action_set_args=DEFAULT_HIGHLEVEL_ACTION_SET_ARGS["webarena_verified"], + high_level_action_set_args=DEFAULT_HIGHLEVEL_ACTION_SET_ARGS["webarena"], is_multi_tab=True, supports_parallel_seeds=False, backends=["webarena_verified"], diff --git a/browsergym/webarena_verified/src/browsergym/webarena_verified/task.py b/browsergym/webarena_verified/src/browsergym/webarena_verified/task.py index 711bc836..579c2355 100644 --- a/browsergym/webarena_verified/src/browsergym/webarena_verified/task.py +++ b/browsergym/webarena_verified/src/browsergym/webarena_verified/task.py @@ -141,36 +141,15 @@ def setup(self, page: playwright.sync_api.Page) -> tuple[str, dict]: goal = self.config["intent"] # WebArena Verified requires a specific format for the agent response - goal += """ - -When you are done, send your final answer to the user with `send_response_to_wav`. -""" -# response_schema = FinalAgentResponse.model_json_schema() -# goal += f""" - -# --- -# Final response format: When you send your final answer to the user with `send_msg_to_user`, your message must be a json formatted string that matches the following schema: -# ``` -# {json.dumps(response_schema, indent=4)} -# ``` -# Your message in `send_msg_to_user` will be validated against this schema. -# """ - - # This note is present in all webarena's agent prompts - # https://github.com/web-arena-x/webarena/blob/c6475f0e9affe5252a2966e26b8cb4c834a4ae40/agent/prompts/raw/p_cot_id_actree_2s.py#L34 - # However, webarena_verified does not have a homepage, so skip this hint - self.with_homepage_hint = False - if self.with_homepage_hint: - goal += f""" - -(Note: if you want to visit other websites, check out the homepage at {self.webarena_instance.home_url}. It has a list of websites you can visit. {self.webarena_instance.home_url}/password.html lists all the account name and password for the websites. You can use them to log in to the websites.) -""" - - # This note is present in some of webarena's agent prompts - if self.with_na_hint: - goal += """\ - -If you believe the task is impossible to complete, provide the answer "N/A". + response_schema = FinalAgentResponse.model_json_schema() + goal += f""" + +--- +Final response format: When you send your final answer to the user with `send_msg_to_user`, your message must be a json formatted string that matches the following schema: +``` +{json.dumps(response_schema, indent=4)} +``` +Your message in `send_msg_to_user` will be validated against this schema. """ return goal, {} From 4272b5e2ed4941c48cf059dd62349b38a9baa423 Mon Sep 17 00:00:00 2001 From: Nicolas Gontier Date: Fri, 31 Oct 2025 18:46:17 +0000 Subject: [PATCH 27/64] remove wav actions --- .../src/browsergym/core/action/highlevel.py | 1 - .../browsergym/experiments/benchmark/configs.py | 7 ------- browsergym/webarena_verified/README.md | 17 ++++------------- 3 files changed, 4 insertions(+), 21 deletions(-) diff --git a/browsergym/core/src/browsergym/core/action/highlevel.py b/browsergym/core/src/browsergym/core/action/highlevel.py index 81b93dcc..97ac0a8d 100644 --- a/browsergym/core/src/browsergym/core/action/highlevel.py +++ b/browsergym/core/src/browsergym/core/action/highlevel.py @@ -272,7 +272,6 @@ class HighLevelActionSet(AbstractActionSet): "miniwob_liu18", "miniwob_humphreys22", "webarena", - "webarena_verified", "visualwebarena", "workarena", "workarena++", diff --git a/browsergym/experiments/src/browsergym/experiments/benchmark/configs.py b/browsergym/experiments/src/browsergym/experiments/benchmark/configs.py index 543eeecd..9551111d 100644 --- a/browsergym/experiments/src/browsergym/experiments/benchmark/configs.py +++ b/browsergym/experiments/src/browsergym/experiments/benchmark/configs.py @@ -65,13 +65,6 @@ retry_with_force=True, demo_mode="off", ), - "webarena_verified": HighLevelActionSetArgs( - subsets=["webarena_verified"], - multiaction=False, - strict=False, - retry_with_force=True, - demo_mode="off", - ), # from https://arxiv.org/abs/2401.13649 "visualwebarena": HighLevelActionSetArgs( subsets=["visualwebarena"], diff --git a/browsergym/webarena_verified/README.md b/browsergym/webarena_verified/README.md index 43ae475d..d6bbc02e 100644 --- a/browsergym/webarena_verified/README.md +++ b/browsergym/webarena_verified/README.md @@ -1,15 +1,15 @@ # WebArena Verified benchmark for BrowserGym -This package provides `browsergym.webarena_verified`, which integrates the WebArena Verified benchmark from the [platform-labs-agent-eval-harness](https://github.com/ServiceNow/platform-labs-agent-eval-harness) into BrowserGym. +This package provides `browsergym.webarena_verified`, which integrates the WebArena Verified benchmark from the [platform-labs-webarena-verified](https://github.com/ServiceNow/platform-labs-webarena-verified) into BrowserGym. ## Installation ### 0. Prerequisites -Before installing this package, you need to clone the platform-labs-agent-eval-harness repository locally: +Before installing this package, you need to clone the platform-labs-webarena-verified repository locally: ```bash -git clone https://github.com/ServiceNow/platform-labs-agent-eval-harness.git /home/toolkit/platform-labs-agent-eval-harness +git clone https://github.com/ServiceNow/platform-labs-webarena-verified.git ../platform-labs-webarena-verified ``` ### 1. Install this BrowserGym package @@ -19,18 +19,9 @@ pip install -e ./browsergym/webarena_verified ``` This will automatically install the required dependencies from local file paths: -- `webarena-verified` from local platform-labs-agent-eval-harness repository -- `agent-eval-harness-common` from local platform-labs-agent-eval-harness repository -- `agent-eval-harness-pytest` from local platform-labs-agent-eval-harness repository +- `webarena-verified` from local platform-labs-webarena-verified -**Note**: This package requires the [platform-labs-agent-eval-harness](https://github.com/ServiceNow/platform-labs-agent-eval-harness) repository to be cloned locally at `/home/toolkit/platform-labs-agent-eval-harness` before installation. -### 2. Download required resources - -```bash -# Download NLTK tokenizer resources -python -c "import nltk; nltk.download('punkt_tab')" -``` ## Setup From fea25ed7e0e666c949a96f68f348109b0002101c Mon Sep 17 00:00:00 2001 From: Nicolas Gontier Date: Mon, 3 Nov 2025 16:09:49 +0000 Subject: [PATCH 28/64] load extra context headers for webarena(+lite) --- browsergym/webarena/src/browsergym/webarena/task.py | 12 ++++++++++++ .../src/browsergym/webarenalite/task.py | 13 ++++++++++++- 2 files changed, 24 insertions(+), 1 deletion(-) diff --git a/browsergym/webarena/src/browsergym/webarena/task.py b/browsergym/webarena/src/browsergym/webarena/task.py index 3467c152..628c23b2 100644 --- a/browsergym/webarena/src/browsergym/webarena/task.py +++ b/browsergym/webarena/src/browsergym/webarena/task.py @@ -1,8 +1,10 @@ import importlib.resources import json import logging +import os import tempfile import urllib.parse +from pathlib import Path from typing import Optional, Tuple import numpy as np @@ -101,6 +103,16 @@ def setup(self, page: playwright.sync_api.Page) -> tuple[str, dict]: # build the evaluator self.evaluator = evaluator_router(self.config_file) + # add extra context headers if they are present (e.g. for access token to the self hosted webarena verified instances) + if os.environ.get("PW_EXTRA_HEADERS"): + extra_headers_file_path = Path(os.environ["PW_EXTRA_HEADERS"]) + try: + with open(extra_headers_file_path, "r") as f: + extra_headers = json.load(f) + page.context.set_extra_http_headers(extra_headers) + except Exception as e: + logger.warning(f"Failed to load extra headers from {extra_headers_file_path}: {e}. Make sure to set the PW_EXTRA_HEADERS environment variable to the path of an existing json file containing the extra headers. Continuing without extra headers.") + # authenticate for site in self.config["sites"]: self.webarena_instance.ui_login(site=site, page=page) diff --git a/browsergym/webarenalite/src/browsergym/webarenalite/task.py b/browsergym/webarenalite/src/browsergym/webarenalite/task.py index 501c8a29..6f0741f7 100644 --- a/browsergym/webarenalite/src/browsergym/webarenalite/task.py +++ b/browsergym/webarenalite/src/browsergym/webarenalite/task.py @@ -1,14 +1,15 @@ import importlib.resources import json import logging +import os import tempfile +from pathlib import Path from typing import Optional import playwright.sync_api from browsergym.webarena.task import GenericWebArenaTask - logger = logging.getLogger(__name__) @@ -87,6 +88,16 @@ def setup(self, page: playwright.sync_api.Page) -> tuple[str, dict]: # build the evaluator self.evaluator = evaluator_router(self.config_file) + # add extra context headers if they are present (e.g. for access token to the self hosted webarena verified instances) + if os.environ.get("PW_EXTRA_HEADERS"): + extra_headers_file_path = Path(os.environ["PW_EXTRA_HEADERS"]) + try: + with open(extra_headers_file_path, "r") as f: + extra_headers = json.load(f) + page.context.set_extra_http_headers(extra_headers) + except Exception as e: + logger.warning(f"Failed to load extra headers from {extra_headers_file_path}: {e}. Make sure to set the PW_EXTRA_HEADERS environment variable to the path of an existing json file containing the extra headers. Continuing without extra headers.") + # authenticate for site in self.config["sites"]: self.webarena_instance.ui_login(site=site, page=page) From fc090f05d2d87cc71c675ff5cdc5534f90e5426b Mon Sep 17 00:00:00 2001 From: Nicolas Gontier Date: Wed, 5 Nov 2025 20:19:43 +0000 Subject: [PATCH 29/64] update README --- browsergym/webarena_verified/README.md | 121 +++++++------------------ 1 file changed, 35 insertions(+), 86 deletions(-) diff --git a/browsergym/webarena_verified/README.md b/browsergym/webarena_verified/README.md index d6bbc02e..d3858e7c 100644 --- a/browsergym/webarena_verified/README.md +++ b/browsergym/webarena_verified/README.md @@ -1,109 +1,58 @@ # WebArena Verified benchmark for BrowserGym -This package provides `browsergym.webarena_verified`, which integrates the WebArena Verified benchmark from the [platform-labs-webarena-verified](https://github.com/ServiceNow/platform-labs-webarena-verified) into BrowserGym. +This package provides `browsergym.webarena_verified`, which integrates the [WebArena Verified benchmark](https://github.com/ServiceNow/platform-labs-webarena-verified) into BrowserGym. -## Installation +## WebArena Server Deployement -### 0. Prerequisites +Follow the official [webarena README](https://github.com/web-arena-x/webarena/blob/main/environment_docker/README.md) -Before installing this package, you need to clone the platform-labs-webarena-verified repository locally: +## Setup + +#### 1. Install webarena-verified in the same folder that contains BrowserGym ```bash -git clone https://github.com/ServiceNow/platform-labs-webarena-verified.git ../platform-labs-webarena-verified +git clone https://github.com/ServiceNow/platform-labs-webarena-verified.git ../webarena-verified +pip install -e ../webarena-verified ``` -### 1. Install this BrowserGym package +#### 2. Install this BrowserGym package ```bash pip install -e ./browsergym/webarena_verified ``` -This will automatically install the required dependencies from local file paths: -- `webarena-verified` from local platform-labs-webarena-verified - - - -## Setup - -### Environment Variables - -Set up the WebArena environment URLs. The ports should correspond to your WebArena instance setup: - +Alternatively, you can also run: ```bash -BASE_URL= # example: "http://myazuremachine.eastus.cloudapp.azure.com" - -# WebArena environment variables (change ports as needed) -export WA_SHOPPING="$BASE_URL:8082/" -export WA_SHOPPING_ADMIN="$BASE_URL:8083/admin" -export WA_REDDIT="$BASE_URL:8080" -export WA_GITLAB="$BASE_URL:9001" -export WA_WIKIPEDIA="$BASE_URL:8081/wikipedia_en_all_maxi_2022-05/A/User:The_other_Kiwix_guy/Landing" -export WA_MAP="$BASE_URL:443" -export WA_HOMEPAGE="$BASE_URL:80" - -# Optional: Full reset functionality -export WA_FULL_RESET="$BASE_URL:7565" +make install ``` -### API Keys - -Set up required API keys: +#### 3. Setup WebArena environment URLs ```bash -# OpenAI API key (required for LLM-based evaluations) -export OPENAI_API_KEY=... - -# Optional: Langfuse API key for tracing -export LANGFUSE_PUBLIC_KEY=... -export LANGFUSE_SECRET_KEY=... +export WA_SHOPPING="..." +export WA_SHOPPING_ADMIN=".../admin" +export WA_REDDIT="..." +export WA_GITLAB="..." +export WA_WIKIPEDIA=".../wikipedia_en_all_maxi_2022-05/A/User:The_other_Kiwix_guy/Landing" +export WA_MAP="..." +export WA_HOMEPAGE="..." + +# (optional) path to a json file containing extra html headers for playwright to load in the page (for instance, a secret token to access your self hosted webarena instances). +export PW_EXTRA_HEADERS="..." ``` -## Usage - +If you don't have a running environment for a domain, simply replace its URL with `"todo"`, +and filter the benchmark tasks like so: ```python -import browsergym.webarena_verified - -# The package automatically registers all WebArena Verified tasks -# Task IDs range from 0 to 811 (812 total tasks) - -# Example: Run a specific task -from browsergym.webarena_verified import ALL_WEBARENA_TASK_IDS -print(f"Available tasks: {len(ALL_WEBARENA_TASK_IDS)}") - -# Example: Create a task -from browsergym.webarena_verified.task import WebArenaVerifiedTask - -task = WebArenaVerifiedTask(seed=42, task_id=0) +import bgym +from browsergym.experiments.benchmark.metadata.utils import task_metadata + +domains = ["shopping", "reddit"] # only consider 'shopping' or 'reddit' tasks +task_list = [] +for domain in domains: + task_list.extend(task_metadata("webarena_verified").groupby("sites").get_group(domain).task_name.to_list()) +benchmark = bgym.DEFAULT_BENCHMARKS["webarena_verified"]() # type: bgym.Benchmark +benchmark = benchmark.subset_from_list( + task_list, "webarena_verified"_suffix=f"only_{'-'.join(domains)}" +) ``` - -## Task Configuration - -WebArena Verified tasks are configured via the `webarena_verified.json` file, which includes: - -- **Task metadata**: task_id, intent, intent_template -- **Environment setup**: sites, start_url, geolocation -- **Evaluation criteria**: expected_retrieve_value, expected_backend_state, expected_ui_state -- **Authentication**: storage_state for logged-in sessions - -## Evaluation System - -The evaluation system supports three types of validation: - -1. **Retrieve Value**: Validates that the agent successfully retrieved the expected information -2. **Backend State**: Validates that the agent made the expected changes to the backend/database -3. **UI State**: Validates that the agent achieved the expected UI state - -## Differences from Original WebArena - -- Enhanced evaluation with multiple validation types -- Integration with platform-labs evaluation framework -- Support for more sophisticated task validation -- Better error handling and logging -- Structured agent response format - -## Troubleshooting - -- Ensure all environment variables are set correctly -- Verify that the WebArena instance is running and accessible -- Check that all required API keys are configured -- Review logs for detailed error information From 377dcca68ea79897b78aa8d1eddf48788bec50fc Mon Sep 17 00:00:00 2001 From: Nicolas Gontier Date: Wed, 5 Nov 2025 20:20:04 +0000 Subject: [PATCH 30/64] update requirements --- browsergym/webarena_verified/requirements.txt | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/browsergym/webarena_verified/requirements.txt b/browsergym/webarena_verified/requirements.txt index dc30c254..7d961d76 100644 --- a/browsergym/webarena_verified/requirements.txt +++ b/browsergym/webarena_verified/requirements.txt @@ -1,3 +1 @@ -browsergym-core==0.14.2 -libwebarena==0.0.4 -webarena-verified @ file:///home/toolkit/platform-labs-webarena-verified +browsergym-core==0.14.2 \ No newline at end of file From 1f02f3f1342caee858867b616addd8229d55e06b Mon Sep 17 00:00:00 2001 From: Nicolas Gontier Date: Wed, 5 Nov 2025 20:42:03 +0000 Subject: [PATCH 31/64] update makefile and readme --- Makefile | 31 ++++++++++++++++++-------- browsergym/webarena_verified/README.md | 21 +++++++---------- 2 files changed, 30 insertions(+), 22 deletions(-) diff --git a/Makefile b/Makefile index 1d5834bb..0d098057 100644 --- a/Makefile +++ b/Makefile @@ -28,6 +28,18 @@ setup-miniwob: @echo "💡 To use MiniWoB++, load the environment variables:" @echo " source .env" +setup-webarena-verified: + @echo "--- 🌐 Setting up WebArena Verified ---" + @if [ ! -d "../platform-labs-webarena-verified" ]; then \ + echo "Cloning WebArena Verified repository..."; \ + git clone https://github.com/ServiceNow/platform-labs-webarena-verified.git ../platform-labs-webarena-verified; \ + else \ + echo "WebArena Verified repository already exists, skipping clone..."; \ + fi + @echo "Installing WebArena Verified package..." + pip install -e ../platform-labs-webarena-verified + @echo "✅ WebArena Verified setup complete!" + test-core: @echo "--- 🧪 Running tests ---" pytest -n auto ./tests/core @@ -39,12 +51,13 @@ clean-miniwob: help: @echo "Available targets:" - @echo " install - Install project dependencies" - @echo " setup-miniwob - Setup MiniWoB++ dependencies" - @echo " install-demo - Install demo dependencies" - @echo " demo - Run demo agent" - @echo " test-core - Run core tests" - @echo " clean-miniwob - Remove MiniWoB++ directory" - @echo " help - Show this help message" - -.PHONY: install setup-miniwob install-demo demo test-core clean-miniwob help + @echo " install - Install project dependencies" + @echo " setup-miniwob - Setup MiniWoB++ dependencies" + @echo " setup-webarena-verified - Setup WebArena Verified dependencies" + @echo " install-demo - Install demo dependencies" + @echo " demo - Run demo agent" + @echo " test-core - Run core tests" + @echo " clean-miniwob - Remove MiniWoB++ directory" + @echo " help - Show this help message" + +.PHONY: install setup-miniwob setup-webarena-verified install-demo demo test-core clean-miniwob help diff --git a/browsergym/webarena_verified/README.md b/browsergym/webarena_verified/README.md index d3858e7c..6c5f392d 100644 --- a/browsergym/webarena_verified/README.md +++ b/browsergym/webarena_verified/README.md @@ -6,27 +6,22 @@ This package provides `browsergym.webarena_verified`, which integrates the [WebA Follow the official [webarena README](https://github.com/web-arena-x/webarena/blob/main/environment_docker/README.md) -## Setup +## WebArena Verified Setup -#### 1. Install webarena-verified in the same folder that contains BrowserGym +#### 1. Install webarena-verified ```bash -git clone https://github.com/ServiceNow/platform-labs-webarena-verified.git ../webarena-verified -pip install -e ../webarena-verified -``` - -#### 2. Install this BrowserGym package - -```bash -pip install -e ./browsergym/webarena_verified +make install +make setup-webarena-verified # this commands will install webarena-verified in the same folder that contains BrowserGym ``` - Alternatively, you can also run: ```bash -make install +pip install -e ./browsergym/webarena_verified +git clone https://github.com/ServiceNow/platform-labs-webarena-verified.git ../platform-labs-webarena-verified +pip install -e ../platform-labs-webarena-verified ``` -#### 3. Setup WebArena environment URLs +#### 2. Setup WebArena environment URLs ```bash export WA_SHOPPING="..." From f86a2b35279ea37321c1c484053362620b8f6c6a Mon Sep 17 00:00:00 2001 From: Nicolas Gontier Date: Thu, 6 Nov 2025 15:35:16 +0000 Subject: [PATCH 32/64] update readme --- README.md | 3 +++ browsergym/webarena_verified/README.md | 4 ++-- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 874950f5..a6ca88cf 100644 --- a/README.md +++ b/README.md @@ -39,6 +39,7 @@ _Example of a GPT4-V agent executing openended tasks (top row, chat interactive) BrowserGym includes the following benchmarks by default: - [MiniWoB](https://miniwob.farama.org/) - [WebArena](https://webarena.dev/) + - [WebArenaVerified](https://github.com/ServiceNow/platform-labs-webarena-verified) - [VisualWebArena](https://jykoh.com/vwa) - [WorkArena](https://github.com/ServiceNow/WorkArena) - [AssistantBench](https://github.com/oriyor/assistantbench) @@ -55,6 +56,7 @@ pip install browsergym-experiments # experiment utilities (agent, loop, benchma pip install browsergym-core # core functionalities only (no benchmark, just the openended task) pip install browsergym-miniwob # core + miniwob pip install browsergym-webarena # core + webarena +pip install browsergym-webarena-verified # core + webarena_verified pip install browsergym-visualwebarena # core + visualwebarena pip install browsergym-workarena # core + workarena pip install browsergym-assistantbench # core + assistantbench @@ -69,6 +71,7 @@ playwright install chromium Finally, each benchmark comes with its own specific setup that requires to follow additional steps. - for MiniWoB++, see [miniwob/README.md](browsergym/miniwob/README.md) - for WebArena, see [webarena/README.md](browsergym/webarena/README.md) + - for WebArenaVerified, see [webarena_verified/README.md](browsergym/webarena_verified/README.md) - for VisualWebArena, see [visualwebarena/README.md](browsergym/visualwebarena/README.md) - for WorkArena, see [WorkArena](https://github.com/ServiceNow/WorkArena) - for AssistantBench, see [assistantbench/README.md](browsergym/assistantbench/README.md) diff --git a/browsergym/webarena_verified/README.md b/browsergym/webarena_verified/README.md index 6c5f392d..7921ef22 100644 --- a/browsergym/webarena_verified/README.md +++ b/browsergym/webarena_verified/README.md @@ -2,7 +2,7 @@ This package provides `browsergym.webarena_verified`, which integrates the [WebArena Verified benchmark](https://github.com/ServiceNow/platform-labs-webarena-verified) into BrowserGym. -## WebArena Server Deployement +## WebArena Server Deployment Follow the official [webarena README](https://github.com/web-arena-x/webarena/blob/main/environment_docker/README.md) @@ -12,7 +12,7 @@ Follow the official [webarena README](https://github.com/web-arena-x/webarena/bl ```bash make install -make setup-webarena-verified # this commands will install webarena-verified in the same folder that contains BrowserGym +make setup-webarena-verified # this commands will clone & install webarena-verified locally in the same folder that contains BrowserGym ``` Alternatively, you can also run: ```bash From df3bfa44cb4c5d10cdd85a0ac8b68424894af28a Mon Sep 17 00:00:00 2001 From: Nicolas Gontier Date: Thu, 6 Nov 2025 15:41:40 +0000 Subject: [PATCH 33/64] update requirements --- browsergym/webarena_verified/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/browsergym/webarena_verified/requirements.txt b/browsergym/webarena_verified/requirements.txt index 7d961d76..6a0d5f79 100644 --- a/browsergym/webarena_verified/requirements.txt +++ b/browsergym/webarena_verified/requirements.txt @@ -1 +1 @@ -browsergym-core==0.14.2 \ No newline at end of file +browsergym-core==0.14.3.dev1 \ No newline at end of file From bf6cd9a2d5bc00f4960ba96f29cb2b9dc222a1fe Mon Sep 17 00:00:00 2001 From: Nicolas Gontier Date: Thu, 6 Nov 2025 16:01:53 +0000 Subject: [PATCH 34/64] update readme --- browsergym/webarena_verified/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/browsergym/webarena_verified/README.md b/browsergym/webarena_verified/README.md index 7921ef22..6504510d 100644 --- a/browsergym/webarena_verified/README.md +++ b/browsergym/webarena_verified/README.md @@ -12,7 +12,7 @@ Follow the official [webarena README](https://github.com/web-arena-x/webarena/bl ```bash make install -make setup-webarena-verified # this commands will clone & install webarena-verified locally in the same folder that contains BrowserGym +make setup-webarena-verified # this command will clone & install webarena-verified locally in the same folder that contains BrowserGym ``` Alternatively, you can also run: ```bash From 76ab14e157576e92daa5229b058493a251299c0c Mon Sep 17 00:00:00 2001 From: Nicolas Gontier Date: Thu, 6 Nov 2025 16:14:31 +0000 Subject: [PATCH 35/64] update test --- tests/experiments/test_benchmark.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/experiments/test_benchmark.py b/tests/experiments/test_benchmark.py index 14e598df..c4ba1b40 100644 --- a/tests/experiments/test_benchmark.py +++ b/tests/experiments/test_benchmark.py @@ -49,6 +49,7 @@ def test_build_benchmarks(): "miniwob": 125 * 5, "miniwob_tiny_test": 2 * 2, "webarena": 812, + "webarena_verified": 812, "webarena_tiny": 6, "webarena_lite": 165, "visualwebarena": 910, From eae4152602cb39fc451810dfabbfe75a9f4a7a25 Mon Sep 17 00:00:00 2001 From: Nicolas Gontier Date: Thu, 6 Nov 2025 16:22:10 +0000 Subject: [PATCH 36/64] black formater --- .../browsergym/experiments/benchmark/base.py | 8 +++++++- .../browsergym/experiments/benchmark/utils.py | 3 +-- .../webarena/src/browsergym/webarena/task.py | 4 +++- .../browsergym/webarena_verified/config.py | 4 +++- .../webarena_verified/evaluators.py | 20 ++++++++++++------- .../src/browsergym/webarena_verified/task.py | 17 ++++++++++------ .../src/browsergym/webarenalite/task.py | 4 +++- 7 files changed, 41 insertions(+), 19 deletions(-) diff --git a/browsergym/experiments/src/browsergym/experiments/benchmark/base.py b/browsergym/experiments/src/browsergym/experiments/benchmark/base.py index d3560b5e..64cd017f 100644 --- a/browsergym/experiments/src/browsergym/experiments/benchmark/base.py +++ b/browsergym/experiments/src/browsergym/experiments/benchmark/base.py @@ -53,7 +53,13 @@ def make_action_set(self): BenchmarkBackend = Literal[ - "miniwob", "webarena", "webarena_verified", "visualwebarena", "workarena", "assistantbench", "weblinx" + "miniwob", + "webarena", + "webarena_verified", + "visualwebarena", + "workarena", + "assistantbench", + "weblinx", ] diff --git a/browsergym/experiments/src/browsergym/experiments/benchmark/utils.py b/browsergym/experiments/src/browsergym/experiments/benchmark/utils.py index b1dcaa48..e87ab50c 100644 --- a/browsergym/experiments/src/browsergym/experiments/benchmark/utils.py +++ b/browsergym/experiments/src/browsergym/experiments/benchmark/utils.py @@ -158,8 +158,7 @@ def prepare_backend(backend: str): massage_tasks( [ f"webarena_verified.{intent_template_id}.{task_id}" - for intent_template_id, task_id in - [ + for intent_template_id, task_id in [ # gitlab, shopping_admin, and map are not ready yet (23, 410), # reddit # (330, 533), # gitlab diff --git a/browsergym/webarena/src/browsergym/webarena/task.py b/browsergym/webarena/src/browsergym/webarena/task.py index 628c23b2..b8275820 100644 --- a/browsergym/webarena/src/browsergym/webarena/task.py +++ b/browsergym/webarena/src/browsergym/webarena/task.py @@ -111,7 +111,9 @@ def setup(self, page: playwright.sync_api.Page) -> tuple[str, dict]: extra_headers = json.load(f) page.context.set_extra_http_headers(extra_headers) except Exception as e: - logger.warning(f"Failed to load extra headers from {extra_headers_file_path}: {e}. Make sure to set the PW_EXTRA_HEADERS environment variable to the path of an existing json file containing the extra headers. Continuing without extra headers.") + logger.warning( + f"Failed to load extra headers from {extra_headers_file_path}: {e}. Make sure to set the PW_EXTRA_HEADERS environment variable to the path of an existing json file containing the extra headers. Continuing without extra headers." + ) # authenticate for site in self.config["sites"]: diff --git a/browsergym/webarena_verified/src/browsergym/webarena_verified/config.py b/browsergym/webarena_verified/src/browsergym/webarena_verified/config.py index b9766ad7..550b9af2 100644 --- a/browsergym/webarena_verified/src/browsergym/webarena_verified/config.py +++ b/browsergym/webarena_verified/src/browsergym/webarena_verified/config.py @@ -10,4 +10,6 @@ for task in data: INTENT_TEMPLATE_IDS.append(task["intent_template_id"]) -assert len(INTENT_TEMPLATE_IDS) == len(TASK_IDS), "Number of intent template IDs must match number of task IDs" +assert len(INTENT_TEMPLATE_IDS) == len( + TASK_IDS +), "Number of intent template IDs must match number of task IDs" diff --git a/browsergym/webarena_verified/src/browsergym/webarena_verified/evaluators.py b/browsergym/webarena_verified/src/browsergym/webarena_verified/evaluators.py index 75f5233d..8242851e 100644 --- a/browsergym/webarena_verified/src/browsergym/webarena_verified/evaluators.py +++ b/browsergym/webarena_verified/src/browsergym/webarena_verified/evaluators.py @@ -33,7 +33,7 @@ class WebArenaVerifiedEvaluator: """ Evaluator that integrates the webarena_verified evaluation system. - + This evaluator handles the new evaluation format with: - expected_retrieve_value: Validates data retrieval - expected_backend_state: Validates backend/database changes @@ -57,8 +57,8 @@ def __init__(self, webarena_instance: WebArenaInstance): }, WebArenaSite.HOMEPAGE: EnvironmentConfig( urls=[webarena_instance.home_url], - ) - } + ), + }, ) # Instantiate data reader and evaluator reader = WebArenaVerifiedDataReader(config) @@ -84,6 +84,7 @@ def __call__( """ # import webarena dynamically from webarena.browser_env.actions import ActionTypes + # if last action is not a STOP action, return 0.0 as the task is not completed yet if trajectory[-1].get("action_type") != ActionTypes.STOP: return 0.0 @@ -110,9 +111,14 @@ def __call__( logger.info(f"Running webarena_verified evaluation for task {task.task_id}") results: TaskEvalResult = self.evaluator.evaluate_task(context=context) logger.info(f"Webarena_verified evaluation result for task {task.task_id}:") - logger.info(f"status: {results.status}, score: {results.score}, error_msg: {results.error_msg}") + logger.info( + f"status: {results.status}, score: {results.score}, error_msg: {results.error_msg}" + ) for result in results.evaluators_results: - logger.info(f"- {result.evaluator_name}: status: {result.status}, score: {result.score}, error_msg: {result.error_msg}") + logger.info( + f"- {result.evaluator_name}: status: {result.status}, score: {result.score}, error_msg: {result.error_msg}" + ) # return average score - return sum(result.score for result in results.evaluators_results) / len(results.evaluators_results) - + return sum(result.score for result in results.evaluators_results) / len( + results.evaluators_results + ) diff --git a/browsergym/webarena_verified/src/browsergym/webarena_verified/task.py b/browsergym/webarena_verified/src/browsergym/webarena_verified/task.py index 579c2355..1afa34d0 100644 --- a/browsergym/webarena_verified/src/browsergym/webarena_verified/task.py +++ b/browsergym/webarena_verified/src/browsergym/webarena_verified/task.py @@ -21,10 +21,10 @@ class WebArenaVerifiedTask(GenericWebArenaTask): """ WebArena Verified task class that integrates the full evaluation system from platform-labs-agent-eval-harness. - + This task class handles the new evaluation format with: - expected_retrieve_value - - expected_backend_state + - expected_backend_state - expected_ui_state """ @@ -50,7 +50,7 @@ def __init__( .joinpath("webarena_verified.json") .read_text() ) - + # substitute URLs for pattern, url_key in { "__GITLAB__": "gitlab", @@ -90,9 +90,12 @@ def setup(self, page: playwright.sync_api.Page) -> tuple[str, dict]: # pick a task at random self.config = self.random.choice(self.task_configs) - # hack: dynamically build a config file to read from + # dynamically build a config file to read from with tempfile.NamedTemporaryFile( - mode="w+", delete=False, prefix=f"wav-{self.config['intent_template_id']}-{self.config['task_id']}_", suffix=".json" + mode="w+", + delete=False, + prefix=f"wav-{self.config['intent_template_id']}-{self.config['task_id']}_", + suffix=".json", ) as f: json.dump(self.config, f, indent=4) f.flush() @@ -109,7 +112,9 @@ def setup(self, page: playwright.sync_api.Page) -> tuple[str, dict]: extra_headers = json.load(f) page.context.set_extra_http_headers(extra_headers) except Exception as e: - logger.warning(f"Failed to load extra headers from {extra_headers_file_path}: {e}. Make sure to set the PW_EXTRA_HEADERS environment variable to the path of an existing json file containing the extra headers. Continuing without extra headers.") + logger.warning( + f"Failed to load extra headers from {extra_headers_file_path}: {e}. Make sure to set the PW_EXTRA_HEADERS environment variable to the path of an existing json file containing the extra headers. Continuing without extra headers." + ) # authenticate for site in self.config["sites"]: diff --git a/browsergym/webarenalite/src/browsergym/webarenalite/task.py b/browsergym/webarenalite/src/browsergym/webarenalite/task.py index 6f0741f7..a62a27a7 100644 --- a/browsergym/webarenalite/src/browsergym/webarenalite/task.py +++ b/browsergym/webarenalite/src/browsergym/webarenalite/task.py @@ -96,7 +96,9 @@ def setup(self, page: playwright.sync_api.Page) -> tuple[str, dict]: extra_headers = json.load(f) page.context.set_extra_http_headers(extra_headers) except Exception as e: - logger.warning(f"Failed to load extra headers from {extra_headers_file_path}: {e}. Make sure to set the PW_EXTRA_HEADERS environment variable to the path of an existing json file containing the extra headers. Continuing without extra headers.") + logger.warning( + f"Failed to load extra headers from {extra_headers_file_path}: {e}. Make sure to set the PW_EXTRA_HEADERS environment variable to the path of an existing json file containing the extra headers. Continuing without extra headers." + ) # authenticate for site in self.config["sites"]: From afdf21895bb161dd358d3ef5f094fe427f516267 Mon Sep 17 00:00:00 2001 From: Nicolas Gontier Date: Mon, 10 Nov 2025 20:44:58 +0000 Subject: [PATCH 37/64] upd makefile --- Makefile | 1 + 1 file changed, 1 insertion(+) diff --git a/Makefile b/Makefile index 0d098057..ea1fb75c 100644 --- a/Makefile +++ b/Makefile @@ -38,6 +38,7 @@ setup-webarena-verified: fi @echo "Installing WebArena Verified package..." pip install -e ../platform-labs-webarena-verified + cp ../platform-labs-webarena-verified/assets/dataset/webarena-verified.json ./browsergym/webarena_verified/src/browsergym/webarena_verified/webarena_verified.json @echo "✅ WebArena Verified setup complete!" test-core: From c3814bfbfc631cb782f122d0b1812b24d9f811bc Mon Sep 17 00:00:00 2001 From: Nicolas Gontier Date: Mon, 10 Nov 2025 20:45:50 +0000 Subject: [PATCH 38/64] update to new webarena_verified dataset version --- .../webarena_verified/webarena_verified.json | 9928 ++++++++++------- 1 file changed, 5834 insertions(+), 4094 deletions(-) diff --git a/browsergym/webarena_verified/src/browsergym/webarena_verified/webarena_verified.json b/browsergym/webarena_verified/src/browsergym/webarena_verified/webarena_verified.json index 6e856fd2..e3c15594 100644 --- a/browsergym/webarena_verified/src/browsergym/webarena_verified/webarena_verified.json +++ b/browsergym/webarena_verified/src/browsergym/webarena_verified/webarena_verified.json @@ -8,7 +8,6 @@ "intent_template": "Get the top-{{n}} best-selling product name(s) in {{year}}", "instantiation_dict": {"n": 1, "year": 2022}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -32,7 +31,6 @@ "intent_template": "Get the top-{{n}} best-selling brand name(s) in {{period}}", "instantiation_dict": {"n": 1, "period": "Quarter 1 2022"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -56,7 +54,6 @@ "intent_template": "Get the top-{{n}} best-selling product type name(s) in {{period}}", "instantiation_dict": {"n": 1, "period": "Quarter 1 2022"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -80,7 +77,6 @@ "intent_template": "Get the top-{{n}} best-selling product name(s) in {{year}}", "instantiation_dict": {"n": 2, "year": 2022}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -110,7 +106,6 @@ "intent_template": "Get the top-{{n}} best-selling product name(s) in {{period}}", "instantiation_dict": {"n": 3, "period": "Jan 2023"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -134,7 +129,6 @@ "intent_template": "Get the top-{{n}} best-selling product type name(s) in {{period}}", "instantiation_dict": {"n": 1, "period": "Jan 2023"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -158,7 +152,6 @@ "intent_template": "Get the top-{{n}} best-selling product name(s) in {{year}}", "instantiation_dict": {"n": 2, "year": 2023}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -189,7 +182,6 @@ "radius": "50 km" }, "format_specification": "Use \"name\" for the name, \"state\" for the state, and \"postcode\" for the postcode.", - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -235,7 +227,6 @@ "radius": "5 km" }, "format_specification": "Use \"name\" for the name, \"state\" for the state, and \"postcode\" for the postcode.", - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -262,7 +253,6 @@ "radius": "30 km" }, "format_specification": "Use \"name\" for the name, \"state\" for the state, and \"postcode\" for the postcode.", - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -308,7 +298,6 @@ "radius": "60 km" }, "format_specification": "Use \"name\" for the name, \"state\" for the state, and \"postcode\" for the postcode.", - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -351,11 +340,10 @@ "task_id": 11, "intent_template_id": 288, "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Tell me the number of reviews that our store received so far that mention term \"disappointed\"", - "intent_template": "Tell me the number of reviews that our store received so far that mention term \"{{term}}\"", + "intent": "Get the total number of reviews that our store received so far that mention term \"disappointed\"", + "intent_template": "Get the total number of reviews that our store received so far that mention term \"{{term}}\"", "instantiation_dict": {"term": "disappointed"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -375,11 +363,10 @@ "task_id": 12, "intent_template_id": 288, "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Tell me the number of reviews that our store received so far that mention term \"satisfied\"", - "intent_template": "Tell me the number of reviews that our store received so far that mention term \"{{term}}\"", + "intent": "Get the total number of reviews that our store received so far that mention term \"satisfied\"", + "intent_template": "Get the total number of reviews that our store received so far that mention term \"{{term}}\"", "instantiation_dict": {"term": "satisfied"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -399,11 +386,10 @@ "task_id": 13, "intent_template_id": 288, "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Tell me the number of reviews that our store received so far that mention term \"decent\"", - "intent_template": "Tell me the number of reviews that our store received so far that mention term \"{{term}}\"", + "intent": "Get the total number of reviews that our store received so far that mention term \"decent\"", + "intent_template": "Get the total number of reviews that our store received so far that mention term \"{{term}}\"", "instantiation_dict": {"term": "decent"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -423,11 +409,10 @@ "task_id": 14, "intent_template_id": 288, "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Tell me the number of reviews that our store received so far that mention term \"not useful\"", - "intent_template": "Tell me the number of reviews that our store received so far that mention term \"{{term}}\"", + "intent": "Get the total number of reviews that our store received so far that mention term \"not useful\"", + "intent_template": "Get the total number of reviews that our store received so far that mention term \"{{term}}\"", "instantiation_dict": {"term": "not useful"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -447,11 +432,10 @@ "task_id": 15, "intent_template_id": 288, "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Tell me the number of reviews that our store received so far that mention term \"best\"", - "intent_template": "Tell me the number of reviews that our store received so far that mention term \"{{term}}\"", + "intent": "Get the total number of reviews that our store received so far that mention term \"best\"", + "intent_template": "Get the total number of reviews that our store received so far that mention term \"{{term}}\"", "instantiation_dict": {"term": "best"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -475,7 +459,6 @@ "intent_template": "Tell me the time for walking and driving route from {{start}} to {{end}}", "instantiation_dict": {"start": "5000 Fifth Avenue, Pittsburgh", "end": "UPMC family health center"}, "format_specification": "Use \"mode\" for the travel mode and \"duration\" for the duration.", - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -512,7 +495,6 @@ "intent_template": "Tell me the time for walking and driving route from {{start}} to {{end}}", "instantiation_dict": {"start": "AMC Waterfront", "end": "Carnegie Mellon University"}, "format_specification": "Use \"mode\" for the travel mode and \"duration\" for the duration.", - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -549,7 +531,6 @@ "intent_template": "Tell me the time for walking and driving route from {{start}} to {{end}}", "instantiation_dict": {"start": "AMC Waterfront", "end": "Univ of Pittsburgh"}, "format_specification": "Use \"mode\" for the travel mode and \"duration\" for the duration.", - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -586,7 +567,6 @@ "intent_template": "Tell me the time for walking and driving route from {{start}} to {{end}}", "instantiation_dict": {"start": "Carnegie Science Center", "end": "Carnegie Mellon University"}, "format_specification": "Use \"mode\" for the travel mode and \"duration\" for the duration.", - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -623,7 +603,6 @@ "intent_template": "Tell me the time for walking and driving route from {{start}} to {{end}}", "instantiation_dict": {"start": "Randyland", "end": "Carnegie Mellon University"}, "format_specification": "Use \"mode\" for the travel mode and \"duration\" for the duration.", - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -658,11 +637,10 @@ "start_urls": [ "__SHOPPING__/6s-wireless-headphones-over-ear-noise-canceling-hi-fi-bass-foldable-stereo-wireless-kid-headsets-earbuds-with-built-in-mic-micro-sd-tf-fm-for-iphone-samsung-ipad-pc-black-gold.html" ], - "intent": "Get name(s) of reviewer(s) who mention ear cups being small", - "intent_template": "Get name(s) of reviewer(s) who mention {{description}}", + "intent": "Get name(s) of reviewer(s) who mention ear cups being small for the product on the current page", + "intent_template": "Get name(s) of reviewer(s) who mention {{description}} for the product on the current page", "instantiation_dict": {"description": "ear cups being small"}, "format_specification": null, - "start_url_context": "Complete the task given the context of the current page.", "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -672,8 +650,10 @@ "performed_operation": "retrieve", "status": "SUCCESS", "retrieved_data": [ - "Joseph Brzezinski", "Catso", "Dibbins", "Anglebert Dinkherhump", - "Michelle Davis" + "Catso", + "Dibbins", + ["Anglebert Dinkherhump", "Anglebert", "Dinkherhump"], + ["Michelle Davis", "Michelle DavisMichelle Davis"] ] } } @@ -687,11 +667,10 @@ "start_urls": [ "__SHOPPING__/fujifilm-finepix-z200fd-10mp-digital-camera-with-5x-optical-dual-image-stabilized-zoom-black.html" ], - "intent": "Get name(s) of reviewer(s) who mention under water photo", - "intent_template": "Get name(s) of reviewer(s) who mention {{description}}", + "intent": "Get name(s) of reviewer(s) who mention under water photo for the product on the current page", + "intent_template": "Get name(s) of reviewer(s) who mention {{description}} for the product on the current page", "instantiation_dict": {"description": "under water photo"}, "format_specification": null, - "start_url_context": "Complete the task given the context of the current page.", "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -712,11 +691,10 @@ "start_urls": [ "__SHOPPING__/3-pack-samsung-galaxy-s6-screen-protector-nearpow-tempered-glass-screen-protector-with-9h-hardness-crystal-clear-easy-bubble-free-installation-scratch-resist.html" ], - "intent": "Get name(s) of reviewer(s) who mention good fingerprint resistant", - "intent_template": "Get name(s) of reviewer(s) who mention {{description}}", + "intent": "Get name(s) of reviewer(s) who mention good fingerprint resistant for the product on the current page", + "intent_template": "Get name(s) of reviewer(s) who mention {{description}} for the product on the current page", "instantiation_dict": {"description": "good fingerprint resistant"}, "format_specification": null, - "start_url_context": "Complete the task given the context of the current page.", "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -738,11 +716,10 @@ "start_urls": [ "__SHOPPING__/haflinger-men-s-wool-felt-open-back-slippers-beige-550-peat-us-7.html" ], - "intent": "Get name(s) of reviewer(s) who mention price being unfair", - "intent_template": "Get name(s) of reviewer(s) who mention {{description}}", + "intent": "Get name(s) of reviewer(s) who mention price being unfair for the product on the current page", + "intent_template": "Get name(s) of reviewer(s) who mention {{description}} for the product on the current page", "instantiation_dict": {"description": "price being unfair"}, "format_specification": null, - "start_url_context": "Complete the task given the context of the current page.", "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -763,15 +740,13 @@ "start_urls": [ "__SHOPPING__/epson-workforce-wf-3620-wifi-direct-all-in-one-color-inkjet-printer-copier-scanner-amazon-dash-replenishment-ready.html" ], - "intent": "Get name(s) of reviewer(s) who mention print quality and give rating of 3 or less stars", - "intent_template": "Get name(s) of reviewer(s) who mention {{description}} and give rating {{rating}}", - "instantiation_dict": {"description": "print quality", "rating": "3 or less stars"}, + "intent": "Get name(s) of reviewer(s) who mention print quality explicitly with a rating of 3 or less stars for the product on the current page", + "intent_template": "Get name(s) of reviewer(s) who mention {{description}} for the product on the current page", + "instantiation_dict": {"description": "print quality explicitly with a rating of 3 or less stars"}, "format_specification": null, - "start_url_context": "Complete the task given the context of the current page.", "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": false, "results_schema": { "type": "array", "items": {"type": "string"} }, "expected": { "performed_operation": "retrieve", @@ -789,11 +764,10 @@ "start_urls": [ "__SHOPPING__/epson-workforce-wf-3620-wifi-direct-all-in-one-color-inkjet-printer-copier-scanner-amazon-dash-replenishment-ready.html" ], - "intent": "Get name(s) of reviewer(s) who mention complain of the customer service", - "intent_template": "Get name(s) of reviewer(s) who mention {{description}}", + "intent": "Get name(s) of reviewer(s) who mention complain of the customer service for the product on the current page", + "intent_template": "Get name(s) of reviewer(s) who mention {{description}} for the product on the current page", "instantiation_dict": {"description": "complain of the customer service"}, "format_specification": null, - "start_url_context": "Complete the task given the context of the current page.", "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -813,20 +787,36 @@ "task_id": 27, "intent_template_id": 33, "start_urls": ["__REDDIT__"], - "intent": "Tell me the count of comments that have received more downvotes than upvotes for the user who made the latest post on the Showerthoughts forum.", - "intent_template": "Tell me the count of comments that have received more downvotes than upvotes for the user who made the latest post on the {{forum}} forum.", - "instantiation_dict": {"forum": "Showerthoughts"}, - "format_specification": null, - "start_url_context": null, + "intent": "In the Showerthoughts forum, get the username and post title of the most recent post, and count the number of comments on that post that are not from the author and have more downvotes than upvotes.", + "intent_template": "In the {{forum}} forum, get the username and post title of the most recent post, and count the number of comments on that post that are not from the author and have more downvotes than upvotes.", + "instantiation_dict": {"forum": "personal finances"}, + "format_specification": "Use \"username\" for the username, \"post_title\" for the post title, and \"count\" for the number of comments.", "eval": [ { "evaluator": "AgentResponseEvaluator", "ordered": false, - "results_schema": { "type": "array", "items": {"type": "number"} }, + "results_schema": { + "type": "array", + "items": { + "type": "object", + "properties": { + "username" : { "type": "string" }, + "post_title": { "type": "string" }, + "count" : { "type": "number" } + }, + "required": ["username", "post_title", "count"] + } + }, "expected": { "performed_operation": "retrieve", "status": "SUCCESS", - "retrieved_data": [0] + "retrieved_data": [ + { + "username": "Hammer94", + "post_title": "56 year old mom has no retirement. Where do I even start on her behalf?", + "count": 0 + } + ] } } ], @@ -837,20 +827,36 @@ "task_id": 28, "intent_template_id": 33, "start_urls": ["__REDDIT__"], - "intent": "Tell me the count of comments that have received more downvotes than upvotes for the user who made the latest post on the Worcester forum.", - "intent_template": "Tell me the count of comments that have received more downvotes than upvotes for the user who made the latest post on the {{forum}} forum.", + "intent": "In the Worcester forum, get the username and post title of the most recent post, and count the number of comments on that post that are not from the author and have more downvotes than upvotes.", + "intent_template": "In the {{forum}} forum, get the username and post title of the most recent post, and count the number of comments on that post that are not from the author and have more downvotes than upvotes.", "instantiation_dict": {"forum": "Worcester"}, - "format_specification": null, - "start_url_context": null, + "format_specification": "Use \"username\" for the username, \"post_title\" for the post title, and \"count\" for the number of comments.", "eval": [ { "evaluator": "AgentResponseEvaluator", "ordered": false, - "results_schema": { "type": "array", "items": {"type": "number"} }, + "results_schema": { + "type": "array", + "items": { + "type": "object", + "properties": { + "username" : { "type": "string" }, + "post_title": { "type": "string" }, + "count" : { "type": "number" } + }, + "required": ["username", "post_title", "count"] + } + }, "expected": { "performed_operation": "retrieve", "status": "SUCCESS", - "retrieved_data": [1] + "retrieved_data": [ + { + "username": "mineinhusdson", + "post_title": "Best place for a foot rub?", + "count": 0 + } + ] } } ], @@ -861,20 +867,36 @@ "task_id": 29, "intent_template_id": 33, "start_urls": ["__REDDIT__"], - "intent": "Tell me the count of comments that have received more downvotes than upvotes for the user who made the latest post on the DIY forum.", - "intent_template": "Tell me the count of comments that have received more downvotes than upvotes for the user who made the latest post on the {{forum}} forum.", + "intent": "In the DIY forum, get the username and post title of the most recent post, and count the number of comments on that post that are not from the author and have more downvotes than upvotes.", + "intent_template": "In the {{forum}} forum, get the username and post title of the most recent post, and count the number of comments on that post that are not from the author and have more downvotes than upvotes.", "instantiation_dict": {"forum": "DIY"}, - "format_specification": null, - "start_url_context": null, + "format_specification": "Use \"username\" for the username, \"post_title\" for the post title, and \"count\" for the number of comments.", "eval": [ { "evaluator": "AgentResponseEvaluator", "ordered": false, - "results_schema": { "type": "array", "items": {"type": "number"} }, + "results_schema": { + "type": "array", + "items": { + "type": "object", + "properties": { + "username" : { "type": "string" }, + "post_title": { "type": "string" }, + "count" : { "type": "number" } + }, + "required": ["username", "post_title", "count"] + } + }, "expected": { "performed_operation": "retrieve", "status": "SUCCESS", - "retrieved_data": [1] + "retrieved_data": [ + { + "username": "ziostraccette", + "post_title": "How can I bring an HDMI cable from my pc downstairs to my TV upstairs?", + "count": 0 + } + ] } } ], @@ -885,20 +907,36 @@ "task_id": 30, "intent_template_id": 33, "start_urls": ["__REDDIT__"], - "intent": "Tell me the count of comments that have received more downvotes than upvotes for the user who made the latest post on the space forum.", - "intent_template": "Tell me the count of comments that have received more downvotes than upvotes for the user who made the latest post on the {{forum}} forum.", + "intent": "In the space forum, get the username and post title of the most recent post, and count the number of comments on that post that are not from the author and have more downvotes than upvotes.", + "intent_template": "In the {{forum}} forum, get the username and post title of the most recent post, and count the number of comments on that post that are not from the author and have more downvotes than upvotes.", "instantiation_dict": {"forum": "space"}, - "format_specification": null, - "start_url_context": null, + "format_specification": "Use \"username\" for the username, \"post_title\" for the post title, and \"count\" for the number of comments.", "eval": [ { "evaluator": "AgentResponseEvaluator", "ordered": false, - "results_schema": { "type": "array", "items": {"type": "number"} }, + "results_schema": { + "type": "array", + "items": { + "type": "object", + "properties": { + "username" : { "type": "string" }, + "post_title": { "type": "string" }, + "count" : { "type": "number" } + }, + "required": ["username", "post_title", "count"] + } + }, "expected": { "performed_operation": "retrieve", "status": "SUCCESS", - "retrieved_data": [0] + "retrieved_data": [ + { + "username": "Dhghomon", + "post_title": "Scientists erupt at NASA gutting funding for crucial Venus mission", + "count": 0 + } + ] } } ], @@ -909,20 +947,36 @@ "task_id": 31, "intent_template_id": 33, "start_urls": ["__REDDIT__"], - "intent": "Tell me the count of comments that have received more downvotes than upvotes for the user who made the latest post on the photoshopbattles forum.", - "intent_template": "Tell me the count of comments that have received more downvotes than upvotes for the user who made the latest post on the {{forum}} forum.", + "intent": "In the photoshopbattles forum, get the username and post title of the most recent post, and count the number of comments on that post that are not from the author and have more downvotes than upvotes.", + "intent_template": "In the {{forum}} forum, get the username and post title of the most recent post, and count the number of comments on that post that are not from the author and have more downvotes than upvotes.", "instantiation_dict": {"forum": "photoshopbattles"}, - "format_specification": null, - "start_url_context": null, + "format_specification": "Use \"username\" for the username, \"post_title\" for the post title, and \"count\" for the number of comments.", "eval": [ { "evaluator": "AgentResponseEvaluator", "ordered": false, - "results_schema": { "type": "array", "items": {"type": "number"} }, + "results_schema": { + "type": "array", + "items": { + "type": "object", + "properties": { + "username" : { "type": "string" }, + "post_title": { "type": "string" }, + "count" : { "type": "number" } + }, + "required": ["username", "post_title", "count"] + } + }, "expected": { "performed_operation": "retrieve", "status": "SUCCESS", - "retrieved_data": [0] + "retrieved_data": [ + { + "username": "Proud_Idiot", + "post_title": "UK Prime Minister Rishi Sunak looking at a pothole", + "count": 0 + } + ] } } ], @@ -942,7 +996,6 @@ "target2": "the nearest supermarket own by a local company" }, "format_specification": "Use \"hotel\" for the hotel name and \"distance\" for the distance.", - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -983,7 +1036,6 @@ "target2": "a supermarket" }, "format_specification": "Use \"hotel\" for the hotel name and \"distance\" for the distance.", - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -1024,7 +1076,6 @@ "target2": "a supermarket" }, "format_specification": "Use \"hotel\" for the hotel name and \"information\" for the distance or time information requested.", - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -1033,7 +1084,10 @@ "type": "array", "items": { "type": "object", - "properties": { "hotel": {"type": "string"}, "information": {"type": "string"} }, + "properties": { + "hotel" : { "type": "string" }, + "information": { "type": "string", "format": "duration" } + }, "required": ["hotel", "information"] } }, @@ -1065,7 +1119,6 @@ "target2": "a supermarket" }, "format_specification": "Use \"hotel\" for the hotel name and \"information\" for the distance or time information requested.", - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -1074,7 +1127,10 @@ "type": "array", "items": { "type": "object", - "properties": { "hotel": {"type": "string"}, "information": {"type": "string"} }, + "properties": { + "hotel" : { "type": "string" }, + "information": { "type": "string", "format": "duration" } + }, "required": ["hotel", "information"] } }, @@ -1104,7 +1160,6 @@ "location": "Carnegie Mellon University" }, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -1128,7 +1183,6 @@ "intent_template": "Check if the {{place}} in pittsburgh can be reached in one hour by car from {{location}}", "instantiation_dict": {"place": "police station", "location": "gates building at CMU"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -1152,7 +1206,6 @@ "intent_template": "Check if the {{place}} in pittsburgh can be reached in one hour by car from {{location}}", "instantiation_dict": {"place": "duquesne university", "location": "pittsburgh airport"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -1176,7 +1229,6 @@ "intent_template": "Check if the {{place}} in pittsburgh can be reached in one hour by car from {{location}}", "instantiation_dict": {"place": "walmart", "location": "5600 fifth avenue"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -1200,7 +1252,6 @@ "intent_template": "Check if the {{place}} in pittsburgh can be reached in one hour by car from {{location}}", "instantiation_dict": {"place": "amc theatre", "location": "hobart street"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -1224,7 +1275,6 @@ "intent_template": "Get the top {{n}} search term(s) in my store", "instantiation_dict": {"n": "1"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -1248,7 +1298,6 @@ "intent_template": "Get the top {{n}} search term(s) in my store", "instantiation_dict": {"n": "2"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -1272,7 +1321,6 @@ "intent_template": "Get the top {{n}} search term(s) in my store", "instantiation_dict": {"n": "3"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -1281,7 +1329,7 @@ "expected": { "performed_operation": "retrieve", "status": "SUCCESS", - "retrieved_data": ["hollister", "Joust Bag", "Antonia Racer Tank"] + "retrieved_data": ["hollister", "Joust Bag", "nike"] } } ], @@ -1292,11 +1340,10 @@ "task_id": 44, "intent_template_id": 303, "start_urls": ["__GITLAB__"], - "intent": "Navigate to my todos", - "intent_template": "Navigate to my todos", + "intent": "Show me my todos", + "intent_template": "Show me my todos", "instantiation_dict": {}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -1306,12 +1353,9 @@ { "evaluator": "NetworkEventEvaluator", "site": "gitlab", - "url_match_mode": "exact", - "last_event_only": true, + "ignored_query_params_patterns": ["page", "sort"], "expected": { - "url": "__GITLAB__/dashboard/todos", - "response_status": 200, - "event_type": "navigation" + "url": ["__GITLAB__/dashboard/todos", "__GITLAB__/dashboard/todos?state=pending"] } } ], @@ -1322,11 +1366,10 @@ "task_id": 45, "intent_template_id": 300, "start_urls": ["__GITLAB__/a11yproject/a11yproject.com"], - "intent": "Navigate to the most recent open issues", - "intent_template": "Navigate to the most recent open issues", + "intent": "Show me the most recent open issues for the current project", + "intent_template": "Show me the most recent open issues for the current project", "instantiation_dict": {}, "format_specification": null, - "start_url_context": "Complete the task given the context of the current page.", "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -1335,17 +1378,7 @@ }, { "evaluator": "NetworkEventEvaluator", - "site": "gitlab", - "url_match_mode": "exact", - "last_event_only": true, - "expected": { - "url": [ - "__GITLAB__/a11yproject/a11yproject.com/-/issues", - "__GITLAB__/a11yproject/a11yproject.com/-/issues/?sort=created_date&state=opened" - ], - "response_status": 200, - "event_type": "navigation" - } + "expected": {"url": "__GITLAB__/a11yproject/a11yproject.com/-/issues"} } ], "revision": 2 @@ -1355,11 +1388,10 @@ "task_id": 46, "intent_template_id": 300, "start_urls": ["__GITLAB__/primer/design"], - "intent": "Navigate to the most recent open issues", - "intent_template": "Navigate to the most recent open issues", + "intent": "Show me the most recent open issues for the current project", + "intent_template": "Show me the most recent open issues for the current project", "instantiation_dict": {}, "format_specification": null, - "start_url_context": "Complete the task given the context of the current page.", "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -1369,15 +1401,13 @@ { "evaluator": "NetworkEventEvaluator", "site": "gitlab", - "url_match_mode": "exact", "last_event_only": true, "expected": { "url": [ "__GITLAB__/primer/design/-/issues", "__GITLAB__/primer/design/-/issues/?sort=created_date&state=opened" ], - "response_status": 200, - "event_type": "navigation" + "response_status": 200 } } ], @@ -1388,11 +1418,10 @@ "task_id": 47, "intent_template_id": 197, "start_urls": ["__SHOPPING__"], - "intent": "Today is 6/12/2023. Tell me how many complete orders I have over the past month, and the total amount of money I spent.", - "intent_template": "Today is 6/12/2023. Tell me how many complete orders I have {{period}}, and the total amount of money I spent.", - "instantiation_dict": {"period": "over the past month"}, + "intent": "Today is June 12, 2023. Tell me how many complete orders I have over the past months, and the total amount of money I spent (including shipping and handling fees).", + "intent_template": "Today is June 12, 2023. Tell me how many complete orders I have {{period}}, and the total amount of money I spent (including shipping and handling fees).", + "instantiation_dict": {"period": "over the past months"}, "format_specification": "Use \"order_count\" for the number of orders and \"amount\" for the total amount spent.", - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -1422,11 +1451,10 @@ "task_id": 48, "intent_template_id": 197, "start_urls": ["__SHOPPING__"], - "intent": "Today is 6/12/2023. Tell me how many complete orders I have over the past three days, and the total amount of money I spent.", - "intent_template": "Today is 6/12/2023. Tell me how many complete orders I have {{period}}, and the total amount of money I spent.", + "intent": "Today is June 12, 2023. Tell me how many complete orders I have over the past three days, and the total amount of money I spent (including shipping and handling fees).", + "intent_template": "Today is June 12, 2023. Tell me how many complete orders I have {{period}}, and the total amount of money I spent (including shipping and handling fees).", "instantiation_dict": {"period": "over the past three days"}, "format_specification": "Use \"order_count\" for the number of orders and \"amount\" for the total amount spent.", - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -1456,11 +1484,10 @@ "task_id": 49, "intent_template_id": 197, "start_urls": ["__SHOPPING__"], - "intent": "Today is 6/12/2023. Tell me how many complete orders I have over the past four month, and the total amount of money I spent.", - "intent_template": "Today is 6/12/2023. Tell me how many complete orders I have {{period}}, and the total amount of money I spent.", - "instantiation_dict": {"period": "over the past four month"}, + "intent": "Today is June 12, 2023. Tell me how many complete orders I have over the past four months, and the total amount of money I spent (including shipping and handling fees).", + "intent_template": "Today is June 12, 2023. Tell me how many complete orders I have {{period}}, and the total amount of money I spent (including shipping and handling fees).", + "instantiation_dict": {"period": "over the past four months"}, "format_specification": "Use \"order_count\" for the number of orders and \"amount\" for the total amount spent.", - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -1490,11 +1517,10 @@ "task_id": 50, "intent_template_id": 197, "start_urls": ["__SHOPPING__"], - "intent": "Today is 6/12/2023. Tell me how many complete orders I have over the past year, and the total amount of money I spent.", - "intent_template": "Today is 6/12/2023. Tell me how many complete orders I have {{period}}, and the total amount of money I spent.", + "intent": "Today is June 12, 2023. Tell me how many complete orders I have over the past year, and the total amount of money I spent (including shipping and handling fees).", + "intent_template": "Today is June 12, 2023. Tell me how many complete orders I have {{period}}, and the total amount of money I spent (including shipping and handling fees).", "instantiation_dict": {"period": "over the past year"}, "format_specification": "Use \"order_count\" for the number of orders and \"amount\" for the total amount spent.", - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -1524,11 +1550,10 @@ "task_id": 51, "intent_template_id": 197, "start_urls": ["__SHOPPING__"], - "intent": "Today is 6/12/2023. Tell me how many complete orders I have over the past six month, and the total amount of money I spent.", - "intent_template": "Today is 6/12/2023. Tell me how many complete orders I have {{period}}, and the total amount of money I spent.", - "instantiation_dict": {"period": "over the past six month"}, + "intent": "Today is June 12, 2023. Tell me how many complete orders I have over the past six months, and the total amount of money I spent (including shipping and handling fees).", + "intent_template": "Today is June 12, 2023. Tell me how many complete orders I have {{period}}, and the total amount of money I spent (including shipping and handling fees).", + "instantiation_dict": {"period": "over the past six months"}, "format_specification": "Use \"order_count\" for the number of orders and \"amount\" for the total amount spent.", - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -1562,12 +1587,11 @@ "intent_template": "How long does it take to walk from {{start}} to {{end}}?", "instantiation_dict": {"start": "Carnegie Mellon University", "end": "starbucks on Craig Street"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "ordered": false, - "results_schema": { "type": "array", "items": {"type": "string"} }, + "results_schema": { "type": "array", "items": {"type": "string", "format": "duration"} }, "expected": { "performed_operation": "retrieve", "status": "SUCCESS", @@ -1586,12 +1610,11 @@ "intent_template": "How long does it take to walk from {{start}} to {{end}}?", "instantiation_dict": {"start": "Univ of Pittsburgh", "end": "starbucks on Craig Street"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "ordered": false, - "results_schema": { "type": "array", "items": {"type": "string"} }, + "results_schema": { "type": "array", "items": {"type": "string", "format": "duration"} }, "expected": { "performed_operation": "retrieve", "status": "SUCCESS", @@ -1610,12 +1633,11 @@ "intent_template": "How long does it take to walk from {{start}} to {{end}}?", "instantiation_dict": {"start": "Carnegie Mellon University", "end": "Univ of Pittsburgh"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "ordered": false, - "results_schema": { "type": "array", "items": {"type": "string"} }, + "results_schema": { "type": "array", "items": {"type": "string", "format": "duration"} }, "expected": { "performed_operation": "retrieve", "status": "SUCCESS", @@ -1634,12 +1656,11 @@ "intent_template": "How long does it take to walk from {{start}} to {{end}}?", "instantiation_dict": {"start": "the starbuck near CMU", "end": "Chatham university"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "ordered": false, - "results_schema": { "type": "array", "items": {"type": "string"} }, + "results_schema": { "type": "array", "items": {"type": "string", "format": "duration"} }, "expected": { "performed_operation": "retrieve", "status": "SUCCESS", @@ -1658,12 +1679,11 @@ "intent_template": "How long does it take to walk from {{start}} to {{end}}?", "instantiation_dict": {"start": "Carnegie Museum of Art", "end": "a library at CMU"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "ordered": false, - "results_schema": { "type": "array", "items": {"type": "string"} }, + "results_schema": { "type": "array", "items": {"type": "string", "format": "duration"} }, "expected": { "performed_operation": "retrieve", "status": "SUCCESS", @@ -1685,7 +1705,6 @@ "place2": "university center at Carnegie Mellon University" }, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -1712,7 +1731,6 @@ "intent_template": "Tell me the closest {{place1}}(s) to {{place2}}", "instantiation_dict": {"place1": "cafe", "place2": "CMU Hunt library"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -1736,7 +1754,6 @@ "intent_template": "Tell me the closest {{place1}}(s) to {{place2}}", "instantiation_dict": {"place1": "restaurant", "place2": "CMU Hunt library"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -1760,7 +1777,6 @@ "intent_template": "Tell me the closest {{place1}}(s) to {{place2}}", "instantiation_dict": {"place1": "restaurant", "place2": "CMU Posner Hall"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -1784,7 +1800,6 @@ "intent_template": "Tell me the closest {{place1}}(s) to {{place2}}", "instantiation_dict": {"place1": "restaurant", "place2": "CMU Sorrells Library"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -1808,7 +1823,6 @@ "intent_template": "Get customer name(s) who completed the {{quantifier}} number of orders in the entire history", "instantiation_dict": {"quantifier": "most"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -1832,7 +1846,6 @@ "intent_template": "Get customer name(s) who completed the {{quantifier}} number of orders in the entire history", "instantiation_dict": {"quantifier": "second most"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -1841,7 +1854,7 @@ "expected": { "performed_operation": "retrieve", "status": "SUCCESS", - "retrieved_data": ["Adam Garcia", "Michael Nguyen", "Sarah Miller"] + "retrieved_data": ["Michael Nguyen", "Sarah Miller"] } } ], @@ -1852,11 +1865,10 @@ "task_id": 64, "intent_template_id": 276, "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Get customer name(s) who placed 2 orders in the entire history", - "intent_template": "Get customer name(s) who placed {{number}} orders in the entire history", + "intent": "Get customer name(s) who have 2 orders in any state in the entire history", + "intent_template": "Get customer name(s) who have {{number}} orders in any state in the entire history", "instantiation_dict": {"number": "2"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -1865,7 +1877,7 @@ "expected": { "performed_operation": "retrieve", "status": "SUCCESS", - "retrieved_data": ["Lisa Kim", "Lisa Green", "Julia Williams", "Brian Smith", "Alexander Thomas"] + "retrieved_data": ["Emma Davis", "Veronica Costello"] } } ], @@ -1880,7 +1892,6 @@ "intent_template": "Get customer name(s) who completed the {{quantifier}} number of orders in the entire history", "instantiation_dict": {"quantifier": "fifth most"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -1889,7 +1900,7 @@ "expected": { "performed_operation": "retrieve", "status": "SUCCESS", - "retrieved_data": ["Jason Miller", "Jennifer White"] + "retrieved_data": ["Adam Garcia", "Jason Miller", "Jennifer White"] } } ], @@ -1900,15 +1911,10 @@ "task_id": 66, "intent_template_id": 17, "start_urls": ["__REDDIT__"], - "intent": "Among the top 10 post in \"books\" forum, Provide the URLs of posts that recommend a single book", - "intent_template": "Among the top {{number}} post in \"{{subreddit}}\" forum, {{description}}", - "instantiation_dict": { - "number": 10, - "subreddit": "books", - "description": "Provide the URLs of posts that recommend a single book" - }, + "intent": "Get the post titles of the top 10 hottest posts in the \"Books\" forum that recommend exactly one book.", + "intent_template": "Get {{description}} of the top {{number}} hottest posts in the \"{{forum}}\" forum that recommend exactly one book.", + "instantiation_dict": {"number": 10, "description": "the post titles", "forum": "books"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -1918,8 +1924,8 @@ "performed_operation": "retrieve", "status": "SUCCESS", "retrieved_data": [ - "http://www.reddit.com/f/books/59396/apple-books-has-a-free-audiobook-of-a-christmas-carol", - "http://www.reddit.com/f/books/17445/i-just-finished-reading-the-hobbit-to-my-6-year-old-daughter" + "I just finished reading The Hobbit to my 6 year old daughter, and she loved it!", + "Apple Books has a free audiobook of A Christmas Carol narrated by LeVar Burton!" ] } } @@ -1931,15 +1937,14 @@ "task_id": 67, "intent_template_id": 17, "start_urls": ["__REDDIT__"], - "intent": "Among the top 10 post in \"books\" forum, Provide the book names from posts that recommend a single book", - "intent_template": "Among the top {{number}} post in \"{{subreddit}}\" forum, {{description}}", + "intent": "Get the book titles, exactly as they appear in the post description, of the top 10 hottest posts in the \"Books\" forum that recommend exactly one book.", + "intent_template": "Get {{description}} of the top {{number}} hottest posts in the \"{{forum}}\" forum that recommend exactly one book.", "instantiation_dict": { "number": 10, - "subreddit": "books", - "description": "Provide the book names from posts that recommend a single book" + "description": "the book titles, exactly as they appear in the post description,", + "forum": "Books" }, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -1948,7 +1953,7 @@ "expected": { "performed_operation": "retrieve", "status": "SUCCESS", - "retrieved_data": ["A Christmas Carol", "The Hobbit"] + "retrieved_data": ["The Hobbit", "A Christmas Carol"] } } ], @@ -1959,15 +1964,14 @@ "task_id": 68, "intent_template_id": 17, "start_urls": ["__REDDIT__"], - "intent": "Among the top 10 post in \"books\" forum, Provide the author names and the book names from posts that recommend a single book", - "intent_template": "Among the top {{number}} post in \"{{subreddit}}\" forum, {{description}}", + "intent": "Get the author names and book titles, exactly as they appear in the post description, of the top 10 hottest posts in the \"Books\" forum that recommend exactly one book.", + "intent_template": "Get {{description}} of the top {{number}} hottest posts in the \"{{forum}}\" forum that recommend exactly one book.", "instantiation_dict": { "number": 10, - "subreddit": "books", - "description": "Provide the author names and the book names from posts that recommend a single book" + "description": "the author names and book titles, exactly as they appear in the post description,", + "forum": "Books" }, - "format_specification": "For each pair, return a dictionary with the key 'book' for the book name and 'author' for the author name.", - "start_url_context": null, + "format_specification": "For each pair, return a dictionary with the key 'book' for the book title and 'author' for the author name. Use null for any missing value.", "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -1984,8 +1988,8 @@ "performed_operation": "retrieve", "status": "SUCCESS", "retrieved_data": [ - { "book": "The Little Prince", "author": "Levar Burton" }, - { "book": "The Hobbit" , "author": "Tolkien" } + { "book": "The Hobbit" , "author": "Tolkien" }, + { "book": "A Christmas Carol", "author": null } ] } } @@ -1997,15 +2001,14 @@ "task_id": 69, "intent_template_id": 17, "start_urls": ["__REDDIT__"], - "intent": "Among the top 10 post in \"books\" forum, is there any post talks about supporting local book stores? If so, tell me the names of the organizations involved", - "intent_template": "Among the top {{number}} post in \"{{subreddit}}\" forum, {{description}}", + "intent": "Get the URLs of any organizations involved, exactly as they appear in the post description, of the top 10 hottest posts in the \"Books\" forum that talk about supporting local book stores.", + "intent_template": "Get {{description}} of the top {{number}} hottest posts in the \"{{forum}}\" forum that talk about supporting local book stores.", "instantiation_dict": { "number": 10, - "subreddit": "books", - "description": "is there any post talks about supporting local book stores? If so, tell me the names of the organizations involved" + "description": "the URLs of any organizations involved, exactly as they appear in the post description,", + "forum": "Books" }, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -2014,7 +2017,7 @@ "expected": { "performed_operation": "retrieve", "status": "SUCCESS", - "retrieved_data": ["bookshop.org"] + "retrieved_data": [ ["bookshop.org", "https://bookshop.org"] ] } } ], @@ -2029,7 +2032,6 @@ "intent_template": "What is the zip code of {{place}}?", "instantiation_dict": {"place": "Carnegie Mellon University"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -2053,7 +2055,6 @@ "intent_template": "What is the zip code of {{place}}?", "instantiation_dict": {"place": "Chatham University"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -2077,7 +2078,6 @@ "intent_template": "What is the zip code of {{place}}?", "instantiation_dict": {"place": "Yale University"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -2101,7 +2101,6 @@ "intent_template": "What is the zip code of {{place}}?", "instantiation_dict": {"place": "Columbia University"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -2130,7 +2129,6 @@ ] }, "format_specification": "Return the list of place in order using their names.", - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -2162,7 +2160,6 @@ ] }, "format_specification": "Return the list of place in order using their names.", - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -2191,7 +2188,6 @@ "place_list": ["Princeton University", "Yale University", "Harvard University"] }, "format_specification": "Return the list of place in order using their names.", - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -2211,11 +2207,10 @@ "task_id": 77, "intent_template_id": 277, "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "What is the total count of Pending reviews amongst all the reviews?", - "intent_template": "What is the total count of {{status}} reviews amongst all the reviews?", + "intent": "Get the total number of Pending reviews amongst all the reviews?", + "intent_template": "Get the total number of {{status}} reviews amongst all the reviews?", "instantiation_dict": {"status": "Pending"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -2235,11 +2230,10 @@ "task_id": 78, "intent_template_id": 277, "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "What is the total count of Approved reviews amongst all the reviews?", - "intent_template": "What is the total count of {{status}} reviews amongst all the reviews?", + "intent": "Get the total number of Approved reviews amongst all the reviews?", + "intent_template": "Get the total number of {{status}} reviews amongst all the reviews?", "instantiation_dict": {"status": "Approved"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -2259,11 +2253,10 @@ "task_id": 79, "intent_template_id": 277, "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "What is the total count of Not Approved reviews amongst all the reviews?", - "intent_template": "What is the total count of {{status}} reviews amongst all the reviews?", + "intent": "Get the total number of Not Approved reviews amongst all the reviews?", + "intent_template": "Get the total number of {{status}} reviews amongst all the reviews?", "instantiation_dict": {"status": "Not Approved"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -2291,12 +2284,11 @@ "place_C": "Pittsburgh International Airport" }, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "ordered": false, - "results_schema": { "type": "array", "items": {"type": "string"} }, + "results_schema": { "type": "array", "items": {"type": "string", "format": "duration"} }, "expected": { "performed_operation": "retrieve", "status": "SUCCESS", @@ -2319,12 +2311,11 @@ "place_C": "Pittsburgh International Airport" }, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "ordered": false, - "results_schema": { "type": "array", "items": {"type": "string"} }, + "results_schema": { "type": "array", "items": {"type": "string", "format": "duration"} }, "expected": { "performed_operation": "retrieve", "status": "SUCCESS", @@ -2347,12 +2338,11 @@ "place_C": "Boston Logan International Airport" }, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "ordered": false, - "results_schema": { "type": "array", "items": {"type": "string"} }, + "results_schema": { "type": "array", "items": {"type": "string", "format": "duration"} }, "expected": { "performed_operation": "retrieve", "status": "SUCCESS", @@ -2375,12 +2365,11 @@ "place_C": "starbucks on craig street" }, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "ordered": false, - "results_schema": { "type": "array", "items": {"type": "string"} }, + "results_schema": { "type": "array", "items": {"type": "string", "format": "duration"} }, "expected": { "performed_operation": "retrieve", "status": "SUCCESS", @@ -2399,12 +2388,11 @@ "intent_template": "From my stay at {{hotel}}, what's the estimated driving time to reach {{place}}?", "instantiation_dict": {"hotel": "DoubleTree by Hilton New York Downtown", "place": "Keens Steakhouse"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "ordered": false, - "results_schema": { "type": "array", "items": {"type": "string"} }, + "results_schema": { "type": "array", "items": {"type": "string", "format": "duration"} }, "expected": { "performed_operation": "retrieve", "status": "SUCCESS", @@ -2426,12 +2414,11 @@ "place": "Carnegie Mellon University" }, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "ordered": false, - "results_schema": { "type": "array", "items": {"type": "string"} }, + "results_schema": { "type": "array", "items": {"type": "string", "format": "duration"} }, "expected": { "performed_operation": "retrieve", "status": "SUCCESS", @@ -2450,12 +2437,11 @@ "intent_template": "From my stay at {{hotel}}, what's the estimated driving time to reach {{place}}?", "instantiation_dict": {"hotel": "La Quinta Inn near the airport", "place": "Upitt"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "ordered": false, - "results_schema": { "type": "array", "items": {"type": "string"} }, + "results_schema": { "type": "array", "items": {"type": "string", "format": "duration"} }, "expected": { "performed_operation": "retrieve", "status": "SUCCESS", @@ -2474,12 +2460,11 @@ "intent_template": "From my stay at {{hotel}}, what's the estimated driving time to reach {{place}}?", "instantiation_dict": {"hotel": "red roof inn", "place": "Pittsburgh science museum"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "ordered": false, - "results_schema": { "type": "array", "items": {"type": "string"} }, + "results_schema": { "type": "array", "items": {"type": "string", "format": "duration"} }, "expected": { "performed_operation": "retrieve", "status": "SUCCESS", @@ -2498,12 +2483,11 @@ "intent_template": "From my stay at {{hotel}}, what's the estimated driving time to reach {{place}}?", "instantiation_dict": {"hotel": "Homewood Suites Southpointe", "place": "PPG Paints Arena"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "ordered": false, - "results_schema": { "type": "array", "items": {"type": "string"} }, + "results_schema": { "type": "array", "items": {"type": "string", "format": "duration"} }, "expected": { "performed_operation": "retrieve", "status": "SUCCESS", @@ -2522,7 +2506,6 @@ "intent_template": "Which US states border {{state}}?", "instantiation_dict": {"state": "Connecticut"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -2546,7 +2529,6 @@ "intent_template": "Which US states border {{state}}?", "instantiation_dict": {"state": "Pennsylvania"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -2570,7 +2552,6 @@ "intent_template": "Which US states border {{state}}?", "instantiation_dict": {"state": "Massachusetts"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -2594,7 +2575,6 @@ "intent_template": "Which US states border {{state}}?", "instantiation_dict": {"state": "Vermont"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -2618,7 +2598,6 @@ "intent_template": "Which US states border {{state}}?", "instantiation_dict": {"state": "New Hampshire"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -2642,16 +2621,15 @@ "intent_template": "Tell me the grand total of invoice {{id}}.", "instantiation_dict": {"id": "000000001"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "ordered": false, - "results_schema": { "type": "array", "items": {"type": "string"} }, + "results_schema": { "type": "array", "items": {"type": "number", "format": "currency"} }, "expected": { "performed_operation": "retrieve", "status": "SUCCESS", - "retrieved_data": ["36.39"] + "retrieved_data": [36.39] } } ], @@ -2666,16 +2644,15 @@ "intent_template": "Tell me the grand total of invoice {{id}}.", "instantiation_dict": {"id": "000000002"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "ordered": false, - "results_schema": { "type": "array", "items": {"type": "string"} }, + "results_schema": { "type": "array", "items": {"type": "number", "format": "currency"} }, "expected": { "performed_operation": "retrieve", "status": "SUCCESS", - "retrieved_data": ["39.64"] + "retrieved_data": [39.64] } } ], @@ -2690,7 +2667,6 @@ "intent_template": "Tell me the status of my latest order and when will it arrive", "instantiation_dict": {}, "format_specification": "Use \"status\" for status and \"arrival_date\" for arrival date (null if not applicable).", - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -2720,11 +2696,10 @@ "task_id": 97, "intent_template_id": 120, "start_urls": ["__MAP__", "__WIKIPEDIA__"], - "intent": "Tell me the distance to drive from Carnegie Mellon University to the top computer science school in massachusetts", - "intent_template": "Tell me the distance to drive from Carnegie Mellon University to the top computer science school in massachusetts", + "intent": "Tell me the distance to drive from Carnegie Mellon University to the top computer science school in massachusetts (use the provided wiki site to look up any needed information)", + "intent_template": "Tell me the distance to drive from Carnegie Mellon University to the top computer science school in massachusetts (use the provided wiki site to look up any needed information)", "instantiation_dict": {}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -2748,7 +2723,6 @@ "intent_template": "Where is the nearest {{places}} to {{start}}, and what is the walking distance to it?", "instantiation_dict": {"places": "tea cafe", "start": "University of Pittsburgh"}, "format_specification": "Use \"location\" for the name and location of the place and \"distance\" for the walking distance.", - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -2787,7 +2761,6 @@ "intent_template": "Where is the nearest {{places}} to {{start}}, and what is the walking distance to it?", "instantiation_dict": {"places": "Five Guys", "start": "5700 Penn Ave"}, "format_specification": "Use \"location\" for the name and location of the place and \"distance\" for the walking distance.", - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -2826,7 +2799,6 @@ "intent_template": "Where is the nearest {{places}} to {{start}}, and what is the walking distance to it?", "instantiation_dict": {"places": "Starbucks", "start": "Carnegie Mellon"}, "format_specification": "Use \"location\" for the name and location of the place and \"distance\" for the walking distance.", - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -2865,7 +2837,6 @@ "intent_template": "Where is the nearest {{places}} to {{start}}, and what is the walking distance to it?", "instantiation_dict": {"places": "In-N-Out", "start": "Upitts"}, "format_specification": "Use \"location\" for the name and location of the place and \"distance\" for the walking distance.", - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -2885,10 +2856,9 @@ "intent_template_id": 349, "start_urls": ["__GITLAB__"], "intent": "Display the list of open issues in the a11yproject/a11yproject.com repository that have labels related to help wanted", - "intent_template": "Display the list of issues in the {{repo}} repository that have labels related to {{label}}", - "instantiation_dict": {"label": "help wanted", "repo": "a11yproject/a11yproject.com"}, + "intent_template": "Display the list of {{state}} issues in the {{repo}} repository that have labels related to {{label}}", + "instantiation_dict": {"label": "help wanted", "repo": "a11yproject/a11yproject.com", "state": "open"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -2898,12 +2868,14 @@ { "evaluator": "NetworkEventEvaluator", "site": "gitlab", - "url_match_mode": "exact", "last_event_only": true, + "ignored_query_params_patterns": ["page", "sort"], "expected": { - "url": "__GITLAB__/byteblaze/a11y-syntax-highlighting/-/issues/?label_name%5B%5D=help%20wanted", - "response_status": 200, - "event_type": "navigation" + "url": "__GITLAB__/api/graphql", + "http_method": "POST", + "headers": { + "referer": "__GITLAB__/byteblaze/a11y-syntax-highlighting/-/issues/?state=opened&label_name%5B%5D=help wanted" + } } } ], @@ -2914,11 +2886,10 @@ "task_id": 103, "intent_template_id": 349, "start_urls": ["__GITLAB__"], - "intent": "Display the list of issues in the kkroening/ffmpeg-python repository that have labels related to questions", - "intent_template": "Display the list of issues in the {{repo}} repository that have labels related to {{label}}", - "instantiation_dict": {"label": "question", "repo": "kkroening/ffmpeg-python"}, + "intent": "Display the list of closed issues in the kkroening/ffmpeg-python repository that have labels related to questions", + "intent_template": "Display the list of {{state}} issues in the {{repo}} repository that have labels related to {{label}}", + "instantiation_dict": {"label": "question", "repo": "kkroening/ffmpeg-python", "state": "closed"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -2928,12 +2899,14 @@ { "evaluator": "NetworkEventEvaluator", "site": "gitlab", - "url_match_mode": "exact", "last_event_only": true, + "ignored_query_params_patterns": ["page", "sort"], "expected": { - "url": "__GITLAB__/kkroening/ffmpeg-python/-/issues/?label_name%5B%5D=question", - "response_status": 200, - "event_type": "navigation" + "url": "__GITLAB__/api/graphql", + "http_method": "POST", + "headers": { + "referer": "__GITLAB__/kkroening/ffmpeg-python/-/issues/?state=closed&label_name%5B%5D=question" + } } } ], @@ -2944,11 +2917,10 @@ "task_id": 104, "intent_template_id": 349, "start_urls": ["__GITLAB__"], - "intent": "Display the list of issues in the keycloak/keycloak repository that have labels related to flaky-test", - "intent_template": "Display the list of issues in the {{repo}} repository that have labels related to {{label}}", - "instantiation_dict": {"label": "flaky-test", "repo": "keycloak/keycloak"}, + "intent": "Display the list of all issues in the keycloak/keycloak repository that have labels related to flaky-test", + "intent_template": "Display the list of {{state}} issues in the {{repo}} repository that have labels related to {{label}}", + "instantiation_dict": {"label": "flaky-test", "repo": "keycloak/keycloak", "state": "all"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -2958,12 +2930,14 @@ { "evaluator": "NetworkEventEvaluator", "site": "gitlab", - "url_match_mode": "exact", "last_event_only": true, + "ignored_query_params_patterns": ["page", "sort"], "expected": { - "url": "__GITLAB__/keycloak/keycloak/-/issues/?label_name%5B%5D=flaky-test&state=opened", - "response_status": 200, - "event_type": "navigation" + "url": "__GITLAB__/api/graphql", + "http_method": "POST", + "headers": { + "referer": "__GITLAB__/keycloak/keycloak/-/issues/?state=all&label_name%5B%5D=flaky-test" + } } } ], @@ -2974,11 +2948,14 @@ "task_id": 105, "intent_template_id": 349, "start_urls": ["__GITLAB__"], - "intent": "Display the list of issues in the OpenAPITools/openapi-generator repository that have labels related to OpenAPI Generator CLI", - "intent_template": "Display the list of issues in the {{repo}} repository that have labels related to {{label}}", - "instantiation_dict": {"label": "OpenAPI Generator CLI", "repo": "OpenAPITools/openapi-generator"}, + "intent": "Display the list of not yet closed issues in the OpenAPITools/openapi-generator repository that have labels related to OpenAPI Generator CLI", + "intent_template": "Display the list of {{state}} issues in the {{repo}} repository that have labels related to {{label}}", + "instantiation_dict": { + "label": "OpenAPI Generator CLI", + "repo": "OpenAPITools/openapi-generator", + "state": "not yet closed" + }, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -2988,12 +2965,17 @@ { "evaluator": "NetworkEventEvaluator", "site": "gitlab", - "url_match_mode": "exact", "last_event_only": true, + "ignored_query_params_patterns": ["page", "sort"], "expected": { - "url": "__GITLAB__/OpenAPITools/openapi-generator/-/issues/?label_name%5B%5D=OpenAPI%20Generator%20CLI&state=opened", - "response_status": 200, - "event_type": "navigation" + "url": "__GITLAB__/api/graphql", + "http_method": "POST", + "headers": { + "referer": [ + "__GITLAB__/OpenAPITools/openapi-generator/-/issues/?state=opened&label_name%5B%5D=OpenAPI%20Generator%20CLI", + "__GITLAB__/OpenAPITools/openapi-generator/-/issues/?label_name%5B%5D=OpenAPI%20Generator%20CLI" + ] + } } } ], @@ -3004,11 +2986,14 @@ "task_id": 106, "intent_template_id": 349, "start_urls": ["__GITLAB__"], - "intent": "Display the list of issues in the umano/AndroidSlidingUpPanel repository that have labels related to BUG", - "intent_template": "Display the list of issues in the {{repo}} repository that have labels related to {{label}}", - "instantiation_dict": {"label": "BUG", "repo": "umano/AndroidSlidingUpPanel"}, + "intent": "Display the list of open issues in the umano/AndroidSlidingUpPanel repository that have labels related to all except BUG", + "intent_template": "Display the list of {{state}} issues in the {{repo}} repository that have labels related to {{label}}", + "instantiation_dict": { + "label": "all except BUG", + "repo": "umano/AndroidSlidingUpPanel", + "state": "open" + }, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -3018,12 +3003,14 @@ { "evaluator": "NetworkEventEvaluator", "site": "gitlab", - "url_match_mode": "exact", "last_event_only": true, + "ignored_query_params_patterns": ["page", "sort"], "expected": { - "url": "__GITLAB__/umano/AndroidSlidingUpPanel/-/issues/?label_name%5B%5D=BUG&state=opened", - "response_status": 200, - "event_type": "navigation" + "url": "__GITLAB__/api/graphql", + "http_method": "POST", + "headers": { + "referer": "__GITLAB__/umano/AndroidSlidingUpPanel/-/issues/?state=opened¬%5Blabel_name%5D%5B%5D=BUG" + } } } ], @@ -3034,11 +3021,10 @@ "task_id": 107, "intent_template_id": 270, "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Get the monthly count of successful orders from May to December 2022", - "intent_template": "Get the monthly count of successful orders {{period}}", - "instantiation_dict": {"period": "from May to December 2022"}, + "intent": "Get the monthly count of completed orders from May 2022 through December 2022, inclusive", + "intent_template": "Get the monthly count of completed orders {{period}}", + "instantiation_dict": {"period": "from May 2022 through December 2022, inclusive"}, "format_specification": "Return a list of objects, where each object includes a \"month\" field for the month and a \"count\" field for the count.", - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -3055,14 +3041,14 @@ "performed_operation": "retrieve", "status": "SUCCESS", "retrieved_data": [ - { "month": "May" , "count": 8 }, - { "month": "June" , "count": 13 }, - { "month": "July" , "count": 9 }, - { "month": "August" , "count": 8 }, - { "month": "Sepetember", "count": 10 }, - { "month": "October" , "count": 4 }, - { "month": "November" , "count": 5 }, - { "month": "December" , "count": 10 } + { "month": "May" , "count": 8 }, + { "month": "June" , "count": 13 }, + { "month": "July" , "count": 9 }, + { "month": "August" , "count": 8 }, + { "month": "September", "count": 10 }, + { "month": "October" , "count": 4 }, + { "month": "November" , "count": 5 }, + { "month": "December" , "count": 10 } ] } } @@ -3074,11 +3060,10 @@ "task_id": 108, "intent_template_id": 270, "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Get the monthly count of successful orders 01/2023-05/2023", - "intent_template": "Get the monthly count of successful orders {{period}}", - "instantiation_dict": {"period": "01/2023-05/2023"}, + "intent": "Get the monthly count of completed orders January 2023 through May 2023", + "intent_template": "Get the monthly count of completed orders {{period}}", + "instantiation_dict": {"period": "January 2023 through May 2023"}, "format_specification": "Use \"month\" for the month and \"count\" for the count.", - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -3095,7 +3080,7 @@ "performed_operation": "retrieve", "status": "SUCCESS", "retrieved_data": [ - { "month": "January" , "count": 12 }, + { "month": "January" , "count": 10 }, { "month": "February", "count": 7 }, { "month": "March" , "count": 5 }, { "month": "April" , "count": 9 }, @@ -3111,11 +3096,10 @@ "task_id": 109, "intent_template_id": 270, "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Get the monthly count of successful orders from Jan to December 2022", - "intent_template": "Get the monthly count of successful orders {{period}}", + "intent": "Get the monthly count of completed orders from Jan to December 2022", + "intent_template": "Get the monthly count of completed orders {{period}}", "instantiation_dict": {"period": "from Jan to December 2022"}, "format_specification": "Use \"month\" for the month and \"count\" for the count.", - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -3155,11 +3139,10 @@ "task_id": 110, "intent_template_id": 270, "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Get the monthly count of successful orders from Jan to Nov 2022", - "intent_template": "Get the monthly count of successful orders {{period}}", - "instantiation_dict": {"period": "from Jan to Nov 2022"}, + "intent": "Get the monthly count of completed orders from Jan 2022 through Nov 2022,", + "intent_template": "Get the monthly count of completed orders {{period}}", + "instantiation_dict": {"period": "from Jan 2022 through Nov 2022"}, "format_specification": "Use \"month\" for the month and \"count\" for the count.", - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -3198,11 +3181,10 @@ "task_id": 111, "intent_template_id": 270, "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Get the monthly count of successful orders from Feb to Nov 2022", - "intent_template": "Get the monthly count of successful orders {{period}}", - "instantiation_dict": {"period": "from Feb to Nov 2022"}, + "intent": "Get the monthly count of completed orders from Feb 2022 through Nov 2022", + "intent_template": "Get the monthly count of completed orders {{period}}", + "instantiation_dict": {"period": "from Feb 2022 through Nov 2022"}, "format_specification": "Use \"month\" for the month and \"count\" for the count.", - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -3240,11 +3222,10 @@ "task_id": 112, "intent_template_id": 245, "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Return the customer name(s) who gave a rating of 3 stars or below for Circe fleece", - "intent_template": "Return the customer name(s) who gave a rating of 3 stars or below for {{product}}", + "intent": "Return the customer nickname(s) who gave a rating of 3 stars or below for Circe fleece", + "intent_template": "Return the customer nickname(s) who gave a rating of 3 stars or below for {{product}}", "instantiation_dict": {"product": "Circe fleece"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -3264,11 +3245,10 @@ "task_id": 113, "intent_template_id": 245, "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Return the customer name(s) who gave a rating of 3 stars or below for Olivia zip jacket", - "intent_template": "Return the customer name(s) who gave a rating of 3 stars or below for {{product}}", + "intent": "Return the customer nickname(s) who gave a rating of 3 stars or below for Olivia zip jacket", + "intent_template": "Return the customer nickname(s) who gave a rating of 3 stars or below for {{product}}", "instantiation_dict": {"product": "Olivia zip jacket"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -3277,7 +3257,7 @@ "expected": { "performed_operation": "retrieve", "status": "SUCCESS", - "retrieved_data": ["Emma Lopez", "Seam Miller"] + "retrieved_data": ["Emma", "Seam Miller"] } } ], @@ -3288,11 +3268,10 @@ "task_id": 114, "intent_template_id": 245, "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Return the customer name(s) who gave a rating of 3 stars or below for Antonia racer tank", - "intent_template": "Return the customer name(s) who gave a rating of 3 stars or below for {{product}}", + "intent": "Return the customer nickname(s) who gave a rating of 3 stars or below for Antonia racer tank", + "intent_template": "Return the customer nickname(s) who gave a rating of 3 stars or below for {{product}}", "instantiation_dict": {"product": "Antonia racer tank"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -3312,11 +3291,10 @@ "task_id": 115, "intent_template_id": 245, "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Return the customer name(s) who gave a rating of 3 stars or below for {{product}} 3 stars or below for Chloe tank", - "intent_template": "Return the customer name(s) who gave a rating of 3 stars or below for {{product}} 3 stars or below for {{product}}", + "intent": "Return the customer nickname(s) who gave a rating of 3 stars or below for Chloe tank", + "intent_template": "Return the customer nickname(s) who gave a rating of 3 stars or below for {{product}}", "instantiation_dict": {"product": "Chloe tank"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -3335,11 +3313,10 @@ "task_id": 116, "intent_template_id": 245, "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Return the customer name(s) who gave a rating of 3 stars or below for tanks products", - "intent_template": "Return the customer name(s) who gave a rating of 3 stars or below for {{product}} 3 stars or below for {{product}}", + "intent": "Return the customer nickname(s) who gave a rating of 3 stars or below for tanks products", + "intent_template": "Return the customer nickname(s) who gave a rating of 3 stars or below for {{product}} 3 stars or below for {{product}}", "instantiation_dict": {"product": "tanks products"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -3349,8 +3326,8 @@ "performed_operation": "retrieve", "status": "SUCCESS", "retrieved_data": [ - "Alexander", "Carma", "Dominic", "Merrie", "Monroe", "Scotty", "Shaunte", - "Teofila", "Valorie", "Yan", "Trey", "Edmund", "Tracey", "Mikkel", "Matthew", + "Dominic", "Trey", "Edmund", "Merrie", "Shaunte", "Teofila", "Carma", "Yan", + "Valorie", "Mikkel", "Matthew", "Monroe", "Scotty", "Alexander", "Tracey", "Joey" ] } @@ -3367,16 +3344,15 @@ "intent_template": "Get the date when I made my first purchase on this site", "instantiation_dict": {}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "ordered": false, - "results_schema": { "type": "array", "items": {"type": "string"} }, + "results_schema": { "type": "array", "items": {"type": "string", "format": "date"} }, "expected": { "performed_operation": "retrieve", "status": "SUCCESS", - "retrieved_data": ["3/2/22"] + "retrieved_data": ["March 2, 2022"] } } ], @@ -3391,7 +3367,6 @@ "intent_template": "I have jaw bruxism problem, show me something that could alleviate the problem.", "instantiation_dict": {}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -3406,11 +3381,10 @@ "task_id": 119, "intent_template_id": 250, "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "List all reviews with 4 stars or above for Antonia Racer Tank.", - "intent_template": "List all reviews with 4 stars or above for {{product}}.", + "intent": "Get title and rating for all reviews with 4 stars or above for Antonia Racer Tank.", + "intent_template": "Get title and rating for all reviews with 4 stars or above for {{product}}.", "instantiation_dict": {"product": "Antonia Racer Tank"}, "format_specification": "Use \"title\" for the review title and \"rating\" for rating (number of stars).", - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -3437,11 +3411,10 @@ "task_id": 120, "intent_template_id": 250, "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "List all reviews with 4 stars or above for Ana Running Short.", - "intent_template": "List all reviews with 4 stars or above for {{product}}.", + "intent": "Get title and rating for all reviews with 4 stars or above for Ana Running Short.", + "intent_template": "Get title and rating for all reviews with 4 stars or above for {{product}}.", "instantiation_dict": {"product": "Ana Running Short"}, "format_specification": "Use \"title\" for the review title and \"rating\" for rating (number of stars).", - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -3471,11 +3444,10 @@ "task_id": 121, "intent_template_id": 250, "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "List all reviews with 4 stars or above for Circe hooded fleece.", - "intent_template": "List all reviews with 4 stars or above for {{product}}.", + "intent": "Get title and rating for all reviews with 4 stars or above for Circe hooded fleece.", + "intent_template": "Get title and rating for all reviews with 4 stars or above for {{product}}.", "instantiation_dict": {"product": "Circe hooded fleece"}, "format_specification": "Use \"title\" for the review title and \"rating\" for rating (number of stars).", - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -3502,11 +3474,10 @@ "task_id": 122, "intent_template_id": 250, "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "List all reviews with 4 stars or above for Olivia zip jacket.", - "intent_template": "List all reviews with 4 stars or above for {{product}}.", + "intent": "Get title and rating for all reviews with 4 stars or above for Olivia zip jacket.", + "intent_template": "Get title and rating for all reviews with 4 stars or above for {{product}}.", "instantiation_dict": {"product": "Olivia zip jacket"}, "format_specification": "Use \"title\" for the review title and \"rating\" for rating (number of stars).", - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -3533,11 +3504,10 @@ "task_id": 123, "intent_template_id": 250, "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "List all reviews with 4 stars or above for Circe's products.", - "intent_template": "List all reviews with 4 stars or above for {{product}}.", + "intent": "Get title and rating for all reviews with 4 stars or above for Circe's products.", + "intent_template": "Get title and rating for all reviews with 4 stars or above for {{product}}.", "instantiation_dict": {"product": "Circe's products"}, "format_specification": "Use \"title\" for the review title and \"rating\" for rating (number of stars).", - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -3568,7 +3538,6 @@ "intent_template": "What is the price range of {{product}} in the One Stop Market?", "instantiation_dict": {"product": "wireless earphone"}, "format_specification": "Use \"min\" for lowest price and \"max\" for highest price.", - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -3577,14 +3546,17 @@ "type": "array", "items": { "type": "object", - "properties": { "min": {"type": "number"}, "max": {"type": "number"} }, + "properties": { + "min": { "type": "number", "format": "currency" }, + "max": { "type": "number", "format": "currency" } + }, "required": ["max", "min"] } }, "expected": { "performed_operation": "retrieve", "status": "SUCCESS", - "retrieved_data": [ {"min": 0.01, "max": 745.0} ] + "retrieved_data": [ {"min": 0.01, "max": 298.0} ] } } ], @@ -3599,7 +3571,6 @@ "intent_template": "What is the price range of {{product}} in the One Stop Market?", "instantiation_dict": {"product": "teeth grinding mouth guard"}, "format_specification": "Use \"min\" for lowest price and \"max\" for highest price.", - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -3608,7 +3579,10 @@ "type": "array", "items": { "type": "object", - "properties": { "min": {"type": "number"}, "max": {"type": "number"} }, + "properties": { + "min": { "type": "number", "format": "currency" }, + "max": { "type": "number", "format": "currency" } + }, "required": ["max", "min"] } }, @@ -3630,7 +3604,6 @@ "intent_template": "What is the price range of {{product}} in the One Stop Market?", "instantiation_dict": {"product": "Canon photo printer"}, "format_specification": "Use \"min\" for lowest price and \"max\" for highest price.", - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -3639,7 +3612,10 @@ "type": "array", "items": { "type": "object", - "properties": { "min": {"type": "number"}, "max": {"type": "number"} }, + "properties": { + "min": { "type": "number", "format": "currency" }, + "max": { "type": "number", "format": "currency" } + }, "required": ["max", "min"] } }, @@ -3657,11 +3633,10 @@ "task_id": 127, "intent_template_id": 1001, "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "What brands appear most frequently among the top search terms?", - "intent_template": "What brands appear most frequently among the top search terms?", + "intent": "Get the most frequent search terms that match available products in the store", + "intent_template": "Get the most frequent search terms that match available products in the store", "instantiation_dict": {}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -3685,7 +3660,6 @@ "intent_template": "What's the total number of items sold in the most recent {{k}} complete orders?", "instantiation_dict": {"k": "2"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -3709,7 +3683,6 @@ "intent_template": "What's the total number of items sold in the most recent {{k}} complete orders?", "instantiation_dict": {"k": "4"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -3733,7 +3706,6 @@ "intent_template": "What's the total number of items sold in the most recent {{k}} complete orders?", "instantiation_dict": {"k": "5"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -3757,7 +3729,6 @@ "intent_template": "What's the total number of items sold in the most recent {{k}} complete orders?", "instantiation_dict": {"k": "7"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -3777,15 +3748,14 @@ "task_id": 132, "intent_template_id": 322, "start_urls": ["__GITLAB__"], - "intent": "How many commits did kilian make to a11yproject.com on 3/5/2023?", + "intent": "How many commits did kilian make to a11yproject.com on March 5, 2023?", "intent_template": "How many commits did {{user}} make to {{repo}} on {{date}}?", - "instantiation_dict": {"user": "kilian", "repo": "a11yproject.com", "date": "3/5/2023"}, + "instantiation_dict": {"user": "kilian", "repo": "a11yproject.com", "date": "March 5, 2023"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": "false", + "ordered": false, "results_schema": { "type": "array", "items": {"type": "number"} }, "expected": { "performed_operation": "retrieve", @@ -3801,15 +3771,14 @@ "task_id": 133, "intent_template_id": 322, "start_urls": ["__GITLAB__"], - "intent": "How many commits did Eric make to a11yproject.com on 3/2?", + "intent": "How many commits did Eric make to a11yproject.com on March 2, 2023?", "intent_template": "How many commits did {{user}} make to {{repo}} on {{date}}?", - "instantiation_dict": {"user": "Eric", "repo": "a11yproject.com", "date": "3/2/2023"}, + "instantiation_dict": {"user": "Eric", "repo": "a11yproject.com", "date": "March 2, 2023"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": "false", + "ordered": false, "results_schema": { "type": "array", "items": {"type": "number"} }, "expected": { "performed_operation": "retrieve", @@ -3825,15 +3794,14 @@ "task_id": 134, "intent_template_id": 322, "start_urls": ["__GITLAB__"], - "intent": "How many commits did kilian make to a11yproject.com on 3/1/2023?", + "intent": "How many commits did kilian make to a11yproject.com on March 1, 2023?", "intent_template": "How many commits did {{user}} make to {{repo}} on {{date}}?", - "instantiation_dict": {"user": "kilian", "repo": "a11yproject.com", "date": "3/1/2023"}, + "instantiation_dict": {"user": "kilian", "repo": "a11yproject.com", "date": "March 1, 2023"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": "false", + "ordered": false, "results_schema": { "type": "array", "items": {"type": "number"} }, "expected": { "performed_operation": "retrieve", @@ -3849,15 +3817,18 @@ "task_id": 135, "intent_template_id": 322, "start_urls": ["__GITLAB__"], - "intent": "How many commits did Eric and Kilian make to a11yproject.com on 1/3/2023?", + "intent": "How many commits did Eric and Kilian make to a11yproject.com on January 3, 2023?", "intent_template": "How many commits did {{user}} make to {{repo}} on {{date}}?", - "instantiation_dict": {"user": "Eric and Kilian", "repo": "a11yproject.com", "date": "1/3/2023"}, + "instantiation_dict": { + "user": "Eric and Kilian", + "repo": "a11yproject.com", + "date": "January 3, 2023" + }, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": "false", + "ordered": false, "results_schema": { "type": "array", "items": {"type": "number"} }, "expected": { "performed_operation": "retrieve", @@ -3873,15 +3844,18 @@ "task_id": 136, "intent_template_id": 322, "start_urls": ["__GITLAB__"], - "intent": "How many commits did Steven Woodson make to a11y-webring.club on 2/6/2023?", + "intent": "How many commits did Steven Woodson make to a11y-webring.club on February 6, 2023?", "intent_template": "How many commits did {{user}} make to {{repo}} on {{date}}?", - "instantiation_dict": {"user": "Steven Woodson", "repo": "a11y-webring.club", "date": "2/6/2023"}, + "instantiation_dict": { + "user": "Steven Woodson", + "repo": "a11y-webring.club", + "date": "February 6, 2023" + }, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": "false", + "ordered": false, "results_schema": { "type": "array", "items": {"type": "number"} }, "expected": { "performed_operation": "retrieve", @@ -3904,12 +3878,11 @@ "city2": "the home city of Pirates" }, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "ordered": false, - "results_schema": { "type": "array", "items": {"type": "string"} }, + "results_schema": { "type": "array", "items": {"type": "string", "format": "duration"} }, "expected": { "performed_operation": "retrieve", "status": "SUCCESS", @@ -3931,12 +3904,11 @@ "city2": "the city with the most authentic Philly cheesesteaks" }, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "ordered": false, - "results_schema": { "type": "array", "items": {"type": "string"} }, + "results_schema": { "type": "array", "items": {"type": "string", "format": "duration"} }, "expected": { "performed_operation": "retrieve", "status": "SUCCESS", @@ -3955,12 +3927,11 @@ "intent_template": "What is the estimated driving time between {{city1}} and {{city2}}?", "instantiation_dict": {"city1": "the hometown of Joe Biden", "city2": "Bridgeport"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "ordered": false, - "results_schema": { "type": "array", "items": {"type": "string"} }, + "results_schema": { "type": "array", "items": {"type": "string", "format": "duration"} }, "expected": { "performed_operation": "retrieve", "status": "SUCCESS", @@ -3979,12 +3950,11 @@ "intent_template": "What is the estimated driving time between {{city1}} and {{city2}}?", "instantiation_dict": {"city1": "the city of Niagara Falls", "city2": "the city of Yale University"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "ordered": false, - "results_schema": { "type": "array", "items": {"type": "string"} }, + "results_schema": { "type": "array", "items": {"type": "string", "format": "duration"} }, "expected": { "performed_operation": "retrieve", "status": "SUCCESS", @@ -4003,16 +3973,15 @@ "intent_template": "Return how much I spent on {{category}} shopping during {{time}} without considering shipping and handling fee", "instantiation_dict": {"category": "food-related", "time": "March 2023"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "ordered": false, - "results_schema": { "type": "array", "items": {"type": "string"} }, + "results_schema": { "type": "array", "items": {"type": "number", "format": "currency"} }, "expected": { "performed_operation": "retrieve", "status": "SUCCESS", - "retrieved_data": ["32.41"] + "retrieved_data": [32.41] } } ], @@ -4027,16 +3996,15 @@ "intent_template": "Return how much I spent on {{category}} shopping during {{time}} without considering shipping and handling fee", "instantiation_dict": {"category": "hair care and hair style", "time": "Jan 2023"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "ordered": false, - "results_schema": { "type": "array", "items": {"type": "string"} }, + "results_schema": { "type": "array", "items": {"type": "number", "format": "currency"} }, "expected": { "performed_operation": "retrieve", "status": "SUCCESS", - "retrieved_data": ["68.51"] + "retrieved_data": [68.51] } } ], @@ -4047,20 +4015,19 @@ "task_id": 143, "intent_template_id": 162, "start_urls": ["__SHOPPING__"], - "intent": "Return how much I spent on home decoration shopping during 1/29/2023 without considering shipping and handling fee", + "intent": "Return how much I spent on home decoration shopping during January 29, 2023 without considering shipping and handling fee", "intent_template": "Return how much I spent on {{category}} shopping during {{time}} without considering shipping and handling fee", "instantiation_dict": {"category": "home decoration", "time": "1/29/2023"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "ordered": false, - "results_schema": { "type": "array", "items": {"type": "string"} }, + "results_schema": { "type": "array", "items": {"type": "number", "format": "currency"} }, "expected": { "performed_operation": "retrieve", "status": "SUCCESS", - "retrieved_data": ["260.69"] + "retrieved_data": [260.69] } } ], @@ -4071,11 +4038,10 @@ "task_id": 144, "intent_template_id": 162, "start_urls": ["__SHOPPING__"], - "intent": "Return how much I spent on food shopping from mid Jan to the end of Jan 2023 without considering shipping and handling fee", + "intent": "Return how much I spent on food shopping from January 15 to January 31 2023 without considering shipping and handling fee", "intent_template": "Return how much I spent on {{category}} shopping during {{time}} without considering shipping and handling fee", - "instantiation_dict": {"category": "food", "time": "from mid Jan to the end Jan 2023"}, + "instantiation_dict": {"category": "food", "time": "from January 15 to the end Jan 2023"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -4099,16 +4065,15 @@ "intent_template": "Return how much I spent on {{category}} shopping during {{time}} without considering shipping and handling fee", "instantiation_dict": {"category": "cooking and food", "time": "March 2022"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "ordered": false, - "results_schema": { "type": "array", "items": {"type": "string"} }, + "results_schema": { "type": "array", "items": {"type": "number", "format": "currency"} }, "expected": { "performed_operation": "retrieve", "status": "SUCCESS", - "retrieved_data": ["42.35"] + "retrieved_data": [42.35] } } ], @@ -4119,20 +4084,26 @@ "task_id": 146, "intent_template_id": 155, "start_urls": ["__SHOPPING__"], - "intent": "Get the size of the picture frame I bought Sep 2022", + "intent": "Get the size of the picture frame I bought in Sep 2022", "intent_template": "Get the {{option}} of the {{product}} I bought {{time}}", - "instantiation_dict": {"option": "size", "product": "picture frame", "time": "Sep 2022"}, - "format_specification": null, - "start_url_context": null, + "instantiation_dict": {"option": "size", "product": "picture frame", "time": "in Sep 2022"}, + "format_specification": "Use \"width\" for width in inches and \"height\" for height in inches.", "eval": [ { "evaluator": "AgentResponseEvaluator", "ordered": false, - "results_schema": { "type": "array", "items": {"type": "string"} }, + "results_schema": { + "type": "array", + "items": { + "type": "object", + "properties": { "width": {"type": "number"}, "height": {"type": "number"} }, + "required": ["width", "height"] + } + }, "expected": { "performed_operation": "retrieve", "status": "SUCCESS", - "retrieved_data": ["16\"x24\""] + "retrieved_data": [ {"width": 16, "height": 24} ] } } ], @@ -4143,24 +4114,26 @@ "task_id": 147, "intent_template_id": 155, "start_urls": ["__SHOPPING__"], - "intent": "Get the size of the picture frame I bought between June and December 2022", + "intent": "Get the size of the picture frame I bought in 2022", "intent_template": "Get the {{option}} of the {{product}} I bought {{time}}", - "instantiation_dict": { - "option": "size", - "product": "picture frame", - "time": "between June and December 2022" - }, - "format_specification": null, - "start_url_context": null, + "instantiation_dict": {"option": "size", "product": "picture frame", "time": "in 2022"}, + "format_specification": "Use \"width\" for width in inches and \"height\" for height in inches.", "eval": [ { "evaluator": "AgentResponseEvaluator", "ordered": false, - "results_schema": { "type": "array", "items": {"type": "string"} }, + "results_schema": { + "type": "array", + "items": { + "type": "object", + "properties": { "width": {"type": "number"}, "height": {"type": "number"} }, + "required": ["width", "height"] + } + }, "expected": { "performed_operation": "retrieve", "status": "SUCCESS", - "retrieved_data": ["16\"x24\""] + "retrieved_data": [ {"width": 16, "height": 24} ] } } ], @@ -4175,7 +4148,6 @@ "intent_template": "Get the {{option}} of the {{product}} I bought {{time}}", "instantiation_dict": {"option": "color", "product": "picture frame", "time": "Sep 2022"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -4184,7 +4156,7 @@ "expected": { "performed_operation": "retrieve", "status": "SUCCESS", - "retrieved_data": ["Mist"] + "retrieved_data": ["Mist 16*24"] } } ], @@ -4199,7 +4171,6 @@ "intent_template": "Get the {{option}} of the {{product}} I bought {{time}}", "instantiation_dict": {"option": "color", "product": "artifical plants", "time": "Feb 2023"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -4223,16 +4194,15 @@ "intent_template": "Get the {{option}} of the {{product}} I bought {{time}}", "instantiation_dict": {"option": "price", "product": "fake tree", "time": "Jan 2023"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "ordered": false, - "results_schema": { "type": "array", "items": {"type": "string"} }, + "results_schema": { "type": "array", "items": {"type": "number", "format": "currency"} }, "expected": { "performed_operation": "retrieve", "status": "SUCCESS", - "retrieved_data": ["260.69"] + "retrieved_data": [260.69] } } ], @@ -4247,12 +4217,11 @@ "intent_template": "What is the minimum travel time by car from {{location1}} to {{location2}}?", "instantiation_dict": {"location1": "CMU", "location2": "University of Pittsburgh"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "ordered": false, - "results_schema": { "type": "array", "items": {"type": "string"} }, + "results_schema": { "type": "array", "items": {"type": "string", "format": "duration"} }, "expected": { "performed_operation": "retrieve", "status": "SUCCESS", @@ -4271,12 +4240,11 @@ "intent_template": "What is the minimum travel time by car from {{location1}} to {{location2}}?", "instantiation_dict": {"location1": "Schenley park", "location2": "Upitt"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "ordered": false, - "results_schema": { "type": "array", "items": {"type": "string"} }, + "results_schema": { "type": "array", "items": {"type": "string", "format": "duration"} }, "expected": { "performed_operation": "retrieve", "status": "SUCCESS", @@ -4295,12 +4263,11 @@ "intent_template": "What is the minimum travel time by car from {{location1}} to {{location2}}?", "instantiation_dict": {"location1": "REI", "location2": "CMU"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "ordered": false, - "results_schema": { "type": "array", "items": {"type": "string"} }, + "results_schema": { "type": "array", "items": {"type": "string", "format": "duration"} }, "expected": { "performed_operation": "retrieve", "status": "SUCCESS", @@ -4319,12 +4286,11 @@ "intent_template": "What is the minimum travel time by car from {{location1}} to {{location2}}?", "instantiation_dict": {"location1": "CMU gates building", "location2": "Schenley park"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "ordered": false, - "results_schema": { "type": "array", "items": {"type": "string"} }, + "results_schema": { "type": "array", "items": {"type": "string", "format": "duration"} }, "expected": { "performed_operation": "retrieve", "status": "SUCCESS", @@ -4346,12 +4312,11 @@ "location2": "Schenley park" }, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "ordered": false, - "results_schema": { "type": "array", "items": {"type": "string"} }, + "results_schema": { "type": "array", "items": {"type": "string", "format": "duration"} }, "expected": { "performed_operation": "retrieve", "status": "SUCCESS", @@ -4366,11 +4331,10 @@ "task_id": 156, "intent_template_id": 290, "start_urls": ["__GITLAB__"], - "intent": "Navigate to the merge requests assigned to me", - "intent_template": "Navigate to the merge requests assigned to me", + "intent": "Show me the merge requests assigned to me", + "intent_template": "Show me the merge requests assigned to me", "instantiation_dict": {}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -4379,13 +4343,9 @@ }, { "evaluator": "NetworkEventEvaluator", - "site": "gitlab", - "url_match_mode": "exact", - "last_event_only": true, "expected": { - "url": "__GITLAB__/dashboard/merge_requests?assignee_username=byteblaze", - "response_status": 200, - "event_type": "navigation" + "url": "__GITLAB__/dashboard/merge_requests", + "query_params": { "assignee_username": ["byteblaze"] } } } ], @@ -4400,7 +4360,6 @@ "intent_template": "Show all customers", "instantiation_dict": {}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -4410,13 +4369,8 @@ { "evaluator": "NetworkEventEvaluator", "site": "shopping_admin", - "url_match_mode": "exact", "last_event_only": true, - "expected": { - "url": "__SHOPPING_ADMIN__/customer/index/", - "response_status": 200, - "event_type": "navigation" - } + "expected": {"url": "__SHOPPING_ADMIN__/customer/index/"} } ], "revision": 2 @@ -4430,7 +4384,6 @@ "intent_template": "I have a lot of Nintendo Switch game cards now, help me find the best storage option to fit all {{num}} cards", "instantiation_dict": {"num": 11}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -4440,12 +4393,10 @@ { "evaluator": "NetworkEventEvaluator", "site": "shopping", - "url_match_mode": "exact", "last_event_only": true, "expected": { "url": "__SHOPPING__/heiying-game-card-case-for-nintendo-switch-switch-oled-game-card-or-micro-sd-memory-cards-portable-switch-game-memory-card-storage-with-24-game-card-slots-and-24-micro-sd-card-slots-black.html", - "response_status": 200, - "event_type": "navigation" + "response_status": 200 } } ], @@ -4460,7 +4411,6 @@ "intent_template": "I have a lot of Nintendo Switch game cards now, help me find the best storage option to fit all {{num}} cards", "instantiation_dict": {"num": 31}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -4470,12 +4420,10 @@ { "evaluator": "NetworkEventEvaluator", "site": "shopping", - "url_match_mode": "exact", "last_event_only": true, "expected": { "url": "__SHOPPING__/game-card-holder-storage-case-for-nintendo-switch-games-or-ps-vita-game-case-or-sd-memory-cards-black.html", - "response_status": 200, - "event_type": "navigation" + "response_status": 200 } } ], @@ -4490,7 +4438,6 @@ "intent_template": "I have a lot of Nintendo Switch game cards now, help me find the best storage option to fit all {{num}} cards", "instantiation_dict": {"num": 6}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -4500,12 +4447,10 @@ { "evaluator": "NetworkEventEvaluator", "site": "shopping", - "url_match_mode": "exact", "last_event_only": true, "expected": { "url": "__SHOPPING__/heiying-game-card-case-for-nintendo-switch-switch-oled-game-card-or-micro-sd-memory-cards-portable-switch-game-memory-card-storage-with-24-game-card-slots-and-24-micro-sd-card-slots-black.html", - "response_status": 200, - "event_type": "navigation" + "response_status": 200 } } ], @@ -4520,7 +4465,6 @@ "intent_template": "I have a lot of Nintendo Switch game cards now, help me find the best storage option to fit all {{num}} cards", "instantiation_dict": {"num": 23}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -4530,12 +4474,10 @@ { "evaluator": "NetworkEventEvaluator", "site": "shopping", - "url_match_mode": "exact", "last_event_only": true, "expected": { "url": "__SHOPPING__/heiying-game-card-case-for-nintendo-switch-switch-oled-game-card-or-micro-sd-memory-cards-portable-switch-game-memory-card-storage-with-24-game-card-slots-and-24-micro-sd-card-slots-black.html", - "response_status": 200, - "event_type": "navigation" + "response_status": 200 } } ], @@ -4550,7 +4492,6 @@ "intent_template": "I have a lot of Nintendo Switch game cards now, help me find the best storage option to fit all {{num}} cards", "instantiation_dict": {"num": 40}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -4560,12 +4501,10 @@ { "evaluator": "NetworkEventEvaluator", "site": "shopping", - "url_match_mode": "exact", "last_event_only": true, "expected": { "url": "__SHOPPING__/game-card-holder-storage-case-for-nintendo-switch-games-or-ps-vita-game-case-or-sd-memory-cards-black.html", - "response_status": 200, - "event_type": "navigation" + "response_status": 200 } } ], @@ -4578,11 +4517,10 @@ "start_urls": [ "__SHOPPING__/ostent-16gb-memory-card-stick-storage-for-sony-ps-vita-psv1000-2000-pch-z081-z161-z321-z641.html" ], - "intent": "List all review titles with 2 stars or below for this product.", - "intent_template": "List all review titles with 2 stars or below for this product.", + "intent": "Get all review titles with 2 stars or below for the product on the current page.", + "intent_template": "Get all review titles with 2 stars or below for the product on the current page.", "instantiation_dict": {}, "format_specification": null, - "start_url_context": "Complete the task given the context of the current page.", "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -4602,11 +4540,10 @@ "task_id": 164, "intent_template_id": 136, "start_urls": ["__SHOPPING__/mineralogie-all-natural-lip-gloss-ruby-rose.html"], - "intent": "List all review titles with 2 stars or below for this product.", - "intent_template": "List all review titles with 2 stars or below for this product.", + "intent": "Get all review titles with 2 stars or below for the product on the current page.", + "intent_template": "Get all review titles with 2 stars or below for the product on the current page.", "instantiation_dict": {}, "format_specification": null, - "start_url_context": "Complete the task given the context of the current page.", "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -4626,11 +4563,10 @@ "task_id": 165, "intent_template_id": 136, "start_urls": ["__SHOPPING__/sandgrens-swedish-handmade-wooden-clog-sandal-copenhagen.html"], - "intent": "List all review titles with 2 stars or below for this product.", - "intent_template": "List all review titles with 2 stars or below for this product.", + "intent": "Get all review titles with 2 stars or below for the product on the current page.", + "intent_template": "Get all review titles with 2 stars or below for the product on the current page.", "instantiation_dict": {}, "format_specification": null, - "start_url_context": "Complete the task given the context of the current page.", "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -4652,11 +4588,10 @@ "start_urls": [ "__SHOPPING__/sensodyne-repair-protect-whitening-toothpaste-with-fluoride-3-4-oz-pack-of-3.html" ], - "intent": "List all review titles with 2 stars or below for this product.", - "intent_template": "List all review titles with 2 stars or below for this product.", + "intent": "Get all review titles with 2 stars or below for the product on the current page.", + "intent_template": "Get all review titles with 2 stars or below for the product on the current page.", "instantiation_dict": {}, "format_specification": null, - "start_url_context": "Complete the task given the context of the current page.", "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -4675,11 +4610,10 @@ "task_id": 167, "intent_template_id": 136, "start_urls": ["__SHOPPING__/photosmart-plus-b209-clr-inkjetfb-p-s-c-usb-wrls-1.html"], - "intent": "List all review titles with 2 stars or below for this product.", - "intent_template": "List all review titles with 2 stars or below for this product.", + "intent": "Get all review titles with 2 stars or below for the product on the current page.", + "intent_template": "Get all review titles with 2 stars or below for the product on the current page.", "instantiation_dict": {}, "format_specification": null, - "start_url_context": "Complete the task given the context of the current page.", "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -4699,11 +4633,10 @@ "task_id": 168, "intent_template_id": 289, "start_urls": ["__GITLAB__"], - "intent": "Tell me the project ID(s) of the repository(ies) where I made contributions that got more than 100 stars?", - "intent_template": "Tell me the project ID(s) of the repository(ies) where I made contributions that got {{description}} stars?", - "instantiation_dict": {"description": "more than 100"}, + "intent": "Get the project ID(s) of my personal project(s) that received more than 100 stars", + "intent_template": "Get the project ID(s) of my personal project(s) that received {{description}}", + "instantiation_dict": {"description": "more than 100 stars"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -4722,11 +4655,10 @@ "task_id": 169, "intent_template_id": 289, "start_urls": ["__GITLAB__"], - "intent": "Tell me the project ID(s) of the repository(ies) where I made contributions that got the most stars?", - "intent_template": "Tell me the project ID(s) of the repository(ies) where I made contributions that got {{description}} stars?", - "instantiation_dict": {"description": "the most"}, + "intent": "Get the project ID(s) of my personal project(s) that received the most stars", + "intent_template": "Get the project ID(s) of my personal project(s) that received {{description}}", + "instantiation_dict": {"description": "the most stars"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -4735,7 +4667,7 @@ "expected": { "performed_operation": "retrieve", "status": "SUCCESS", - "retrieved_data": [174, 180] + "retrieved_data": [187, 183] } } ], @@ -4746,11 +4678,10 @@ "task_id": 170, "intent_template_id": 289, "start_urls": ["__GITLAB__"], - "intent": "Tell me the project ID(s) of the repository(ies) where I made contributions that got the least stars?", - "intent_template": "Tell me the project ID(s) of the repository(ies) where I made contributions that got {{description}} stars?", - "instantiation_dict": {"description": "the least"}, + "intent": "Get the project ID(s) of my personal project(s) that received the least stars", + "intent_template": "Get the project ID(s) of my personal project(s) that received {{description}}", + "instantiation_dict": {"description": "the least stars"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -4759,7 +4690,7 @@ "expected": { "performed_operation": "retrieve", "status": "SUCCESS", - "retrieved_data": [189, 193, 190, 188, 184, 181] + "retrieved_data": [193, 190, 189, 188, 184, 181] } } ], @@ -4770,11 +4701,10 @@ "task_id": 171, "intent_template_id": 289, "start_urls": ["__GITLAB__"], - "intent": "Tell me the project ID(s) of the repository(ies) where I made contributions that got less than 5 stars?", - "intent_template": "Tell me the project ID(s) of the repository(ies) where I made contributions that got {{description}} stars?", - "instantiation_dict": {"description": "less than 5"}, + "intent": "Get the project ID(s) of my personal project(s) that received less than 5 stars", + "intent_template": "Get the project ID(s) of my personal project(s) that received {{description}}", + "instantiation_dict": {"description": "less than 5 stars"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -4783,7 +4713,7 @@ "expected": { "performed_operation": "retrieve", "status": "SUCCESS", - "retrieved_data": [186, 179, 185, 182, 189, 193, 190, 188, 184, 181] + "retrieved_data": [182, 179, 186, 185, 193, 190, 189, 188, 184, 181] } } ], @@ -4794,11 +4724,10 @@ "task_id": 172, "intent_template_id": 289, "start_urls": ["__GITLAB__"], - "intent": "Tell me the project ID(s) of the repository(ies) where I made contributions that got no stars?", - "intent_template": "Tell me the project ID(s) of the repository(ies) where I made contributions that got {{description}} stars?", - "instantiation_dict": {"description": "no"}, + "intent": "Tell me the project ID(s) of my personal project(s) that received no stars?", + "intent_template": "Tell me the project ID(s) of my personal project(s) that received {{description}}?", + "instantiation_dict": {"description": "no stars"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -4807,7 +4736,7 @@ "expected": { "performed_operation": "retrieve", "status": "SUCCESS", - "retrieved_data": [189, 193, 190, 188, 184, 181] + "retrieved_data": [193, 190, 189, 188, 184, 181] } } ], @@ -4822,11 +4751,10 @@ "intent_template": "Get whether my latest updated issue with \"{{keyword}}\" in its title is closed", "instantiation_dict": {"keyword": "better"}, "format_specification": "Use a boolean where true means the issue is closed and false means the issue is opened.", - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": "false", + "ordered": false, "results_schema": { "type": "array", "items": {"type": "boolean"} }, "expected": { "performed_operation": "retrieve", @@ -4846,11 +4774,10 @@ "intent_template": "Get whether my latest updated issue with \"{{keyword}}\" in its title is closed", "instantiation_dict": {"keyword": "feature"}, "format_specification": "Use a boolean where true means the issue is closed and false means the issue is opened.", - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": "false", + "ordered": false, "results_schema": { "type": "array", "items": {"type": "boolean"} }, "expected": { "performed_operation": "retrieve", @@ -4870,11 +4797,10 @@ "intent_template": "Get whether my latest updated issue with \"{{keyword}}\" in its title is closed", "instantiation_dict": {"keyword": "dependency"}, "format_specification": "Use a boolean where true means the issue is closed and false means the issue is opened.", - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": "false", + "ordered": false, "results_schema": { "type": "array", "items": {"type": "boolean"} }, "expected": { "performed_operation": "retrieve", @@ -4894,11 +4820,10 @@ "intent_template": "Get whether my latest updated issue with \"{{keyword}}\" in its title is closed", "instantiation_dict": {"keyword": "theme editor"}, "format_specification": "Use a boolean where true means the issue is closed and false means the issue is opened.", - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": "false", + "ordered": false, "results_schema": { "type": "array", "items": {"type": "boolean"} }, "expected": { "performed_operation": "retrieve", @@ -4918,11 +4843,10 @@ "intent_template": "Get whether my latest updated issue with \"{{keyword}}\" in its title is closed", "instantiation_dict": {"keyword": "homepage content"}, "format_specification": "Use a boolean where true means the issue is closed and false means the issue is opened.", - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": "false", + "ordered": false, "results_schema": { "type": "array", "items": {"type": "boolean"} }, "expected": { "performed_operation": "retrieve", @@ -4942,7 +4866,6 @@ "intent_template": "Get whether my latest created issue with {{keyword}} in its title is closed", "instantiation_dict": {"keyword": "better"}, "format_specification": "Use a boolean where true means the issue is closed and false means the issue is opened.", - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -4965,7 +4888,6 @@ "intent_template": "Get whether my latest created issue with {{keyword}} in its title is closed", "instantiation_dict": {"keyword": "feature"}, "format_specification": "Use a boolean where true means the issue is closed and false means the issue is opened.", - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -4988,7 +4910,6 @@ "intent_template": "Get whether my latest created issue with {{keyword}} in its title is closed", "instantiation_dict": {"keyword": "dependency"}, "format_specification": "Use a boolean where true means the issue is closed and false means the issue is opened.", - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -5011,7 +4932,6 @@ "intent_template": "Get whether my latest created issue with {{keyword}} in its title is closed", "instantiation_dict": {"keyword": "theme editor"}, "format_specification": "Use a boolean where true means the issue is closed and false means the issue is opened.", - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -5034,7 +4954,6 @@ "intent_template": "Get whether my latest created issue with {{keyword}} in its title is closed", "instantiation_dict": {"keyword": "homepage content"}, "format_specification": "Use a boolean where true means the issue is closed and false means the issue is opened.", - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -5053,11 +4972,10 @@ "task_id": 183, "intent_template_id": 368, "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Give me the SKU of the simple products that have 10 units left", - "intent_template": "Give me the {{Attribute}} of the simple products that have {{N}} units left", + "intent": "Give me the SKU of the products that have 10 units left", + "intent_template": "Give me the {{Attribute}} of the products that have {{N}} units left", "instantiation_dict": {"Attribute": "SKU", "N": "10"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -5076,11 +4994,10 @@ "task_id": 184, "intent_template_id": 368, "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Give me the name of the simple products that have 0 units left", - "intent_template": "Give me the {{Attribute}} of the simple products that have {{N}} units left", + "intent": "Give me the name of the products that have 0 units left", + "intent_template": "Give me the {{Attribute}} of the products that have {{N}} units left", "instantiation_dict": {"Attribute": "name", "N": "0"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -5100,11 +5017,10 @@ "task_id": 185, "intent_template_id": 368, "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Give me the brand of the simple products that have 3 units left", - "intent_template": "Give me the {{Attribute}} of the simple products that have {{N}} units left", - "instantiation_dict": {"Attribute": "brand", "N": "3"}, + "intent": "Give me the material of the products that have 3 units left", + "intent_template": "Give me the {{Attribute}} of the products that have {{N}} units left", + "instantiation_dict": {"Attribute": "material", "N": "3"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -5113,7 +5029,7 @@ "expected": { "performed_operation": "retrieve", "status": "SUCCESS", - "retrieved_data": ["Eos", "Minera"] + "retrieved_data": ["Cotton", "Fleece"] } } ], @@ -5124,11 +5040,10 @@ "task_id": 186, "intent_template_id": 368, "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Give me the product names and the sizes of the simple products that have 2-3 units left", - "intent_template": "Give me the {{Attribute}} of the simple products that have {{N}} units left", + "intent": "Give me the product names and the sizes of the products that have 2-3 units left", + "intent_template": "Give me the {{Attribute}} of the products that have {{N}} units left", "instantiation_dict": {"Attribute": "product names and the sizes", "N": "2-3"}, "format_specification": "Use \"name\" for the product name and \"size\" for the size.", - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -5158,11 +5073,10 @@ "task_id": 187, "intent_template_id": 368, "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Give me the SKU of the simple products that have 1-3 units left", - "intent_template": "Give me the {{Attribute}} of the simple products that have {{N}} units left", + "intent": "Give me the SKU of the products that have 1-3 units left", + "intent_template": "Give me the {{Attribute}} of the products that have {{N}} units left", "instantiation_dict": {"Attribute": "SKU", "N": "1-3"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -5182,20 +5096,19 @@ "task_id": 188, "intent_template_id": 214, "start_urls": ["__SHOPPING__"], - "intent": "Get the total cost of my latest cancelled order", - "intent_template": "Get the total cost of my latest {{status}} order", - "instantiation_dict": {"status": "cancelled"}, + "intent": "Get the total cost of my latest order marked as \"cancelled\"", + "intent_template": "Get the total cost of my latest order {{status}}", + "instantiation_dict": {"status": "marked as 'cancelled'"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "ordered": false, - "results_schema": { "type": "array", "items": {"type": "string"} }, + "results_schema": { "type": "array", "items": {"type": "number", "format": "currency"} }, "expected": { "performed_operation": "retrieve", "status": "SUCCESS", - "retrieved_data": ["365.42"] + "retrieved_data": [365.42] } } ], @@ -5206,20 +5119,19 @@ "task_id": 189, "intent_template_id": 214, "start_urls": ["__SHOPPING__"], - "intent": "Get the total cost of my latest pending order", - "intent_template": "Get the total cost of my latest {{status}} order", - "instantiation_dict": {"status": "pending"}, + "intent": "Get the total cost of my latest order marked as \"pending\"", + "intent_template": "Get the total cost of my latest order {{status}}", + "instantiation_dict": {"status": "marked as 'pending'"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "ordered": false, - "results_schema": { "type": "array", "items": {"type": "string"} }, + "results_schema": { "type": "array", "items": {"type": "number", "format": "currency"} }, "expected": { "performed_operation": "retrieve", "status": "SUCCESS", - "retrieved_data": ["754.99"] + "retrieved_data": [754.99] } } ], @@ -5230,20 +5142,19 @@ "task_id": 190, "intent_template_id": 214, "start_urls": ["__SHOPPING__"], - "intent": "Get the total cost of my latest complete order", - "intent_template": "Get the total cost of my latest {{status}} order", - "instantiation_dict": {"status": "complete"}, + "intent": "Get the total cost of my latest order marked as \"complete\"", + "intent_template": "Get the total cost of my latest order {{status}}", + "instantiation_dict": {"status": "marked as 'complete'"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "ordered": false, - "results_schema": { "type": "array", "items": {"type": "string"} }, + "results_schema": { "type": "array", "items": {"type": "number", "format": "currency"} }, "expected": { "performed_operation": "retrieve", "status": "SUCCESS", - "retrieved_data": ["65.32"] + "retrieved_data": [65.32] } } ], @@ -5254,11 +5165,10 @@ "task_id": 191, "intent_template_id": 214, "start_urls": ["__SHOPPING__"], - "intent": "Get the total cost of my latest processing order", - "intent_template": "Get the total cost of my latest {{status}} order", - "instantiation_dict": {"status": "processing"}, + "intent": "Get the total cost of my latest order marked as \"processing\"", + "intent_template": "Get the total cost of my latest order {{status}}", + "instantiation_dict": {"status": "marked as 'processing'"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -5277,20 +5187,19 @@ "task_id": 192, "intent_template_id": 214, "start_urls": ["__SHOPPING__"], - "intent": "Get the total cost of my latest non-cancelled order", - "intent_template": "Get the total cost of my latest {{status}} order", - "instantiation_dict": {"status": "non-cancelled"}, + "intent": "Get the total cost of my latest order that is not cancelled", + "intent_template": "Get the total cost of my latest order {{status}}", + "instantiation_dict": {"status": "that is not cancelled"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "ordered": false, - "results_schema": { "type": "array", "items": {"type": "string"} }, + "results_schema": { "type": "array", "items": {"type": "number", "format": "currency"} }, "expected": { "performed_operation": "retrieve", "status": "SUCCESS", - "retrieved_data": ["754.99"] + "retrieved_data": [754.99] } } ], @@ -5305,16 +5214,15 @@ "intent_template": "Get the total payment amount of the last {{N}} {{status}} orders", "instantiation_dict": {"status": "completed", "N": "2"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "ordered": false, - "results_schema": { "type": "array", "items": {"type": "string"} }, + "results_schema": { "type": "array", "items": {"type": "number", "format": "currency"} }, "expected": { "performed_operation": "retrieve", "status": "SUCCESS", - "retrieved_data": ["182.4"] + "retrieved_data": [182.4] } } ], @@ -5329,16 +5237,15 @@ "intent_template": "Get the total payment amount of the last {{N}} {{status}} orders", "instantiation_dict": {"status": "completed", "N": "5"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "ordered": false, - "results_schema": { "type": "array", "items": {"type": "string"} }, + "results_schema": { "type": "array", "items": {"type": "number", "format": "currency"} }, "expected": { "performed_operation": "retrieve", "status": "SUCCESS", - "retrieved_data": ["555.2"] + "retrieved_data": [555.2] } } ], @@ -5353,16 +5260,15 @@ "intent_template": "Get the total payment amount of the last {{N}} {{status}} orders", "instantiation_dict": {"status": "pending", "N": "5"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "ordered": false, - "results_schema": { "type": "array", "items": {"type": "string"} }, + "results_schema": { "type": "array", "items": {"type": "number", "format": "currency"} }, "expected": { "performed_operation": "retrieve", "status": "SUCCESS", - "retrieved_data": ["885.4"] + "retrieved_data": [885.4] } } ], @@ -5377,16 +5283,15 @@ "intent_template": "Get the payment difference between the last {{N}} {{status_1}} orders and the last {{N}} {{status_2}} orders", "instantiation_dict": {"status_1": "cancelled", "status_2": "completed", "N": "4"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "ordered": false, - "results_schema": { "type": "array", "items": {"type": "string"} }, + "results_schema": { "type": "array", "items": {"type": "number", "format": "currency"} }, "expected": { "performed_operation": "retrieve", "status": "SUCCESS", - "retrieved_data": ["194.25"] + "retrieved_data": [194.25] } } ], @@ -5401,16 +5306,15 @@ "intent_template": "Get the total payment amount of the last {{N}} {{status}} orders", "instantiation_dict": {"status": "non-cancelled", "N": "5"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "ordered": false, - "results_schema": { "type": "array", "items": {"type": "string"} }, + "results_schema": { "type": "array", "items": {"type": "number", "format": "currency"} }, "expected": { "performed_operation": "retrieve", "status": "SUCCESS", - "retrieved_data": ["778.2"] + "retrieved_data": [778.2] } } ], @@ -5425,7 +5329,6 @@ "intent_template": "Get the {{attribute}} of the {{status}} order", "instantiation_dict": {"attribute": "customer name", "status": "most recent cancelled"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -5449,7 +5352,6 @@ "intent_template": "Get the {{attribute}} of the {{status}} order", "instantiation_dict": {"attribute": "order ID", "status": "newest pending"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -5473,7 +5375,6 @@ "intent_template": "Get the {{attribute}} of the {{status}} order", "instantiation_dict": {"attribute": "billing name", "status": "oldest complete"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -5497,7 +5398,6 @@ "intent_template": "Get the {{attribute}} of the {{status}} order", "instantiation_dict": {"attribute": "customer name", "status": "earliest fraud suspect"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -5520,12 +5420,11 @@ "intent_template": "Get the {{attribute}} of the {{status}} order", "instantiation_dict": {"attribute": "date", "status": "most recent cancelled"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "ordered": false, - "results_schema": { "type": "array", "items": {"type": "string"} }, + "results_schema": { "type": "array", "items": {"type": "string", "format": "date"} }, "expected": { "performed_operation": "retrieve", "status": "SUCCESS", @@ -5544,7 +5443,6 @@ "intent_template": "Get the {{attribute}} of the {{status}} order", "instantiation_dict": {"attribute": "purchase date and order id", "status": "most recent pending"}, "format_specification": "Use \"date\" for the date and \"order_id\" for the order id.", - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -5571,18 +5469,17 @@ "task_id": 204, "intent_template_id": 366, "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Get the product name and discounted price (low to high) of the most recent completed order", + "intent": "Get the product name and final price (low to high) of the most recent completed order", "intent_template": "Get the {{attribute}} of the {{status}} order", "instantiation_dict": { - "attribute": "product name and discounted price (low to high)", + "attribute": "product name and final price (low to high)", "status": "most recent completed" }, - "format_specification": "Use \"name\" for the product name and \"price\" for the discounted price.", - "start_url_context": null, + "format_specification": "Use \"name\" for the product name and \"price\" for the final price.", "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": "true", + "ordered": true, "results_schema": { "type": "array", "items": { @@ -5598,8 +5495,8 @@ "performed_operation": "retrieve", "status": "SUCCESS", "retrieved_data": [ - { "name": "Proteus Fitness Jackshirt" , "price": 45 }, - { "name": "Ida Workout Parachute Pant", "price": 48 } + { "name": "Proteus Fitness Jackshirt" , "price": 45.0 }, + { "name": "Ida Workout Parachute Pant", "price": 38.4 } ] } } @@ -5611,15 +5508,14 @@ "task_id": 205, "intent_template_id": 320, "start_urls": ["__GITLAB__/a11yproject/a11yproject.com"], - "intent": "How many commits did kilian make on 3/5/2023?", - "intent_template": "How many commits did {{user}} make on {{date}}?", - "instantiation_dict": {"user": "kilian", "date": "3/5/2023"}, + "intent": "How many commits did kilian make on March 5, 2023 for the current project?", + "intent_template": "How many commits did {{user}} make on {{date}} for the current project?", + "instantiation_dict": {"user": "kilian", "date": "March 5, 2023"}, "format_specification": null, - "start_url_context": "Complete the task given the context of the current page.", "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": "false", + "ordered": false, "results_schema": { "type": "array", "items": {"type": "number"} }, "expected": { "performed_operation": "retrieve", @@ -5635,15 +5531,14 @@ "task_id": 206, "intent_template_id": 320, "start_urls": ["__GITLAB__/a11yproject/a11yproject.com"], - "intent": "How many commits did Eric make on 3/2?", - "intent_template": "How many commits did {{user}} make on {{date}}?", - "instantiation_dict": {"user": "Eric", "date": "3/2"}, + "intent": "How many commits did Eric make on March 2, 2023 for the current project?", + "intent_template": "How many commits did {{user}} make on {{date}} for the current project?", + "instantiation_dict": {"user": "Eric", "date": "March 2, 2023"}, "format_specification": null, - "start_url_context": "Complete the task given the context of the current page.", "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": "false", + "ordered": false, "results_schema": { "type": "array", "items": {"type": "number"} }, "expected": { "performed_operation": "retrieve", @@ -5659,15 +5554,14 @@ "task_id": 207, "intent_template_id": 320, "start_urls": ["__GITLAB__/a11yproject/a11yproject.com"], - "intent": "How many commits did Eric and Kilian make on 1/3/2023 in total?", - "intent_template": "How many commits did {{user}} make on {{date}} in total?", - "instantiation_dict": {"user": "Eric and Kilian", "date": "1/3/2023"}, + "intent": "How many commits did Eric and Kilian make on January 3, 2023 in total for the current project?", + "intent_template": "How many commits did {{user}} make on {{date}} in total for the current project?", + "instantiation_dict": {"user": "Eric and Kilian", "date": "January 3, 2023"}, "format_specification": null, - "start_url_context": "Complete the task given the context of the current page.", "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": "false", + "ordered": false, "results_schema": { "type": "array", "items": {"type": "number"} }, "expected": { "performed_operation": "retrieve", @@ -5683,11 +5577,10 @@ "task_id": 208, "intent_template_id": 364, "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Find the customer name and email with phone number +1 2058812302", - "intent_template": "Find the customer name and email with phone number {{PhoneNum}}", + "intent": "Get the customer name and email with phone number +1 2058812302", + "intent_template": "Get the customer name and email with phone number {{PhoneNum}}", "instantiation_dict": {"PhoneNum": "+1 2058812302"}, "format_specification": "Use \"name\" for the customer name and \"email\" for the email.", - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -5714,11 +5607,10 @@ "task_id": 209, "intent_template_id": 364, "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Find the customer name and email with phone number 2137418080", - "intent_template": "Find the customer name and email with phone number {{PhoneNum}}", + "intent": "Get the customer name and email with phone number 2137418080", + "intent_template": "Get the customer name and email with phone number {{PhoneNum}}", "instantiation_dict": {"PhoneNum": "2137418080"}, "format_specification": "Use \"name\" for the customer name and \"email\" for the email.", - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -5745,11 +5637,10 @@ "task_id": 210, "intent_template_id": 364, "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Find the customer name and email with phone number 2065555555", - "intent_template": "Find the customer name and email with phone number {{PhoneNum}}", + "intent": "Get the customer name and email with phone number 2065555555", + "intent_template": "Get the customer name and email with phone number {{PhoneNum}}", "instantiation_dict": {"PhoneNum": "2065555555"}, "format_specification": "Use \"name\" for the customer name and \"email\" for the email.", - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -5776,11 +5667,10 @@ "task_id": 211, "intent_template_id": 364, "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Find the customer name and email with phone number 8015551212", - "intent_template": "Find the customer name and email with phone number {{PhoneNum}}", + "intent": "Get the customer name and email with phone number 8015551212", + "intent_template": "Get the customer name and email with phone number {{PhoneNum}}", "instantiation_dict": {"PhoneNum": "8015551212"}, "format_specification": "Use \"name\" for the customer name and \"email\" for the email.", - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -5807,11 +5697,10 @@ "task_id": 212, "intent_template_id": 364, "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Find the customer name and email with phone number 555-229-3326", - "intent_template": "Find the customer name and email with phone number {{PhoneNum}}", + "intent": "Get the customer name and email with phone number 555-229-3326", + "intent_template": "Get the customer name and email with phone number {{PhoneNum}}", "instantiation_dict": {"PhoneNum": "555-229-3326"}, "format_specification": "Use \"name\" for the customer name and \"email\" for the email.", - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -5838,11 +5727,10 @@ "task_id": 213, "intent_template_id": 249, "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "List all reviews with 3 stars or below for Antonia Racer Tank.", - "intent_template": "List all reviews with 3 stars or below for {{product}}.", + "intent": "Get the title and rating for all reviews with 3 stars or below for Antonia Racer Tank.", + "intent_template": "Get the title and rating for all reviews with 3 stars or below for {{product}}.", "instantiation_dict": {"product": "Antonia Racer Tank"}, "format_specification": "Use \"title\" for the review title and \"rating\" for rating (number of stars).", - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -5872,11 +5760,10 @@ "task_id": 214, "intent_template_id": 249, "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "List all reviews with 3 stars or below for Zing Jump Rope.", - "intent_template": "List all reviews with 3 stars or below for {{product}}.", - "instantiation_dict": {"product": "Zing Jump Rope"}, + "intent": "Get the title and rating for all reviews with 3 stars or below for Erica Sports Bra.", + "intent_template": "Get the title and rating for all reviews with 3 stars or below for {{product}}.", + "instantiation_dict": {"product": "Erica Sports Bra"}, "format_specification": "Use \"title\" for the review title and \"rating\" for rating (number of stars).", - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -5889,7 +5776,14 @@ "required": ["rating", "title"] } }, - "expected": {"performed_operation": "retrieve", "status": "SUCCESS", "retrieved_data": []} + "expected": { + "performed_operation": "retrieve", + "status": "SUCCESS", + "retrieved_data": [ + { "title": "Doesn't fit me. Luma fail.", "rating": "2" }, + { "title": "does not fit. worthless." , "rating": "1" } + ] + } } ], "revision": 2 @@ -5899,11 +5793,10 @@ "task_id": 215, "intent_template_id": 249, "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "List all reviews with 3 stars or below for Circe ice fleece.", - "intent_template": "List all reviews with 3 stars or below for {{product}}.", + "intent": "Get the title and rating for all reviews with 3 stars or below for Circe ice fleece.", + "intent_template": "Get the title and rating for all reviews with 3 stars or below for {{product}}.", "instantiation_dict": {"product": "Circe ice fleece"}, "format_specification": "Use \"title\" for the review title and \"rating\" for rating (number of stars).", - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -5930,11 +5823,10 @@ "task_id": 216, "intent_template_id": 249, "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "List all reviews with 3 stars or below for Electra Bra Top.", - "intent_template": "List all reviews with 3 stars or below for {{product}}.", + "intent": "Get the title and rating for all reviews with 3 stars or below for Electra Bra Top.", + "intent_template": "Get the title and rating for all reviews with 3 stars or below for {{product}}.", "instantiation_dict": {"product": "Electra Bra Top"}, "format_specification": "Use \"title\" for the review title and \"rating\" for rating (number of stars).", - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -5961,11 +5853,10 @@ "task_id": 217, "intent_template_id": 249, "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "List all reviews with 3 stars or below for Pursuit Tone Band.", - "intent_template": "List all reviews with 3 stars or below for {{product}}.", + "intent": "Get the title and rating for all reviews with 3 stars or below for Pursuit Tone Band.", + "intent_template": "Get the title and rating for all reviews with 3 stars or below for {{product}}.", "instantiation_dict": {"product": "Pursuit Tone Band"}, "format_specification": "Use \"title\" for the review title and \"rating\" for rating (number of stars).", - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -5999,7 +5890,6 @@ "intent_template": "Get the name of the hotel and walking distance of nearby hotels to {{location}} that take at most {{n}} minutes", "instantiation_dict": {"location": "CMU, Pittsburgh", "n": "5"}, "format_specification": "Use \"hotel\" for the name and \"distance\" for the distance.", - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -6022,7 +5912,6 @@ "intent_template": "Get the name of the hotel and walking distance of nearby hotels to {{location}} that take at most {{n}} minutes", "instantiation_dict": {"location": "Pittsburgh airport", "n": "3"}, "format_specification": "Use \"hotel\" for the name and \"distance\" for the distance.", - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -6045,7 +5934,6 @@ "intent_template": "Tell me the name of the hotel and walking distance of nearby hotels to {{location}} that take at most {{n}} minutes", "instantiation_dict": {"location": "Gardner Steel Conference Center,", "n": 5}, "format_specification": "Use \"hotel\" for the name and \"distance\" for the distance.", - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -6082,7 +5970,6 @@ "intent_template": "I am at CMU Pittsburgh, how long does it take to the nearest {{location}} with different transportation methods?", "instantiation_dict": {"location": "USPS postal office"}, "format_specification": "Use \"transportation_method\" for the transportation method and \"duration\" for the duration.", - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -6120,12 +6007,11 @@ "intent_template": "I am at CMU Pittsburgh, how long does it take to the nearest {{location}} with different transportation methods?", "instantiation_dict": {"location": "cold stone ice cream"}, "format_specification": "Use \"transportation_method\" for the transportation method and \"duration\" for the duration.", - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "ordered": false, - "results_schema": { "type": "array", "items": {"type": "string"} }, + "results_schema": { "type": "array", "items": {"type": "string", "format": "duration"} }, "expected": { "performed_operation": "retrieve", "status": "SUCCESS", @@ -6140,16 +6026,15 @@ "task_id": 223, "intent_template_id": 35, "start_urls": ["__MAP__"], - "intent": "I am at CMU Pittsburgh, how long does it take to the nearest Mcdonald's with different transportation methods?", + "intent": "I am at CMU Pittsburgh, how long does it take to the nearest McDonald's with different transportation methods?", "intent_template": "I am at CMU Pittsburgh, how long does it take to the nearest {{location}} with different transportation methods?", "instantiation_dict": {"location": "Mcdonald's"}, "format_specification": "Use \"transportation_method\" for the transportation method and \"duration\" for the duration.", - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "ordered": false, - "results_schema": { "type": "array", "items": {"type": "string"} }, + "results_schema": { "type": "array", "items": {"type": "string", "format": "duration"} }, "expected": { "performed_operation": "retrieve", "status": "SUCCESS", @@ -6168,12 +6053,11 @@ "intent_template": "I am at CMU Pittsburgh, how long does it take to the nearest {{location}} with different transportation methods?", "instantiation_dict": {"location": "wendys"}, "format_specification": "Use \"transportation_method\" for the transportation method and \"duration\" for the duration.", - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "ordered": false, - "results_schema": { "type": "array", "items": {"type": "string"} }, + "results_schema": { "type": "array", "items": {"type": "string", "format": "duration"} }, "expected": { "performed_operation": "retrieve", "status": "SUCCESS", @@ -6192,7 +6076,6 @@ "intent_template": "Return the titles for reviews with 3 stars or below for {{product_type}} from {{manufature}}", "instantiation_dict": {"product_type": "brush", "manufature": "sephora"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -6215,7 +6098,6 @@ "intent_template": "What is the price range for products from {{brand}}?", "instantiation_dict": {"brand": "Amazon basic"}, "format_specification": "Use \"min\" for lowest price and \"max\" for highest price.", - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -6224,7 +6106,10 @@ "type": "array", "items": { "type": "object", - "properties": { "min": {"type": "number"}, "max": {"type": "number"} }, + "properties": { + "min": { "type": "number", "format": "currency" }, + "max": { "type": "number", "format": "currency" } + }, "required": ["max", "min"] } }, @@ -6246,7 +6131,6 @@ "intent_template": "What is the price range for products from {{brand}}?", "instantiation_dict": {"brand": "EYZUTAK"}, "format_specification": "Use \"min\" for lowest price and \"max\" for highest price.", - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -6255,7 +6139,10 @@ "type": "array", "items": { "type": "object", - "properties": { "min": {"type": "number"}, "max": {"type": "number"} }, + "properties": { + "min": { "type": "number", "format": "currency" }, + "max": { "type": "number", "format": "currency" } + }, "required": ["max", "min"] } }, @@ -6277,7 +6164,6 @@ "intent_template": "What is the price range for products from {{brand}}?", "instantiation_dict": {"brand": "sephora"}, "format_specification": "Use \"min\" for lowest price and \"max\" for highest price.", - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -6286,7 +6172,10 @@ "type": "array", "items": { "type": "object", - "properties": { "min": {"type": "number"}, "max": {"type": "number"} }, + "properties": { + "min": { "type": "number", "format": "currency" }, + "max": { "type": "number", "format": "currency" } + }, "required": ["max", "min"] } }, @@ -6308,7 +6197,6 @@ "intent_template": "What is the price range for products from {{brand}}?", "instantiation_dict": {"brand": "ugreen"}, "format_specification": "Use \"min\" for lowest price and \"max\" for highest price.", - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -6317,7 +6205,10 @@ "type": "array", "items": { "type": "object", - "properties": { "min": {"type": "number"}, "max": {"type": "number"} }, + "properties": { + "min": { "type": "number", "format": "currency" }, + "max": { "type": "number", "format": "currency" } + }, "required": ["max", "min"] } }, @@ -6339,7 +6230,6 @@ "intent_template": "What is the price range for products from {{brand}}?", "instantiation_dict": {"brand": "Perricone MD"}, "format_specification": "Use \"min\" for lowest price and \"max\" for highest price.", - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -6348,7 +6238,10 @@ "type": "array", "items": { "type": "object", - "properties": { "min": {"type": "number"}, "max": {"type": "number"} }, + "properties": { + "min": { "type": "number", "format": "currency" }, + "max": { "type": "number", "format": "currency" } + }, "required": ["max", "min"] } }, @@ -6366,11 +6259,10 @@ "task_id": 231, "intent_template_id": 213, "start_urls": ["__SHOPPING__"], - "intent": "Get the order number of my most recent cancelled order ", - "intent_template": "Get the order number of my most recent {{status}} order ", + "intent": "Get the order number of my most recent cancelled order", + "intent_template": "Get the order number of my most recent {{status}} order", "instantiation_dict": {"status": "cancelled"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -6390,11 +6282,10 @@ "task_id": 232, "intent_template_id": 213, "start_urls": ["__SHOPPING__"], - "intent": "Get the order number of my most recent pending order ", - "intent_template": "Get the order number of my most recent {{status}} order ", + "intent": "Get the order number of my most recent pending order", + "intent_template": "Get the order number of my most recent {{status}} order", "instantiation_dict": {"status": "pending"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -6414,11 +6305,10 @@ "task_id": 233, "intent_template_id": 213, "start_urls": ["__SHOPPING__"], - "intent": "Get the order number of my most recent complete order ", - "intent_template": "Get the order number of my most recent {{status}} order ", + "intent": "Get the order number of my most recent complete order", + "intent_template": "Get the order number of my most recent {{status}} order", "instantiation_dict": {"status": "complete"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -6438,11 +6328,10 @@ "task_id": 234, "intent_template_id": 213, "start_urls": ["__SHOPPING__"], - "intent": "Get the order number of my most recent on hold order ", - "intent_template": "Get the order number of my most recent {{status}} order ", + "intent": "Get the order number of my most recent on hold order", + "intent_template": "Get the order number of my most recent {{status}} order", "instantiation_dict": {"status": "on hold"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -6461,11 +6350,10 @@ "task_id": 235, "intent_template_id": 213, "start_urls": ["__SHOPPING__"], - "intent": "Get the order number of my most recent under delivery order ", - "intent_template": "Get the order number of my most recent {{status}} order ", + "intent": "Get the order number of my most recent under delivery order", + "intent_template": "Get the order number of my most recent {{status}} order", "instantiation_dict": {"status": "under delivery"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -6492,7 +6380,6 @@ "condition": "I can walk within 20mins" }, "format_specification": "Return an object with keys \"name\", \"house_number\", \"street\", \"city\", \"state\", and \"postcode\"", - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -6540,7 +6427,6 @@ "intent_template": "Get the name and address of the nearest {{location}} from {{location2}} {{condition}}", "instantiation_dict": {"location": "gas station", "location2": "CMU", "condition": ""}, "format_specification": "Return an object with keys \"name\", \"house_number\", \"street\", \"city\", \"state\", and \"postcode\"", - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -6588,7 +6474,6 @@ "intent_template": "I am doing a market survey for one stop market, show me the most expensive product from {{product_category}} category", "instantiation_dict": {"product_category": "PS4 accessories"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -6598,12 +6483,10 @@ { "evaluator": "NetworkEventEvaluator", "site": "shopping", - "url_match_mode": "exact", "last_event_only": true, "expected": { "url": "__SHOPPING__/astro-gaming-a50-wireless-headset-base-station-gen-4-compatible-with-ps5-ps4-pc-mac-black-silver.html", - "response_status": 200, - "event_type": "navigation" + "response_status": 200 } } ], @@ -6618,7 +6501,6 @@ "intent_template": "I am doing a market survey for one stop market, show me the most expensive product from {{product_category}} category", "instantiation_dict": {"product_category": "nutrition bars and drinks"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -6628,12 +6510,10 @@ { "evaluator": "NetworkEventEvaluator", "site": "shopping", - "url_match_mode": "exact", "last_event_only": true, "expected": { "url": "__SHOPPING__/kellogg-s-special-k-protein-meal-bars-chocolate-caramel-12-7oz-6-count.html", - "response_status": 200, - "event_type": "navigation" + "response_status": 200 } } ], @@ -6644,11 +6524,10 @@ "task_id": 240, "intent_template_id": 138, "start_urls": ["__SHOPPING__"], - "intent": "I am doing a market survey for one stop market, show me the most expensive product from competitive swimwear category", + "intent": "I am doing a market survey for one stop market, show me the most expensive product from men's Uniforms, Work & Safety category", "intent_template": "I am doing a market survey for one stop market, show me the most expensive product from {{product_category}} category", - "instantiation_dict": {"product_category": "competitive swimwear"}, + "instantiation_dict": {"product_category": "men's Uniforms, Work & Safety"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -6658,12 +6537,9 @@ { "evaluator": "NetworkEventEvaluator", "site": "shopping", - "url_match_mode": "exact", "last_event_only": true, "expected": { - "url": "__SHOPPING__/women-cross-flower-beachwear-tankini-bandeau-bandage-bikini-set-push-up-swimwear-bathing-suit-two-pieces-swimsuits.html", - "response_status": 200, - "event_type": "navigation" + "url": "__SHOPPING__/arc-teryx-beta-sv-bib-men-s-versatile-durable-gore-tex-pro-bib-for-severe-conditions.html" } } ], @@ -6674,11 +6550,10 @@ "task_id": 241, "intent_template_id": 138, "start_urls": ["__SHOPPING__"], - "intent": "I am doing a market survey for one stop market, show me the most expensive product from skin care tool category", + "intent": "I am doing a market survey for one stop market, show me the most expensive product from skin care category", "intent_template": "I am doing a market survey for one stop market, show me the most expensive product from {{product_category}} category", - "instantiation_dict": {"product_category": "skin care tool"}, + "instantiation_dict": {"product_category": "skin care"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -6688,12 +6563,10 @@ { "evaluator": "NetworkEventEvaluator", "site": "shopping", - "url_match_mode": "exact", "last_event_only": true, "expected": { "url": "__SHOPPING__/professional-medi-spa-scar-stretch-mark-reduction-system.html", - "response_status": 200, - "event_type": "navigation" + "response_status": 200 } } ], @@ -6708,7 +6581,6 @@ "intent_template": "I am doing a market survey for one stop market, show me the most expensive product from {{product_category}} category", "instantiation_dict": {"product_category": "Household Supplies"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -6718,12 +6590,10 @@ { "evaluator": "NetworkEventEvaluator", "site": "shopping", - "url_match_mode": "exact", "last_event_only": true, "expected": { "url": "__SHOPPING__/lynx-battery-12v-200ah-lithium-iron-phosphate-lifepo4-prismatic-deep-cell-battery-set-of-4-3-2v-cells-with-3-bus-bars-and-8-lug-nuts-for-rv-solar-marine-off-grid-applications.html", - "response_status": 200, - "event_type": "navigation" + "response_status": 200 } } ], @@ -6738,7 +6608,6 @@ "intent_template": "Show me the {{information}} of the customer who is the most unhappy with {{product}}", "instantiation_dict": {"information": "email address", "product": "Circe fleece"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -6762,7 +6631,6 @@ "intent_template": "Show me the {{information}} of the customer who is the most unhappy with {{product}}", "instantiation_dict": {"information": "email address", "product": "Olivia zip jacket"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -6786,7 +6654,6 @@ "intent_template": "Show me the {{information}} of the customer who is the most unhappy with {{product}}", "instantiation_dict": {"information": "name", "product": "Antonia racer tank"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -6810,7 +6677,6 @@ "intent_template": "Show me the {{information}} of the customer who is the most unhappy with {{product}}", "instantiation_dict": {"information": "name", "product": "Chloe tank"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -6834,7 +6700,6 @@ "intent_template": "Show me the {{information}} of the customer who is the most unhappy with {{product}}", "instantiation_dict": {"information": "email address", "product": "the style of Zoe products"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -6857,7 +6722,6 @@ "intent_template": "Tell me the coordinates of {{location}} in DD format", "instantiation_dict": {"location": "Carnegie Mellon Caf\u00e9"}, "format_specification": "Use \"latitude\" for the latitude coordinate and \"longitude\" for the longitude coordinate.", - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -6881,7 +6745,6 @@ "intent_template": "Tell me the coordinates of {{location}} in DD format", "instantiation_dict": {"location": "Western Pennsylvania Hospital Heliport"}, "format_specification": "Use \"latitude\" for the latitude coordinate and \"longitude\" for the longitude coordinate.", - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -6905,7 +6768,6 @@ "intent_template": "Tell me the coordinates of {{location}} in DD format", "instantiation_dict": {"location": "Apple Store near Pitt"}, "format_specification": "Use \"latitude\" for the latitude coordinate and \"longitude\" for the longitude coordinate.", - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -6929,7 +6791,6 @@ "intent_template": "Tell me the coordinates of {{location}} in DD format", "instantiation_dict": {"location": "bus stop on the Carnegie art museum side of the street near CMU"}, "format_specification": "Use \"latitude\" for the latitude coordinate and \"longitude\" for the longitude coordinate.", - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -6953,7 +6814,6 @@ "intent_template": "Tell me the coordinates of {{location}} in DD format", "instantiation_dict": {"location": "Tokyo Japanese Food Store in Pittsburgh"}, "format_specification": "Use \"latitude\" for the latitude coordinate and \"longitude\" for the longitude coordinate.", - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -6977,7 +6837,6 @@ "intent_template": "What is the {{information}} of {{location}}", "instantiation_dict": {"location": "Carnegie Mellon Caf\u00e9", "information": "phone number"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -7000,7 +6859,6 @@ "intent_template": "What is the {{information}} of {{location}}", "instantiation_dict": {"location": "Western Pennsylvania Hospital", "information": "phone number"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -7024,7 +6882,6 @@ "intent_template": "Who is the {{information}} of {{location}}", "instantiation_dict": {"location": "PIT airport", "information": "operator"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -7048,7 +6905,6 @@ "intent_template": "What is the {{information}} of {{location}}", "instantiation_dict": {"location": "Carnegie art museum in pittsburgh", "information": "website"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -7075,7 +6931,6 @@ "information": "hours of operation" }, "format_specification": "Provide the information for each day of the week when the location is open. Use \"day\" for the day of the week, \"open_time\" for the opening time, and \"close_time\" for the closing time.", - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -7112,11 +6967,10 @@ "task_id": 258, "intent_template_id": 325, "start_urls": ["__GITLAB__"], - "intent": "Navigate to all public projects", - "intent_template": "Navigate to all public projects", + "intent": "Show me all public projects", + "intent_template": "Show me all public projects", "instantiation_dict": {}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -7126,12 +6980,11 @@ { "evaluator": "NetworkEventEvaluator", "site": "gitlab", - "url_match_mode": "exact", "last_event_only": true, "expected": { - "url": "__GITLAB__/explore?visibility_level=20", + "url": "__GITLAB__/explore", "response_status": 200, - "event_type": "navigation" + "query_params": { "visibility_level": ["20"] } } } ], @@ -7146,11 +6999,9 @@ "intent_template": "Get me my RSS feed token", "instantiation_dict": {}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": "false", "results_schema": { "type": "array", "items": {"type": "string"} }, "expected": { "performed_operation": "retrieve", @@ -7170,7 +7021,6 @@ "intent_template": "I want to browse the products in the {{category}} category", "instantiation_dict": {"category": "Video Game"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -7180,13 +7030,8 @@ { "evaluator": "NetworkEventEvaluator", "site": "shopping", - "url_match_mode": "exact", "last_event_only": true, - "expected": { - "url": "__SHOPPING__/video-games.html", - "response_status": 200, - "event_type": "navigation" - } + "expected": {"url": "__SHOPPING__/video-games.html", "response_status": 200} } ], "revision": 2 @@ -7200,7 +7045,6 @@ "intent_template": "I want to browse the products in the {{category}} category", "instantiation_dict": {"category": "Headphones"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -7210,13 +7054,8 @@ { "evaluator": "NetworkEventEvaluator", "site": "shopping", - "url_match_mode": "exact", "last_event_only": true, - "expected": { - "url": "__SHOPPING__/electronics/headphones.html", - "response_status": 200, - "event_type": "navigation" - } + "expected": {"url": "__SHOPPING__/electronics/headphones.html", "response_status": 200} } ], "revision": 2 @@ -7230,7 +7069,6 @@ "intent_template": "I want to browse the products in the {{category}} category", "instantiation_dict": {"category": "Men shoes"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -7240,12 +7078,10 @@ { "evaluator": "NetworkEventEvaluator", "site": "shopping", - "url_match_mode": "exact", "last_event_only": true, "expected": { "url": "__SHOPPING__/clothing-shoes-jewelry/men/shoes.html", - "response_status": 200, - "event_type": "navigation" + "response_status": 200 } } ], @@ -7260,7 +7096,6 @@ "intent_template": "I want to browse the products in the {{category}} category", "instantiation_dict": {"category": "Woman clothing"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -7270,12 +7105,10 @@ { "evaluator": "NetworkEventEvaluator", "site": "shopping", - "url_match_mode": "exact", "last_event_only": true, "expected": { "url": "__SHOPPING__/clothing-shoes-jewelry/women/clothing.html", - "response_status": 200, - "event_type": "navigation" + "response_status": 200 } } ], @@ -7290,7 +7123,6 @@ "intent_template": "I want to browse the products in the {{category}} category", "instantiation_dict": {"category": "Cabinets, Racks & Shelves"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -7300,12 +7132,10 @@ { "evaluator": "NetworkEventEvaluator", "site": "shopping", - "url_match_mode": "exact", "last_event_only": true, "expected": { "url": "__SHOPPING__/office-products/office-furniture-lighting/cabinets-racks-shelves.html", - "response_status": 200, - "event_type": "navigation" + "response_status": 200 } } ], @@ -7316,11 +7146,10 @@ "task_id": 265, "intent_template_id": 85, "start_urls": ["__MAP__", "__WIKIPEDIA__"], - "intent": "What's the closest national park to Boston? How far is it to drive there?", - "intent_template": "What's the closest national park to {{city}}? How far is it to drive there?", + "intent": "What's the closest national park to Boston (use the provided wiki site to look up any needed information)? How far is it to drive there?", + "intent_template": "What's the closest national park to {{city}} (use the provided wiki site to look up any needed information)? How far is it to drive there?", "instantiation_dict": {"city": "Boston"}, "format_specification": "Provide the name of the park using \"park_name\" and its distance using \"distance\".", - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -7350,11 +7179,10 @@ "task_id": 266, "intent_template_id": 85, "start_urls": ["__MAP__", "__WIKIPEDIA__"], - "intent": "What's the closest national park to the largest city in Maine?", - "intent_template": "What's the closest national park to {{city}}?", + "intent": "What's the closest national park to the largest city in Maine (use the provided wiki site to look up any needed information)?", + "intent_template": "What's the closest national park to {{city}} (use the provided wiki site to look up any needed information)?", "instantiation_dict": {"city": "the largest city in Maine"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -7374,11 +7202,10 @@ "task_id": 267, "intent_template_id": 85, "start_urls": ["__MAP__", "__WIKIPEDIA__"], - "intent": "What's the closest national park to the hometown of Stephen King? How long it takes to drive there?", - "intent_template": "What's the closest national park to {{city}}? How long it takes to drive there?", + "intent": "What's the closest national park to the hometown of Stephen King (use the provided wiki site to look up any needed information)? How long it takes to drive there?", + "intent_template": "What's the closest national park to {{city}} (use the provided wiki site to look up any needed information)? How long it takes to drive there?", "instantiation_dict": {"city": "the hometown of Stephen King"}, "format_specification": "Provide the name of the park using \"park_name\" and the time using \"time\".", - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -7408,11 +7235,10 @@ "task_id": 268, "intent_template_id": 85, "start_urls": ["__MAP__", "__WIKIPEDIA__"], - "intent": "What's the closest national park to Vinalhaven, ME? How long does it take to bike there?", - "intent_template": "What's the closest national park to {{city}}? How long does it take to bike there?", + "intent": "What's the closest national park to Vinalhaven, ME (use the provided wiki site to look up any needed information)? How long does it take to bike there?", + "intent_template": "What's the closest national park to {{city}} (use the provided wiki site to look up any needed information)? How long does it take to bike there?", "instantiation_dict": {"city": "Vinalhaven, ME"}, "format_specification": "Provide the name of the park using \"park_name\" and the time using \"time\".", - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -7442,11 +7268,10 @@ "task_id": 269, "intent_template_id": 139, "start_urls": ["__SHOPPING__"], - "intent": "Show me products under $25 in \"women shoes\" category", - "intent_template": "Show me products under ${{price}} in \"{{product_category}}\" category", - "instantiation_dict": {"price": "25", "product_category": "women shoes"}, + "intent": "Show me all products in \"women shoes\" category filtered to under $25", + "intent_template": "Show me all products in \"{{product_category}}\" category filtered to {{price_range}}", + "instantiation_dict": {"price_range": "under $25", "product_category": "women shoes"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -7456,12 +7281,11 @@ { "evaluator": "NetworkEventEvaluator", "site": "shopping", - "url_match_mode": "exact", "last_event_only": true, + "ignored_query_params_patterns": ["^(?!price$).+$"], "expected": { - "url": "__SHOPPING__/clothing-shoes-jewelry/women/shoes.html?price=0-25", - "response_status": 200, - "event_type": "navigation" + "url": "__SHOPPING__/clothing-shoes-jewelry/women/shoes.html", + "query_params": { "price": ["0-25"] } } } ], @@ -7472,11 +7296,10 @@ "task_id": 270, "intent_template_id": 139, "start_urls": ["__SHOPPING__"], - "intent": "Show me products under $30 in \"men shoes\" category", - "intent_template": "Show me products under ${{price}} in \"{{product_category}}\" category", - "instantiation_dict": {"price": "30", "product_category": "men shoes"}, + "intent": "Show me all products in \"men shoes\" category filtered to under $30", + "intent_template": "Show me all products in \"{{product_category}}\" category filtered to {{price_range}}", + "instantiation_dict": {"price_range": "under $30", "product_category": "men shoes"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -7486,12 +7309,11 @@ { "evaluator": "NetworkEventEvaluator", "site": "shopping", - "url_match_mode": "exact", "last_event_only": true, "expected": { - "url": "__SHOPPING__/clothing-shoes-jewelry/men/shoes.html?price=0-30", + "url": "__SHOPPING__/clothing-shoes-jewelry/men/shoes.html", "response_status": 200, - "event_type": "navigation" + "query_params": { "price": ["0-30"] } } } ], @@ -7502,11 +7324,10 @@ "task_id": 271, "intent_template_id": 139, "start_urls": ["__SHOPPING__"], - "intent": "Show me products under $46.99 in \"makeup remover\" category", - "intent_template": "Show me products under ${{price}} in \"{{product_category}}\" category", - "instantiation_dict": {"price": "46.99", "product_category": "makeup remover"}, + "intent": "Show me all products in \"makeup remover\" category filtered to under $46.99", + "intent_template": "Show me all products in \"{{product_category}}\" category filtered to {{price_range}}", + "instantiation_dict": {"price_range": "under $46.99", "product_category": "makeup remover"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -7516,12 +7337,11 @@ { "evaluator": "NetworkEventEvaluator", "site": "shopping", - "url_match_mode": "exact", "last_event_only": true, "expected": { - "url": "__SHOPPING__/beauty-personal-care/makeup/makeup-remover.html?price=0-46.99", + "url": "__SHOPPING__/beauty-personal-care/makeup/makeup-remover.html", "response_status": 200, - "event_type": "navigation" + "query_params": { "price": ["0-46.99"] } } } ], @@ -7532,11 +7352,10 @@ "task_id": 272, "intent_template_id": 139, "start_urls": ["__SHOPPING__"], - "intent": "Show me products under $78 in \"children dental care\" category", - "intent_template": "Show me products under ${{price}} in \"{{product_category}}\" category", - "instantiation_dict": {"price": "78", "product_category": "children dental care"}, + "intent": "Show me all products in \"children dental care\" category filtered to under $78", + "intent_template": "Show me all products in \"{{product_category}}\" category filtered to {{price_range}}", + "instantiation_dict": {"price_range": "under $78", "product_category": "children dental care"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -7546,12 +7365,11 @@ { "evaluator": "NetworkEventEvaluator", "site": "shopping", - "url_match_mode": "exact", "last_event_only": true, "expected": { - "url": "__SHOPPING__/beauty-personal-care/oral-care/children-s-dental-care.html?price=0-78", + "url": "__SHOPPING__/beauty-personal-care/oral-care/children-s-dental-care.html", "response_status": 200, - "event_type": "navigation" + "query_params": { "price": ["0-78"] } } } ], @@ -7562,11 +7380,10 @@ "task_id": 273, "intent_template_id": 139, "start_urls": ["__SHOPPING__"], - "intent": "Show me products under $199 in \"furtiture with accent\" category", - "intent_template": "Show me products under ${{price}} in \"{{product_category}}\" category", - "instantiation_dict": {"price": "199", "product_category": "furtiture with accent"}, + "intent": "Show me all products in \"furniture with accent\" category filtered to under $199", + "intent_template": "Show me all products in \"{{product_category}}\" category filtered to {{price_range}}", + "instantiation_dict": {"price_range": "under $199", "product_category": "furniture with accent"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -7576,12 +7393,11 @@ { "evaluator": "NetworkEventEvaluator", "site": "shopping", - "url_match_mode": "exact", "last_event_only": true, "expected": { - "url": "__SHOPPING__/home-kitchen/furniture/accent-furniture.html?price=0-199", + "url": "__SHOPPING__/home-kitchen/furniture/accent-furniture.html", "response_status": 200, - "event_type": "navigation" + "query_params": { "price": ["0-199"] } } } ], @@ -7596,7 +7412,6 @@ "intent_template": "Search for \"{{keyword}}\"", "instantiation_dict": {"keyword": "usb wifi"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -7606,12 +7421,11 @@ { "evaluator": "NetworkEventEvaluator", "site": "shopping", - "url_match_mode": "exact", "last_event_only": true, "expected": { - "url": "__SHOPPING__/catalogsearch/result/?q=usb+wifi", + "url": "__SHOPPING__/catalogsearch/result/", "response_status": 200, - "event_type": "navigation" + "query_params": { "q": ["usb wifi"] } } } ], @@ -7626,7 +7440,6 @@ "intent_template": "Search for \"{{keyword}}\"", "instantiation_dict": {"keyword": "xbox"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -7636,12 +7449,11 @@ { "evaluator": "NetworkEventEvaluator", "site": "shopping", - "url_match_mode": "exact", "last_event_only": true, "expected": { - "url": "__SHOPPING__/catalogsearch/result/?q=xbox", + "url": "__SHOPPING__/catalogsearch/result/", "response_status": 200, - "event_type": "navigation" + "query_params": { "q": ["xbox"] } } } ], @@ -7656,7 +7468,6 @@ "intent_template": "Search for \"{{keyword}}\"", "instantiation_dict": {"keyword": "switch accessories"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -7666,12 +7477,11 @@ { "evaluator": "NetworkEventEvaluator", "site": "shopping", - "url_match_mode": "exact", "last_event_only": true, "expected": { - "url": "__SHOPPING__/catalogsearch/result/?q=switch+accessories", + "url": "__SHOPPING__/catalogsearch/result/", "response_status": 200, - "event_type": "navigation" + "query_params": { "q": ["switch accessories"] } } } ], @@ -7686,7 +7496,6 @@ "intent_template": "Search for \"{{keyword}}\"", "instantiation_dict": {"keyword": "batteries for iphone 13"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -7696,12 +7505,11 @@ { "evaluator": "NetworkEventEvaluator", "site": "shopping", - "url_match_mode": "exact", "last_event_only": true, "expected": { - "url": "__SHOPPING__/catalogsearch/result/?q=batteries+for+iphone+13", + "url": "__SHOPPING__/catalogsearch/result/", "response_status": 200, - "event_type": "navigation" + "query_params": { "q": ["batteries for iphone 13"] } } } ], @@ -7716,7 +7524,6 @@ "intent_template": "Search for \"{{keyword}}\"", "instantiation_dict": {"keyword": "green tea bag for weight loss"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -7726,12 +7533,11 @@ { "evaluator": "NetworkEventEvaluator", "site": "shopping", - "url_match_mode": "exact", "last_event_only": true, "expected": { - "url": "__SHOPPING__/catalogsearch/result/?q=green+tea+bag+for+weight+loss", + "url": "__SHOPPING__/catalogsearch/result/", "response_status": 200, - "event_type": "navigation" + "query_params": { "q": ["green tea bag for weight loss"] } } } ], @@ -7746,7 +7552,6 @@ "intent_template": "Provide me with the full names of {{product}}, and also share the price range for the available models", "instantiation_dict": {"product": "Bluetooth headphones from Sony"}, "format_specification": "Use \"names\" for the list of product names and \"min\" for the minimum price and \"max\" for the maximum price.", - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -7757,8 +7562,8 @@ "type": "object", "properties": { "names": { "type": "array", "items": {"type": "string"} }, - "min": {"type": "string"}, - "max": {"type": "string"} + "min": {"type": "number", "format": "currency"}, + "max": {"type": "number", "format": "currency"} }, "required": ["max", "min", "names"] } @@ -7782,8 +7587,8 @@ "Sony XB950N1 Extra Bass Wireless Noise Canceling Headphones, Black", "SONY - H900N Hi-Res Noise Cancelling Wireless Headphone Grayish Black Renewed" ], - "min": "18.99", - "max": "406" + "min": 18.99, + "max": 406 } ] } @@ -7800,7 +7605,6 @@ "intent_template": "Provide me with the full names of {{product}}, and also share the price range for the available models", "instantiation_dict": {"product": "chargers from Anker"}, "format_specification": "Use \"names\" for the list of product names and \"min\" for the minimum price and \"max\" for the maximum price.", - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -7854,7 +7658,6 @@ "intent_template": "Provide me with the full names of {{product}}, and also share the price range for the available models", "instantiation_dict": {"product": "Oral B brush heads designed for children"}, "format_specification": "Use \"names\" for the list of product names and \"min\" for the minimum price and \"max\" for the maximum price.", - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -7865,8 +7668,8 @@ "type": "object", "properties": { "names": { "type": "array", "items": {"type": "string"} }, - "min": {"type": "string"}, - "max": {"type": "string"} + "min": {"type": "number", "format": "currency"}, + "max": {"type": "number", "format": "currency"} }, "required": ["max", "min", "names"] } @@ -7880,8 +7683,8 @@ "Oral-B Kids Extra Soft Replacement Brush Heads featuring STAR WARS, 2 count", "Kids By Oral-b Stages Power Star Wars Replacement Heads 4 Pack" ], - "min": "12.99", - "max": "14.98" + "min": 12.99, + "max": 14.98 } ] } @@ -7898,7 +7701,6 @@ "intent_template": "Provide me with the full names of {{product}}, and also share the price range for the available models", "instantiation_dict": {"product": "slide slippers from Nike"}, "format_specification": "Use \"names\" for the list of product names and \"min\" for the minimum price and \"max\" for the maximum price.", - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -7909,8 +7711,8 @@ "type": "object", "properties": { "names": { "type": "array", "items": {"type": "string"} }, - "min": {"type": "string"}, - "max": {"type": "string"} + "min": {"type": "number", "format": "currency"}, + "max": {"type": "number", "format": "currency"} }, "required": ["max", "min", "names"] } @@ -7931,8 +7733,8 @@ "Nike Men's Benassi Solarsoft Slide Athletic Sandal (Midnight Navy/Blue, numeric_8)", "Nike womens Benassi Just Do It" ], - "min": "27.6", - "max": "90.65" + "min": 27.6, + "max": 90.65 } ] } @@ -7945,11 +7747,10 @@ "task_id": 283, "intent_template_id": 210, "start_urls": ["__SHOPPING__"], - "intent": "Navigate to the most recent models of XBox controllers released between 2020-2021.", - "intent_template": "Navigate to the most recent models of XBox controllers released between 2020-2021.", + "intent": "Show me the most recent models of XBox controllers released between 2020-2021.", + "intent_template": "Show me the most recent models of XBox controllers released between 2020-2021.", "instantiation_dict": {}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -7959,12 +7760,10 @@ { "evaluator": "NetworkEventEvaluator", "site": "shopping", - "url_match_mode": "exact", "last_event_only": true, "expected": { "url": "__SHOPPING__/microsoft-xbox-controller-carbon-black-for-series-x-series-s-xbox-one-windows-10-android-ios-bundled-with-dual-port-charging-dock-xbox-controller-skin-voucher-premgear-cloth.html", - "response_status": 200, - "event_type": "navigation" + "response_status": 200 } } ], @@ -7979,7 +7778,6 @@ "intent_template": "Show the least expensive {{product}} with a minimum storage capacity of {{min_storage}}.", "instantiation_dict": {"product": "shoe storage", "min_storage": "12 pairs"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -7989,12 +7787,10 @@ { "evaluator": "NetworkEventEvaluator", "site": "shopping", - "url_match_mode": "exact", "last_event_only": true, "expected": { "url": "__SHOPPING__/onlyeasy-over-the-door-shoe-storage-organizer-hanging-shoe-rack-holder-with-24-large-fabric-pockets-22-1-x-61-4-herringbone-grey-mxrodsb1p.html", - "response_status": 200, - "event_type": "navigation" + "response_status": 200 } } ], @@ -8009,7 +7805,6 @@ "intent_template": "Show the least expensive {{product}} with a minimum storage capacity of {{min_storage}}.", "instantiation_dict": {"product": "switch card holder", "min_storage": "15 cards"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -8019,12 +7814,10 @@ { "evaluator": "NetworkEventEvaluator", "site": "shopping", - "url_match_mode": "exact", "last_event_only": true, "expected": { "url": "__SHOPPING__/game-card-holder-storage-case-for-nintendo-switch-games-or-ps-vita-game-case-or-sd-memory-cards-black.html", - "response_status": 200, - "event_type": "navigation" + "response_status": 200 } } ], @@ -8039,7 +7832,6 @@ "intent_template": "Show the least expensive {{product}} with a minimum storage capacity of {{min_storage}}.", "instantiation_dict": {"product": "ssd hard drive", "min_storage": "1TB"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -8049,12 +7841,10 @@ { "evaluator": "NetworkEventEvaluator", "site": "shopping", - "url_match_mode": "exact", "last_event_only": true, "expected": { "url": "__SHOPPING__/external-hard-drive-2tb-ultra-thin-external-hard-drive-2000gb-ultra-high-speed-portable-3-1-type-c-storage-drive-compatible-with-pc-laptop-and-mac-2tb-a1.html", - "response_status": 200, - "event_type": "navigation" + "response_status": 200 } } ], @@ -8069,12 +7859,11 @@ "intent_template": "How much time does it take from Pittsburgh to Philadelphia by car?", "instantiation_dict": {}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "ordered": true, - "results_schema": { "type": "array", "items": {"type": "string"} }, + "results_schema": { "type": "array", "items": {"type": "string", "format": "duration"} }, "expected": { "performed_operation": "retrieve", "status": "SUCCESS", @@ -8093,7 +7882,6 @@ "intent_template": "Tell me the {{attribute}} of the customer who has the most cancellations in the history", "instantiation_dict": {"attribute": "name"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -8117,7 +7905,6 @@ "intent_template": "Tell me the {{attribute}} of the customer who has the most cancellations in the history", "instantiation_dict": {"attribute": "email address, name, phone number"}, "format_specification": "Use \"name\" for the customer name and \"email\" for the email and \"phone_number\" for the phone number.", - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -8158,7 +7945,6 @@ "intent_template": "Tell me the {{attribute}} of the customer who has the most cancellations in the history", "instantiation_dict": {"attribute": "product SKUs in the most recent cancelled orders"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -8180,18 +7966,19 @@ "start_urls": ["__SHOPPING_ADMIN__"], "intent": "Tell me the total spend on products in the most recent cancelled orders of the customer who has the most cancellations in the history", "intent_template": "Tell me the {{attribute}} of the customer who has the most cancellations in the history", - "instantiation_dict": {"attribute": "total spend on products in the most recent cancelled orders"}, + "instantiation_dict": { + "attribute": "total spend on products in the most recent cancelled orders excluding shipping and handling" + }, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "ordered": false, - "results_schema": { "type": "array", "items": {"type": "number"} }, + "results_schema": { "type": "array", "items": {"type": "number", "format": "currency"} }, "expected": { "performed_operation": "retrieve", "status": "SUCCESS", - "retrieved_data": [148] + "retrieved_data": [148.8] } } ], @@ -8206,7 +7993,6 @@ "intent_template": "Tell me the {{attribute}} of the customer who has the most cancellations in the history", "instantiation_dict": {"attribute": "total number of cancellations"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -8226,21 +8012,18 @@ "task_id": 293, "intent_template_id": 329, "start_urls": ["__GITLAB__"], - "intent": "Show me the command to clone Super_Awesome_Robot with SSH.", - "intent_template": "Show me the command to clone {{repo}} with SSH.", + "intent": "Get the URL to clone Super_Awesome_Robot with SSH", + "intent_template": "Get the URL to clone {{repo}} with SSH", "instantiation_dict": {"repo": "Super_Awesome_Robot"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", - "results_schema": { "type": "array", "items": {"type": "string"} }, + "results_schema": { "type": "array", "items": {"type": "string", "format": "url"} }, "expected": { "performed_operation": "retrieve", "status": "SUCCESS", - "retrieved_data": [ - "git clone ssh://git@{{ resource.base_host }}:2222/convexegg/super_awesome_robot.git" - ] + "retrieved_data": ["ssh://git@__GITLAB__:2222/convexegg/super_awesome_robot.git"] } } ], @@ -8251,19 +8034,18 @@ "task_id": 294, "intent_template_id": 329, "start_urls": ["__GITLAB__"], - "intent": "Show me the command to clone ChatGPT with SSH.", - "intent_template": "Show me the command to clone {{repo}} with SSH.", + "intent": "Get the URL to clone ChatGPT with SSH", + "intent_template": "Get the URL to clone {{repo}} with SSH", "instantiation_dict": {"repo": "ChatGPT"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", - "results_schema": { "type": "array", "items": {"type": "string"} }, + "results_schema": { "type": "array", "items": {"type": "string", "format": "url"} }, "expected": { "performed_operation": "retrieve", "status": "SUCCESS", - "retrieved_data": ["git clone ssh://git@{{ resource.base_host }}:2222/convexegg/chatgpt.git"] + "retrieved_data": ["ssh://git@__GITLAB__:2222/convexegg/chatgpt.git"] } } ], @@ -8274,19 +8056,18 @@ "task_id": 295, "intent_template_id": 329, "start_urls": ["__GITLAB__"], - "intent": "Show me the command to clone metaseq with SSH.", - "intent_template": "Show me the command to clone {{repo}} with SSH.", + "intent": "Get the URL to clone metaseq with SSH", + "intent_template": "Get the URL to clone {{repo}} with SSH", "instantiation_dict": {"repo": "metaseq"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", - "results_schema": { "type": "array", "items": {"type": "string"} }, + "results_schema": { "type": "array", "items": {"type": "string", "format": "url"} }, "expected": { "performed_operation": "retrieve", "status": "SUCCESS", - "retrieved_data": ["git clone ssh://git@{{ resource.base_host }}:2222/root/metaseq.git"] + "retrieved_data": ["ssh://git@__GITLAB__:2222/root/metaseq.git"] } } ], @@ -8297,21 +8078,18 @@ "task_id": 296, "intent_template_id": 329, "start_urls": ["__GITLAB__"], - "intent": "Show me the command to clone the best GAN python implementation with SSH.", - "intent_template": "Show me the command to clone {{repo}} with SSH.", + "intent": "Get the URL to clone the best GAN python implementation with SSH", + "intent_template": "Get the URL to clone {{repo}} with SSH", "instantiation_dict": {"repo": "the best GAN python implementation"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", - "results_schema": { "type": "array", "items": {"type": "string"} }, + "results_schema": { "type": "array", "items": {"type": "string", "format": "url"} }, "expected": { "performed_operation": "retrieve", "status": "SUCCESS", - "retrieved_data": [ - "git clone ssh://git@{{ resource.base_host }}:2222/eriklindernoren/PyTorch-GAN.git" - ] + "retrieved_data": ["ssh://git@__GITLAB__:2222/eriklindernoren/PyTorch-GAN.git"] } } ], @@ -8322,19 +8100,18 @@ "task_id": 297, "intent_template_id": 329, "start_urls": ["__GITLAB__"], - "intent": "Show me the command to clone the most stared Covid location tracker with SSH.", - "intent_template": "Show me the command to clone {{repo}} with SSH.", - "instantiation_dict": {"repo": "the most stared Covid location tracker"}, + "intent": "Get the URL to clone the most stared Covid related project with SSH", + "intent_template": "Get the URL to clone {{repo}} with SSH", + "instantiation_dict": {"repo": "the most stared Covid related project"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", - "results_schema": { "type": "array", "items": {"type": "string"} }, + "results_schema": { "type": "array", "items": {"type": "string", "format": "url"} }, "expected": { "performed_operation": "retrieve", "status": "SUCCESS", - "retrieved_data": ["git clone ssh://git@{{ resource.base_host }}:2222/yjlou/2019-nCov.git"] + "retrieved_data": ["ssh://git@__GITLAB__:2222/covid19india/covid19india-react.git"] } } ], @@ -8349,7 +8126,6 @@ "intent_template": "Show the most recent {{status}} order", "instantiation_dict": {"status": "completed"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -8359,13 +8135,8 @@ { "evaluator": "NetworkEventEvaluator", "site": "shopping", - "url_match_mode": "exact", "last_event_only": true, - "expected": { - "url": "__SHOPPING__/sales/order/view/order_id/180/", - "response_status": 200, - "event_type": "navigation" - } + "expected": {"url": "__SHOPPING__/sales/order/view/order_id/180/", "response_status": 200} } ], "revision": 2 @@ -8379,7 +8150,6 @@ "intent_template": "Show the most recent {{status}} order", "instantiation_dict": {"status": "cancelled"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -8389,13 +8159,8 @@ { "evaluator": "NetworkEventEvaluator", "site": "shopping", - "url_match_mode": "exact", "last_event_only": true, - "expected": { - "url": "__SHOPPING__/sales/order/view/order_id/170/", - "response_status": 200, - "event_type": "navigation" - } + "expected": {"url": "__SHOPPING__/sales/order/view/order_id/170/", "response_status": 200} } ], "revision": 2 @@ -8409,7 +8174,6 @@ "intent_template": "Show the most recent {{status}} order", "instantiation_dict": {"status": "pending"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -8419,12 +8183,10 @@ { "evaluator": "NetworkEventEvaluator", "site": "shopping", - "url_match_mode": "exact", "last_event_only": true, "expected": { "url": "__SHOPPING__/sales/order/view/order_id/189/", - "response_status": 200, - "event_type": "navigation" + "headers": {"referer": "__SHOPPING__/sales/order/history/"} } } ], @@ -8439,16 +8201,20 @@ "intent_template": "Show the most recent {{status}} order", "instantiation_dict": {"status": "processing"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, "expected": { - "performed_operation": "retrieve", + "performed_operation": "navigate", "status": "NOT_FOUND_ERROR", "retrieved_data": null } + }, + { + "evaluator": "NetworkEventEvaluator", + "site": "shopping", + "expected": {"url": "__SHOPPING__/sales/order/history/"} } ], "revision": 2 @@ -8462,16 +8228,20 @@ "intent_template": "Show the most recent {{status}} order", "instantiation_dict": {"status": "out of delivery"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, "expected": { - "performed_operation": "retrieve", + "performed_operation": "navigate", "status": "NOT_FOUND_ERROR", "retrieved_data": null } + }, + { + "evaluator": "NetworkEventEvaluator", + "site": "shopping", + "expected": {"url": "__SHOPPING__/sales/order/history/"} } ], "revision": 2 @@ -8481,15 +8251,14 @@ "task_id": 303, "intent_template_id": 321, "start_urls": ["__GITLAB__/a11yproject/a11yproject.com"], - "intent": "How many commits did Kilian make during 2023?", - "intent_template": "How many commits did {{user}} make {{period}}?", + "intent": "How many commits did Kilian make during 2023 in the current repository?", + "intent_template": "How many commits did {{user}} make {{period}} in the current repository?", "instantiation_dict": {"user": "Kilian", "period": "during 2023"}, "format_specification": null, - "start_url_context": "Complete the task given the context of the current page.", "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": "false", + "ordered": false, "results_schema": { "type": "array", "items": {"type": "number"} }, "expected": { "performed_operation": "retrieve", @@ -8505,15 +8274,17 @@ "task_id": 304, "intent_template_id": 321, "start_urls": ["__GITLAB__/a11yproject/a11yproject.com"], - "intent": "How many commits did Eric make between Feb 2023 and May 2023?", - "intent_template": "How many commits did {{user}} make {{period}}?", - "instantiation_dict": {"user": "Eric", "period": "between Feb 2023 and May 2023"}, + "intent": "How many commits did Eric Bailey make between start of Feb 2023 and end of May 2023 in the current repository?", + "intent_template": "How many commits did {{user}} make {{period}} in the current repository?", + "instantiation_dict": { + "user": "Eric Bailey", + "period": "between start of Feb 2023 and end of May 2023" + }, "format_specification": null, - "start_url_context": "Complete the task given the context of the current page.", "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": "false", + "ordered": false, "results_schema": { "type": "array", "items": {"type": "number"} }, "expected": { "performed_operation": "retrieve", @@ -8529,15 +8300,14 @@ "task_id": 305, "intent_template_id": 321, "start_urls": ["__GITLAB__/a11yproject/a11yproject.com"], - "intent": "How many commits did Philip make in 2023/1?", - "intent_template": "How many commits did {{user}} make {{period}}?", - "instantiation_dict": {"user": "Philip", "period": "in 2023/1"}, + "intent": "How many commits did Philip make in Jan 2023 in the current repository?", + "intent_template": "How many commits did {{user}} make {{period}} in the current repository?", + "instantiation_dict": {"user": "Philip", "period": "in Jan 2023"}, "format_specification": null, - "start_url_context": "Complete the task given the context of the current page.", "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": "false", + "ordered": false, "results_schema": { "type": "array", "items": {"type": "number"} }, "expected": { "performed_operation": "retrieve", @@ -8553,15 +8323,14 @@ "task_id": 306, "intent_template_id": 321, "start_urls": ["__GITLAB__/a11yproject/a11yproject.com"], - "intent": "How many commits did Anthony make between 08/2022-09/2022?", - "intent_template": "How many commits did {{user}} make {{period}}?", - "instantiation_dict": {"user": "Anthony", "period": "between 08/2022-09/2022"}, + "intent": "How many commits did Anthony make between August 2022 through September 2022 in the current repository?", + "intent_template": "How many commits did {{user}} make {{period}} in the current repository?", + "instantiation_dict": {"user": "Anthony", "period": "between August 2022 through September 2022"}, "format_specification": null, - "start_url_context": "Complete the task given the context of the current page.", "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": "false", + "ordered": false, "results_schema": { "type": "array", "items": {"type": "number"} }, "expected": { "performed_operation": "retrieve", @@ -8577,20 +8346,19 @@ "task_id": 307, "intent_template_id": 321, "start_urls": ["__GITLAB__/a11yproject/a11yproject.com"], - "intent": "How many commits did Nic make in April 2021?", - "intent_template": "How many commits did {{user}} make {{period}}?", - "instantiation_dict": {"user": "Nic", "period": "in April 2021"}, + "intent": "How many commits did Nic Chan make on April 7th 2022 in the current repository?", + "intent_template": "How many commits did {{user}} make {{period}} in the current repository?", + "instantiation_dict": {"user": "Nic Chan", "period": "on April 7th 2022"}, "format_specification": null, - "start_url_context": "Complete the task given the context of the current page.", "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": "false", + "ordered": false, "results_schema": { "type": "array", "items": {"type": "number"} }, "expected": { "performed_operation": "retrieve", "status": "SUCCESS", - "retrieved_data": [16] + "retrieved_data": [5] } } ], @@ -8605,7 +8373,6 @@ "intent_template": "Get the username(s) of the user(s) with the most commits to the {{repo}} project", "instantiation_dict": {"repo": "primer/design"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -8628,7 +8395,6 @@ "intent_template": "Get the username(s) of the user(s) with the most commits to the {{repo}} project", "instantiation_dict": {"repo": "thoughtbot/administrate"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -8651,7 +8417,6 @@ "intent_template": "Get the username(s) of the user(s) with the most commits to the {{repo}} project", "instantiation_dict": {"repo": "AndroidSlidingUpPanel"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -8674,7 +8439,6 @@ "intent_template": "Get the username(s) of the user(s) with the most commits to the {{repo}} project", "instantiation_dict": {"repo": "Pytorch GAN"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -8697,7 +8461,6 @@ "intent_template": "Get the username(s) of the user(s) with the most commits to the {{repo}} project", "instantiation_dict": {"repo": "csvkit"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -8720,7 +8483,6 @@ "intent_template": "Get the customer service phone number", "instantiation_dict": {}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -8739,11 +8501,10 @@ "task_id": 314, "intent_template_id": 324, "start_urls": ["__GITLAB__"], - "intent": "List the full names of the top 3 contributors to primer/design repo, ranked by the number of commits?", - "intent_template": "List the {{attribute}} of the top 3 contributors to {{repo}} repo, ranked by the number of commits?", + "intent": "Get the full names of the top 3 contributors (by commit count) to primer/design repo", + "intent_template": "Get the {{attribute}} of the top 3 contributors (by commit count) to {{repo}} repo", "instantiation_dict": {"repo": "primer/design", "attribute": "full names"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -8762,11 +8523,10 @@ "task_id": 315, "intent_template_id": 324, "start_urls": ["__GITLAB__"], - "intent": "List the email addresses of the top 3 contributors to Pytorch GAN repo, ranked by the number of commits?", - "intent_template": "List the {{attribute}} of the top 3 contributors to {{repo}} repo, ranked by the number of commits?", + "intent": "Get the email addresses of the top 3 contributors (by commit count) to Pytorch GAN repo", + "intent_template": "Get the {{attribute}} of the top 3 contributors (by commit count) to {{repo}} repo", "instantiation_dict": {"repo": "Pytorch GAN", "attribute": "email addresses"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -8785,20 +8545,23 @@ "task_id": 316, "intent_template_id": 324, "start_urls": ["__GITLAB__"], - "intent": "List the names of the top 3 contributors to facebook's guide on building react apps repo, ranked by the number of commits?", - "intent_template": "List the {{attribute}} of the top 3 contributors to {{repo}} repo, ranked by the number of commits?", - "instantiation_dict": {"repo": "facebook's guide on building react apps", "attribute": "names"}, + "intent": "Get the email addresses of the top 3 contributors (by commit count) to facebook\"s guide on building react apps repo", + "intent_template": "Get the {{attribute}} of the top 3 contributors (by commit count) to {{repo}} repo", + "instantiation_dict": { + "repo": "facebook's guide on building react apps", + "attribute": "email addresses" + }, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, + "results_schema": { "type": "array", "items": {"type": "string"} }, "expected": { "performed_operation": "retrieve", - "status": "NOT_FOUND_ERROR", - "retrieved_data": null - } + "status": "SUCCESS", + "retrieved_data": ["dan.abramov@gmail.com", "timer150@gmail.com", "ian@iansutherland.ca"] + }, + "ordered": false } ], "revision": 2 @@ -8808,15 +8571,14 @@ "task_id": 317, "intent_template_id": 324, "start_urls": ["__GITLAB__"], - "intent": "List the names and number of commits of the top 3 contributors to metaseq repo, ranked by the number of commits?", - "intent_template": "List the {{attribute}} of the top 3 contributors to {{repo}} repo, ranked by the number of commits?", + "intent": "Get the names and number of commits of the top 3 contributors (by commit count) to metaseq repo", + "intent_template": "Get the {{attribute}} of the top 3 contributors (by commit count) to {{repo}} repo", "instantiation_dict": {"repo": "metaseq", "attribute": "names and number of commits"}, "format_specification": "Use objects with keys: \"first_name\", \"last_name\" and \"number_of_commits\".", - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": "false", + "ordered": false, "results_schema": { "type": "array", "items": { @@ -8847,15 +8609,14 @@ "task_id": 318, "intent_template_id": 324, "start_urls": ["__GITLAB__"], - "intent": "List the last names of the top 3 contributors to 2019-nCov repo, ranked by the number of commits?", - "intent_template": "List the {{attribute}} of the top 3 contributors to {{repo}} repo, ranked by the number of commits?", + "intent": "Get the last names of the top 3 contributors (by commit count) to 2019-nCov repo", + "intent_template": "Get the {{attribute}} of the top 3 contributors (by commit count) to {{repo}} repo", "instantiation_dict": {"repo": "2019-nCov", "attribute": "last names"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": "false", + "ordered": false, "results_schema": { "type": "array", "items": {"type": "string"} }, "expected": { "performed_operation": "retrieve", @@ -8871,21 +8632,16 @@ "task_id": 319, "intent_template_id": 160, "start_urls": ["__SHOPPING__"], - "intent": "How much refund I should expect from my order canceled in April 2022, including shipping fee", - "intent_template": "How much refund I should expect from my order canceled in {{time}}, including shipping fee", - "instantiation_dict": {"time": "April 2022"}, + "intent": "How much refund should I expect from my orders canceled, if any, in April 2022, including shipping fee", + "intent_template": "How much refund should I expect from my orders canceled, if any, in {{time}}{{conditions}}", + "instantiation_dict": {"time": "April 2022", "conditions": ", including shipping fee"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "ordered": false, - "results_schema": { "type": "array", "items": {"type": "number"} }, - "expected": { - "performed_operation": "retrieve", - "status": "SUCCESS", - "retrieved_data": [0] - } + "results_schema": { "type": "array", "items": {"type": "number", "format": "currency"} }, + "expected": {"performed_operation": "retrieve", "status": "NOT_FOUND_ERROR"} } ], "revision": 2 @@ -8895,20 +8651,19 @@ "task_id": 320, "intent_template_id": 160, "start_urls": ["__SHOPPING__"], - "intent": "How much refund I should expect from my order canceled in Feb 2023, including shipping fee", - "intent_template": "How much refund I should expect from my order canceled in {{time}}, including shipping fee", - "instantiation_dict": {"time": "Feb 2023"}, + "intent": "How much refund should I expect from my orders canceled, if any, in Feb 2023, including shipping fee", + "intent_template": "How much refund should I expect from my orders canceled, if any, in {{time}}{{conditions}}", + "instantiation_dict": {"time": "Feb 2023", "conditions": ", including shipping fee"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "ordered": false, - "results_schema": { "type": "array", "items": {"type": "string"} }, + "results_schema": { "type": "array", "items": {"type": "number", "format": "currency"} }, "expected": { "performed_operation": "retrieve", "status": "SUCCESS", - "retrieved_data": ["406.53"] + "retrieved_data": [406.53] } } ], @@ -8919,20 +8674,19 @@ "task_id": 321, "intent_template_id": 160, "start_urls": ["__SHOPPING__"], - "intent": "How much refund I should expect from my order canceled in 2022, including shipping fee", - "intent_template": "How much refund I should expect from my order canceled in {{time}}, including shipping fee", - "instantiation_dict": {"time": "2022"}, + "intent": "How much refund should I expect from my orders canceled, if any, in 2022, including shipping fee", + "intent_template": "How much refund should I expect from my orders canceled, if any, in {{time}}{{conditions}}", + "instantiation_dict": {"time": "2022", "conditions": ", including shipping fee"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "ordered": false, - "results_schema": { "type": "array", "items": {"type": "string"} }, + "results_schema": { "type": "array", "items": {"type": "number", "format": "currency"} }, "expected": { "performed_operation": "retrieve", "status": "SUCCESS", - "retrieved_data": ["3053.97"] + "retrieved_data": [3053.97] } } ], @@ -8943,20 +8697,22 @@ "task_id": 322, "intent_template_id": 160, "start_urls": ["__SHOPPING__"], - "intent": "How much refund I should expect from my order canceled in May 2023 if I cannot get the shipping fee refunded?", - "intent_template": "How much refund I should expect from my order canceled in {{time}} if I cannot get the shipping fee refunded?", - "instantiation_dict": {"time": "May 2023"}, + "intent": "How much refund should I expect from my orders canceled, if any, in May 2023 if I cannot get the shipping fee refunded?", + "intent_template": "How much refund should I expect from my orders canceled, if any, in {{time}}{{conditions}}", + "instantiation_dict": { + "time": "May 2023", + "conditions": " if I cannot get the shipping fee refunded?" + }, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "ordered": false, - "results_schema": { "type": "array", "items": {"type": "string"} }, + "results_schema": { "type": "array", "items": {"type": "number", "format": "currency"} }, "expected": { "performed_operation": "retrieve", "status": "SUCCESS", - "retrieved_data": ["350.42"] + "retrieved_data": [350.42] } } ], @@ -8967,20 +8723,22 @@ "task_id": 323, "intent_template_id": 160, "start_urls": ["__SHOPPING__"], - "intent": "How much refund I should expect from my order canceled in 2022/03? I only kept the AC-DC Adapter and the shop told me that I cannot get the shipping fee back", - "intent_template": "How much refund I should expect from my order canceled in {{time}}? I only kept the AC-DC Adapter and the shop told me that I cannot get the shipping fee back", - "instantiation_dict": {"time": "2022/03"}, + "intent": "How much refund should I expect from my orders canceled, if any, in March 2022? I only kept the AC-DC Adapter and the shop told me that I cannot get the shipping fee back", + "intent_template": "How much refund should I expect from my orders canceled, if any, in {{time}}{{conditions}}", + "instantiation_dict": { + "time": "March 2022", + "conditions": "? I only kept the AC-DC Adapter and the shop told me that I cannot get the shipping fee back" + }, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "ordered": false, - "results_schema": { "type": "array", "items": {"type": "string"} }, + "results_schema": { "type": "array", "items": {"type": "number", "format": "currency"} }, "expected": { "performed_operation": "retrieve", "status": "SUCCESS", - "retrieved_data": ["264.49"] + "retrieved_data": [264.49] } } ], @@ -8995,7 +8753,6 @@ "intent_template": "Show me the \"{{product}}\" listings by {{sorting_order}}.", "instantiation_dict": {"product": "chairs", "sorting_order": "ascending price"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -9005,12 +8762,15 @@ { "evaluator": "NetworkEventEvaluator", "site": "shopping", - "url_match_mode": "exact", "last_event_only": true, "expected": { - "url": "__SHOPPING__/catalogsearch/result/index/?product_list_order=price&q=chairs&product_list_dir=asc", + "url": "__SHOPPING__/catalogsearch/result/index/", "response_status": 200, - "event_type": "navigation" + "query_params": { + "product_list_order": [ "price" ], + "q" : [ "chairs" ], + "product_list_dir" : [ "asc" ] + } } } ], @@ -9025,7 +8785,6 @@ "intent_template": "Show me the \"{{product}}\" listings by {{sorting_order}}.", "instantiation_dict": {"product": "mouth night guard", "sorting_order": "descending price"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -9035,12 +8794,11 @@ { "evaluator": "NetworkEventEvaluator", "site": "shopping", - "url_match_mode": "exact", "last_event_only": true, + "ignored_query_params_patterns": ["^(?!q$|product_list_dir$|product_list_order).+$"], "expected": { - "url": "__SHOPPING__/catalogsearch/result/index/?q=mouth%20night%20guard%20&product_list_order=price", - "response_status": 200, - "event_type": "navigation" + "url": "__SHOPPING__/catalogsearch/result/index/", + "query_params": { "q": ["mouth night guard "], "product_list_order": ["price"] } } } ], @@ -9058,7 +8816,6 @@ "sorting_order": "search relevance, from most to least" }, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -9068,12 +8825,11 @@ { "evaluator": "NetworkEventEvaluator", "site": "shopping", - "url_match_mode": "exact", "last_event_only": true, "expected": { - "url": "__SHOPPING__/catalogsearch/result/?q=Canon+photo+printer", + "url": "__SHOPPING__/catalogsearch/result/", "response_status": 200, - "event_type": "navigation" + "query_params": { "q": ["Canon photo printer"] } } } ], @@ -9088,7 +8844,6 @@ "intent_template": "Show me the \"{{product}}\" listings by {{sorting_order}}.", "instantiation_dict": {"product": "iphone 12 phone case", "sorting_order": "name alphabetically"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -9098,12 +8853,15 @@ { "evaluator": "NetworkEventEvaluator", "site": "shopping", - "url_match_mode": "exact", "last_event_only": true, + "ignored_query_params_patterns": ["^(?!q$|product_list_dir$|product_list_order$).+$"], "expected": { - "url": "__SHOPPING__/catalogsearch/result/index/?q=%20iphone%2012%20phone%20case&product_list_order=name", - "response_status": 200, - "event_type": "navigation" + "url": "__SHOPPING__/catalogsearch/result/index/", + "query_params": { + "q" : [ " iphone 12 phone case" ], + "product_list_order": [ "name" ], + "product_list_dir" : [ "asc" ] + } } } ], @@ -9118,7 +8876,6 @@ "intent_template": "Show me the \"{{product}}\" listings by {{sorting_order}}.", "instantiation_dict": {"product": "iphone 12 phone case", "sorting_order": "price"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -9128,12 +8885,11 @@ { "evaluator": "NetworkEventEvaluator", "site": "shopping", - "url_match_mode": "exact", "last_event_only": true, + "ignored_query_params_patterns": ["^(?!q$|product_list_order$).+$"], "expected": { - "url": "__SHOPPING__/catalogsearch/result/index/?product_list_order=price&q=%20iphone%2012%20phone%20case", - "response_status": 200, - "event_type": "navigation" + "url": "__SHOPPING__/catalogsearch/result/index/", + "query_params": { "product_list_order": ["price"], "q": [" iphone 12 phone case"] } } } ], @@ -9144,16 +8900,15 @@ "task_id": 329, "intent_template_id": 147, "start_urls": ["__SHOPPING__"], - "intent": "Return how much I spent on 4/19/2023 on shopping at One Stop Market?", - "intent_template": "Return how much I spent {{time}} on shopping at One Stop Market?", - "instantiation_dict": {"time": "on 4/19/2023"}, + "intent": "Return the total amount I spent on shopping at One Stop Market on April 19, 2023, excluding shipping.", + "intent_template": "Return the total amount I spent on shopping at One Stop Market {{time}}, excluding shipping.", + "instantiation_dict": {"time": "on April 19, 2023"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "ordered": false, - "results_schema": { "type": "array", "items": {"type": "number"} }, + "results_schema": { "type": "array", "items": {"type": "number", "format": "currency"} }, "expected": { "performed_operation": "retrieve", "status": "SUCCESS", @@ -9168,20 +8923,19 @@ "task_id": 330, "intent_template_id": 147, "start_urls": ["__SHOPPING__"], - "intent": "Return how much I spent in March 2023 on shopping at One Stop Market?", - "intent_template": "Return how much I spent {{time}} on shopping at One Stop Market?", + "intent": "Return the total amount I spent on shopping at One Stop Market in March 2023, excluding shipping.", + "intent_template": "Return the total amount I spent on shopping at One Stop Market {{time}}, excluding shipping.", "instantiation_dict": {"time": "in March 2023"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "ordered": false, - "results_schema": { "type": "array", "items": {"type": "string"} }, + "results_schema": { "type": "array", "items": {"type": "number", "format": "currency"} }, "expected": { "performed_operation": "retrieve", "status": "SUCCESS", - "retrieved_data": ["83.31"] + "retrieved_data": [53.31] } } ], @@ -9192,20 +8946,19 @@ "task_id": 331, "intent_template_id": 147, "start_urls": ["__SHOPPING__"], - "intent": "Return how much I spent in July 2022 on shopping at One Stop Market?", - "intent_template": "Return how much I spent {{time}} on shopping at One Stop Market?", + "intent": "Return the total amount I spent on shopping at One Stop Market in July 2022, excluding shipping.", + "intent_template": "Return the total amount I spent on shopping at One Stop Market {{time}}, excluding shipping.", "instantiation_dict": {"time": "in July 2022"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "ordered": false, - "results_schema": { "type": "array", "items": {"type": "string"} }, + "results_schema": { "type": "array", "items": {"type": "number", "format": "currency"} }, "expected": { "performed_operation": "retrieve", "status": "SUCCESS", - "retrieved_data": ["40.16"] + "retrieved_data": [25.16] } } ], @@ -9216,11 +8969,10 @@ "task_id": 332, "intent_template_id": 147, "start_urls": ["__SHOPPING__"], - "intent": "Return how much I spent each month from Jan to the end of March 2023 on shopping at One Stop Market?", - "intent_template": "Return how much I spent {{time}} on shopping at One Stop Market?", - "instantiation_dict": {"time": "each month from Jan to the end of March 2023"}, + "intent": "Return the total amount I spent on shopping at One Stop Market each months from Jan to the March 31, 2023, excluding shipping.", + "intent_template": "Return the total amount I spent on shopping at One Stop Market {{time}}, excluding shipping.", + "instantiation_dict": {"time": "each months from Jan to the March 31, 2023"}, "format_specification": "Use \"month\" for month and \"total\" for spent amount.", - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -9240,9 +8992,9 @@ "performed_operation": "retrieve", "status": "SUCCESS", "retrieved_data": [ - { "month": "Jan", "total": 572.88 }, - { "month": "Feb", "total": 947.50 }, - { "month": "Mar", "total": 83.31 } + { "month": "Jan", "total": 542.88 }, + { "month": "Feb", "total": 912.50 }, + { "month": "Mar", "total": 53.31 } ] } } @@ -9254,20 +9006,19 @@ "task_id": 333, "intent_template_id": 147, "start_urls": ["__SHOPPING__"], - "intent": "Return how much I spent on November 2022 on shopping at One Stop Market?", - "intent_template": "Return how much I spent {{time}} on shopping at One Stop Market?", - "instantiation_dict": {"time": "on November 2022"}, + "intent": "Return the total amount I spent on shopping at One Stop Market in November 2022, excluding shipping.", + "intent_template": "Return the total amount I spent on shopping at One Stop Market {{time}}, excluding shipping.", + "instantiation_dict": {"time": "in November 2022"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "ordered": false, - "results_schema": { "type": "array", "items": {"type": "string"} }, + "results_schema": { "type": "array", "items": {"type": "number", "format": "currency"} }, "expected": { "performed_operation": "retrieve", "status": "SUCCESS", - "retrieved_data": ["403.18"] + "retrieved_data": [358.18] } } ], @@ -9278,20 +9029,19 @@ "task_id": 334, "intent_template_id": 169, "start_urls": ["__SHOPPING__"], - "intent": "Tell me when I last ordered my muffin cornbread mix?", - "intent_template": "Tell me when I last ordered my {{description}}?", + "intent": "Return the date I last ordered my muffin cornbread mix", + "intent_template": "Return the date I last ordered my {{description}}", "instantiation_dict": {"description": "muffin cornbread mix"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "ordered": false, - "results_schema": { "type": "array", "items": {"type": "string"} }, + "results_schema": { "type": "array", "items": {"type": "string", "format": "date"} }, "expected": { "performed_operation": "retrieve", "status": "SUCCESS", - "retrieved_data": ["March 11th 2023"] + "retrieved_data": ["03/11/2023"] } } ], @@ -9302,20 +9052,19 @@ "task_id": 335, "intent_template_id": 169, "start_urls": ["__SHOPPING__"], - "intent": "Tell me when I last ordered my body butter?", - "intent_template": "Tell me when I last ordered my {{description}}?", + "intent": "Return the date I last ordered my body butter", + "intent_template": "Return the date I last ordered my {{description}}", "instantiation_dict": {"description": "body butter"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "ordered": false, - "results_schema": { "type": "array", "items": {"type": "string"} }, + "results_schema": { "type": "array", "items": {"type": "string", "format": "date"} }, "expected": { "performed_operation": "retrieve", "status": "SUCCESS", - "retrieved_data": ["January 16th 2023"] + "retrieved_data": ["01/16/2023"] } } ], @@ -9326,20 +9075,19 @@ "task_id": 336, "intent_template_id": 169, "start_urls": ["__SHOPPING__"], - "intent": "Tell me when I last ordered my conditioner?", - "intent_template": "Tell me when I last ordered my {{description}}?", + "intent": "Return the date I last ordered my conditioner", + "intent_template": "Return the date I last ordered my {{description}}", "instantiation_dict": {"description": "conditioner"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "ordered": false, - "results_schema": { "type": "array", "items": {"type": "string"} }, + "results_schema": { "type": "array", "items": {"type": "string", "format": "date"} }, "expected": { "performed_operation": "retrieve", "status": "SUCCESS", - "retrieved_data": ["January 16th 2023"] + "retrieved_data": ["01/16/2023"] } } ], @@ -9350,20 +9098,19 @@ "task_id": 337, "intent_template_id": 169, "start_urls": ["__SHOPPING__"], - "intent": "Tell me when I last ordered my bread olive?", - "intent_template": "Tell me when I last ordered my {{description}}?", - "instantiation_dict": {"description": "bread olive"}, + "intent": "Return the date I last ordered my olive bread", + "intent_template": "Return the date I last ordered my {{description}}", + "instantiation_dict": {"description": "olive bread"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "ordered": false, - "results_schema": { "type": "array", "items": {"type": "string"} }, + "results_schema": { "type": "array", "items": {"type": "string", "format": "date"} }, "expected": { "performed_operation": "retrieve", "status": "SUCCESS", - "retrieved_data": ["December 12th 2022"] + "retrieved_data": ["12/12/2022"] } } ], @@ -9374,20 +9121,19 @@ "task_id": 338, "intent_template_id": 169, "start_urls": ["__SHOPPING__"], - "intent": "Tell me when I last ordered my toothpaste?", - "intent_template": "Tell me when I last ordered my {{description}}?", + "intent": "Return the date I last ordered my toothpaste", + "intent_template": "Return the date I last ordered my {{description}}", "instantiation_dict": {"description": "toothpaste"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "ordered": false, - "results_schema": { "type": "array", "items": {"type": "string"} }, + "results_schema": { "type": "array", "items": {"type": "string", "format": "date"} }, "expected": { "performed_operation": "retrieve", "status": "SUCCESS", - "retrieved_data": ["December 4th 2022"] + "retrieved_data": ["12/04/2022"] } } ], @@ -9398,11 +9144,10 @@ "task_id": 339, "intent_template_id": 299, "start_urls": ["__GITLAB__/a11yproject/a11yproject.com"], - "intent": "Navigate to the list of all opened issues that report bugs", - "intent_template": "Navigate to the list of all opened issues {{description}}", + "intent": "Show me the list of all opened issues that report bugs for the current project", + "intent_template": "Show me the list of all opened issues {{description}} for the current project", "instantiation_dict": {"description": "that report bugs"}, "format_specification": null, - "start_url_context": "Complete the task given the context of the current page.", "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -9412,12 +9157,11 @@ { "evaluator": "NetworkEventEvaluator", "site": "gitlab", - "url_match_mode": "exact", "last_event_only": true, "expected": { - "url": "__GITLAB__/a11yproject/a11yproject.com/-/issues/?label_name%5B%5D=bug&state=opened", + "url": "__GITLAB__/a11yproject/a11yproject.com/-/issues/", "response_status": 200, - "event_type": "navigation" + "query_params": { "label_name[]": ["bug"], "state": ["opened"] } } } ], @@ -9428,11 +9172,10 @@ "task_id": 340, "intent_template_id": 299, "start_urls": ["__GITLAB__/primer/design"], - "intent": "Navigate to the list of all opened issues that report bugs", - "intent_template": "Navigate to the list of all opened issues {{description}}", + "intent": "Show me the list of all opened issues that report bugs for the current project", + "intent_template": "Show me the list of all opened issues {{description}} for the current project", "instantiation_dict": {"description": "that report bugs"}, "format_specification": null, - "start_url_context": "Complete the task given the context of the current page.", "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -9442,12 +9185,11 @@ { "evaluator": "NetworkEventEvaluator", "site": "gitlab", - "url_match_mode": "exact", "last_event_only": true, "expected": { - "url": "__GITLAB__/primer/design/-/issues/?label_name%5B%5D=type%3A%20bug%20%F0%9F%90%9E&state=opened", + "url": "__GITLAB__/primer/design/-/issues/", "response_status": 200, - "event_type": "navigation" + "query_params": { "label_name[]": ["type: bug \ud83d\udc1e"], "state": ["opened"] } } } ], @@ -9458,11 +9200,10 @@ "task_id": 341, "intent_template_id": 299, "start_urls": ["__GITLAB__/root/metaseq"], - "intent": "Navigate to the list of all opened issues requesting new features", - "intent_template": "Navigate to the list of all opened issues {{description}}", + "intent": "Show me the list of all opened issues requesting new features for the current project", + "intent_template": "Show me the list of all opened issues {{description}} for the current project", "instantiation_dict": {"description": "requesting new features"}, "format_specification": null, - "start_url_context": "Complete the task given the context of the current page.", "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -9472,12 +9213,11 @@ { "evaluator": "NetworkEventEvaluator", "site": "gitlab", - "url_match_mode": "exact", "last_event_only": true, "expected": { - "url": "__GITLAB__/root/metaseq/-/issues/?label_name%5B%5D=enhancement&state=opened", + "url": "__GITLAB__/root/metaseq/-/issues/", "response_status": 200, - "event_type": "navigation" + "query_params": { "label_name[]": ["enhancement"], "state": ["opened"] } } } ], @@ -9488,11 +9228,10 @@ "task_id": 342, "intent_template_id": 299, "start_urls": ["__GITLAB__/root/metaseq"], - "intent": "Navigate to the list of all opened issues that ask about OPT model related questions", - "intent_template": "Navigate to the list of all opened issues {{description}}", + "intent": "Show me the list of all opened issues that ask about OPT model related questions for the current project", + "intent_template": "Show me the list of all opened issues {{description}} for the current project", "instantiation_dict": {"description": "that ask about OPT model related questions"}, "format_specification": null, - "start_url_context": "Complete the task given the context of the current page.", "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -9502,12 +9241,14 @@ { "evaluator": "NetworkEventEvaluator", "site": "gitlab", - "url_match_mode": "exact", "last_event_only": true, + "ignored_query_params_patterns": ["page", "sort"], "expected": { - "url": "__GITLAB__/root/metaseq/-/issues/?search=OPT&label_name%5B%5D=question&state=opened", - "response_status": 200, - "event_type": "navigation" + "url": "__GITLAB__/api/graphql", + "http_method": "POST", + "headers": { + "referer": "__GITLAB__/root/metaseq/-/issues/?state=opened&label_name%5B%5D=question&search=OPT%20Model" + } } } ], @@ -9518,11 +9259,10 @@ "task_id": 343, "intent_template_id": 299, "start_urls": ["__GITLAB__/root/metaseq"], - "intent": "Navigate to the list of all opened issues that don't have any labels", - "intent_template": "Navigate to the list of all opened issues {{description}}", + "intent": "Show me the list of all opened issues that don't have any labels for the current project", + "intent_template": "Show me the list of all opened issues {{description}} for the current project", "instantiation_dict": {"description": "that don't have any labels"}, "format_specification": null, - "start_url_context": "Complete the task given the context of the current page.", "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -9532,12 +9272,14 @@ { "evaluator": "NetworkEventEvaluator", "site": "gitlab", - "url_match_mode": "exact", "last_event_only": true, + "ignored_query_params_patterns": ["page", "sort"], "expected": { - "url": "__GITLAB__/root/metaseq/-/issues/?label_name%5B%5D=None&state=opened", - "response_status": 200, - "event_type": "navigation" + "url": "__GITLAB__/api/graphql", + "http_method": "POST", + "headers": { + "referer": "__GITLAB__/root/metaseq/-/issues/?state=opened&label_name%5B%5D=None" + } } } ], @@ -9552,7 +9294,6 @@ "intent_template": "How many reviews did our shop receive {{time}}?", "instantiation_dict": {"time": "so far"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -9576,7 +9317,6 @@ "intent_template": "How many reviews did our shop receive {{time}}?", "instantiation_dict": {"time": "in Apr 2023"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -9600,7 +9340,6 @@ "intent_template": "How many reviews did our shop receive {{time}}?", "instantiation_dict": {"time": "during 2022"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -9624,7 +9363,6 @@ "intent_template": "How many reviews did our shop receive {{time}}?", "instantiation_dict": {"time": "from the beginning of the shop"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -9648,7 +9386,6 @@ "intent_template": "How many reviews did our shop receive {{time}}?", "instantiation_dict": {"time": "in May 2023"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -9672,16 +9409,15 @@ "intent_template": "Get the usernames of other users who have access to my repo {{repo}}", "instantiation_dict": {"repo": "gimmiethat.space"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": "false", + "ordered": false, "results_schema": { "type": "array", "items": {"type": "string"} }, "expected": { "performed_operation": "retrieve", "status": "SUCCESS", - "retrieved_data": ["yjlou"] + "retrieved_data": [ ["yjlou", "@yjlou"] ] } } ], @@ -9696,16 +9432,15 @@ "intent_template": "Get the usernames of other users who have access to my repo {{repo}}", "instantiation_dict": {"repo": "prism-theme"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": "false", + "ordered": false, "results_schema": { "type": "array", "items": {"type": "string"} }, "expected": { "performed_operation": "retrieve", "status": "SUCCESS", - "retrieved_data": ["abisubramanya27"] + "retrieved_data": [ ["abisubramanya27", "@abisubramanya27"] ] } } ], @@ -9720,7 +9455,6 @@ "intent_template": "List products from {{product_category}} category by {{order}} price", "instantiation_dict": {"product_category": "PS4 accessories", "order": "ascending"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -9730,12 +9464,11 @@ { "evaluator": "NetworkEventEvaluator", "site": "shopping", - "url_match_mode": "exact", "last_event_only": true, "expected": { - "url": "__SHOPPING__/video-games/playstation-4/accessories.html?product_list_order=price", + "url": "__SHOPPING__/video-games/playstation-4/accessories.html", "response_status": 200, - "event_type": "navigation" + "query_params": { "product_list_order": ["price"] } } } ], @@ -9750,7 +9483,6 @@ "intent_template": "List products from {{product_category}} category by {{order}} price", "instantiation_dict": {"product_category": "nutrition bars and drinks", "order": "ascending"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -9760,12 +9492,11 @@ { "evaluator": "NetworkEventEvaluator", "site": "shopping", - "url_match_mode": "exact", "last_event_only": true, "expected": { - "url": "__SHOPPING__/health-household/diet-sports-nutrition/nutrition-bars-drinks.html?product_list_order=price", + "url": "__SHOPPING__/health-household/diet-sports-nutrition/nutrition-bars-drinks.html", "response_status": 200, - "event_type": "navigation" + "query_params": { "product_list_order": ["price"] } } } ], @@ -9780,7 +9511,6 @@ "intent_template": "List products from {{product_category}} category by {{order}} price", "instantiation_dict": {"product_category": "competitive swimwear", "order": "ascending"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -9790,12 +9520,11 @@ { "evaluator": "NetworkEventEvaluator", "site": "shopping", - "url_match_mode": "exact", "last_event_only": true, "expected": { - "url": "__SHOPPING__/clothing-shoes-jewelry/sport-specific-clothing/competitive-swimwear.html?product_list_order=price", + "url": "__SHOPPING__/clothing-shoes-jewelry/sport-specific-clothing/competitive-swimwear.html", "response_status": 200, - "event_type": "navigation" + "query_params": { "product_list_order": ["price"] } } } ], @@ -9810,7 +9539,6 @@ "intent_template": "List products from {{product_category}} category by {{order}} price", "instantiation_dict": {"product_category": "living room furtniture", "order": "descending"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -9820,12 +9548,11 @@ { "evaluator": "NetworkEventEvaluator", "site": "shopping", - "url_match_mode": "exact", "last_event_only": true, "expected": { - "url": "__SHOPPING__/home-kitchen/furniture/living-room-furniture.html?product_list_order=price&product_list_dir=desc", + "url": "__SHOPPING__/home-kitchen/furniture/living-room-furniture.html", "response_status": 200, - "event_type": "navigation" + "query_params": { "product_list_order": ["price"], "product_list_dir": ["desc"] } } } ], @@ -9836,11 +9563,10 @@ "task_id": 355, "intent_template_id": 137, "start_urls": ["__SHOPPING__"], - "intent": "List products from kids' bedding category by descending price", + "intent": "List products from kids\" bedding category by descending price", "intent_template": "List products from {{product_category}} category by {{order}} price", "instantiation_dict": {"product_category": "kids' bedding", "order": "descending"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -9850,12 +9576,11 @@ { "evaluator": "NetworkEventEvaluator", "site": "shopping", - "url_match_mode": "exact", "last_event_only": true, "expected": { - "url": "__SHOPPING__/home-kitchen/bedding/kids-bedding.html?product_list_dir=desc", + "url": "__SHOPPING__/home-kitchen/bedding/kids-bedding.html", "response_status": 200, - "event_type": "navigation" + "query_params": { "product_list_dir": ["desc"] } } } ], @@ -9870,7 +9595,6 @@ "intent_template": "Show the route from Gates and Hillman Centers at CMU in Pittsburgh to the location where the Declaration of Independence and Constitution were signed", "instantiation_dict": {}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -9880,9 +9604,8 @@ { "evaluator": "NetworkEventEvaluator", "site": "map", - "url_match_mode": "exact", "last_event_only": true, - "expected": {"url": "__MAP__", "response_status": 200, "event_type": "navigation"} + "expected": {"url": "__MAP__", "response_status": 200} } ], "revision": 2 @@ -9892,11 +9615,10 @@ "task_id": 357, "intent_template_id": 291, "start_urls": ["__GITLAB__"], - "intent": "Navigate to the merge requests requiring my review", - "intent_template": "Navigate to the merge requests requiring my review", + "intent": "Show me the merge requests requiring my review", + "intent_template": "Show me the merge requests requiring my review", "instantiation_dict": {}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -9906,15 +9628,13 @@ { "evaluator": "NetworkEventEvaluator", "site": "gitlab", - "url_match_mode": "exact", "last_event_only": true, "expected": { "url": [ "__GITLAB__/dashboard/merge_requests?reviewer_username=byteblaze", "__GITLAB__/dashboard/merge_requests?reviewer_username=byteblaze&scope=all&state=opened" ], - "response_status": 200, - "event_type": "navigation" + "response_status": 200 } } ], @@ -9929,7 +9649,6 @@ "intent_template": "Get the {{info}} for order number {{order_number}}.", "instantiation_dict": {"info": "shipping method", "order_number": 187}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -9953,16 +9672,15 @@ "intent_template": "Get the {{info}} for order number {{order_number}}.", "instantiation_dict": {"info": "order date", "order_number": "148"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "ordered": false, - "results_schema": { "type": "array", "items": {"type": "string"} }, + "results_schema": { "type": "array", "items": {"type": "string", "format": "date"} }, "expected": { "performed_operation": "retrieve", "status": "SUCCESS", - "retrieved_data": ["1/29/2023"] + "retrieved_data": ["January 29, 2023"] } } ], @@ -9977,7 +9695,6 @@ "intent_template": "Get the {{info}} for order number {{order_number}}.", "instantiation_dict": {"info": "product names", "order_number": "148"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -10004,7 +9721,6 @@ "intent_template": "Get the {{info}} for order number {{order_number}}.", "instantiation_dict": {"info": "order statuses", "order_number": "170 and 189"}, "format_specification": "Use \"order_number\" for order number and \"status\" for status.", - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -10038,7 +9754,6 @@ "intent_template": "Get the {{info}} for order number {{order_number}}.", "instantiation_dict": {"info": "billing address", "order_number": "00178"}, "format_specification": "Use keys \"house_number\", \"street\", \"city\", \"state\", \"postcode\", and \"country\". Set any key to null if not available.", - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -10089,7 +9804,6 @@ "location/address_2": "Carnegie Music Hall" }, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -10116,7 +9830,6 @@ "location/address_2": "UPMC Shadyside" }, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -10143,7 +9856,6 @@ "location/address_2": "UPMC Shadyside" }, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -10170,7 +9882,6 @@ "location/address_2": "UPMC Shadyside" }, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -10197,7 +9908,6 @@ "location/address_2": "CVS (closet one)" }, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -10221,7 +9931,6 @@ "intent_template": "find discounted items.", "instantiation_dict": {}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -10244,7 +9953,6 @@ "intent_template": "Pull up the description page of {{location}} on Map", "instantiation_dict": {"location": "Carnegie Music Hall"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -10254,13 +9962,8 @@ { "evaluator": "NetworkEventEvaluator", "site": "map", - "url_match_mode": "exact", "last_event_only": true, - "expected": { - "url": "__MAP__/way/154257484/", - "response_status": 200, - "event_type": "navigation" - } + "expected": {"url": "__MAP__/way/154257484/", "response_status": 200} } ], "revision": 2 @@ -10274,7 +9977,6 @@ "intent_template": "Pull up the description page of {{location}} on Map", "instantiation_dict": {"location": "Carnegie Mellon University"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -10284,13 +9986,8 @@ { "evaluator": "NetworkEventEvaluator", "site": "map", - "url_match_mode": "exact", "last_event_only": true, - "expected": { - "url": "__MAP__/relation/2279034/", - "response_status": 200, - "event_type": "navigation" - } + "expected": {"url": "__MAP__/relation/2279034/", "response_status": 200} } ], "revision": 2 @@ -10304,7 +10001,6 @@ "intent_template": "Pull up the description page of {{location}} on Map", "instantiation_dict": {"location": "Piada restaurant near Pitt"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -10314,13 +10010,8 @@ { "evaluator": "NetworkEventEvaluator", "site": "map", - "url_match_mode": "exact", "last_event_only": true, - "expected": { - "url": "__MAP__/node/2710170970", - "response_status": 200, - "event_type": "navigation" - } + "expected": {"url": "__MAP__/node/2710170970", "response_status": 200} } ], "revision": 2 @@ -10334,7 +10025,6 @@ "intent_template": "Pull up the description page of {{location}} on Map", "instantiation_dict": {"location": "the Costco in Pittsburgh near a river"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -10344,13 +10034,8 @@ { "evaluator": "NetworkEventEvaluator", "site": "map", - "url_match_mode": "exact", "last_event_only": true, - "expected": { - "url": "__MAP__/way/168456128", - "response_status": 200, - "event_type": "navigation" - } + "expected": {"url": "__MAP__/way/168456128", "response_status": 200} } ], "revision": 2 @@ -10364,7 +10049,6 @@ "intent_template": "Pull up the description page of {{location}} on Map", "instantiation_dict": {"location": "Whole Foods near Carnegie Mellon"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -10374,13 +10058,8 @@ { "evaluator": "NetworkEventEvaluator", "site": "map", - "url_match_mode": "exact", "last_event_only": true, - "expected": { - "url": "__MAP__/node/10114377662", - "response_status": 200, - "event_type": "navigation" - } + "expected": {"url": "__MAP__/node/10114377662", "response_status": 200} } ], "revision": 2 @@ -10390,11 +10069,10 @@ "task_id": 374, "intent_template_id": 266, "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Show the Magento Blank theme settings", - "intent_template": "Show the {{name}} theme settings", + "intent": "Show me the Magento Blank theme settings page", + "intent_template": "Show me the {{name}} theme settings page", "instantiation_dict": {"name": "Magento Blank"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -10404,12 +10082,8 @@ { "evaluator": "NetworkEventEvaluator", "site": "shopping_admin", - "url_match_mode": "exact", "last_event_only": true, - "expected": { - "url": "__SHOPPING_ADMIN__/admin/system_design_theme/edit/id/1", - "event_type": "navigation" - } + "expected": {"url": "__SHOPPING_ADMIN__/admin/system_design_theme/edit/id/1"} } ], "revision": 2 @@ -10419,11 +10093,10 @@ "task_id": 375, "intent_template_id": 266, "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Show the Magento Luma theme settings", - "intent_template": "Show the {{name}} theme settings", + "intent": "Show me the Magento Luma theme settings page", + "intent_template": "Show me the {{name}} theme settings page", "instantiation_dict": {"name": "Magento Luma"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -10433,12 +10106,8 @@ { "evaluator": "NetworkEventEvaluator", "site": "shopping_admin", - "url_match_mode": "exact", "last_event_only": true, - "expected": { - "url": "__SHOPPING_ADMIN__/admin/system_design_theme/edit/id/3/key/", - "event_type": "navigation" - } + "expected": {"url": "__SHOPPING_ADMIN__/admin/system_design_theme/edit/id/3/key/"} } ], "revision": 2 @@ -10452,7 +10121,6 @@ "intent_template": "Summarize customer reviews for {{product}}.", "instantiation_dict": {"product": "Amazon Echo Dot 3rd generation"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -10475,7 +10143,6 @@ "intent_template": "Search for \"{{space}} near {{location}}\"", "instantiation_dict": {"location": "CMU ArtPark Lab", "space": "resturants"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -10485,12 +10152,11 @@ { "evaluator": "NetworkEventEvaluator", "site": "map", - "url_match_mode": "exact", "last_event_only": true, "expected": { - "url": "__MAP__/search?query=restaurants%20near%20CMU%20ArtPark%20Lab", + "url": "__MAP__/search", "response_status": 200, - "event_type": "navigation" + "query_params": { "query": ["restaurants near CMU ArtPark Lab"] } } } ], @@ -10505,7 +10171,6 @@ "intent_template": "Search for \"{{space}} near {{location}}\"", "instantiation_dict": {"location": "Carnegie Mellon University", "space": "parking"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -10515,12 +10180,11 @@ { "evaluator": "NetworkEventEvaluator", "site": "map", - "url_match_mode": "exact", "last_event_only": true, "expected": { - "url": "__MAP__/search?query=parking%20near%20Carnegie%20Mellon%20University", + "url": "__MAP__/search", "response_status": 200, - "event_type": "navigation" + "query_params": { "query": ["parking near Carnegie Mellon University"] } } } ], @@ -10535,7 +10199,6 @@ "intent_template": "Search for \"{{space}} near {{location}}\"", "instantiation_dict": {"location": "Carnegie Mellon University", "space": "hotels"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -10545,12 +10208,11 @@ { "evaluator": "NetworkEventEvaluator", "site": "map", - "url_match_mode": "exact", "last_event_only": true, "expected": { - "url": "__MAP__/search?query=hotels%20near%20Carnegie%20Mellon%20University", + "url": "__MAP__/search", "response_status": 200, - "event_type": "navigation" + "query_params": { "query": ["hotels near Carnegie Mellon University"] } } } ], @@ -10565,7 +10227,6 @@ "intent_template": "Search for \"{{space}} near {{location}}\"", "instantiation_dict": {"location": "Carnegie Music Hall", "space": "bars"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -10575,12 +10236,11 @@ { "evaluator": "NetworkEventEvaluator", "site": "map", - "url_match_mode": "exact", "last_event_only": true, "expected": { - "url": "__MAP__/search?query=bars%20near%20Carnegie%20Music%20Hall", + "url": "__MAP__/search", "response_status": 200, - "event_type": "navigation" + "query_params": { "query": ["bars near Carnegie Music Hall"] } } } ], @@ -10595,7 +10255,6 @@ "intent_template": "Search for \"{{space}} near {{location}}\"", "instantiation_dict": {"location": "Carnegie Music Hall", "space": "hotels"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -10605,12 +10264,11 @@ { "evaluator": "NetworkEventEvaluator", "site": "map", - "url_match_mode": "exact", "last_event_only": true, "expected": { - "url": "__MAP__/search?query=hotels%20near%20Carnegie%20Music%20Hall", + "url": "__MAP__/search", "response_status": 200, - "event_type": "navigation" + "query_params": { "query": ["hotels near Carnegie Music Hall"] } } } ], @@ -10625,7 +10283,6 @@ "intent_template": "I am arriving at Carnegie Mellon University. Find the nearby US Citizenship and Immigration Services and the walking distance to the nearest Social Security Administration from US Citizenship and Immigration Services", "instantiation_dict": {}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -10648,7 +10305,6 @@ "intent_template": "I am arriving at Pittsburgh Airport. Show me the name of a Hyatt hotel if there is any nearby. Tell me the names of supermarkets that are within 15mins driving from the hotel", "instantiation_dict": {}, "format_specification": "Use \"hotel\" for the hotel name and \"supermarkets\" for the list of supermarket names", - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -10687,7 +10343,6 @@ "intent_template": "List the customer names who complain about the quality of EYZUTAK phone cases", "instantiation_dict": {}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -10711,7 +10366,6 @@ "intent_template": "List the customer names who thinks EYZUTAK phone cases are of good looking", "instantiation_dict": {}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -10735,7 +10389,6 @@ "intent_template": "What is the rating of {{product}}", "instantiation_dict": {"product": "Ugreen lightning to 3.5mm cable"}, "format_specification": "Return a value between 0 and 100", - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -10759,7 +10412,6 @@ "intent_template": "Who gave {{stars}} for phone cases from EYZUTAK", "instantiation_dict": {"stars": "4 or 5 stars"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -10768,10 +10420,7 @@ "expected": { "performed_operation": "retrieve", "status": "SUCCESS", - "retrieved_data": [ - "MH", "Misba009", "Amanda", "N Randall", "Amazon Customer", "Cally", - "Bethany Robertson" - ] + "retrieved_data": ["MH", "Misba009", "Amanda", "Amazon Customer", "Cally", "Bethany Robertson"] } } ], @@ -10786,7 +10435,6 @@ "intent_template": "Who gave {{stars}} for phone cases from EYZUTAK", "instantiation_dict": {"stars": "1 or 2 stars"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -10814,7 +10462,6 @@ "content": "Thanks, working on reviews" }, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -10822,13 +10469,19 @@ "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "group": "primer", - "project": "design", - "merge_request_iid": 450, - "comment": "Thanks, working on reviews", + "evaluator": "NetworkEventEvaluator", "site": "gitlab", - "expected": true + "last_event_only": true, + "expected": { + "url": "__GITLAB__/primer/design/notes", + "http_method": "POST", + "query_string": {"target_id": "139245", "target_type": "merge_request"}, + "post_data": { + "$.note.noteable_type": "MergeRequest", + "$.note.note": "Thanks, working on reviews" + }, + "response_status": 200 + } } ], "revision": 2 @@ -10846,7 +10499,6 @@ "content": "lgtm" }, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -10854,13 +10506,16 @@ "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "group": "a11yproject", - "project": "a11yproject.com", - "merge_request_iid": 1531, - "comment": "lgtm", + "evaluator": "NetworkEventEvaluator", "site": "gitlab", - "expected": true + "last_event_only": true, + "expected": { + "url": "__GITLAB__/a11yproject/a11yproject.com/notes", + "http_method": "POST", + "query_string": {"target_id": "72594", "target_type": "merge_request"}, + "post_data": {"$.note.noteable_type": "MergeRequest", "$.note.note": "lgtm"}, + "response_status": 200 + } } ], "revision": 2 @@ -10878,7 +10533,6 @@ "content": "close because non reproducible" }, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -10886,13 +10540,19 @@ "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "group": "a11yproject", - "project": "a11yproject.com", - "merge_request_iid": 1265, - "comment": "close because non reproducible", + "evaluator": "NetworkEventEvaluator", "site": "gitlab", - "expected": true + "last_event_only": true, + "expected": { + "url": "__GITLAB__/a11yproject/a11yproject.com/notes", + "http_method": "POST", + "query_string": {"target_id": "72404", "target_type": "merge_request"}, + "post_data": { + "$.note.noteable_type": "MergeRequest", + "$.note.note": "close because non reproducible" + }, + "response_status": 200 + } } ], "revision": 2 @@ -10906,7 +10566,6 @@ "intent_template": "Post \"{{content}}\" for the merge request related to {{mr}} in {{repo}} project", "instantiation_dict": {"mr": "color ulitity", "repo": "a11yproject.com", "content": "Good idea"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -10914,13 +10573,16 @@ "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "group": "a11yproject", - "project": "a11yproject.com", - "merge_request_iid": 1071, - "comment": "Good idea", + "evaluator": "NetworkEventEvaluator", "site": "gitlab", - "expected": true + "last_event_only": true, + "expected": { + "url": "__GITLAB__/a11yproject/a11yproject.com/notes", + "http_method": "POST", + "query_string": {"target_id": "72135", "target_type": "merge_request"}, + "post_data": {"$.note.noteable_type": "MergeRequest", "$.note.note": "Good idea"}, + "response_status": 200 + } } ], "revision": 2 @@ -10938,7 +10600,6 @@ "content": "lgtm" }, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -10946,13 +10607,16 @@ "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "group": "byteblaze", - "project": "empathy-prompts", - "merge_request_iid": 19, - "comment": "lgtm", + "evaluator": "NetworkEventEvaluator", "site": "gitlab", - "expected": true + "last_event_only": true, + "expected": { + "url": "__GITLAB__/byteblaze/empathy-prompts/notes", + "http_method": "POST", + "query_string": {"target_id": "138843", "target_type": "merge_request"}, + "post_data": {"$.note.noteable_type": "MergeRequest", "$.note.note": "lgtm"}, + "response_status": 200 + } } ], "revision": 2 @@ -10966,7 +10630,6 @@ "intent_template": "Fork {{repo}}.", "instantiation_dict": {"repo": "2019-nCov"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -10974,10 +10637,13 @@ "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "project": "2019-nCov", - "site": "gitlab", - "expected": true + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": "__GITLAB__/api/v4/projects/79/fork", + "http_method": "POST", + "post_data": {"id": "79", "name": "2019-nCov", "namespace_id": 2505, "path": "2019-nCov"}, + "response_status": 201 + } } ], "revision": 2 @@ -10991,7 +10657,6 @@ "intent_template": "Fork {{repo}}.", "instantiation_dict": {"repo": "the Pytorch GAN repo with most stars"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -10999,10 +10664,13 @@ "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "project": "Pytorch-GAN", - "site": "gitlab", - "expected": true + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": "__GITLAB__/api/v4/projects/86/fork", + "http_method": "POST", + "post_data": {"id": "86", "name": "PyTorch-GAN", "namespace_id": 2505, "path": "PyTorch-GAN"}, + "response_status": 201 + } } ], "revision": 2 @@ -11016,7 +10684,6 @@ "intent_template": "Fork {{repo}}.", "instantiation_dict": {"repo": "ChatGPT"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -11024,10 +10691,13 @@ "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "project": "ChatGPT", - "site": "gitlab", - "expected": true + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": "__GITLAB__/api/v4/projects/175/fork", + "http_method": "POST", + "post_data": {"id": "175", "name": "Chatgpt", "namespace_id": 2505, "path": "chatgpt"}, + "response_status": 201 + } } ], "revision": 2 @@ -11041,7 +10711,6 @@ "intent_template": "Fork {{repo}}.", "instantiation_dict": {"repo": "MetaSeq"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -11049,10 +10718,14 @@ "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "project": "metaseq", + "evaluator": "NetworkEventEvaluator", "site": "gitlab", - "expected": true + "expected": { + "url": "__GITLAB__/api/v4/projects/33/fork", + "http_method": "POST", + "post_data": {"id": "33", "name": "metaseq", "namespace_id": 2505, "path": "metaseq"}, + "response_status": 201 + } } ], "revision": 2 @@ -11062,11 +10735,10 @@ "task_id": 398, "intent_template_id": 352, "start_urls": ["__GITLAB__"], - "intent": "Fork all source repos from Akilesh Kannan.", + "intent": "Fork all repos from Akilesh Kannan.", "intent_template": "Fork {{repo}}.", - "instantiation_dict": {"repo": "all source repos from Akilesh Kannan"}, + "instantiation_dict": {"repo": "all repos from Akilesh Kannan"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -11074,10 +10746,59 @@ "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "project": ["SimCache", "dots", "CacheEval", "nvidia-patch", "viewgrades-scraper"], - "site": "gitlab", - "expected": true + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": "__GITLAB__/api/v4/projects/170/fork", + "http_method": "POST", + "post_data": {"id": "170", "name": "CacheEval", "namespace_id": 2505, "path": "CacheEval"}, + "response_status": 201 + } + }, + { + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": "__GITLAB__/api/v4/projects/165/fork", + "http_method": "POST", + "post_data": { + "id": "165", + "name": "nvidia-patch", + "namespace_id": 2505, + "path": "nvidia-patch" + }, + "response_status": 201 + } + }, + { + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": "__GITLAB__/api/v4/projects/163/fork", + "http_method": "POST", + "post_data": {"id": "163", "name": "SimCache", "namespace_id": 2505, "path": "SimCache"}, + "response_status": 201 + } + }, + { + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": "__GITLAB__/api/v4/projects/164/fork", + "http_method": "POST", + "post_data": { + "id": "164", + "name": "viewgrades-scraper", + "namespace_id": 2505, + "path": "viewgrades-scraper" + }, + "response_status": 201 + } + }, + { + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": "__GITLAB__/api/v4/projects/159/fork", + "http_method": "POST", + "post_data": {"id": "159", "name": "dots", "namespace_id": 2505, "path": "dots"}, + "response_status": 201 + } } ], "revision": 2 @@ -11087,11 +10808,10 @@ "task_id": 399, "intent_template_id": 6, "start_urls": ["__REDDIT__"], - "intent": "Change my reddit bio to \"I am a robot\"", - "intent_template": "Change my reddit bio to \"{{content}}\"", + "intent": "Change my bio to \"I am a robot\" in the discussion forum", + "intent_template": "Change my bio to \"{{content}}\" in the discussion forum", "instantiation_dict": {"content": "I am a robot"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -11099,9 +10819,16 @@ "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "site": "reddit", - "expected": {"content": "I am a robot"} + "evaluator": "NetworkEventEvaluator", + "last_event_only": false, + "expected": { + "url": "__REDDIT__/user/MarvelsGrantMan136/edit_biography", + "http_method": "POST", + "headers": {"referer": "__REDDIT__/user/MarvelsGrantMan136/edit_biography"}, + "post_data": {"user_biography[biography]": "I am a robot"}, + "response_status": 302, + "redirect_url": "__REDDIT__/user/MarvelsGrantMan136/edit_biography" + } } ], "revision": 2 @@ -11111,11 +10838,10 @@ "task_id": 400, "intent_template_id": 6, "start_urls": ["__REDDIT__"], - "intent": "Change my reddit bio to \"Pro Python Developer with 20 years of Experience\"", - "intent_template": "Change my reddit bio to \"{{content}}\"", + "intent": "Change my bio to \"Pro Python Developer with 20 years of Experience\" in the discussion forum", + "intent_template": "Change my bio to \"{{content}}\" in the discussion forum", "instantiation_dict": {"content": "Pro Python Developer with 20 years of Experience"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -11123,9 +10849,18 @@ "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "site": "reddit", - "expected": {"content": "Pro Python Developer with 20 years of Experience"} + "evaluator": "NetworkEventEvaluator", + "last_event_only": false, + "expected": { + "url": "__REDDIT__/user/MarvelsGrantMan136/edit_biography", + "http_method": "POST", + "headers": {"referer": "__REDDIT__/user/MarvelsGrantMan136/edit_biography"}, + "post_data": { + "user_biography[biography]": "Pro Python Developer with 20 years of Experience" + }, + "response_status": 302, + "redirect_url": "__REDDIT__/user/MarvelsGrantMan136/edit_biography" + } } ], "revision": 2 @@ -11135,11 +10870,10 @@ "task_id": 401, "intent_template_id": 6, "start_urls": ["__REDDIT__"], - "intent": "Change my reddit bio to \"Seeking SDE positions\"", - "intent_template": "Change my reddit bio to \"{{content}}\"", + "intent": "Change my bio to \"Seeking SDE positions\" in the discussion forum", + "intent_template": "Change my bio to \"{{content}}\" in the discussion forum", "instantiation_dict": {"content": "Seeking SDE positions"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -11147,9 +10881,16 @@ "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "site": "reddit", - "expected": {"content": "Seeking SDE positions"} + "evaluator": "NetworkEventEvaluator", + "last_event_only": false, + "expected": { + "url": "__REDDIT__/user/MarvelsGrantMan136/edit_biography", + "http_method": "POST", + "headers": {"referer": "__REDDIT__/user/MarvelsGrantMan136/edit_biography"}, + "post_data": {"user_biography[biography]": "Seeking SDE positions"}, + "response_status": 302, + "redirect_url": "__REDDIT__/user/MarvelsGrantMan136/edit_biography" + } } ], "revision": 2 @@ -11159,11 +10900,10 @@ "task_id": 402, "intent_template_id": 6, "start_urls": ["__REDDIT__"], - "intent": "Change my reddit bio to \"Freelance Web Developer\"", - "intent_template": "Change my reddit bio to \"{{content}}\"", + "intent": "Change my bio to \"Freelance Web Developer\" in the discussion forum", + "intent_template": "Change my bio to \"{{content}}\" in the discussion forum", "instantiation_dict": {"content": "Freelance Web Developer"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -11171,9 +10911,16 @@ "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "site": "reddit", - "expected": {"content": "Freelance Web Developer"} + "evaluator": "NetworkEventEvaluator", + "last_event_only": false, + "expected": { + "url": "__REDDIT__/user/MarvelsGrantMan136/edit_biography", + "http_method": "POST", + "headers": {"referer": "__REDDIT__/user/MarvelsGrantMan136/edit_biography"}, + "post_data": {"user_biography[biography]": "Freelance Web Developer"}, + "response_status": 302, + "redirect_url": "__REDDIT__/user/MarvelsGrantMan136/edit_biography" + } } ], "revision": 2 @@ -11183,11 +10930,10 @@ "task_id": 403, "intent_template_id": 6, "start_urls": ["__REDDIT__"], - "intent": "Change my reddit bio to \"Awesome Prompt Artist\"", - "intent_template": "Change my reddit bio to \"{{content}}\"", + "intent": "Change my bio to \"Awesome Prompt Artist\" in the discussion forum", + "intent_template": "Change my bio to \"{{content}}\" in the discussion forum", "instantiation_dict": {"content": "Awesome Prompt Artist"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -11195,9 +10941,16 @@ "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "site": "reddit", - "expected": {"content": "Awesome Prompt Artist"} + "evaluator": "NetworkEventEvaluator", + "last_event_only": false, + "expected": { + "url": "__REDDIT__/user/MarvelsGrantMan136/edit_biography", + "http_method": "POST", + "headers": {"referer": "__REDDIT__/user/MarvelsGrantMan136/edit_biography"}, + "post_data": {"user_biography[biography]": "Awesome Prompt Artist"}, + "response_status": 302, + "redirect_url": "__REDDIT__/user/MarvelsGrantMan136/edit_biography" + } } ], "revision": 2 @@ -11207,11 +10960,10 @@ "task_id": 404, "intent_template_id": 22, "start_urls": ["__REDDIT__"], - "intent": "Upvote the newest post in books subreddit", - "intent_template": "Upvote the newest post in {{subreddit}} subreddit", - "instantiation_dict": {"subreddit": "books"}, + "intent": "Upvote the newest post in books forum", + "intent_template": "Upvote the newest post in {{forum}} forum", + "instantiation_dict": {"forum": "books"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -11219,9 +10971,12 @@ "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "site": "reddit", - "expected": { "post_ids": [124260] } + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": "__REDDIT__/sv/124260.json", + "http_method": "POST", + "post_data": {"choice": "1"} + } } ], "revision": 2 @@ -11231,11 +10986,10 @@ "task_id": 405, "intent_template_id": 22, "start_urls": ["__REDDIT__"], - "intent": "Upvote the newest post in DIY subreddit", - "intent_template": "Upvote the newest post in {{subreddit}} subreddit", - "instantiation_dict": {"subreddit": "DIY"}, + "intent": "Upvote the newest post in DIY forum", + "intent_template": "Upvote the newest post in {{forum}} forum", + "instantiation_dict": {"forum": "DIY"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -11243,9 +10997,12 @@ "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "site": "reddit", - "expected": { "post_ids": [119019] } + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": "__REDDIT__/sv/119019.json", + "http_method": "POST", + "post_data": {"choice": "1"} + } } ], "revision": 2 @@ -11255,11 +11012,10 @@ "task_id": 406, "intent_template_id": 22, "start_urls": ["__REDDIT__"], - "intent": "Upvote the newest post in future technology subreddit", - "intent_template": "Upvote the newest post in {{subreddit}} subreddit", - "instantiation_dict": {"subreddit": "future technology"}, + "intent": "Upvote the newest post in future technology forum", + "intent_template": "Upvote the newest post in {{forum}} forum", + "instantiation_dict": {"forum": "future technology"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -11267,9 +11023,12 @@ "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "site": "reddit", - "expected": { "post_ids": [119517] } + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": "__REDDIT__/sv/119517.json", + "http_method": "POST", + "post_data": {"choice": "1"} + } } ], "revision": 2 @@ -11279,11 +11038,10 @@ "task_id": 407, "intent_template_id": 22, "start_urls": ["__REDDIT__"], - "intent": "Upvote the newest post in deeplearning subreddit", - "intent_template": "Upvote the newest post in {{subreddit}} subreddit", - "instantiation_dict": {"subreddit": "deeplearning"}, + "intent": "Upvote the newest post in the deep learning forum", + "intent_template": "Upvote the newest post in {{forum}} forum", + "instantiation_dict": {"forum": "the deep learning"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -11291,9 +11049,12 @@ "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "site": "reddit", - "expected": { "post_ids": [125036] } + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": "__REDDIT__/sv/125036.json", + "http_method": "POST", + "post_data": {"choice": "1"} + } } ], "revision": 2 @@ -11303,11 +11064,10 @@ "task_id": 408, "intent_template_id": 22, "start_urls": ["__REDDIT__"], - "intent": "Upvote the newest post in explain like im 5 subreddit", - "intent_template": "Upvote the newest post in {{subreddit}} subreddit", - "instantiation_dict": {"subreddit": "explain like im 5"}, + "intent": "Upvote the newest post in explain like im 5 forum", + "intent_template": "Upvote the newest post in {{forum}} forum", + "instantiation_dict": {"forum": "explain like im 5"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -11315,9 +11075,12 @@ "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "site": "reddit", - "expected": { "post_ids": [125342] } + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": "__REDDIT__/sv/125342.json", + "http_method": "POST", + "post_data": {"choice": "1"} + } } ], "revision": 2 @@ -11334,7 +11097,6 @@ "content_description": "thanks! I am a big fan of your website." }, "format_specification": null, - "start_url_context": "Complete the task given the context of the current page.", "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -11342,11 +11104,12 @@ "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "site": "reddit", + "evaluator": "NetworkEventEvaluator", "expected": { - "comment_id": "1235250", - "reply_content": "thanks! I am a big fan of your website." + "url": "__REDDIT__/f/books/59421/-/comment/1235250", + "http_method": "POST", + "post_data": {"reply_to_comment_1235250[comment]": "thanks! I am a big fan of your website."}, + "response_status": 302 } } ], @@ -11364,7 +11127,6 @@ "content_description": "don't panic" }, "format_specification": null, - "start_url_context": "Complete the task given the context of the current page.", "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -11372,9 +11134,13 @@ "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "site": "reddit", - "expected": {"comment_id": "1042264", "reply_content": "don't panic"} + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": "__REDDIT__/f/singularity/69404/-/comment/1042264", + "http_method": "POST", + "post_data": {"reply_to_comment_1042264[comment]": "don't panic"}, + "response_status": 302 + } } ], "revision": 2 @@ -11388,7 +11154,6 @@ "intent_template": "Change the LICENSE for repo {{repo}} to {{license}}", "instantiation_dict": {"repo": "byteblaze/cloud-to-butt", "license": "an MIT license"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -11396,14 +11161,20 @@ "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "group": "byteblaze", - "project": "cloud-to-butt", - "file_path": "LICENSE.txt", - "substring": "MIT License", - "branch": "master", + "evaluator": "NetworkEventEvaluator", "site": "gitlab", - "expected": true + "last_event_only": true, + "expected": { + "url": "__GITLAB__/byteblaze/cloud-to-butt/-/update/master/LICENSE.txt", + "http_method": "POST", + "post_data": { + "_method": "put", + "file_path": "LICENSE", + "commit_message": "Update LICENSE.txt", + "branch_name": "master" + }, + "response_status": 302 + } } ], "revision": 2 @@ -11420,7 +11191,6 @@ "license": "an Apache License" }, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -11428,14 +11198,20 @@ "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "group": "byteblaze", - "project": "accessible-html-content-patterns", - "file_path": "LICENSE", - "substring": "Apache License", - "branch": "main", + "evaluator": "NetworkEventEvaluator", "site": "gitlab", - "expected": true + "last_event_only": true, + "expected": { + "url": "__GITLAB__/byteblaze/accessible-html-content-patterns/-/update/main/LICENSE", + "http_method": "POST", + "post_data": { + "_method": "put", + "file_path": "LICENSE", + "commit_message": "Update LICENSE", + "branch_name": "main" + }, + "response_status": 302 + } } ], "revision": 2 @@ -11452,7 +11228,6 @@ "license": "one that mandates all copies and derivative works to be under the same license" }, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -11460,15 +11235,20 @@ "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "group": "byteblaze", - "project": "a11y-syntax-highlighting", - "file_path": "LICENSE", - "substring": "GNU GENERAL PUBLIC LICENSE", - "not_substring": "MIT License", - "branch": "main", + "evaluator": "NetworkEventEvaluator", "site": "gitlab", - "expected": true + "last_event_only": true, + "expected": { + "url": "__GITLAB__/byteblaze/a11y-syntax-highlighting/-/update/main/LICENSE", + "http_method": "POST", + "post_data": { + "_method": "put", + "file_path": "LICENSE", + "commit_message": "Update LICENSE", + "branch_name": "main" + }, + "response_status": 302 + } } ], "revision": 2 @@ -11482,7 +11262,6 @@ "intent_template": "Change the LICENSE for repo {{repo}} to {{license}}", "instantiation_dict": {"repo": "byteblaze/dotfiles", "license": "an MIT license"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -11507,11 +11286,10 @@ "task_id": 415, "intent_template_id": 360, "start_urls": ["__GITLAB__"], - "intent": "Go to the merge request on verification functions I have to review, find if the author of the merge request responded at the end, and reply \"Thank you\" if he did. Otherwise remind him with a simple @.", - "intent_template": "Go to the merge request on {{topic}} I have to review, find if the author of the merge request responded at the end, and reply \"Thank you\" if he did. Otherwise remind him with a simple @.", + "intent": "Check the code review assigned to me for verification functions. If the last message in the code review is from the author, reply \"Thank you\". If not, tag them with their username (e.g., @user) as a reminder.", + "intent_template": "Check the code review assigned to me for {{topic}}. If the last message in the code review is from the author, reply \"Thank you\". If not, tag them with their username (e.g., @user) as a reminder.", "instantiation_dict": {"topic": "verification functions"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -11519,13 +11297,12 @@ "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "group": "byteblaze", - "project": "a11y-webring.club", - "merge_request_iid": 40, - "comment": "@davepgreene", - "site": "gitlab", - "expected": true + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": "__GITLAB__/byteblaze/a11y-webring.club/notes?target_id=138783&target_type=merge_request", + "http_method": "POST", + "post_data": {"$.note.noteable_type": "MergeRequest", "$.note.note": "@davepgreene "} + } } ], "revision": 2 @@ -11535,11 +11312,10 @@ "task_id": 416, "intent_template_id": 360, "start_urls": ["__GITLAB__"], - "intent": "Go to the merge request on wcag I have to review, find if the author of the merge request responded at the end, and reply \"Thank you\" if he did. Otherwise remind him with a simple @.", - "intent_template": "Go to the merge request on {{topic}} I have to review, find if the author of the merge request responded at the end, and reply \"Thank you\" if he did. Otherwise remind him with a simple @.", + "intent": "Check the code review assigned to me for wcag. If the last message in the code review is from the author, reply \"Thank you\". If not, tag them with their username (e.g., @user) as a reminder.", + "intent_template": "Check the code review assigned to me for {{topic}}. If the last message in the code review is from the author, reply \"Thank you\". If not, tag them with their username (e.g., @user) as a reminder.", "instantiation_dict": {"topic": "wcag"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -11547,13 +11323,12 @@ "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "group": "a11yproject", - "project": "a11yproject.com", - "merge_request_iid": 1270, - "comment": "Thank you", - "site": "gitlab", - "expected": true + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": "__GITLAB__/a11yproject/a11yproject.com/notes?target_id=72409&target_type=merge_request", + "http_method": "POST", + "post_data": {"$.note.noteable_type": "MergeRequest", "$.note.note": "Thank you"} + } } ], "revision": 2 @@ -11563,11 +11338,10 @@ "task_id": 417, "intent_template_id": 360, "start_urls": ["__GITLAB__"], - "intent": "Go to the merge request on 404 link I have to review, find if the author of the merge request responded at the end, and reply \"Thank you\" if he did. Otherwise remind him with a simple @.", - "intent_template": "Go to the merge request on {{topic}} I have to review, find if the author of the merge request responded at the end, and reply \"Thank you\" if he did. Otherwise remind him with a simple @.", + "intent": "Check the code review assigned to me for 404 link. If the last message in the code review is from the author, reply \"Thank you\". If not, tag them with their username (e.g., @user) as a reminder.", + "intent_template": "Check the code review assigned to me for {{topic}}. If the last message in the code review is from the author, reply \"Thank you\". If not, tag them with their username (e.g., @user) as a reminder.", "instantiation_dict": {"topic": "404 link"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -11575,13 +11349,12 @@ "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "group": "a11yproject", - "project": "a11yproject.com", - "merge_request_iid": 1485, - "comment": "@Roshanjossey", - "site": "gitlab", - "expected": true + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": "__GITLAB__/a11yproject/a11yproject.com/notes?target_id=72560&target_type=merge_request", + "http_method": "POST", + "post_data": {"$.note.noteable_type": "MergeRequest", "$.note.note": "Thank you"} + } } ], "revision": 2 @@ -11595,7 +11368,6 @@ "intent_template": "Set my gitlab status as {{status}}.", "instantiation_dict": {"status": "Busy"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -11603,11 +11375,12 @@ "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "user_id": "byteblaze", - "compare_str": "Busy", - "site": "gitlab", - "expected": true + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": "__GITLAB__/api/v4/user/status", + "http_method": "PUT", + "post_data": {"availability": "busy", "clear_status_after": null} + } } ], "revision": 2 @@ -11621,7 +11394,6 @@ "intent_template": "Set my gitlab status as {{status}}.", "instantiation_dict": {"status": "Enjoying life"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -11629,11 +11401,17 @@ "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "user_id": "byteblaze", - "compare_str": "Enjoying life", - "site": "gitlab", - "expected": true + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": "__GITLAB__/api/v4/user/status", + "http_method": "PUT", + "post_data": { + "message": "Enjoying life.", + "availability": "busy", + "clear_status_after": null + }, + "response_status": 200 + } } ], "revision": 2 @@ -11647,7 +11425,6 @@ "intent_template": "Set my gitlab status as {{status}}.", "instantiation_dict": {"status": "Playing Badminton"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -11655,11 +11432,17 @@ "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "user_id": "byteblaze", - "compare_str": "Playing Badminton", - "site": "gitlab", - "expected": true + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": "__GITLAB__/api/v4/user/status", + "http_method": "PUT", + "post_data": { + "message": "Playing Badminton.", + "availability": "busy", + "clear_status_after": null + }, + "response_status": 200 + } } ], "revision": 2 @@ -11673,7 +11456,6 @@ "intent_template": "Set my gitlab status as {{status}}.", "instantiation_dict": {"status": "Resting due to leg injury"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -11681,11 +11463,17 @@ "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "user_id": "byteblaze", - "compare_str": "Resting due to leg injury", - "site": "gitlab", - "expected": true + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": "__GITLAB__/api/v4/user/status", + "http_method": "PUT", + "post_data": { + "message": "Resting due to leg injury.", + "availability": "busy", + "clear_status_after": null + }, + "response_status": 200 + } } ], "revision": 2 @@ -11699,7 +11487,6 @@ "intent_template": "Set my gitlab status as {{status}}.", "instantiation_dict": {"status": "Out of Office"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -11707,11 +11494,17 @@ "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "user_id": "byteblaze", - "compare_str": "Out of Office", - "site": "gitlab", - "expected": true + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": "__GITLAB__/api/v4/user/status", + "http_method": "PUT", + "post_data": { + "message": "Out of Office.", + "availability": "busy", + "clear_status_after": null + }, + "response_status": 200 + } } ], "revision": 2 @@ -11725,7 +11518,6 @@ "intent_template": "Mark all {{brand}} shirts on sale", "instantiation_dict": {"brand": "Hollister"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -11735,16 +11527,15 @@ { "evaluator": "NetworkEventEvaluator", "site": "shopping_admin", - "url_match_mode": "exact", "last_event_only": false, "expected": { - "url": "__SHOPPING_ADMIN__/catalog/product/validate/id/126/type/configurable/store/0/set/9/?isAjax=true", - "event_type": "modification", + "url": "__SHOPPING_ADMIN__/catalog/product/validate/id/126/type/configurable/store/0/set/9/", "headers": { "referer": "__SHOPPING_ADMIN__/admin/catalog/product/edit/id/126/", "X-Requested-With": "XMLHttpRequest" }, - "post_data": {"report_type": "created_at_order", "from": "02/1/2023", "to": "02/28/2023"} + "post_data": {"report_type": "created_at_order", "from": "02/1/2023", "to": "02/28/2023"}, + "query_params": { "isAjax": ["true"] } } } ], @@ -11755,11 +11546,10 @@ "task_id": 424, "intent_template_id": 371, "start_urls": ["__MAP__", "__WIKIPEDIA__"], - "intent": "Find the page of the place where Mr. Rogers was filmed on the map.", - "intent_template": "Find the page of {{description}} on the map.", + "intent": "Find the page of the place where Mr. Rogers was filmed on the map (use the provided wiki site to look up any needed information).", + "intent_template": "Find the page of {{description}} on the map (use the provided wiki site to look up any needed information).", "instantiation_dict": {"description": "the place where Mr. Rogers was filmed"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -11774,11 +11564,10 @@ "task_id": 425, "intent_template_id": 371, "start_urls": ["__MAP__", "__WIKIPEDIA__"], - "intent": "Find the page of the longest bridge in the Western hemisphere on the map.", - "intent_template": "Find the page of {{description}} on the map.", + "intent": "Find the page of the longest bridge in the Western hemisphere on the map (use the provided wiki site to look up any needed information).", + "intent_template": "Find the page of {{description}} on the map (use the provided wiki site to look up any needed information).", "instantiation_dict": {"description": "the longest bridge in the Western hemisphere"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -11793,13 +11582,12 @@ "task_id": 426, "intent_template_id": 371, "start_urls": ["__MAP__", "__WIKIPEDIA__"], - "intent": "Find the page of the place in Pennsylvania where a plane crashed during the September 11th attacks on the map.", - "intent_template": "Find the page of {{description}} on the map.", + "intent": "Find the page of the place in Pennsylvania where a plane crashed during the September 11th attacks on the map (use the provided wiki site to look up any needed information).", + "intent_template": "Find the page of {{description}} on the map (use the provided wiki site to look up any needed information).", "instantiation_dict": { "description": "the place in Pennsylvania where a plane crashed during the September 11th attacks" }, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -11814,11 +11602,10 @@ "task_id": 427, "intent_template_id": 371, "start_urls": ["__MAP__", "__WIKIPEDIA__"], - "intent": "Find the page of the university that has most Turning Award winners on the map.", - "intent_template": "Find the page of {{description}} on the map.", + "intent": "Find the page of the university that has most Turning Award winners on the map (use the provided wiki site to look up any needed information).", + "intent_template": "Find the page of {{description}} on the map (use the provided wiki site to look up any needed information).", "instantiation_dict": {"description": "the university that has most Turning Award winners"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -11833,13 +11620,12 @@ "task_id": 428, "intent_template_id": 371, "start_urls": ["__MAP__", "__WIKIPEDIA__"], - "intent": "Find the page of the undergrad college of the person who developed the Nash equilibrium on the map.", - "intent_template": "Find the page of {{description}} on the map.", + "intent": "Find the page of the undergrad college of the person who developed the Nash equilibrium on the map (use the provided wiki site to look up any needed information).", + "intent_template": "Find the page of {{description}} on the map (use the provided wiki site to look up any needed information).", "instantiation_dict": { "description": "the undergrad college of the person who developed the Nash equilibrium" }, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -11854,11 +11640,10 @@ "task_id": 429, "intent_template_id": 371, "start_urls": ["__MAP__", "__WIKIPEDIA__"], - "intent": "Find the page of the colleges where The Chair was filmed in Pittsburgh on the map.", - "intent_template": "Find the page of {{description}} on the map.", + "intent": "Find the page of the colleges where The Chair was filmed in Pittsburgh on the map (use the provided wiki site to look up any needed information).", + "intent_template": "Find the page of {{description}} on the map (use the provided wiki site to look up any needed information).", "instantiation_dict": {"description": "the colleges where The Chair was filmed in Pittsburgh"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -11873,13 +11658,12 @@ "task_id": 430, "intent_template_id": 371, "start_urls": ["__MAP__", "__WIKIPEDIA__"], - "intent": "Find the page of the college(s) where The Chair was filmed in Pennsylvania other than the ones in Pittsburgh on the map.", - "intent_template": "Find the page of {{description}} on the map.", + "intent": "Find the page of the college(s) where The Chair was filmed in Pennsylvania other than the ones in Pittsburgh on the map (use the provided wiki site to look up any needed information).", + "intent_template": "Find the page of {{description}} on the map (use the provided wiki site to look up any needed information).", "instantiation_dict": { "description": "the college(s) where The Chair was filmed in Pennsylvania other than the ones in Pittsburgh" }, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -11902,7 +11686,6 @@ "intent_template": "Add the product with the lowest per unit price from my open tabs to the shopping cart", "instantiation_dict": {}, "format_specification": null, - "start_url_context": "Complete the task given the context of the current page.", "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -11910,9 +11693,15 @@ "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", + "evaluator": "NetworkEventEvaluator", "site": "shopping", - "expected": {"sku": "B0933NCMSC"} + "last_event_only": true, + "expected": { + "url": "^__SHOPPING__/checkout/cart/add/uenc/.*/product/32202$", + "http_method": "POST", + "post_data": {"qty": "1", "item": "32202"}, + "response_status": 200 + } } ], "revision": 2 @@ -11929,7 +11718,6 @@ "intent_template": "Add the product with the lowest per unit price from my open tabs to the shopping cart", "instantiation_dict": {}, "format_specification": null, - "start_url_context": "Complete the task given the context of the current page.", "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -11937,9 +11725,15 @@ "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", + "evaluator": "NetworkEventEvaluator", "site": "shopping", - "expected": {"sku": "B00CPTR7WS"} + "last_event_only": true, + "expected": { + "url": "^__SHOPPING__/checkout/cart/add/uenc/.*/product/104498$", + "http_method": "POST", + "post_data": {"qty": "1", "item": "104498"}, + "response_status": 200 + } } ], "revision": 2 @@ -11956,7 +11750,6 @@ "intent_template": "Add the product with the lowest per unit price from my open tabs to the shopping cart", "instantiation_dict": {}, "format_specification": null, - "start_url_context": "Complete the task given the context of the current page.", "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -11983,7 +11776,6 @@ "intent_template": "Add the product with the lowest per unit price from my open tabs to the shopping cart", "instantiation_dict": {}, "format_specification": null, - "start_url_context": "Complete the task given the context of the current page.", "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -12010,7 +11802,6 @@ "intent_template": "Add the product with the lowest per unit price from my open tabs to the shopping cart", "instantiation_dict": {}, "format_specification": null, - "start_url_context": "Complete the task given the context of the current page.", "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -12018,9 +11809,15 @@ "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", + "evaluator": "NetworkEventEvaluator", "site": "shopping", - "expected": {"sku": "B01JMA0YX6"} + "last_event_only": true, + "expected": { + "url": "^__SHOPPING__/checkout/cart/add/uenc/.*/product/77300$", + "http_method": "POST", + "post_data": {"qty": "1", "item": "77300"}, + "response_status": 200 + } } ], "revision": 2 @@ -12034,7 +11831,6 @@ "intent_template": "I previously ordered {{product}} {{time}} and later cancelled. Can you reorder it for me?", "instantiation_dict": {"product": "a mattress foundation", "time": "around Feb or March 2023"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -12058,7 +11854,6 @@ "intent_template": "I previously ordered {{product}} {{time}} and later cancelled. Can you reorder it for me?", "instantiation_dict": {"product": "a table lamp", "time": "in May 2023"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -12082,7 +11877,6 @@ "intent_template": "I previously ordered {{product}} {{time}} and later cancelled. Can you reorder it for me?", "instantiation_dict": {"product": "a TV stand", "time": "sometime around sep 2022"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -12106,7 +11900,6 @@ "intent_template": "I previously ordered {{product}} {{time}} and later cancelled. Can you reorder it for me?", "instantiation_dict": {"product": "a cat t-shirt", "time": "during 2022"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -12130,7 +11923,6 @@ "intent_template": "I previously ordered {{product}} {{time}} and later cancelled. Can you reorder it for me?", "instantiation_dict": {"product": "a make up removal kit", "time": "during summer 2022"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -12150,11 +11942,10 @@ "task_id": 441, "intent_template_id": 308, "start_urls": ["__GITLAB__/byteblaze/gimmiethat.space"], - "intent": "Update the project site's title to \"GIVE ME SPACE\"", - "intent_template": "Update the project site's title to \"{{title}}\"", - "instantiation_dict": {"title": "GIVE ME SPACE"}, + "intent": "Update and commit (to the default branch) the website code for the current project using the simple online file editor to change the browser tab title to \"GIVE ME SPACE\"", + "intent_template": "Update and commit (to {{target_branch}}) the website code for the current project using the simple online file editor to change the browser tab title to \"{{title}}\"", + "instantiation_dict": {"title": "GIVE ME SPACE", "target_branch": "the default branch"}, "format_specification": null, - "start_url_context": "Complete the task given the context of the current page.", "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -12162,14 +11953,20 @@ "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "group": "byteblaze", - "project": "gimmiethat.space", - "file_path": "index.html", - "substring": "GIVE ME SPACE", - "branch": "main", - "site": "gitlab", - "expected": true + "evaluator": "NetworkEventEvaluator", + "gen_missing_post_data_schema": true, + "expected": { + "url": "__GITLAB__/byteblaze/gimmiethat.space/-/update/main/index.html", + "http_method": "POST", + "post_data": { + "_method": "put", + "file_path": "index.html", + "branch_name": "main", + "original_branch": "main", + "content": "^<\\!doctype html>.*.*GIVE ME SPACE.*.*$" + }, + "response_status": 302 + } } ], "revision": 2 @@ -12179,11 +11976,10 @@ "task_id": 442, "intent_template_id": 308, "start_urls": ["__GITLAB__/byteblaze/gimmiethat.space"], - "intent": "Update the project site's title to \"Welcome to my site\"", - "intent_template": "Update the project site's title to \"{{title}}\"", - "instantiation_dict": {"title": "Welcome to my site"}, + "intent": "Update and commit (to main) the website code for the current project using the simple online file editor to change the browser tab title to \"Welcome to my site\"", + "intent_template": "Update and commit (to {{target_branch}}) the website code for the current project using the simple online file editor to change the browser tab title to \"{{title}}\"", + "instantiation_dict": {"title": "Welcome to my site", "target_branch": "main"}, "format_specification": null, - "start_url_context": "Complete the task given the context of the current page.", "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -12191,14 +11987,20 @@ "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "group": "byteblaze", - "project": "gimmiethat.space", - "file_path": "index.html", - "substring": "Welcome to my site", - "branch": "main", - "site": "gitlab", - "expected": true + "evaluator": "NetworkEventEvaluator", + "gen_missing_post_data_schema": true, + "expected": { + "url": "__GITLAB__/byteblaze/gimmiethat.space/-/update/main/index.html", + "http_method": "POST", + "post_data": { + "_method": "put", + "file_path": "index.html", + "branch_name": "main", + "original_branch": "main", + "content": "^<\\!doctype html>.*.*Welcome to my site.*.*$" + }, + "response_status": 302 + } } ], "revision": 2 @@ -12208,11 +12010,13 @@ "task_id": 443, "intent_template_id": 308, "start_urls": ["__GITLAB__/byteblaze/gimmiethat.space"], - "intent": "Update the project site's title to \"Not an interesting site\"", - "intent_template": "Update the project site's title to \"{{title}}\"", - "instantiation_dict": {"title": "Not an interesting site"}, + "intent": "Update and commit (to a new branch with no merged request) the website code for the current project using the simple online file editor to change the browser tab title to \"Not an interesting site\"", + "intent_template": "Update and commit (to {{target_branch}}) the website code for the current project using the simple online file editor to change the browser tab title to \"{{title}}\"", + "instantiation_dict": { + "title": "Not an interesting site", + "target_branch": "a new branch with no merged request" + }, "format_specification": null, - "start_url_context": "Complete the task given the context of the current page.", "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -12220,14 +12024,20 @@ "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "group": "byteblaze", - "project": "gimmiethat.space", - "file_path": "index.html", - "substring": "Not an interesting site", - "branch": "main", - "site": "gitlab", - "expected": true + "evaluator": "NetworkEventEvaluator", + "gen_missing_post_data_schema": true, + "expected": { + "url": "__GITLAB__/byteblaze/gimmiethat.space/-/update/main/index.html", + "http_method": "POST", + "post_data": { + "_method": "put", + "file_path": "index.html", + "branch_name": "^(?!main$).+$", + "original_branch": "main", + "content": "^<\\!doctype html>.*.*Not an interesting site.*.*$" + }, + "response_status": 302 + } } ], "revision": 2 @@ -12237,11 +12047,13 @@ "task_id": 444, "intent_template_id": 308, "start_urls": ["__GITLAB__/byteblaze/gimmiethat.space"], - "intent": "Update the project site's title to \"Title Wanted\"", - "intent_template": "Update the project site's title to \"{{title}}\"", - "instantiation_dict": {"title": "Title Wanted"}, + "intent": "Update and commit (to a new branch called title-update with no merged request) the website code for the current project using the simple online file editor to change the browser tab title to \"Title Wanted\"", + "intent_template": "Update and commit (to {{target_branch}}) the website code for the current project using the simple online file editor to change the browser tab title to \"{{title}}\"", + "instantiation_dict": { + "title": "Title Wanted", + "target_branch": "a new branch called title-update with no merged request" + }, "format_specification": null, - "start_url_context": "Complete the task given the context of the current page.", "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -12249,14 +12061,20 @@ "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "group": "byteblaze", - "project": "gimmiethat.space", - "file_path": "index.html", - "substring": "Title Wanted", - "branch": "main", - "site": "gitlab", - "expected": true + "evaluator": "NetworkEventEvaluator", + "gen_missing_post_data_schema": true, + "expected": { + "url": "__GITLAB__/byteblaze/gimmiethat.space/-/update/main/index.html", + "http_method": "POST", + "post_data": { + "_method": "put", + "file_path": "index.html", + "branch_name": "title-update", + "original_branch": "main", + "content": "^<\\!doctype html>.*.*Title Wanted.*.*$" + }, + "response_status": 302 + } } ], "revision": 2 @@ -12266,11 +12084,10 @@ "task_id": 445, "intent_template_id": 308, "start_urls": ["__GITLAB__/byteblaze/gimmiethat.space"], - "intent": "Update the project site's title to \"Hello\"", - "intent_template": "Update the project site's title to \"{{title}}\"", - "instantiation_dict": {"title": "Hello"}, + "intent": "Update and commit (to main) the website code for the current project using the simple online file editor to change the browser tab title to \"Hello\"", + "intent_template": "Update and commit (to {{target_branch}}) the website code for the current project using the simple online file editor to change the browser tab title to \"{{title}}\"", + "instantiation_dict": {"title": "Hello", "target_branch": "main"}, "format_specification": null, - "start_url_context": "Complete the task given the context of the current page.", "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -12278,14 +12095,20 @@ "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "group": "byteblaze", - "project": "gimmiethat.space", - "file_path": "index.html", - "substring": "Hello", - "branch": "main", - "site": "gitlab", - "expected": true + "evaluator": "NetworkEventEvaluator", + "gen_missing_post_data_schema": true, + "expected": { + "url": "__GITLAB__/byteblaze/gimmiethat.space/-/update/main/index.html", + "http_method": "POST", + "post_data": { + "_method": "put", + "file_path": "index.html", + "branch_name": "main", + "original_branch": "main", + "content": "^<\\!doctype html>.*.*Hello.*.*$" + }, + "response_status": 302 + } } ], "revision": 2 @@ -12299,7 +12122,6 @@ "intent_template": "Assign the issue regarding {{issue}} in {{repo}} to {{account}}.", "instantiation_dict": {"repo": "a11yproject", "issue": 404, "account": "Roshanjossey"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -12307,13 +12129,13 @@ "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "usernames": ["Roshanjossey"], - "group": "a11yproject", - "project": "a11yproject.com", - "issue_iid": 1478, - "site": "gitlab", - "expected": true + "evaluator": "NetworkEventEvaluator", + "ignored_query_params": ["serializer"], + "expected": { + "url": "__GITLAB__/a11yproject/a11yproject.com/-/issues/1478.json", + "http_method": "PUT", + "post_data": { "$.issue.assignee_ids": [2264] } + } } ], "revision": 2 @@ -12331,7 +12153,6 @@ "account": "Rohan" }, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -12339,13 +12160,13 @@ "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "usernames": ["Seirdy"], - "group": "byteblaze", - "project": "a11y-webring.club", - "issue_iid": 71, - "site": "gitlab", - "expected": true + "evaluator": "NetworkEventEvaluator", + "ignored_query_params": ["serializer"], + "expected": { + "url": "__GITLAB__/byteblaze/a11y-webring.club/-/issues/71.json", + "http_method": "PUT", + "post_data": { "$.issue.assignee_ids": [2366] } + } } ], "revision": 2 @@ -12359,7 +12180,6 @@ "intent_template": "set the homepage URL on my GitLab profile to {{url}}", "instantiation_dict": {"url": "https://egg.tart.com"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -12367,11 +12187,13 @@ "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "username": "byteblaze", - "website_url": "https://egg.tart.com", - "site": "gitlab", - "expected": true + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": "__GITLAB__/-/profile", + "http_method": "POST", + "post_data": {"user[website_url]": "https://egg.tart.com"}, + "response_status": 302 + } } ], "revision": 2 @@ -12381,11 +12203,10 @@ "task_id": 449, "intent_template_id": 331, "start_urls": ["__GITLAB__"], - "intent": "set the homepage URL on my GitLab profile to https://helloworld.xyz", + "intent": "set the homepage URL on my GitLab profile to helloworld.xyz", "intent_template": "set the homepage URL on my GitLab profile to {{url}}", - "instantiation_dict": {"url": "https://helloworld.xyz"}, + "instantiation_dict": {"url": "helloworld.xyz"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -12393,11 +12214,14 @@ "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "username": "byteblaze", - "website_url": "https://helloworld.xyz", - "site": "gitlab", - "expected": true + "evaluator": "NetworkEventEvaluator", + "post_data_schema": { "type": "object", "properties": { "user[website_url]": {"type": "string"} } }, + "expected": { + "url": "__GITLAB__/-/profile", + "http_method": "POST", + "post_data": { "user[website_url]": ["https://helloworld.xyz", "http://helloworld.xyz"] }, + "response_status": 302 + } } ], "revision": 2 @@ -12411,7 +12235,6 @@ "intent_template": "set the homepage URL on my GitLab profile to {{url}}", "instantiation_dict": {"url": "https://a11yproject.contributor.me"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -12419,11 +12242,13 @@ "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "username": "byteblaze", - "website_url": "https://a11yproject.contributor.me", - "site": "gitlab", - "expected": true + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": "__GITLAB__/-/profile", + "http_method": "POST", + "post_data": {"user[website_url]": "https://a11yproject.contributor.me"}, + "response_status": 302 + } } ], "revision": 2 @@ -12433,11 +12258,10 @@ "task_id": 451, "intent_template_id": 331, "start_urls": ["__GITLAB__"], - "intent": "set the homepage URL on my GitLab profile to https://www.byteblaze.com", + "intent": "set the homepage URL on my GitLab profile to www.byteblaze.com", "intent_template": "set the homepage URL on my GitLab profile to {{url}}", - "instantiation_dict": {"url": "https://www.byteblaze.com"}, + "instantiation_dict": {"url": "www.byteblaze.com"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -12445,11 +12269,16 @@ "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "username": "byteblaze", - "website_url": "https://www.byteblaze.com", - "site": "gitlab", - "expected": true + "evaluator": "NetworkEventEvaluator", + "post_data_schema": { "type": "object", "properties": { "user[website_url]": {"type": "string"} } }, + "expected": { + "url": "__GITLAB__/-/profile", + "http_method": "POST", + "post_data": { + "user[website_url]": ["https://www.byteblaze.com", "http://www.byteblaze.com"] + }, + "response_status": 302 + } } ], "revision": 2 @@ -12463,7 +12292,6 @@ "intent_template": "set the homepage URL on my GitLab profile to {{url}}", "instantiation_dict": {"url": "https://byteblaze.github.io"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -12471,11 +12299,13 @@ "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "username": "byteblaze", - "website_url": "https://byteblaze.github.io", - "site": "gitlab", - "expected": true + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": "__GITLAB__/-/profile", + "http_method": "POST", + "post_data": {"user[website_url]": "https://byteblaze.github.io"}, + "response_status": 302 + } } ], "revision": 2 @@ -12489,7 +12319,6 @@ "intent_template": "Disable {{product}} from the site, they are facing some quality issues.", "instantiation_dict": {"product": "Teton pullover hoodie"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -12514,7 +12343,6 @@ "intent_template": "Disable {{product}} from the site, they are facing some quality issues.", "instantiation_dict": {"product": "Ryker Tee Crew Neck"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -12539,7 +12367,6 @@ "intent_template": "Disable {{product}} from the site, they are facing some quality issues.", "instantiation_dict": {"product": "Helios Endurance Tank"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -12564,7 +12391,6 @@ "intent_template": "Disable {{product}} from the site, they are facing some quality issues.", "instantiation_dict": {"product": "Cora Pant"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -12589,7 +12415,6 @@ "intent_template": "Disable {{product}} from the site, they are facing some quality issues.", "instantiation_dict": {"product": "Karmen yoga pants"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -12610,11 +12435,10 @@ "task_id": 458, "intent_template_id": 247, "start_urls": ["__SHOPPING_ADMIN__/catalog/product/edit/id/1481/"], - "intent": "Reduce the price of this product by $5", - "intent_template": "{{action}} the price of this product by {{amount}}", + "intent": "Reduce the price of the product on the current page by $5", + "intent_template": "{{action}} the price of the product on the current page by {{amount}}", "instantiation_dict": {"amount": "$5", "performed_operation": "Reduce"}, "format_specification": null, - "start_url_context": "Complete the task given the context of the current page.", "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -12635,11 +12459,10 @@ "task_id": 459, "intent_template_id": 247, "start_urls": ["__SHOPPING_ADMIN__/catalog/product/edit/id/237/"], - "intent": "Reduce the price of this product by 10%", - "intent_template": "{{action}} the price of this product by {{amount}}", + "intent": "Reduce the price of the product on the current page by 10%", + "intent_template": "{{action}} the price of the product on the current page by {{amount}}", "instantiation_dict": {"amount": "10%", "performed_operation": "Reduce"}, "format_specification": null, - "start_url_context": "Complete the task given the context of the current page.", "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -12660,11 +12483,10 @@ "task_id": 460, "intent_template_id": 247, "start_urls": ["__SHOPPING_ADMIN__/catalog/product/edit/id/418/"], - "intent": "Reduce the price of this product by 15%", - "intent_template": "{{action}} the price of this product by {{amount}}", + "intent": "Reduce the price of the product on the current page by 15%", + "intent_template": "{{action}} the price of the product on the current page by {{amount}}", "instantiation_dict": {"amount": "15%", "performed_operation": "Reduce"}, "format_specification": null, - "start_url_context": "Complete the task given the context of the current page.", "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -12685,11 +12507,10 @@ "task_id": 461, "intent_template_id": 247, "start_urls": ["__SHOPPING_ADMIN__/catalog/product/edit/id/721/"], - "intent": "Increase the price of this product by $11.5", - "intent_template": "{{action}} the price of this product by {{amount}}", + "intent": "Increase the price of the product on the current page by $11.5", + "intent_template": "{{action}} the price of the product on the current page by {{amount}}", "instantiation_dict": {"amount": "$11.5", "performed_operation": "Increase"}, "format_specification": null, - "start_url_context": "Complete the task given the context of the current page.", "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -12710,11 +12531,10 @@ "task_id": 462, "intent_template_id": 247, "start_urls": ["__SHOPPING_ADMIN__/catalog/product/edit/id/710/"], - "intent": "Increase the price of this product by 10%", - "intent_template": "{{action}} the price of this product by {{amount}}", + "intent": "Increase the price of the product on the current page by 10%", + "intent_template": "{{action}} the price of the product on the current page by {{amount}}", "instantiation_dict": {"amount": "10%", "performed_operation": "Increase"}, "format_specification": null, - "start_url_context": "Complete the task given the context of the current page.", "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -12735,11 +12555,10 @@ "task_id": 463, "intent_template_id": 247, "start_urls": ["__SHOPPING_ADMIN__/catalog/product/edit/id/996/"], - "intent": "Increase the price of this product by 15%", - "intent_template": "{{action}} the price of this product by {{amount}}", + "intent": "Increase the price of the product on the current page by 15%", + "intent_template": "{{action}} the price of the product on the current page by {{amount}}", "instantiation_dict": {"amount": "15%", "performed_operation": "Increase"}, "format_specification": null, - "start_url_context": "Complete the task given the context of the current page.", "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -12760,11 +12579,10 @@ "task_id": 464, "intent_template_id": 251, "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Update the product description of Antonia Racer Tank with the titles of all reviews with 4 stars or above.", - "intent_template": "Update the product description of {{product}} with the titles of all reviews with 4 stars or above.", + "intent": "Update the product description of Antonia Racer Tank to \"{count} customer(s) love it!\" where count is the number of reviews with 4 stars or above, or \"don't miss out on this amazing product\" when there are no such reviews.", + "intent_template": "Update the product description of {{product}} to \"{count} customer(s) love it!\" where count is the number of reviews with 4 stars or above, or \"don't miss out on this amazing product\" when there are no such reviews.", "instantiation_dict": {"product": "Antonia Racer Tank"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -12772,10 +12590,15 @@ "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "product_id": "1796", + "evaluator": "NetworkEventEvaluator", "site": "shopping_admin", - "expected": { "description": ["A regular or me"] } + "last_event_only": true, + "expected": { + "url": "__SHOPPING_ADMIN__/catalog/product/save/id/1796/type/configurable/store/0/set/9/back/edit", + "http_method": "POST", + "response_status": 302, + "post_data": {"product[short_description]": "

1 customer(s) love it!

"} + } } ], "revision": 2 @@ -12791,7 +12614,6 @@ "product": "Tide PODS Spring Meadow Scent HE Turbo Laundry Detergent Pacs, 81 Count" }, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -12815,7 +12637,6 @@ "intent_template": "Add {{product}} to my wish list", "instantiation_dict": {"product": "2 Hawaiian Bamboo Orchid Roots #zc50 - by Discount Hawaiian Gifts"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -12823,9 +12644,15 @@ "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", + "evaluator": "NetworkEventEvaluator", "site": "shopping", - "expected": {"sku": "B01M1RMOLX"} + "last_event_only": true, + "expected": { + "url": "__SHOPPING__/wishlist/index/add/", + "http_method": "POST", + "post_data": {"product": "22787", "qty": null}, + "response_status": 302 + } } ], "revision": 2 @@ -12841,7 +12668,6 @@ "product": "HONGJ Hawaiian Beach Outfits Set for Mens, Summer Tropical Tree Printed Relaxed-fit Hawaii Shirts Shorts 2 Piece Suits" }, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -12867,7 +12693,6 @@ "product": "DkRgVNY Lace Spcling Lingerie Womens Sexy Hollow Out Underwear Bodysuit One Piece Snap Crotch Clubwear Teddy Bodysuit" }, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -12893,7 +12718,6 @@ "product": "Light Blue Simple Summer New Low Heels Slippers for Women Fashion Chunky Heels Pointed Toe Wine Glasses Sandals Comfortable Walking Shoes Ladies All-Match Sexy Party Shoes" }, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -12917,7 +12741,6 @@ "intent_template": "Cancel order {{id}}", "instantiation_dict": {"id": "302"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -12942,7 +12765,6 @@ "intent_template": "Cancel order {{id}}", "instantiation_dict": {"id": "307"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -12967,7 +12789,6 @@ "intent_template": "Cancel order {{id}}", "instantiation_dict": {"id": "299"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -12992,7 +12813,6 @@ "intent_template": "Cancel order {{id}}", "instantiation_dict": {"id": "301"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -13017,7 +12837,6 @@ "intent_template": "Cancel order {{id}}", "instantiation_dict": {"id": "305"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -13042,7 +12861,6 @@ "intent_template": "Set up a new, empty repository with the name {{project_name}}?", "instantiation_dict": {"project_name": "chatgpt_plugin"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -13067,7 +12885,6 @@ "intent_template": "Set up a new, empty repository with the name {{project_name}}?", "instantiation_dict": {"project_name": "awesome_llm_reading"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -13092,7 +12909,6 @@ "intent_template": "Set up a new, empty repository with the name {{project_name}}?", "instantiation_dict": {"project_name": "awesome_program_aided_reasoning"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -13117,7 +12933,6 @@ "intent_template": "Set up a new, empty repository with the name {{project_name}}?", "instantiation_dict": {"project_name": "webagent"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -13142,7 +12957,6 @@ "intent_template": "Set up a new, empty repository with the name {{project_name}}?", "instantiation_dict": {"project_name": "awesome_webagent"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -13167,7 +12981,6 @@ "intent_template": "Invite {{collaborator_account_list}} as collaborator(s) to {{repo}} repo", "instantiation_dict": {"collaborator_account_list": "yjlou", "repo": "solarized-prism-theme"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -13175,12 +12988,13 @@ "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "collaborators": ["yjlou"], - "group": "byteblaze", - "project": "solarized-prism-theme", - "site": "gitlab", - "expected": true + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": "__GITLAB__/api/v4/projects/188/members", + "http_method": "POST", + "post_data": {"user_id": 168, "access_level": 30}, + "response_status": 201 + } } ], "revision": 2 @@ -13194,7 +13008,6 @@ "intent_template": "{{name}} wants to check my dotfile configurations. Please invite him to the repo as a guest.", "instantiation_dict": {"name": "Abishek"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -13202,13 +13015,13 @@ "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "collaborators": ["abisubramanya27"], - "group": "byteblaze", - "project": "dotfiles", - "access_level": 10, - "site": "gitlab", - "expected": true + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": "__GITLAB__/api/v4/projects/193/members", + "http_method": "POST", + "post_data": {"user_id": 5, "access_level": 10}, + "response_status": 201 + } } ], "revision": 2 @@ -13222,7 +13035,6 @@ "intent_template": "{{name}} wants to check my dotfile configurations. Please invite him to the repo as a guest.", "instantiation_dict": {"name": "yjlou"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -13230,13 +13042,13 @@ "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "collaborators": ["yjlou"], - "group": "byteblaze", - "project": "dotfiles", - "access_level": 10, - "site": "gitlab", - "expected": true + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": "__GITLAB__/api/v4/projects/193/members", + "http_method": "POST", + "post_data": {"user_id": 168, "access_level": 10}, + "response_status": 201 + } } ], "revision": 2 @@ -13250,7 +13062,6 @@ "intent_template": "{{name}} wants to check my dotfile configurations. Please invite him to the repo as a guest.", "instantiation_dict": {"name": "Koushik"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -13258,13 +13069,13 @@ "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "collaborators": ["koush"], - "group": "byteblaze", - "project": "dotfiles", - "access_level": 10, - "site": "gitlab", - "expected": true + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": "__GITLAB__/api/v4/projects/193/members", + "http_method": "POST", + "post_data": {"user_id": 1912, "access_level": 10}, + "response_status": 201 + } } ], "revision": 2 @@ -13278,7 +13089,6 @@ "intent_template": "{{name}} wants to check my dotfile configurations. Please invite him to the repo as a guest.", "instantiation_dict": {"name": "Jakub Klinkovsk\u00fd"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -13286,13 +13096,13 @@ "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "collaborators": ["lahwaacz"], - "group": "byteblaze", - "project": "dotfiles", - "access_level": 10, - "site": "gitlab", - "expected": true + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": "__GITLAB__/api/v4/projects/193/members", + "http_method": "POST", + "post_data": {"user_id": 1842, "access_level": 10}, + "response_status": 201 + } } ], "revision": 2 @@ -13306,7 +13116,6 @@ "intent_template": "{{name}} wants to check my dotfile configurations. Please invite him to the repo as a guest.", "instantiation_dict": {"name": "Vinta"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -13314,13 +13123,13 @@ "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "collaborators": ["vinta"], - "group": "byteblaze", - "project": "dotfiles", - "access_level": 10, - "site": "gitlab", - "expected": true + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": "__GITLAB__/api/v4/projects/193/members", + "http_method": "POST", + "post_data": {"user_id": 278, "access_level": 10}, + "response_status": 201 + } } ], "revision": 2 @@ -13337,7 +13146,6 @@ "heading": "Bruh bro you clicked the wrong page" }, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -13345,10 +13153,20 @@ "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "page_id": "1", + "evaluator": "NetworkEventEvaluator", "site": "shopping_admin", - "expected": {"title": "Bruh bro you clicked the wrong page"} + "last_event_only": true, + "expected": { + "url": "__SHOPPING_ADMIN__/cms/page/save/back/edit", + "http_method": "POST", + "post_data": { + "title": "Bruh bro you clicked the wrong page", + "is_active": "1", + "store_id[0]": "0", + "page_id": "1" + }, + "response_status": 302 + } } ], "revision": 2 @@ -13365,7 +13183,6 @@ "heading": "Cookie monster coming to your place" }, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -13373,10 +13190,20 @@ "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "page_id": "3", + "evaluator": "NetworkEventEvaluator", "site": "shopping_admin", - "expected": {"title": "Cookie monster coming to your place"} + "last_event_only": true, + "expected": { + "url": "__SHOPPING_ADMIN__/cms/page/save/back/edit", + "http_method": "POST", + "post_data": { + "title": "Cookie monster coming to your place", + "is_active": "1", + "store_id[0]": "0", + "page_id": "3" + }, + "response_status": 302 + } } ], "revision": 2 @@ -13390,7 +13217,6 @@ "intent_template": "Change the page title of \"{{old-heading}}\" page on my site to \"{{heading}}\".", "instantiation_dict": {"old-heading": "Home Page", "heading": "This is the home page!! Leave here!!"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -13398,10 +13224,20 @@ "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "page_id": "2", + "evaluator": "NetworkEventEvaluator", "site": "shopping_admin", - "expected": {"title": "This is the home page!! Leave here!!"} + "last_event_only": true, + "expected": { + "url": "__SHOPPING_ADMIN__/cms/page/save/back/edit", + "http_method": "POST", + "post_data": { + "title": "This is the home page!! Leave here!!", + "is_active": "1", + "store_id[0]": "0", + "page_id": "2" + }, + "response_status": 302 + } } ], "revision": 2 @@ -13418,7 +13254,6 @@ "heading": "No privacy policy is needed in this dystopian world" }, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -13426,10 +13261,20 @@ "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "page_id": "4", + "evaluator": "NetworkEventEvaluator", "site": "shopping_admin", - "expected": {"title": "No privacy policy is needed in this dystopian world"} + "last_event_only": true, + "expected": { + "url": "__SHOPPING_ADMIN__/cms/page/save/back/edit", + "http_method": "POST", + "post_data": { + "title": "No privacy policy is needed in this dystopian world", + "is_active": "1", + "store_id[0]": "0", + "page_id": "4" + }, + "response_status": 302 + } } ], "revision": 2 @@ -13443,7 +13288,6 @@ "intent_template": "Change the page title of \"{{old-heading}}\" page on my site to \"{{heading}}\".", "instantiation_dict": {"old-heading": "About us", "heading": "Secret"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -13451,10 +13295,15 @@ "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "page_id": "5", + "evaluator": "NetworkEventEvaluator", "site": "shopping_admin", - "expected": {"title": "Secret"} + "last_event_only": true, + "expected": { + "url": "__SHOPPING_ADMIN__/cms/page/save/back/edit", + "http_method": "POST", + "post_data": {"title": "Secret", "is_active": "1", "store_id[0]": "0", "page_id": "5"}, + "response_status": 302 + } } ], "revision": 2 @@ -13468,13 +13317,12 @@ "intent_template": "Notify {{name}} in their most recent pending order with message \"{{message}}\"", "instantiation_dict": {"name": "Sarah Miller", "message": "the order is ready to be shipped soon!"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, "expected": { - "performed_operation": "retrieve", + "performed_operation": "mutate", "status": "ACTION_NOT_ALLOWED_ERROR", "retrieved_data": null } @@ -13491,7 +13339,6 @@ "intent_template": "Notify {{name}} in their most recent pending order with message \"{{message}}\"", "instantiation_dict": {"name": "Jane Doe", "message": "sorry we are out of stock, please reorder"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -13519,7 +13366,6 @@ "message": "sorry we are bankrupt, please contact our customer service for refund" }, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -13546,7 +13392,6 @@ "intent_template": "Notify {{name}} in their most recent pending order with message \"{{message}}\"", "instantiation_dict": {"name": "Alex Thomas", "message": "Yo, your order will be shipped soon!"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -13571,7 +13416,6 @@ "intent_template": "Notify {{name}} in their most recent pending order with message \"{{message}}\"", "instantiation_dict": {"name": "Lily Potter", "message": "Thanks, your order is ready to be shipped!"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -13596,7 +13440,6 @@ "intent_template": "Update order #{{order}} with the {{service}} tracking number {{tracking}}", "instantiation_dict": {"tracking": "8974568499", "order": "299", "service": "Federal Express"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -13604,10 +13447,15 @@ "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "order_id": "299", + "evaluator": "NetworkEventEvaluator", "site": "shopping_admin", - "expected": {"shipping_carrier": "Federal Express", "tracking_number": "8974568499"} + "last_event_only": true, + "expected": { + "url": "__SHOPPING_ADMIN__/admin/order_shipment/save/order_id/299/", + "http_method": "POST", + "post_data": {"tracking[1][carrier_code]": "fedex", "tracking[1][number]": "8974568499"}, + "response_status": 302 + } } ], "revision": 2 @@ -13621,7 +13469,6 @@ "intent_template": "Update order #{{order}} with the {{service}} tracking number {{tracking}}", "instantiation_dict": {"tracking": "24353446464", "order": "307", "service": "DHL"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -13629,10 +13476,15 @@ "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "order_id": "307", + "evaluator": "NetworkEventEvaluator", "site": "shopping_admin", - "expected": {"shipping_carrier": "DHL", "tracking_number": "24353446464"} + "last_event_only": true, + "expected": { + "url": "__SHOPPING_ADMIN__/admin/order_shipment/save/order_id/307/", + "http_method": "POST", + "post_data": {"tracking[1][carrier_code]": "dhl", "tracking[1][number]": "24353446464"}, + "response_status": 302 + } } ], "revision": 2 @@ -13646,7 +13498,6 @@ "intent_template": "Update order #{{order}} with the {{service}} tracking number {{tracking}}", "instantiation_dict": {"tracking": "55591023930", "order": "306", "service": "UPS"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -13654,10 +13505,15 @@ "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "order_id": "306", + "evaluator": "NetworkEventEvaluator", "site": "shopping_admin", - "expected": {"shipping_carrier": "UPS", "tracking_number": "55591023930"} + "last_event_only": true, + "expected": { + "url": "__SHOPPING_ADMIN__/admin/order_shipment/save/order_id/306/", + "http_method": "POST", + "post_data": {"tracking[1][carrier_code]": "ups", "tracking[1][number]": "55591023930"}, + "response_status": 302 + } } ], "revision": 2 @@ -13671,7 +13527,6 @@ "intent_template": "Update order #{{order}} with the {{service}} tracking number {{tracking}}", "instantiation_dict": {"tracking": "13849373987", "order": "304", "service": "USPS"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -13679,10 +13534,15 @@ "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "order_id": "304", + "evaluator": "NetworkEventEvaluator", "site": "shopping_admin", - "expected": {"shipping_carrier": "USPS", "tracking_number": "13849373987"} + "last_event_only": true, + "expected": { + "url": "__SHOPPING_ADMIN__/admin/order_shipment/save/order_id/304/", + "http_method": "POST", + "post_data": {"tracking[1][carrier_code]": "usps", "tracking[1][number]": "13849373987"}, + "response_status": 302 + } } ], "revision": 2 @@ -13696,7 +13556,6 @@ "intent_template": "Update order #{{order}} with the {{service}} tracking number {{tracking}}", "instantiation_dict": {"tracking": "239028439840", "order": "301", "service": "DHL"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -13704,10 +13563,15 @@ "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "order_id": "301", + "evaluator": "NetworkEventEvaluator", "site": "shopping_admin", - "expected": {"shipping_carrier": "DHL", "tracking_number": "239028439840"} + "last_event_only": true, + "expected": { + "url": "__SHOPPING_ADMIN__/admin/order_shipment/save/order_id/301/", + "http_method": "POST", + "post_data": {"tracking[1][carrier_code]": "dhl", "tracking[1][number]": "239028439840"}, + "response_status": 302 + } } ], "revision": 2 @@ -13721,7 +13585,6 @@ "intent_template": "Make all {{product}} as out of stock", "instantiation_dict": {"product": "Taurus Elements Shell"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -13729,10 +13592,15 @@ "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "product_id": "350", + "evaluator": "NetworkEventEvaluator", "site": "shopping_admin", - "expected": {"in_stock": false} + "last_event_only": true, + "expected": { + "url": "__SHOPPING_ADMIN__/catalog/product/save/id/350/type/configurable/store/0/set/10/back/edit", + "http_method": "POST", + "post_data": {"product[quantity_and_stock_status][is_in_stock]": "0"}, + "response_status": 302 + } } ], "revision": 2 @@ -13746,7 +13614,6 @@ "intent_template": "Make all {{product}} as out of stock", "instantiation_dict": {"product": "Gobi HeatTec Tee"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -13754,10 +13621,15 @@ "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "product_id": "446", + "evaluator": "NetworkEventEvaluator", "site": "shopping_admin", - "expected": {"in_stock": false} + "last_event_only": true, + "expected": { + "url": "__SHOPPING_ADMIN__/catalog/product/save/id/446/type/configurable/store/0/set/9/back/edit", + "http_method": "POST", + "post_data": {"product[quantity_and_stock_status][is_in_stock]": "0"}, + "response_status": 302 + } } ], "revision": 2 @@ -13771,7 +13643,6 @@ "intent_template": "Make all {{product}} as out of stock", "instantiation_dict": {"product": "rocco gym tank"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -13779,10 +13650,15 @@ "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "product_id": "682", + "evaluator": "NetworkEventEvaluator", "site": "shopping_admin", - "expected": {"in_stock": false} + "last_event_only": true, + "expected": { + "url": "__SHOPPING_ADMIN__/catalog/product/save/id/682/type/configurable/store/0/set/10/back/edit", + "http_method": "POST", + "post_data": {"product[quantity_and_stock_status][is_in_stock]": "0"}, + "response_status": 302 + } } ], "revision": 2 @@ -13796,7 +13672,6 @@ "intent_template": "Make all {{product}} as out of stock", "instantiation_dict": {"product": "Selene yoga hoodie"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -13804,10 +13679,15 @@ "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "product_id": "1108", + "evaluator": "NetworkEventEvaluator", "site": "shopping_admin", - "expected": {"in_stock": false} + "last_event_only": true, + "expected": { + "url": "__SHOPPING_ADMIN__/catalog/product/save/id/1108/type/configurable/store/0/set/10/back/edit", + "http_method": "POST", + "post_data": {"product[quantity_and_stock_status][is_in_stock]": "0"}, + "response_status": 302 + } } ], "revision": 2 @@ -13821,7 +13701,6 @@ "intent_template": "Make all {{product}} as out of stock", "instantiation_dict": {"product": "Aeon capri"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -13829,10 +13708,15 @@ "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "product_id": "1861", + "evaluator": "NetworkEventEvaluator", "site": "shopping_admin", - "expected": {"in_stock": false} + "last_event_only": true, + "expected": { + "url": "__SHOPPING_ADMIN__/catalog/product/save/id/1861/type/configurable/store/0/set/10/back/edit", + "http_method": "POST", + "post_data": {"product[quantity_and_stock_status][is_in_stock]": "0"}, + "response_status": 302 + } } ], "revision": 2 @@ -13842,11 +13726,10 @@ "task_id": 506, "intent_template_id": 172, "start_urls": ["__SHOPPING__"], - "intent": "Buy the highest rated product from the meat substitute category within a budget between 100 and 200.", - "intent_template": "Buy the highest rated product from the {{product_category}} category within a budget {{dollar_value}}.", + "intent": "Buy the highest rated product from the meat substitute category within a budget between 100 and 200. Discard any items in your cart if it is not empty.", + "intent_template": "Buy the highest rated product from the {{product_category}} category within a budget {{dollar_value}}. Discard any items in your cart if it is not empty.", "instantiation_dict": {"product_category": "meat substitute", "dollar_value": "between 100 and 200"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -13854,9 +13737,44 @@ "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", + "evaluator": "NetworkEventEvaluator", + "site": "shopping", + "last_event_only": true, + "decode_base64_query": true, + "expected": { + "url": "^__SHOPPING__/checkout/cart/add/uenc/.*/product/22490$", + "http_method": "POST", + "post_data": {"product": "22490"} + } + }, + { + "evaluator": "NetworkEventEvaluator", + "site": "shopping", + "last_event_only": true, + "ignored_query_params_patterns": ["\\_"], + "expected": { + "url": "__SHOPPING__/rest/default/V1/carts/mine/totals", + "response_content": { + "items_qty": 1, + "$.items[0].name": "Beyond Meat Beef Beefy Crumble, 5 Pound -- 2 per case." + } + } + }, + { + "evaluator": "NetworkEventEvaluator", + "site": "shopping", + "last_event_only": true, + "expected": { + "url": "__SHOPPING__/rest/default/V1/carts/mine/payment-information", + "http_method": "POST", + "post_data": {"$.billingAddress.customerId": "27"} + } + }, + { + "evaluator": "NetworkEventEvaluator", "site": "shopping", - "expected": {"sku": "B01CTR3DLE"} + "last_event_only": true, + "expected": {"url": "__SHOPPING__/checkout/onepage/success"} } ], "revision": 2 @@ -13866,11 +13784,10 @@ "task_id": 507, "intent_template_id": 172, "start_urls": ["__SHOPPING__"], - "intent": "Buy the highest rated product from the Ceiling light category within a budget above 1000.", - "intent_template": "Buy the highest rated product from the {{product_category}} category within a budget {{dollar_value}}.", + "intent": "Buy the highest rated product from the Ceiling light category within a budget above 1000. Discard any items in your cart if it is not empty.", + "intent_template": "Buy the highest rated product from the {{product_category}} category within a budget {{dollar_value}}. Discard any items in your cart if it is not empty.", "instantiation_dict": {"product_category": "Ceiling light", "dollar_value": "above 1000"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -13878,9 +13795,44 @@ "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", + "evaluator": "NetworkEventEvaluator", + "site": "shopping", + "last_event_only": true, + "decode_base64_query": true, + "expected": { + "url": "^__SHOPPING__/checkout/cart/add/uenc/.*/product/71506$", + "http_method": "POST", + "post_data": {"product": "71506"} + } + }, + { + "evaluator": "NetworkEventEvaluator", "site": "shopping", - "expected": {"sku": "B07BVL3P1V"} + "last_event_only": true, + "ignored_query_params_patterns": ["\\_"], + "expected": { + "url": "__SHOPPING__/rest/default/V1/carts/mine/totals", + "response_content": { + "items_qty": 1, + "$.items[0].name": "40''X138\" Hight Ceiling Chandelier Crystal Raindrop Chandeliers Staircase Large Chandelier Villa Entrance Foyer Pendant Light Grand Light Foyer High Ceiling Fixture 15 GU10 Bulb Include Remote Dimming" + } + } + }, + { + "evaluator": "NetworkEventEvaluator", + "site": "shopping", + "last_event_only": true, + "expected": { + "url": "__SHOPPING__/rest/default/V1/carts/mine/payment-information", + "http_method": "POST", + "post_data": {"$.billingAddress.customerId": "27"} + } + }, + { + "evaluator": "NetworkEventEvaluator", + "site": "shopping", + "last_event_only": true, + "expected": {"url": "__SHOPPING__/checkout/onepage/success"} } ], "revision": 2 @@ -13890,11 +13842,10 @@ "task_id": 508, "intent_template_id": 172, "start_urls": ["__SHOPPING__"], - "intent": "Buy the highest rated product from the NS switch pouch category within a budget under 60.", - "intent_template": "Buy the highest rated product from the {{product_category}} category within a budget {{dollar_value}}.", - "instantiation_dict": {"product_category": "NS switch pouch", "dollar_value": "under 60"}, + "intent": "Buy the highest rated product from the NS switch category within a budget under 60. Discard any items in your cart if it is not empty.", + "intent_template": "Buy the highest rated product from the {{product_category}} category within a budget {{dollar_value}}. Discard any items in your cart if it is not empty.", + "instantiation_dict": {"product_category": "NS switch", "dollar_value": "under 60"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -13902,9 +13853,44 @@ "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", + "evaluator": "NetworkEventEvaluator", + "site": "shopping", + "last_event_only": true, + "decode_base64_query": true, + "expected": { + "url": "^__SHOPPING__/checkout/cart/add/uenc/.*/product/40448$", + "http_method": "POST", + "post_data": {"product": "40448"} + } + }, + { + "evaluator": "NetworkEventEvaluator", + "site": "shopping", + "last_event_only": true, + "ignored_query_params_patterns": ["\\_"], + "expected": { + "url": "__SHOPPING__/rest/default/V1/carts/mine/totals", + "response_content": { + "items_qty": 1, + "$.items[0].name": "Multi Pouch The Legend of Zelda Breath of the Wild for Nintendo Switch" + } + } + }, + { + "evaluator": "NetworkEventEvaluator", + "site": "shopping", + "last_event_only": true, + "expected": { + "url": "__SHOPPING__/rest/default/V1/carts/mine/payment-information", + "http_method": "POST", + "post_data": {"$.billingAddress.customerId": "27"} + } + }, + { + "evaluator": "NetworkEventEvaluator", "site": "shopping", - "expected": {"sku": "B07116LGP6"} + "last_event_only": true, + "expected": {"url": "__SHOPPING__/checkout/onepage/success"} } ], "revision": 2 @@ -13914,11 +13900,10 @@ "task_id": 509, "intent_template_id": 216, "start_urls": ["__SHOPPING__"], - "intent": "Buy the best rating product from \"Men's shoe\" category with at least 5 reviews and the product is least expensive", - "intent_template": "Buy the best rating product from \"{{category}}\" category with at least 5 reviews and the product is least expensive", + "intent": "Buy the best rating product from \"Men\"s shoe\" category with at least 5 reviews and the product is least expensive. Choose any available variant.", + "intent_template": "Buy the best rating product from \"{{category}}\" category with at least 5 reviews and the product is least expensive. Choose any available variant.", "instantiation_dict": {"category": "Men's shoe"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -13926,9 +13911,41 @@ "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", + "evaluator": "NetworkEventEvaluator", + "site": "shopping", + "last_event_only": true, + "decode_base64_query": true, + "expected": { + "url": "^__SHOPPING__/checkout/cart/add/uenc/.*/product/65885$", + "http_method": "POST", + "post_data": {"product": "65885"} + } + }, + { + "evaluator": "NetworkEventEvaluator", + "site": "shopping", + "last_event_only": true, + "ignored_query_params_patterns": ["\\_"], + "expected": { + "url": "__SHOPPING__/rest/default/V1/carts/mine/totals", + "response_content": {"items_qty": 1, "$.items[0].name": "Clarks Men's Tunsil Lane Oxford"} + } + }, + { + "evaluator": "NetworkEventEvaluator", + "site": "shopping", + "last_event_only": true, + "expected": { + "url": "__SHOPPING__/rest/default/V1/carts/mine/payment-information", + "http_method": "POST", + "post_data": {"$.billingAddress.customerId": "27"} + } + }, + { + "evaluator": "NetworkEventEvaluator", "site": "shopping", - "expected": {"sku": "B01J4MM3KO"} + "last_event_only": true, + "expected": {"url": "__SHOPPING__/checkout/onepage/success"} } ], "revision": 2 @@ -13938,11 +13955,10 @@ "task_id": 510, "intent_template_id": 216, "start_urls": ["__SHOPPING__"], - "intent": "Buy the best rating product from \"Home Audio Speaker\" category with at least 5 reviews and the product is least expensive", - "intent_template": "Buy the best rating product from \"{{category}}\" category with at least 5 reviews and the product is least expensive", + "intent": "Buy the best rating product from \"Home Audio Speaker\" category with at least 5 reviews and the product is least expensive. Choose any available variant.", + "intent_template": "Buy the best rating product from \"{{category}}\" category with at least 5 reviews and the product is least expensive. Choose any available variant.", "instantiation_dict": {"category": "Home Audio Speaker"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -13966,7 +13982,6 @@ "intent_template": "Add a {{product}} to my wish list.", "instantiation_dict": {"product": "laundry detergent"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -13995,7 +14010,6 @@ "intent_template": "Add a {{product}} to my wish list.", "instantiation_dict": {"product": "toothpaste"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -14151,7 +14165,6 @@ "intent_template": "Add a {{product}} to my wish list.", "instantiation_dict": {"product": "chair"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -14527,7 +14540,6 @@ "intent_template": "Add a {{product}} to my wish list.", "instantiation_dict": {"product": "white desk"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -14593,7 +14605,6 @@ "intent_template": "Add a {{product}} to my wish list.", "instantiation_dict": {"product": "white computer desk"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -14636,11 +14647,10 @@ "start_urls": [ "__SHOPPING__/elmwood-inn-fine-teas-orange-vanilla-caffeine-free-fruit-infusion-16-ounce-pouch.html" ], - "intent": "Add this product to my wishlist", - "intent_template": "Add this product to my wishlist", + "intent": "Add the product on the current page to my wishlist", + "intent_template": "Add the product on the current page to my wishlist", "instantiation_dict": {}, "format_specification": null, - "start_url_context": "Complete the task given the context of the current page.", "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -14662,11 +14672,10 @@ "start_urls": [ "__SHOPPING__/skinit-decal-gaming-skin-compatible-with-xbox-one-s-console-and-controller-bundle-officially-licensed-nfl-baltimore-ravens-design.html" ], - "intent": "Add this product to my wishlist", - "intent_template": "Add this product to my wishlist", + "intent": "Add the product on the current page to my wishlist", + "intent_template": "Add the product on the current page to my wishlist", "instantiation_dict": {}, "format_specification": null, - "start_url_context": "Complete the task given the context of the current page.", "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -14686,11 +14695,10 @@ "task_id": 518, "intent_template_id": 196, "start_urls": ["__SHOPPING__/sceptre-e195bd-srr-19-inch-720p-led-tv-true-black-2017.html"], - "intent": "Add this product to my wishlist", - "intent_template": "Add this product to my wishlist", + "intent": "Add the product on the current page to my wishlist", + "intent_template": "Add the product on the current page to my wishlist", "instantiation_dict": {}, "format_specification": null, - "start_url_context": "Complete the task given the context of the current page.", "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -14712,11 +14720,10 @@ "start_urls": [ "__SHOPPING__/iphone-13-pro-max-case-neon-turtle-iphone-13-pro-max-cases-tempered-glass-back-soft-silicone-tpu-shock-protective-case-for-apple-iphone-13-pro-max.html" ], - "intent": "Add this product to my wishlist", - "intent_template": "Add this product to my wishlist", + "intent": "Add the product on the current page to my wishlist", + "intent_template": "Add the product on the current page to my wishlist", "instantiation_dict": {}, "format_specification": null, - "start_url_context": "Complete the task given the context of the current page.", "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -14724,9 +14731,15 @@ "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", + "evaluator": "NetworkEventEvaluator", "site": "shopping", - "expected": {"sku": "B09GG4P4MD"} + "last_event_only": true, + "expected": { + "url": "__SHOPPING__/wishlist/index/add/", + "http_method": "POST", + "post_data": {"product": "37339", "qty": "1"}, + "response_status": 302 + } } ], "revision": 2 @@ -14738,11 +14751,10 @@ "start_urls": [ "__SHOPPING__/magnetic-metal-stainless-steel-d-pads-kits-directional-pad-replacement-parts-for-xbox-one-elite-controller-elite-series-2-xbox-one-xbox-one-s-x-controller.html" ], - "intent": "Add this product to my wishlist", - "intent_template": "Add this product to my wishlist", + "intent": "Add the product on the current page to my wishlist", + "intent_template": "Add the product on the current page to my wishlist", "instantiation_dict": {}, "format_specification": null, - "start_url_context": "Complete the task given the context of the current page.", "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -14750,9 +14762,15 @@ "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", + "evaluator": "NetworkEventEvaluator", "site": "shopping", - "expected": {"sku": "B073XDR3K6"} + "last_event_only": true, + "expected": { + "url": "__SHOPPING__/wishlist/index/add/", + "http_method": "POST", + "post_data": {"product": "37313", "qty": "1"}, + "response_status": 302 + } } ], "revision": 2 @@ -14766,14 +14784,23 @@ "intent_template": "Subscribe to the newsletter of OneStopMarket", "instantiation_dict": {}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, - {"evaluator": "BackendStateEvaluator", "site": "shopping", "expected": true} + { + "evaluator": "NetworkEventEvaluator", + "site": "shopping", + "last_event_only": true, + "expected": { + "url": "__SHOPPING__/newsletter/subscriber/new/", + "http_method": "POST", + "post_data": {"email": "emma.lopez@gmail.com"}, + "response_status": 302 + } + } ], "revision": 2 }, @@ -14786,7 +14813,6 @@ "intent_template": "Fork {{repo}}.", "instantiation_dict": {"repo": "all repos from facebook"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -14794,10 +14820,18 @@ "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "project": ["create-react-app", "buck"], - "site": "gitlab", - "expected": true + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": "__GITLAB__/api/v4/projects/122/fork", + "http_method": "POST", + "post_data": { + "id": "122", + "name": "create-react-app", + "namespace_id": 2505, + "path": "create-react-app" + }, + "response_status": 201 + } } ], "revision": 2 @@ -14811,7 +14845,6 @@ "intent_template": "Star the top {{number}} most stared repos in Gitlab", "instantiation_dict": {"number": "five"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -14819,14 +14852,49 @@ "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "user_id": "byteblaze", - "project_names": [ - "AndroidSlidingUpPanel", "create-react-app", "ffmpeg-python", "PHP_XLSXWriter", - "AndroidAsync" - ], - "site": "gitlab", - "expected": true + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": "__GITLAB__/umano/AndroidSlidingUpPanel/toggle_star.json", + "http_method": "POST", + "response_status": 200, + "response_content": {"star_count": 56} + } + }, + { + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": "__GITLAB__/facebook/create-react-app/toggle_star.json", + "http_method": "POST", + "response_status": 200, + "response_content": {"star_count": 53} + } + }, + { + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": "__GITLAB__/kkroening/ffmpeg-python/toggle_star.json", + "http_method": "POST", + "response_status": 200, + "response_content": {"star_count": 52} + } + }, + { + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": "__GITLAB__/mk-j/PHP_XLSXWriter/toggle_star.json", + "http_method": "POST", + "response_status": 200, + "response_content": {"star_count": 48} + } + }, + { + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": "__GITLAB__/koush/AndroidAsync/toggle_star.json", + "http_method": "POST", + "response_status": 200, + "response_content": {"star_count": 47} + } } ], "revision": 2 @@ -14840,7 +14908,6 @@ "intent_template": "Star the top {{number}} most stared repos in Gitlab", "instantiation_dict": {"number": "eight"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -14848,17 +14915,79 @@ "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "user_id": "byteblaze", - "project_names": [ - "AndroidSlidingUpPanel", "create-react-app", "ffmpeg-python", "PHP_XLSXWriter", - "AndroidAsync", "Pytorch-GAN", "administrate", "keycloak" - ], - "site": "gitlab", - "expected": true - } - ], - "revision": 2 + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": "__GITLAB__/umano/AndroidSlidingUpPanel/toggle_star.json", + "http_method": "POST", + "response_status": 200, + "response_content": {"star_count": 56} + } + }, + { + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": "__GITLAB__/facebook/create-react-app/toggle_star.json", + "http_method": "POST", + "response_status": 200, + "response_content": {"star_count": 53} + } + }, + { + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": "__GITLAB__/kkroening/ffmpeg-python/toggle_star.json", + "http_method": "POST", + "response_status": 200, + "response_content": {"star_count": 52} + } + }, + { + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": "__GITLAB__/mk-j/PHP_XLSXWriter/toggle_star.json", + "http_method": "POST", + "response_status": 200, + "response_content": {"star_count": 48} + } + }, + { + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": "__GITLAB__/koush/AndroidAsync/toggle_star.json", + "http_method": "POST", + "response_status": 200, + "response_content": {"star_count": 47} + } + }, + { + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": "__GITLAB__/eriklindernoren/PyTorch-GAN/toggle_star.json", + "http_method": "POST", + "response_status": 200, + "response_content": {"star_count": 46} + } + }, + { + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": "__GITLAB__/thoughtbot/administrate/toggle_star.json", + "http_method": "POST", + "response_status": 200, + "response_content": {"star_count": 45} + } + }, + { + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": "__GITLAB__/keycloak/keycloak/toggle_star.json", + "http_method": "POST", + "response_status": 200, + "response_content": {"star_count": 44} + } + } + ], + "revision": 2 }, { "sites": ["gitlab"], @@ -14869,7 +14998,6 @@ "intent_template": "Star the top {{number}} most stared repos in Gitlab", "instantiation_dict": {"number": "four"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -14877,11 +15005,40 @@ "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "user_id": "byteblaze", - "project_names": ["AndroidSlidingUpPanel", "create-react-app", "ffmpeg-python", "PHP_XLSXWriter"], - "site": "gitlab", - "expected": true + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": "__GITLAB__/umano/AndroidSlidingUpPanel/toggle_star.json", + "http_method": "POST", + "response_status": 200, + "response_content": {"star_count": 56} + } + }, + { + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": "__GITLAB__/facebook/create-react-app/toggle_star.json", + "http_method": "POST", + "response_status": 200, + "response_content": {"star_count": 53} + } + }, + { + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": "__GITLAB__/kkroening/ffmpeg-python/toggle_star.json", + "http_method": "POST", + "response_status": 200, + "response_content": {"star_count": 52} + } + }, + { + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": "__GITLAB__/mk-j/PHP_XLSXWriter/toggle_star.json", + "http_method": "POST", + "response_status": 200, + "response_content": {"star_count": 48} + } } ], "revision": 2 @@ -14895,7 +15052,6 @@ "intent_template": "Star the top {{number}} most stared repos in Gitlab", "instantiation_dict": {"number": "three"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -14903,11 +15059,31 @@ "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "user_id": "byteblaze", - "project_names": ["AndroidSlidingUpPanel", "create-react-app", "ffmpeg-python"], - "site": "gitlab", - "expected": true + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": "__GITLAB__/umano/AndroidSlidingUpPanel/toggle_star.json", + "http_method": "POST", + "response_status": 200, + "response_content": {"star_count": 56} + } + }, + { + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": "__GITLAB__/facebook/create-react-app/toggle_star.json", + "http_method": "POST", + "response_status": 200, + "response_content": {"star_count": 53} + } + }, + { + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": "__GITLAB__/kkroening/ffmpeg-python/toggle_star.json", + "http_method": "POST", + "response_status": 200, + "response_content": {"star_count": 52} + } } ], "revision": 2 @@ -14921,7 +15097,6 @@ "intent_template": "Star the top {{number}} most stared repos in Gitlab", "instantiation_dict": {"number": "one"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -14929,11 +15104,13 @@ "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "user_id": "byteblaze", - "project_names": ["AndroidSlidingUpPanel"], - "site": "gitlab", - "expected": true + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": "__GITLAB__/umano/AndroidSlidingUpPanel/toggle_star.json", + "http_method": "POST", + "response_status": 200, + "response_content": {"star_count": 56} + } } ], "revision": 2 @@ -14943,27 +15120,34 @@ "task_id": 528, "intent_template_id": 154, "start_urls": ["__SHOPPING__"], - "intent": "Draft a refund message via their \"contact us\" form for the phone screen protector I bought March 2023. State: \"It broke after three days of use\". The shop requires the order id, the reason and the amount to refund in the message. Don't submit yet", - "intent_template": "Draft a refund message via their \"contact us\" form for the {{product}} I bought {{time}}. State: \"It broke after three days of use\". The shop requires the order id, the reason and the amount to refund in the message. Don't submit yet", + "intent": "Draft a refund message via their \"contact us\" form for the phone screen protector I bought March 2023. The shop requires the order id, the reason and the amount to refund in the message, so use the following message format: \"I need a refund. Order: . Amount: . Reason: It broke after three days of use.\". Don\"t submit yet", + "intent_template": "Draft a refund message via their \"contact us\" form for the {{product}} I bought {{time}}. The shop requires the order id, the reason and the amount to refund in the message, so use the following message format: \"I need a refund. Order: . Amount: . Reason: It broke after three days of use.\". Don\"t submit yet", "instantiation_dict": {"product": "phone screen protector", "time": "March 2023"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", - "site": "shopping", - "url_match_mode": "exact", - "last_event_only": true, "expected": { - "url": "__SHOPPING__/contact", - "response_status": 200, - "event_type": "navigation" + "url": "^http://.*/dummy_bin$", + "http_method": "POST", + "post_data": { + "form_id": "contact-form", + "name": "Emma Lopez", + "email": "emma.lopez@gmail.com", + "comment": "^I\\s+need\\s+a\\s+refund[.\\s]+Order\\s*:\\s*#?\\s*0*180[.\\s]+Amount\\s*:\\s*(\\$\\s*)?12\\.99(\\s*\\$|\\s+dollars)?[.\\s]+Reason\\s*:\\s*It\\s+broke\\s+after\\s+three\\s+days\\s+of\\s+use\\.?$" + }, + "response_status": -1 } + }, + { + "evaluator": "NetworkEventEvaluator", + "should_not_exist": true, + "expected": {"url": "__SHOPPING__/contact/index/post", "http_method": "POST"} } ], "revision": 2 @@ -14973,27 +15157,34 @@ "task_id": 529, "intent_template_id": 154, "start_urls": ["__SHOPPING__"], - "intent": "Draft a refund message via their \"contact us\" form for the bluetooth speaker I bought Feb 2023. State: \"It broke after three days of use\". The shop requires the order id, the reason and the amount to refund in the message. Don't submit yet", - "intent_template": "Draft a refund message via their \"contact us\" form for the {{product}} I bought {{time}}. State: \"It broke after three days of use\". The shop requires the order id, the reason and the amount to refund in the message. Don't submit yet", - "instantiation_dict": {"product": "bluetooth speaker", "time": "Feb 2023"}, + "intent": "Draft a refund message via their \"contact us\" form for the Stereo Sound System I bought Feb 2023. The shop requires the order id, the reason and the amount to refund in the message, so use the following message format: \"I need a refund. Order: . Amount: . Reason: It broke after three days of use.\". Don\"t submit yet", + "intent_template": "Draft a refund message via their \"contact us\" form for the {{product}} I bought {{time}}. The shop requires the order id, the reason and the amount to refund in the message, so use the following message format: \"I need a refund. Order: . Amount: . Reason: It broke after three days of use.\". Don\"t submit yet", + "instantiation_dict": {"product": "Stereo Sound System", "time": "Feb 2023"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", - "site": "shopping", - "url_match_mode": "exact", - "last_event_only": true, "expected": { - "url": "__SHOPPING__/contact", - "response_status": 200, - "event_type": "navigation" + "url": "^http://.*/dummy_bin$", + "http_method": "POST", + "post_data": { + "form_id": "contact-form", + "name": "Emma Lopez", + "email": "emma.lopez@gmail.com", + "comment": "^I\\s+need\\s+a\\s+refund[.\\s]+Order\\s*:\\s*#?\\s*0*157[.\\s]+Amount\\s*:\\s*(\\$\\s*)?122\\.05(\\s*\\$|\\s+dollars)?[.\\s]+Reason\\s*:\\s*It\\s+broke\\s+after\\s+three\\s+days\\s+of\\s+use\\.?$" + }, + "response_status": -1 } + }, + { + "evaluator": "NetworkEventEvaluator", + "should_not_exist": true, + "expected": {"url": "__SHOPPING__/contact/index/post", "http_method": "POST"} } ], "revision": 2 @@ -15003,27 +15194,34 @@ "task_id": 530, "intent_template_id": 154, "start_urls": ["__SHOPPING__"], - "intent": "Draft a refund message via their \"contact us\" form for the kitchen organizer I bought around Feb 2023. State: \"It broke after three days of use\". The shop requires the order id, the reason and the amount to refund in the message. Don't submit yet", - "intent_template": "Draft a refund message via their \"contact us\" form for the {{product}} I bought {{time}}. State: \"It broke after three days of use\". The shop requires the order id, the reason and the amount to refund in the message. Don't submit yet", + "intent": "Draft a refund message via their \"contact us\" form for the kitchen organizer I bought around Feb 2023. The shop requires the order id, the reason and the amount to refund in the message, so use the following message format: \"I need a refund. Order: . Amount: . Reason: It broke after three days of use.\". Don\"t submit yet", + "intent_template": "Draft a refund message via their \"contact us\" form for the {{product}} I bought {{time}}. The shop requires the order id, the reason and the amount to refund in the message, so use the following message format: \"I need a refund. Order: . Amount: . Reason: It broke after three days of use.\". Don\"t submit yet", "instantiation_dict": {"product": "kitchen organizer", "time": "around Feb 2023"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", - "site": "shopping", - "url_match_mode": "exact", - "last_event_only": true, "expected": { - "url": "__SHOPPING__/contact", - "response_status": 200, - "event_type": "navigation" + "url": "^http://.*/dummy_bin$", + "http_method": "POST", + "post_data": { + "form_id": "contact-form", + "name": "Emma Lopez", + "email": "emma.lopez@gmail.com", + "comment": "^I\\s+need\\s+a\\s+refund[.\\s]+Order\\s*:\\s*#?\\s*0*161[.\\s]+Amount\\s*:\\s*(\\$\\s*)?68\\.88(\\s*\\$|\\s+dollars)?[.\\s]+Reason\\s*:\\s*It\\s+broke\\s+after\\s+three\\s+days\\s+of\\s+use\\.?$" + }, + "response_status": -1 } + }, + { + "evaluator": "NetworkEventEvaluator", + "should_not_exist": true, + "expected": {"url": "__SHOPPING__/contact/index/post", "http_method": "POST"} } ], "revision": 2 @@ -15033,27 +15231,21 @@ "task_id": 531, "intent_template_id": 154, "start_urls": ["__SHOPPING__"], - "intent": "Draft a refund message via their \"contact us\" form for the phone case I bought March 2023. State: \"It broke after three days of use\". The shop requires the order id, the reason and the amount to refund in the message. Don't submit yet", - "intent_template": "Draft a refund message via their \"contact us\" form for the {{product}} I bought {{time}}. State: \"It broke after three days of use\". The shop requires the order id, the reason and the amount to refund in the message. Don't submit yet", + "intent": "Draft a refund message via their \"contact us\" form for the phone case I bought March 2023. The shop requires the order id, the reason and the amount to refund in the message, so use the following message format: \"I need a refund. Order: . Amount: . Reason: It broke after three days of use.\". Don\"t submit yet", + "intent_template": "Draft a refund message via their \"contact us\" form for the {{product}} I bought {{time}}. The shop requires the order id, the reason and the amount to refund in the message, so use the following message format: \"I need a refund. Order: . Amount: . Reason: It broke after three days of use.\". Don\"t submit yet", "instantiation_dict": {"product": "phone case", "time": "March 2023"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", "site": "shopping", - "url_match_mode": "exact", "last_event_only": true, - "expected": { - "url": "__SHOPPING__/contact", - "response_status": 200, - "event_type": "navigation" - } + "expected": {"url": "__SHOPPING__/contact", "response_status": 200} } ], "revision": 2 @@ -15063,27 +15255,21 @@ "task_id": 532, "intent_template_id": 154, "start_urls": ["__SHOPPING__"], - "intent": "Draft a refund message via their \"contact us\" form for the PS3 remote controller I bought early 2023. State: \"It broke after three days of use\". The shop requires the order id, the reason and the amount to refund in the message. Don't submit yet", - "intent_template": "Draft a refund message via their \"contact us\" form for the {{product}} I bought {{time}}. State: \"It broke after three days of use\". The shop requires the order id, the reason and the amount to refund in the message. Don't submit yet", + "intent": "Draft a refund message via their \"contact us\" form for the PS3 remote controller I bought early 2023. The shop requires the order id, the reason and the amount to refund in the message, so use the following message format: \"I need a refund. Order: . Amount: . Reason: It broke after three days of use.\". Don\"t submit yet", + "intent_template": "Draft a refund message via their \"contact us\" form for the {{product}} I bought {{time}}. The shop requires the order id, the reason and the amount to refund in the message, so use the following message format: \"I need a refund. Order: . Amount: . Reason: It broke after three days of use.\". Don\"t submit yet", "instantiation_dict": {"product": "PS3 remote controller", "time": "early 2023"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", "site": "shopping", - "url_match_mode": "exact", "last_event_only": true, - "expected": { - "url": "__SHOPPING__/contact", - "response_status": 200, - "event_type": "navigation" - } + "expected": {"url": "__SHOPPING__/contact", "response_status": 200} } ], "revision": 2 @@ -15093,11 +15279,10 @@ "task_id": 533, "intent_template_id": 330, "start_urls": ["__GITLAB__"], - "intent": "Follow ['convexegg', 'yjlou'] on Gitlab", + "intent": "Follow [\"convexegg\", \"yjlou\"] on Gitlab", "intent_template": "Follow {{account_list}} on Gitlab", "instantiation_dict": { "account_list": ["convexegg", "yjlou"] }, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -15119,11 +15304,10 @@ "task_id": 534, "intent_template_id": 330, "start_urls": ["__GITLAB__"], - "intent": "Follow ['Jakub Klinkovsk\u00fd', 'Koushik', 'Vinta Chen'] on Gitlab", + "intent": "Follow [\"Jakub Klinkovsk\u00fd\", \"Koushik\", \"Vinta Chen\"] on Gitlab", "intent_template": "Follow {{account_list}} on Gitlab", "instantiation_dict": { "account_list": ["Jakub Klinkovsk\u00fd", "Koushik", "Vinta Chen"] }, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -15145,11 +15329,10 @@ "task_id": 535, "intent_template_id": 330, "start_urls": ["__GITLAB__"], - "intent": "Follow ['Jakub K', 'ghost', 'Beno\u00eet Blanchon'] on Gitlab", + "intent": "Follow [\"Jakub K\", \"ghost\", \"Beno\u00eet Blanchon\"] on Gitlab", "intent_template": "Follow {{account_list}} on Gitlab", "instantiation_dict": { "account_list": ["Jakub K", "ghost", "Beno\u00eet Blanchon"] }, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -15171,11 +15354,10 @@ "task_id": 536, "intent_template_id": 330, "start_urls": ["__GITLAB__"], - "intent": "Follow ['ghost', 'R1kk3r', 'Abishek'] on Gitlab", + "intent": "Follow [\"ghost\", \"R1kk3r\", \"Abishek\"] on Gitlab", "intent_template": "Follow {{account_list}} on Gitlab", "instantiation_dict": { "account_list": ["ghost", "R1kk3r", "Abishek"] }, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -15197,13 +15379,12 @@ "task_id": 537, "intent_template_id": 330, "start_urls": ["__GITLAB__"], - "intent": "Follow ['Jakub Klinkovsk', 'convexegg', 'Vinta Chen', 'yjlou', 'Abishek S'] on Gitlab", + "intent": "Follow [\"Jakub Klinkovsk\", \"convexegg\", \"Vinta Chen\", \"yjlou\", \"Abishek S\"] on Gitlab", "intent_template": "Follow {{account_list}} on Gitlab", "instantiation_dict": { "account_list": ["Jakub Klinkovsk", "convexegg", "Vinta Chen", "yjlou", "Abishek S"] }, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -15232,7 +15413,6 @@ "address": "456 Oak Avenue, Apartment 5B, New York, NY, 10001" }, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -15264,7 +15444,6 @@ "intent_template": "Modify the address of order #{{order_id}} to {{address}}", "instantiation_dict": {"order_id": "65", "address": "789 Pine Lane, San Francisco, CA, 94102"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -15297,7 +15476,6 @@ "address": "321 Birch Boulevard, Suite 200, Dallas, TX, 75201" }, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -15328,7 +15506,6 @@ "intent_template": "Modify the address of order #{{order_id}} to {{address}}", "instantiation_dict": {"order_id": "125", "address": "654 Elm Drive, Apartment 12, Miami, FL, 33101"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -15336,15 +15513,24 @@ "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "order_id": "125", - "site": "shopping_admin", - "expected": { - "address": "654 Elm Drive", - "address2": "Apartment 12", - "city": "Miami", - "state": "Florida", - "zip_code": "33101" + "evaluator": "NetworkEventEvaluator", + "ignored_post_data_params_patterns": ["^form_key$"], + "expected": { + "url": "__SHOPPING_ADMIN__/sales/order/addressSave/address_id/249/", + "http_method": "POST", + "headers": {"referer": "__SHOPPING_ADMIN__/sales/order/address/address_id/249/"}, + "method": "POST", + "post_data": { + "street[0]": "654 Elm Drive", + "street[1]": "Apartment 12", + "country_id": "US", + "region": "Florida", + "region_id": "18", + "city": "Miami", + "postcode": "33101" + }, + "response_status": 302, + "redirect_url": "__SHOPPING_ADMIN__/sales/order/view/order_id/125/" } } ], @@ -15359,7 +15545,6 @@ "intent_template": "Modify the address of order #{{order_id}} to {{address}}", "instantiation_dict": {"order_id": "300", "address": "987 Cedar Court, Los Angeles, CA, 90012"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -15385,11 +15570,10 @@ "task_id": 543, "intent_template_id": 251, "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Update the product description of Bella Tank with the titles of all reviews with 4 stars or above.", - "intent_template": "Update the product description of {{product}} with the titles of all reviews with 4 stars or above.", + "intent": "Update the product description of Bella Tank to \"{count} customer(s) love it!\" where count is the number of reviews with 4 stars or above, or \"don't miss out on this amazing product\" when there are no such reviews.", + "intent_template": "Update the product description of {{product}} to \"{count} customer(s) love it!\" where count is the number of reviews with 4 stars or above, or \"don't miss out on this amazing product\" when there are no such reviews.", "instantiation_dict": {"product": "Bella Tank"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -15397,11 +15581,14 @@ "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "product_id": "1684", + "evaluator": "NetworkEventEvaluator", "site": "shopping_admin", + "last_event_only": true, "expected": { - "description": ["A sweet n sporty look for the gym", "Good choice for working out"] + "url": "__SHOPPING_ADMIN__/catalog/product/save/id/1684/type/configurable/store/0/set/9/back/edit", + "http_method": "POST", + "response_status": 302, + "post_data": {"product[short_description]": "

2 customer(s) love it!

"} } } ], @@ -15412,11 +15599,10 @@ "task_id": 544, "intent_template_id": 251, "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Update the product description of Selene Yoga Hoodie with the titles of all reviews with 4 stars or above.", - "intent_template": "Update the product description of {{product}} with the titles of all reviews with 4 stars or above.", + "intent": "Update the product description of Selene Yoga Hoodie to \"{count} customer(s) love it!\" where count is the number of reviews with 4 stars or above, or \"don't miss out on this amazing product\" when there are no such reviews.", + "intent_template": "Update the product description of {{product}} to \"{count} customer(s) love it!\" where count is the number of reviews with 4 stars or above, or \"don't miss out on this amazing product\" when there are no such reviews.", "instantiation_dict": {"product": "Selene Yoga Hoodie"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -15424,10 +15610,15 @@ "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "product_id": "1108", + "evaluator": "NetworkEventEvaluator", "site": "shopping_admin", - "expected": { "description": ["Best hoodies I've owned.", "Great value", "Kept me warm"] } + "last_event_only": true, + "expected": { + "url": "__SHOPPING_ADMIN__/catalog/product/save/id/1108/type/configurable/store/0/set/9/back/edit", + "http_method": "POST", + "response_status": 302, + "post_data": {"product[short_description]": "

3 customer(s) love it!

"} + } } ], "revision": 2 @@ -15437,11 +15628,10 @@ "task_id": 545, "intent_template_id": 251, "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Update the product description of Radiant Tee with the titles of all reviews with 4 stars or above.", - "intent_template": "Update the product description of {{product}} with the titles of all reviews with 4 stars or above.", + "intent": "Update the product description of Radiant Tee to \"{count} customer(s) love it!\" where count is the number of reviews with 4 stars or above, or \"don't miss out on this amazing product\" when there are no such reviews.", + "intent_template": "Update the product description of {{product}} to \"{count} customer(s) love it!\" where count is the number of reviews with 4 stars or above, or \"don't miss out on this amazing product\" when there are no such reviews.", "instantiation_dict": {"product": "Radiant Tee"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -15449,10 +15639,15 @@ "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "product_id": "1556", + "evaluator": "NetworkEventEvaluator", "site": "shopping_admin", - "expected": { "description": ["What a versatile shirt!"] } + "last_event_only": true, + "expected": { + "url": "__SHOPPING_ADMIN__/catalog/product/save/id/1556/type/configurable/store/0/set/9/back/edit", + "http_method": "POST", + "response_status": 302, + "post_data": {"product[short_description]": "

1 customer(s) love it!

"} + } } ], "revision": 2 @@ -15462,19 +15657,25 @@ "task_id": 546, "intent_template_id": 251, "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Update the product description of Lucia Cross-Fit Bra with the titles of all reviews with 4 stars or above.", - "intent_template": "Update the product description of {{product}} with the titles of all reviews with 4 stars or above.", + "intent": "Update the product description of Lucia Cross-Fit Bra to \"{count} customer(s) love it!\" where count is the number of reviews with 4 stars or above, or \"don't miss out on this amazing product\" when there are no such reviews.", + "intent_template": "Update the product description of {{product}} to \"{count} customer(s) love it!\" where count is the number of reviews with 4 stars or above, or \"don't miss out on this amazing product\" when there are no such reviews.", "instantiation_dict": {"product": "Lucia Cross-Fit Bra"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + }, + { + "evaluator": "NetworkEventEvaluator", + "site": "shopping_admin", + "last_event_only": true, "expected": { - "performed_operation": "retrieve", - "status": "ACTION_NOT_ALLOWED_ERROR", - "retrieved_data": null + "url": "__SHOPPING_ADMIN__/catalog/product/save/id/1668/type/configurable/store/0/set/9/back/edit", + "http_method": "POST", + "response_status": 302, + "post_data": {"product[short_description]": "

don't miss out on this amazing product

"} } } ], @@ -15494,7 +15695,6 @@ "product": "Phoebe Zipper Sweatshirt" }, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -15524,7 +15724,6 @@ "product": "Frankie Sweatshirt" }, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -15554,7 +15753,6 @@ "product": "Minerva LumaTech V-Tee" }, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -15584,7 +15782,6 @@ "product": "Nona Fitness Tank" }, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -15614,7 +15811,6 @@ "product": "Diana Tights" }, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -15622,14 +15818,33 @@ "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "product_id": "1854", + "evaluator": "NetworkEventEvaluator", "site": "shopping_admin", "expected": { - "variants": [ - "Diana Tights-30-Blue", "Diana Tights-30-Black", "Diana Tights-30-Orange", - "Diana Tights-31-Blue", "Diana Tights-31-Black", "Diana Tights-31-Orange" - ] + "url": "__SHOPPING_ADMIN__/catalog/product/save/id/1854/type/configurable/store/0/set/10/back/edit", + "http_method": "POST", + "response_status": 302, + "post_data": { + "product[configurable_attributes_data][93][code]": "color", + "product[configurable_attributes_data][93][position]": "1", + "product[configurable_attributes_data][93][values][49][include]": "1", + "product[configurable_attributes_data][93][values][49][value_index]": "49", + "product[configurable_attributes_data][93][values][50][include]": "1", + "product[configurable_attributes_data][93][values][50][value_index]": "50", + "product[configurable_attributes_data][93][values][56][include]": "1", + "product[configurable_attributes_data][93][values][56][value_index]": "56", + "product[configurable_attributes_data][144][attribute_id]": "144", + "product[configurable_attributes_data][144][code]": "size", + "product[configurable_attributes_data][144][position]": "0", + "product[configurable_attributes_data][144][values][171][include]": "1", + "product[configurable_attributes_data][144][values][171][value_index]": "171", + "product[configurable_attributes_data][144][values][172][include]": "1", + "product[configurable_attributes_data][144][values][172][value_index]": "172", + "product[configurable_attributes_data][144][values][173][include]": "1", + "product[configurable_attributes_data][144][values][173][value_index]": "173", + "product[configurable_attributes_data][144][values][174][include]": "1", + "product[configurable_attributes_data][144][values][174][value_index]": "174" + } } } ], @@ -15640,15 +15855,10 @@ "task_id": 552, "intent_template_id": 84, "start_urls": ["__GITLAB__", "__REDDIT__"], - "intent": "Create a folder named real_space in gimmiethat.space repo. Within it, create a file named urls.txt that contains the URLs of the 5 most recent posts from the space subreddit.", - "intent_template": "Create a folder named {{directory}} in {{gitlab_repo}} repo. Within it, create a file named urls.txt that contains the URLs of the 5 most recent posts from the {{subreddit}} subreddit.", - "instantiation_dict": { - "directory": "real_space", - "subreddit": "space", - "gitlab_repo": "gimmiethat.space" - }, + "intent": "Use the Web IDE to create a folder named real_space in gimmiethat.space repo. Within it, create a file named urls.json that contains the full URLs of the 5 most recent posts from the space forum on the discussion forum as a JSON object with a \"urls\" array. Write them in order from newest to oldest. Commit the changes to the default branch.", + "intent_template": "Use the Web IDE to create a folder named {{directory}} in {{gitlab_repo}} repo. Within it, create a file named urls.json that contains the full URLs of the 5 most recent posts from the {{forum}} forum on the discussion forum as a JSON object with a \"urls\" array. Write them in order from newest to oldest. Commit the changes to the default branch.", + "instantiation_dict": {"directory": "real_space", "gitlab_repo": "gimmiethat.space", "forum": "space"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -15656,20 +15866,29 @@ "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "group": "byteblaze", - "project": "gimmiethat.space", - "file_path": "real_space/urls.txt", - "substring": [ - "/f/space/134164/scientists-erupt-at-nasa-gutting-funding-for-crucial-venus", - "/f/space/134163/virgin-orbit-fails-to-secure-funding-will-cease-operations", - "/f/space/134162/nasa-to-name-artemis-2-crew-next-week-the-first-moon", - "/f/space/134161/bent-light-in-deep-space-reveals-one-of-the-biggest-black", - "/f/space/134160/seti-s-new-machine-learning-algorithm-works-like-google-s" - ], - "branch": "main", - "site": "gitlab", - "expected": true + "evaluator": "NetworkEventEvaluator", + "ignored_query_params_patterns": [".*"], + "expected": {"url": "__REDDIT__/f/space/new"} + }, + { + "evaluator": "NetworkEventEvaluator", + "gen_missing_post_data_schema": true, + "post_data_schema": { + "type": "object", + "properties": { "$.actions[0].content": {"type": "string", "format": "json"} } + }, + "expected": { + "url": "__GITLAB__/api/v4/projects/byteblaze%2Fgimmiethat.space/repository/commits", + "http_method": "POST", + "post_data": { + "branch": "main", + "$.actions[0].action": "create", + "$.actions[0].file_path": "real_space/urls.json", + "$.actions[0].content": "{\"urls\": [\"__REDDIT__/f/space/134164/scientists-erupt-at-nasa-gutting-funding-for-crucial-venus\",\"__REDDIT__/f/space/134163/virgin-orbit-fails-to-secure-funding-will-cease-operations\",\"__REDDIT__/f/space/134162/nasa-to-name-artemis-2-crew-next-week-the-first-moon\",\"__REDDIT__/f/space/134161/bent-light-in-deep-space-reveals-one-of-the-biggest-black\",\"__REDDIT__/f/space/134160/seti-s-new-machine-learning-algorithm-works-like-google-s\"]}", + "$.actions[1].action": null + }, + "response_status": 201 + } } ], "revision": 2 @@ -15679,15 +15898,14 @@ "task_id": 553, "intent_template_id": 84, "start_urls": ["__GITLAB__", "__REDDIT__"], - "intent": "Create a folder named news in gimmiethat.space repo. Within it, create a file named urls.txt that contains the URLs of the 5 most recent posts from the news related subreddits subreddit.", - "intent_template": "Create a folder named {{directory}} in {{gitlab_repo}} repo. Within it, create a file named urls.txt that contains the URLs of the 5 most recent posts from the {{subreddit}} subreddit.", + "intent": "Use the Web IDE to create a folder named news in gimmiethat.space repo. Within it, create a file named urls.json that contains the full URLs of the 5 most recent posts from the news related forums forum on the discussion forum as a JSON object with a \"urls\" array. Write them in order from newest to oldest. Commit the changes to the default branch.", + "intent_template": "Use the Web IDE to create a folder named {{directory}} in {{gitlab_repo}} repo. Within it, create a file named urls.json that contains the full URLs of the 5 most recent posts from the {{forum}} forum on the discussion forum as a JSON object with a \"urls\" array. Write them in order from newest to oldest. Commit the changes to the default branch.", "instantiation_dict": { "directory": "news", "gitlab_repo": "gimmiethat.space", - "subreddit": "news related subreddits" + "forum": "news related subreddits" }, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -15695,20 +15913,29 @@ "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "group": "byteblaze", - "project": "gimmiethat.space", - "file_path": "news/urls.txt", - "substring": [ - "/f/news/129905/ohio-man-charged-for-using-molotov-cocktails-to-attack", - "/f/news/129904/in-a-loss-for-fox-news-judge-allows-dominion-s-defamation", - "/f/news/129903/theater-group-sues-to-block-tennessee-s-new-anti-drag-law", - "/f/news/129902/andrew-tate-released-from-jail-in-romania-and-placed-under", - "/f/news/129901/rare-high-risk-storm-alert-issued-for-parts-of-midwest-and" - ], - "branch": "main", - "site": "gitlab", - "expected": true + "evaluator": "NetworkEventEvaluator", + "ignored_query_params_patterns": [".*"], + "expected": {"url": "__REDDIT__/f/news/new"} + }, + { + "evaluator": "NetworkEventEvaluator", + "gen_missing_post_data_schema": true, + "post_data_schema": { + "type": "object", + "properties": { "$.actions[0].content": {"type": "string", "format": "json"} } + }, + "expected": { + "url": "__GITLAB__/api/v4/projects/byteblaze%2Fgimmiethat.space/repository/commits", + "http_method": "POST", + "post_data": { + "branch": "main", + "$.actions[0].action": "create", + "$.actions[0].file_path": "news/urls.json", + "$.actions[0].content": "{\"urls\": [\"__REDDIT__/f/news/129905/ohio-man-charged-for-using-molotov-cocktails-to-attack\", \"__REDDIT__/f/news/129904/in-a-loss-for-fox-news-judge-allows-dominion-s-defamation\", \"__REDDIT__/f/news/129903/theater-group-sues-to-block-tennessee-s-new-anti-drag-law\", \"__REDDIT__/f/news/129902/andrew-tate-released-from-jail-in-romania-and-placed-under\", \"__REDDIT__/f/news/129901/rare-high-risk-storm-alert-issued-for-parts-of-midwest-and\"]}", + "$.actions[1].action": null + }, + "response_status": 201 + } } ], "revision": 2 @@ -15718,15 +15945,14 @@ "task_id": 554, "intent_template_id": 84, "start_urls": ["__GITLAB__", "__REDDIT__"], - "intent": "Create a folder named moive_space in gimmiethat.space repo. Within it, create a file named urls.txt that contains the URLs of the 5 most recent posts from the movies subreddit.", - "intent_template": "Create a folder named {{directory}} in {{gitlab_repo}} repo. Within it, create a file named urls.txt that contains the URLs of the 5 most recent posts from the {{subreddit}} subreddit.", + "intent": "Use the Web IDE to create a folder named movie_space in gimmiethat.space repo. Within it, create a file named urls.json that contains the full URLs of the 5 most recent posts from the movies forum on the discussion forum as a JSON object with a \"urls\" array. Write them in order from newest to oldest. Commit the changes to the default branch.", + "intent_template": "Use the Web IDE to create a folder named {{directory}} in {{gitlab_repo}} repo. Within it, create a file named urls.json that contains the full URLs of the 5 most recent posts from the {{forum}} forum on the discussion forum as a JSON object with a \"urls\" array. Write them in order from newest to oldest. Commit the changes to the default branch.", "instantiation_dict": { - "directory": "moive_space", + "directory": "movie_space", "gitlab_repo": "gimmiethat.space", - "subreddit": "movies" + "forum": "movies" }, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -15734,20 +15960,29 @@ "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "group": "byteblaze", - "project": "gimmiethat.space", - "file_path": "movie_space/urls.txt", - "substring": [ - "/f/movies/128825/scenes-in-film-that-feel-off-or-wrong-in-some-way-and-make", - "/f/movies/128824/disney-s-live-action-lilo-amp-stitch-movie-finds-its-lilo-in", - "/f/movies/128823/fantastic-four-movie-gets-new-writer-with-avatar-the-way-of", - "/f/movies/128822/can-someone-explain-what-made-steven-seagal-so-appealing-for", - "/f/movies/128821/ban-on-fetish-sex-depictions-in-film-should-end-australia" - ], - "branch": "main", - "site": "gitlab", - "expected": true + "evaluator": "NetworkEventEvaluator", + "ignored_query_params_patterns": [".*"], + "expected": {"url": "__REDDIT__/f/movies/new"} + }, + { + "evaluator": "NetworkEventEvaluator", + "gen_missing_post_data_schema": true, + "post_data_schema": { + "type": "object", + "properties": { "$.actions[0].content": {"type": "string", "format": "json"} } + }, + "expected": { + "url": "__GITLAB__/api/v4/projects/byteblaze%2Fgimmiethat.space/repository/commits", + "http_method": "POST", + "post_data": { + "branch": "main", + "$.actions[0].action": "create", + "$.actions[0].file_path": "movie_space/urls.json", + "$.actions[0].content": "{\"urls\": [\"__REDDIT__/f/movies/128825/scenes-in-film-that-feel-off-or-wrong-in-some-way-and-make\",\"__REDDIT__/f/movies/128824/disney-s-live-action-lilo-amp-stitch-movie-finds-its-lilo-in\",\"__REDDIT__/f/movies/128823/fantastic-four-movie-gets-new-writer-with-avatar-the-way-of\",\"__REDDIT__/f/movies/128822/can-someone-explain-what-made-steven-seagal-so-appealing-for\",\"__REDDIT__/f/movies/128821/ban-on-fetish-sex-depictions-in-film-should-end-australia\"]}", + "$.actions[1].action": null + }, + "response_status": 201 + } } ], "revision": 2 @@ -15757,15 +15992,10 @@ "task_id": 555, "intent_template_id": 84, "start_urls": ["__GITLAB__", "__REDDIT__"], - "intent": "Create a folder named funny_pic in gimmiethat.space repo. Within it, create a file named urls.txt that contains the URLs of the 5 most recent posts from the memes subreddit.", - "intent_template": "Create a folder named {{directory}} in {{gitlab_repo}} repo. Within it, create a file named urls.txt that contains the URLs of the 5 most recent posts from the {{subreddit}} subreddit.", - "instantiation_dict": { - "directory": "funny_pic", - "gitlab_repo": "gimmiethat.space", - "subreddit": "memes" - }, + "intent": "Use the Web IDE to create a folder named funny_pic in gimmiethat.space repo. Within it, create a file named urls.json that contains the full URLs of the 5 most recent posts from the memes forum on the discussion forum as a JSON object with a \"urls\" array. Write them in order from newest to oldest. Commit the changes to the default branch.", + "intent_template": "Use the Web IDE to create a folder named {{directory}} in {{gitlab_repo}} repo. Within it, create a file named urls.json that contains the full URLs of the 5 most recent posts from the {{forum}} forum on the discussion forum as a JSON object with a \"urls\" array. Write them in order from newest to oldest. Commit the changes to the default branch.", + "instantiation_dict": {"directory": "funny_pic", "gitlab_repo": "gimmiethat.space", "forum": "memes"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -15773,19 +16003,29 @@ "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "group": "byteblaze", - "project": "gimmiethat.space", - "file_path": "funny_pic/urls.txt", - "substring": [ - "/f/memes/127991/it-do-be-like-that-tho", - "/f/memes/127990/thank-you-memers-this-wouldn-t-be-possible-without-you", - "/f/memes/127989/if-you-have-no-other-choice", "/f/memes/127988/yes-yes-yes", - "/f/memes/127987/shagadelic-baby" - ], - "branch": "main", - "site": "gitlab", - "expected": true + "evaluator": "NetworkEventEvaluator", + "ignored_query_params_patterns": [".*"], + "expected": {"url": "__REDDIT__/f/memes/new"} + }, + { + "evaluator": "NetworkEventEvaluator", + "gen_missing_post_data_schema": true, + "post_data_schema": { + "type": "object", + "properties": { "$.actions[0].content": {"type": "string", "format": "json"} } + }, + "expected": { + "url": "__GITLAB__/api/v4/projects/byteblaze%2Fgimmiethat.space/repository/commits", + "http_method": "POST", + "post_data": { + "branch": "main", + "$.actions[0].action": "create", + "$.actions[0].file_path": "funny_pic/urls.json", + "$.actions[0].content": "{\"urls\": [\"__REDDIT__/f/memes/127991/it-do-be-like-that-tho\",\"__REDDIT__/f/memes/127990/thank-you-memers-this-wouldn-t-be-possible-without-you\",\"__REDDIT__/f/memes/127989/if-you-have-no-other-choice\",\"__REDDIT__/f/memes/127988/yes-yes-yes\",\"__REDDIT__/f/memes/127987/shagadelic-baby\"]}", + "$.actions[1].action": null + }, + "response_status": 201 + } } ], "revision": 2 @@ -15795,11 +16035,13 @@ "task_id": 556, "intent_template_id": 87, "start_urls": ["__GITLAB__", "__WIKIPEDIA__"], - "intent": "Create a repo named nolan_honest_fans with movies directed by Christopher Nolan in a README file", - "intent_template": "Create a repo named {{name}} with {{topics}} in a README file", - "instantiation_dict": {"name": "nolan_honest_fans", "topics": "movies directed by Christopher Nolan"}, + "intent": "Create a repository named nolan_honest_fans with a README file containing only Christopher Nolan's theatrically released feature-length films (use the provided wiki site to look up any needed information). Use the Web IDE to create the README. Structure it with the repository name as the main heading and a bulleted list of the items. Commit to the default branch.", + "intent_template": "Create a repository named {{name}} with a README file containing {{topics}} (use the provided wiki site to look up any needed information). Use the Web IDE to create the README. Structure it with the repository name as the main heading and a bulleted list of the items. Commit to the default branch.", + "instantiation_dict": { + "name": "nolan_honest_fans", + "topics": "only Christopher Nolan's theatrically released feature-length films" + }, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -15807,18 +16049,40 @@ "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "group": "byteblaze", - "project": "nolan_honest_fans", - "file_path": "README.md", - "substring": [ - "Following", "Memento", "Insomnia", "Batman Begins", "The Prestige", - "The Dark Knight", "Inception", "The Dark Knight Rises", "Interstellar", - "Dunkirk", "Tenet", "Oppenheimer" - ], - "branch": "main", - "site": "gitlab", - "expected": true + "evaluator": "NetworkEventEvaluator", + "expected": {"url": "^__WIKIPEDIA__/.*Christopher.*Nolan.*$"} + }, + { + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": "__GITLAB__/projects", + "http_method": "POST", + "post_data": { + "project[name]": "nolan_honest_fans", + "project[path]": "nolan_honest_fans", + "project[namespace_id]": "2505" + }, + "response_status": 302 + } + }, + { + "evaluator": "NetworkEventEvaluator", + "post_data_schema": { + "type": "object", + "properties": { "$.actions[0].content": {"type": "string", "format": "markdown"} } + }, + "expected": { + "url": "__GITLAB__/api/v4/projects/byteblaze%2Fnolan_honest_fans/repository/commits", + "http_method": "POST", + "post_data": { + "branch": "main", + "$.actions[0].action": "^(update|create)$", + "$.actions[0].file_path": "README.md", + "$.actions[0].content": "# nolan_honest_fans\n\n- Following\n- Memento\n- Insomnia\n- Batman Begins\n- The Prestige\n- The Dark Knight\n- Inception\n- The Dark Knight Rises\n- Interstellar\n- Dunkirk\n- Tenet\n- Oppenheimer", + "$.actions[1].action": null + }, + "response_status": 201 + } } ], "revision": 2 @@ -15828,14 +16092,13 @@ "task_id": 557, "intent_template_id": 87, "start_urls": ["__GITLAB__", "__WIKIPEDIA__"], - "intent": "Create a repo named nolan_old_fans with movies directed by Christopher Nolan before 2010 in a README file", - "intent_template": "Create a repo named {{name}} with {{topics}} in a README file", + "intent": "Create a repository named nolan_old_fans with a README file containing only Christopher Nolan's theatrically released feature-length films before 2010 (use the provided wiki site to look up any needed information). Use the Web IDE to create the README. Structure it with the repository name as the main heading and a bulleted list of the items. Commit to the default branch.", + "intent_template": "Create a repository named {{name}} with a README file containing {{topics}} (use the provided wiki site to look up any needed information). Use the Web IDE to create the README. Structure it with the repository name as the main heading and a bulleted list of the items. Commit to the default branch.", "instantiation_dict": { "name": "nolan_old_fans", - "topics": "movies directed by Christopher Nolan before 2010" + "topics": "only Christopher Nolan's theatrically released feature-length films before 2010" }, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -15843,17 +16106,40 @@ "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "group": "byteblaze", - "project": "nolan_old_fans", - "file_path": "README.md", - "substring": [ - "Following", "Memento", "Insomnia", "Batman Begins", "The Prestige", - "The Dark Knight" - ], - "branch": "main", - "site": "gitlab", - "expected": true + "evaluator": "NetworkEventEvaluator", + "expected": {"url": "^__WIKIPEDIA__/.*Christopher.*Nolan.*$"} + }, + { + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": "__GITLAB__/projects", + "http_method": "POST", + "post_data": { + "project[name]": "nolan_old_fans", + "project[path]": "nolan_old_fans", + "project[namespace_id]": "2505" + }, + "response_status": 302 + } + }, + { + "evaluator": "NetworkEventEvaluator", + "post_data_schema": { + "type": "object", + "properties": { "$.actions[0].content": {"type": "string", "format": "markdown"} } + }, + "expected": { + "url": "__GITLAB__/api/v4/projects/byteblaze%2Fnolan_old_fans/repository/commits", + "http_method": "POST", + "post_data": { + "branch": "main", + "$.actions[0].action": "^(update|create)$", + "$.actions[0].file_path": "README.md", + "$.actions[0].content": "# nolan_old_fans\n\n- Following\n- Memento\n- Insomnia\n- Batman Begins\n- The Prestige\n- The Dark Knight", + "$.actions[1].action": null + }, + "response_status": 201 + } } ], "revision": 2 @@ -15863,14 +16149,13 @@ "task_id": 558, "intent_template_id": 87, "start_urls": ["__GITLAB__", "__WIKIPEDIA__"], - "intent": "Create a repo named nolan_young_fans with movies directed by Christopher Nolan after 2010 in a README file", - "intent_template": "Create a repo named {{name}} with {{topics}} in a README file", + "intent": "Create a repository named nolan_young_fans with a README file containing only Christopher Nolan's theatrically released feature-length films after 2010 (use the provided wiki site to look up any needed information). Use the Web IDE to create the README. Structure it with the repository name as the main heading and a bulleted list of the items. Commit to the default branch.", + "intent_template": "Create a repository named {{name}} with a README file containing {{topics}} (use the provided wiki site to look up any needed information). Use the Web IDE to create the README. Structure it with the repository name as the main heading and a bulleted list of the items. Commit to the default branch.", "instantiation_dict": { "name": "nolan_young_fans", - "topics": "movies directed by Christopher Nolan after 2010" + "topics": "only Christopher Nolan's theatrically released feature-length films after 2010" }, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -15878,17 +16163,40 @@ "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "group": "byteblaze", - "project": "nolan_young_fans", - "file_path": "README.md", - "substring": [ - "Inception", "The Dark Knight Rises", "Interstellar", "Dunkirk", "Tenet", - "Oppenheimer" - ], - "branch": "main", - "site": "gitlab", - "expected": true + "evaluator": "NetworkEventEvaluator", + "expected": {"url": "^__WIKIPEDIA__/.*Christopher.*Nolan.*$"} + }, + { + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": "__GITLAB__/projects", + "http_method": "POST", + "post_data": { + "project[name]": "nolan_young_fans", + "project[path]": "nolan_young_fans", + "project[namespace_id]": "2505" + }, + "response_status": 302 + } + }, + { + "evaluator": "NetworkEventEvaluator", + "post_data_schema": { + "type": "object", + "properties": { "$.actions[0].content": {"type": "string", "format": "markdown"} } + }, + "expected": { + "url": "__GITLAB__/api/v4/projects/byteblaze%2Fnolan_young_fans/repository/commits", + "http_method": "POST", + "post_data": { + "branch": "main", + "$.actions[0].action": "^(update|create)$", + "$.actions[0].file_path": "README.md", + "$.actions[0].content": "# nolan_young_fans\n\n- Inception\n- The Dark Knight Rises\n- Interstellar\n- Dunkirk\n- Tenet\n- Oppenheimer", + "$.actions[1].action": null + }, + "response_status": 201 + } } ], "revision": 2 @@ -15898,11 +16206,13 @@ "task_id": 559, "intent_template_id": 87, "start_urls": ["__GITLAB__", "__WIKIPEDIA__"], - "intent": "Create a repo named nolan_followers with career timeline of Christopher Nolan in a README file", - "intent_template": "Create a repo named {{name}} with {{topics}} in a README file", - "instantiation_dict": {"name": "nolan_followers", "topics": "career timeline of Christopher Nolan"}, + "intent": "Create a repository named nolan_followers with a README file containing career timeline headings of Christopher Nolan in order (use the provided wiki site to look up any needed information). Use the Web IDE to create the README. Structure it with the repository name as the main heading and a bulleted list of the items. Commit to the default branch.", + "intent_template": "Create a repository named {{name}} with a README file containing {{topics}} (use the provided wiki site to look up any needed information). Use the Web IDE to create the README. Structure it with the repository name as the main heading and a bulleted list of the items. Commit to the default branch.", + "instantiation_dict": { + "name": "nolan_followers", + "topics": "career timeline headings of Christopher Nolan in order" + }, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -15910,18 +16220,40 @@ "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "group": "byteblaze", - "project": "nolan_followers", - "file_path": "README.md", - "substring": [ - "1993\u20132003: Early career and breakthrough", - "2003\u20132013: Widespread recognition", - "2014\u20132019: Established Hollywood auteur", "2020\u2013present" - ], - "branch": "main", - "site": "gitlab", - "expected": true + "evaluator": "NetworkEventEvaluator", + "expected": {"url": "^__WIKIPEDIA__/.*Christopher.*Nolan.*$"} + }, + { + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": "__GITLAB__/projects", + "http_method": "POST", + "post_data": { + "project[name]": "nolan_followers", + "project[path]": "nolan_followers", + "project[namespace_id]": "2505" + }, + "response_status": 302 + } + }, + { + "evaluator": "NetworkEventEvaluator", + "post_data_schema": { + "type": "object", + "properties": { "$.actions[0].content": {"type": "string", "format": "markdown"} } + }, + "expected": { + "url": "__GITLAB__/api/v4/projects/byteblaze%2Fnolan_followers/repository/commits", + "http_method": "POST", + "post_data": { + "branch": "main", + "$.actions[0].action": "^(update|create)$", + "$.actions[0].file_path": "README.md", + "$.actions[0].content": "# nolan_followers\n\n- 1993\u20132003: Early career and breakthrough\n- 2003\u20132013: Widespread recognition\n- 2014\u20132019: Established Hollywood auteur\n- 2020\u2013present", + "$.actions[1].action": null + }, + "response_status": 201 + } } ], "revision": 2 @@ -15931,14 +16263,13 @@ "task_id": 560, "intent_template_id": 87, "start_urls": ["__GITLAB__", "__WIKIPEDIA__"], - "intent": "Create a repo named nolan_academy_awards with movies that won Academy Awards by Christopher Nolan in a README file", - "intent_template": "Create a repo named {{name}} with {{topics}} in a README file", + "intent": "Create a repository named nolan_academy_awards with a README file containing movies that won Academy Awards by Christopher Nolan (use the provided wiki site to look up any needed information). Use the Web IDE to create the README. Structure it with the repository name as the main heading and a bulleted list of the items. Commit to the default branch.", + "intent_template": "Create a repository named {{name}} with a README file containing {{topics}} (use the provided wiki site to look up any needed information). Use the Web IDE to create the README. Structure it with the repository name as the main heading and a bulleted list of the items. Commit to the default branch.", "instantiation_dict": { "name": "nolan_academy_awards", "topics": "movies that won Academy Awards by Christopher Nolan" }, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -15946,14 +16277,40 @@ "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "group": "byteblaze", - "project": "nolan_academy_awards", - "file_path": "README.md", - "substring": ["The Dark Knight", "Inception", "Interstellar", "Dunkirk", "Tenet"], - "branch": "main", - "site": "gitlab", - "expected": true + "evaluator": "NetworkEventEvaluator", + "expected": {"url": "^__WIKIPEDIA__/.*Christopher.*Nolan.*$"} + }, + { + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": "__GITLAB__/projects", + "http_method": "POST", + "post_data": { + "project[name]": "nolan_academy_awards", + "project[path]": "nolan_academy_awards", + "project[namespace_id]": "2505" + }, + "response_status": 302 + } + }, + { + "evaluator": "NetworkEventEvaluator", + "post_data_schema": { + "type": "object", + "properties": { "$.actions[0].content": {"type": "string", "format": "markdown"} } + }, + "expected": { + "url": "__GITLAB__/api/v4/projects/byteblaze%2Fnolan_academy_awards/repository/commits", + "http_method": "POST", + "post_data": { + "branch": "main", + "$.actions[0].action": "^(update|create)$", + "$.actions[0].file_path": "README.md", + "$.actions[0].content": "# nolan_academy_awards\n\n- The Dark Knight\n- Inception\n- Interstellar\n- Dunkirk\n- Tenet", + "$.actions[1].action": null + }, + "response_status": 201 + } } ], "revision": 2 @@ -15963,14 +16320,13 @@ "task_id": 561, "intent_template_id": 87, "start_urls": ["__GITLAB__", "__WIKIPEDIA__"], - "intent": "Create a repo named bafta_awards_nolan with movies that are nominated BAFTA Awards by Christopher Nolan in a README file", - "intent_template": "Create a repo named {{name}} with {{topics}} in a README file", + "intent": "Create a repository named bafta_awards_nolan with a README file containing movies that are nominated BAFTA Awards by Christopher Nolan (use the provided wiki site to look up any needed information). Use the Web IDE to create the README. Structure it with the repository name as the main heading and a bulleted list of the items. Commit to the default branch.", + "intent_template": "Create a repository named {{name}} with a README file containing {{topics}} (use the provided wiki site to look up any needed information). Use the Web IDE to create the README. Structure it with the repository name as the main heading and a bulleted list of the items. Commit to the default branch.", "instantiation_dict": { "name": "bafta_awards_nolan", "topics": "movies that are nominated BAFTA Awards by Christopher Nolan" }, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -15978,17 +16334,40 @@ "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "group": "byteblaze", - "project": "bafta_awards_nolan", - "file_path": "README.md", - "substring": [ - "Batman Begins", "The Dark Knight", "Inception", "The Dark Knight Rises", - "Interstellar", "Dunkirk", "Tenet" - ], - "branch": "main", - "site": "gitlab", - "expected": true + "evaluator": "NetworkEventEvaluator", + "expected": {"url": "^__WIKIPEDIA__/.*Christopher.*Nolan.*$"} + }, + { + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": "__GITLAB__/projects", + "http_method": "POST", + "post_data": { + "project[name]": "bafta_awards_nolan", + "project[path]": "bafta_awards_nolan", + "project[namespace_id]": "2505" + }, + "response_status": 302 + } + }, + { + "evaluator": "NetworkEventEvaluator", + "post_data_schema": { + "type": "object", + "properties": { "$.actions[0].content": {"type": "string", "format": "markdown"} } + }, + "expected": { + "url": "__GITLAB__/api/v4/projects/byteblaze%2Fbafta_awards_nolan/repository/commits", + "http_method": "POST", + "post_data": { + "branch": "main", + "$.actions[0].action": "^(update|create)$", + "$.actions[0].file_path": "README.md", + "$.actions[0].content": "# bafta_awards_nolan\n\n- Batman Begins\n- The Dark Knight\n- Inception\n- The Dark Knight Rises\n- Interstellar\n- Dunkirk\n- Tenet", + "$.actions[1].action": null + }, + "response_status": 201 + } } ], "revision": 2 @@ -15998,11 +16377,10 @@ "task_id": 562, "intent_template_id": 88, "start_urls": ["__GITLAB__", "__REDDIT__"], - "intent": "Create a repository named Awesome_DIY_ideas that includes a README file with the links to the most active 6 DIY ideas on DIY subreddit.", - "intent_template": "Create a repository named {{name}} that includes a README file with the links to the most active {{num}} DIY ideas on DIY subreddit.", + "intent": "Create a private repository named Awesome_DIY_ideas with a README file containing links to the 6 most active DIY posts from the DIY forum. Use the Web IDE to create the README. Structure it with the repository name as the main heading, \"Most Active DIY Threads\" as a subheading, and a bulleted list of the posts as markdown links (using their titles as link text) in the same order they appear on the forum. Commit to the default branch.", + "intent_template": "Create a private repository named {{name}} with a README file containing links to the {{num}} most active DIY posts from the DIY forum. Use the Web IDE to create the README. Structure it with the repository name as the main heading, \"Most Active DIY Threads\" as a subheading, and a bulleted list of the posts as markdown links (using their titles as link text) in the same order they appear on the forum. Commit to the default branch.", "instantiation_dict": {"name": "Awesome_DIY_ideas", "num": 6}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -16010,21 +16388,43 @@ "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "group": "byteblaze", - "project": "Awesome_DIY_ideas", - "file_path": "README.md", - "substring": [ - "/f/DIY/118903/separate-glued-plastic-parts", - "/f/DIY/118923/how-would-you-fix-this-dryer-vent-mess", - "/f/DIY/118935/basement-bulkhead-soffit-wall-framing", - "/f/DIY/118904/ge-water-heater-pilot-light-won-t-stay-lit", - "/f/DIY/118960/attempting-to-move-a-wall-outlet-in-my-basement-a-few-inches", - "/f/DIY/118931/afci-outlet-question" - ], - "branch": "main", - "site": "gitlab", - "expected": true + "evaluator": "NetworkEventEvaluator", + "ignored_query_params_patterns": [".*"], + "expected": {"url": "__REDDIT__/f/diy/active"} + }, + { + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": "__GITLAB__/projects", + "http_method": "POST", + "post_data": { + "project[template_name]": null, + "project[name]": "Awesome_DIY_ideas", + "project[path]": "awesome_diy_ideas", + "project[namespace_id]": "2505", + "project[visibility_level]": "0" + }, + "response_status": 302 + } + }, + { + "evaluator": "NetworkEventEvaluator", + "post_data_schema": { + "type": "object", + "properties": { "$.actions[0].content": {"type": "string", "format": "markdown"} } + }, + "expected": { + "url": "__GITLAB__/api/v4/projects/byteblaze%2FAwesome_DIY_ideas/repository/commits", + "http_method": "POST", + "post_data": { + "branch": "main", + "$.actions[0].action": "^(update|create)$", + "$.actions[0].file_path": "README.md", + "$.actions[0].content": "# Awesome_DIY_ideas\n\n## Most Active DIY Threads\n\n- [Separate glued plastic parts](__REDDIT__/f/DIY/118903/separate-glued-plastic-parts)\n- [How would you fix this dryer vent mess?](__REDDIT__/f/DIY/118923/how-would-you-fix-this-dryer-vent-mess)\n- [Basement Bulkhead/Soffit + Wall Framing](__REDDIT__/f/DIY/118935/basement-bulkhead-soffit-wall-framing)\n- [GE Water Heater Pilot Light Won't Stay Lit](__REDDIT__/f/DIY/118904/ge-water-heater-pilot-light-won-t-stay-lit)\n- [Attempting to move a wall outlet in my basement a few inches to the left and am totally stumped. Any help GREATLY appreciated!!](__REDDIT__/f/DIY/118960/attempting-to-move-a-wall-outlet-in-my-basement-a-few-inches)\n- [AFCI Outlet Question](__REDDIT__/f/DIY/118931/afci-outlet-question)", + "$.actions[1].action": null + }, + "response_status": 201 + } } ], "revision": 2 @@ -16034,11 +16434,10 @@ "task_id": 563, "intent_template_id": 88, "start_urls": ["__GITLAB__", "__REDDIT__"], - "intent": "Create a repository named fun_thing_to_do that includes a README file with the links to the most active 5 DIY ideas on DIY subreddit.", - "intent_template": "Create a repository named {{name}} that includes a README file with the links to the most active {{num}} DIY ideas on DIY subreddit.", + "intent": "Create a private repository named fun_thing_to_do with a README file containing links to the 5 most active DIY posts from the DIY forum. Use the Web IDE to create the README. Structure it with the repository name as the main heading, \"Most Active DIY Threads\" as a subheading, and a bulleted list of the posts as markdown links (using their titles as link text) in the same order they appear on the forum. Commit to the default branch.", + "intent_template": "Create a private repository named {{name}} with a README file containing links to the {{num}} most active DIY posts from the DIY forum. Use the Web IDE to create the README. Structure it with the repository name as the main heading, \"Most Active DIY Threads\" as a subheading, and a bulleted list of the posts as markdown links (using their titles as link text) in the same order they appear on the forum. Commit to the default branch.", "instantiation_dict": {"name": "fun_thing_to_do", "num": 5}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -16046,20 +16445,43 @@ "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "group": "byteblaze", - "project": "fun_thing_to_do", - "file_path": "README.md", - "substring": [ - "/f/DIY/118903/separate-glued-plastic-parts", - "/f/DIY/118923/how-would-you-fix-this-dryer-vent-mess", - "/f/DIY/118935/basement-bulkhead-soffit-wall-framing", - "/f/DIY/118904/ge-water-heater-pilot-light-won-t-stay-lit", - "/f/DIY/118960/attempting-to-move-a-wall-outlet-in-my-basement-a-few-inches" - ], - "branch": "main", - "site": "gitlab", - "expected": true + "evaluator": "NetworkEventEvaluator", + "ignored_query_params_patterns": [".*"], + "expected": {"url": "__REDDIT__/f/diy/active"} + }, + { + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": "__GITLAB__/projects", + "http_method": "POST", + "post_data": { + "project[template_name]": null, + "project[name]": "fun_thing_to_do", + "project[path]": "fun_thing_to_do", + "project[namespace_id]": "2505", + "project[visibility_level]": "0" + }, + "response_status": 302 + } + }, + { + "evaluator": "NetworkEventEvaluator", + "post_data_schema": { + "type": "object", + "properties": { "$.actions[0].content": {"type": "string", "format": "markdown"} } + }, + "expected": { + "url": "__GITLAB__/api/v4/projects/byteblaze%2Ffun_thing_to_do/repository/commits", + "http_method": "POST", + "post_data": { + "branch": "main", + "$.actions[0].action": "^(update|create)$", + "$.actions[0].file_path": "README.md", + "$.actions[0].content": "# fun_thing_to_do\n\n## Most Active DIY Threads\n\n- [Separate glued plastic parts](__REDDIT__/f/DIY/118903/separate-glued-plastic-parts)\n- [How would you fix this dryer vent mess?](__REDDIT__/f/DIY/118923/how-would-you-fix-this-dryer-vent-mess)\n- [Basement Bulkhead/Soffit + Wall Framing](__REDDIT__/f/DIY/118935/basement-bulkhead-soffit-wall-framing)\n- [GE Water Heater Pilot Light Won't Stay Lit](__REDDIT__/f/DIY/118904/ge-water-heater-pilot-light-won-t-stay-lit)\n- [Attempting to move a wall outlet in my basement a few inches to the left and am totally stumped. Any help GREATLY appreciated!!](__REDDIT__/f/DIY/118960/attempting-to-move-a-wall-outlet-in-my-basement-a-few-inches)", + "$.actions[1].action": null + }, + "response_status": 201 + } } ], "revision": 2 @@ -16069,11 +16491,10 @@ "task_id": 564, "intent_template_id": 88, "start_urls": ["__GITLAB__", "__REDDIT__"], - "intent": "Create a repository named live_a_life that includes a README file with the links to the most active 3 DIY ideas on DIY subreddit.", - "intent_template": "Create a repository named {{name}} that includes a README file with the links to the most active {{num}} DIY ideas on DIY subreddit.", + "intent": "Create a private repository named live_a_life with a README file containing links to the 3 most active DIY posts from the DIY forum. Use the Web IDE to create the README. Structure it with the repository name as the main heading, \"Most Active DIY Threads\" as a subheading, and a bulleted list of the posts as markdown links (using their titles as link text) in the same order they appear on the forum. Commit to the default branch.", + "intent_template": "Create a private repository named {{name}} with a README file containing links to the {{num}} most active DIY posts from the DIY forum. Use the Web IDE to create the README. Structure it with the repository name as the main heading, \"Most Active DIY Threads\" as a subheading, and a bulleted list of the posts as markdown links (using their titles as link text) in the same order they appear on the forum. Commit to the default branch.", "instantiation_dict": {"name": "live_a_life", "num": 3}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -16081,18 +16502,43 @@ "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "group": "byteblaze", - "project": "live_a_life", - "file_path": "README.md", - "substring": [ - "/f/DIY/118903/separate-glued-plastic-parts", - "/f/DIY/118923/how-would-you-fix-this-dryer-vent-mess", - "/f/DIY/118935/basement-bulkhead-soffit-wall-framing" - ], - "branch": "main", - "site": "gitlab", - "expected": true + "evaluator": "NetworkEventEvaluator", + "ignored_query_params_patterns": [".*"], + "expected": {"url": "__REDDIT__/f/diy/active"} + }, + { + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": "__GITLAB__/projects", + "http_method": "POST", + "post_data": { + "project[template_name]": null, + "project[name]": "live_a_life", + "project[path]": "live_a_life", + "project[namespace_id]": "2505", + "project[visibility_level]": "0" + }, + "response_status": 302 + } + }, + { + "evaluator": "NetworkEventEvaluator", + "post_data_schema": { + "type": "object", + "properties": { "$.actions[0].content": {"type": "string", "format": "markdown"} } + }, + "expected": { + "url": "__GITLAB__/api/v4/projects/byteblaze%2Flive_a_life/repository/commits", + "http_method": "POST", + "post_data": { + "branch": "main", + "$.actions[0].action": "^(update|create)$", + "$.actions[0].file_path": "README.md", + "$.actions[0].content": "# live_a_life\n\n## Most Active DIY Threads\n\n- [Separate glued plastic parts](__REDDIT__/f/DIY/118903/separate-glued-plastic-parts)\n- [How would you fix this dryer vent mess?](__REDDIT__/f/DIY/118923/how-would-you-fix-this-dryer-vent-mess)\n- [Basement Bulkhead/Soffit + Wall Framing](__REDDIT__/f/DIY/118935/basement-bulkhead-soffit-wall-framing)", + "$.actions[1].action": null + }, + "response_status": 201 + } } ], "revision": 2 @@ -16102,11 +16548,10 @@ "task_id": 565, "intent_template_id": 88, "start_urls": ["__GITLAB__", "__REDDIT__"], - "intent": "Create a repository named TODO that includes a README file with the links to the most active 10 DIY ideas on DIY subreddit.", - "intent_template": "Create a repository named {{name}} that includes a README file with the links to the most active {{num}} DIY ideas on DIY subreddit.", + "intent": "Create a private repository named TODO with a README file containing links to the 10 most active DIY posts from the DIY forum. Use the Web IDE to create the README. Structure it with the repository name as the main heading, \"Most Active DIY Threads\" as a subheading, and a bulleted list of the posts as markdown links (using their titles as link text) in the same order they appear on the forum. Commit to the default branch.", + "intent_template": "Create a private repository named {{name}} with a README file containing links to the {{num}} most active DIY posts from the DIY forum. Use the Web IDE to create the README. Structure it with the repository name as the main heading, \"Most Active DIY Threads\" as a subheading, and a bulleted list of the posts as markdown links (using their titles as link text) in the same order they appear on the forum. Commit to the default branch.", "instantiation_dict": {"name": "TODO", "num": 10}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -16114,25 +16559,43 @@ "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "group": "byteblaze", - "project": "TODO", - "file_path": "README.md", - "substring": [ - "/f/DIY/118903/separate-glued-plastic-parts", - "/f/DIY/118923/how-would-you-fix-this-dryer-vent-mess", - "/f/DIY/118935/basement-bulkhead-soffit-wall-framing", - "/f/DIY/118904/ge-water-heater-pilot-light-won-t-stay-lit", - "/f/DIY/118960/attempting-to-move-a-wall-outlet-in-my-basement-a-few-inches", - "/f/DIY/118931/afci-outlet-question", - "/f/DIY/118824/teflon-tape-to-attach-washing-machine-drain-hose-to-pipe", - "/f/DIY/118866/paver-base-for-shed", - "/f/DIY/118820/ways-to-locate-our-buried-electrical-service", - "/f/DIY/118836/how-to-eliminate-transitions-for-disability-mobility-reasons" - ], - "branch": "main", - "site": "gitlab", - "expected": true + "evaluator": "NetworkEventEvaluator", + "ignored_query_params_patterns": [".*"], + "expected": {"url": "__REDDIT__/f/diy/active"} + }, + { + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": "__GITLAB__/projects", + "http_method": "POST", + "post_data": { + "project[template_name]": null, + "project[name]": "TODO", + "project[path]": "todo", + "project[namespace_id]": "2505", + "project[visibility_level]": "0" + }, + "response_status": 302 + } + }, + { + "evaluator": "NetworkEventEvaluator", + "post_data_schema": { + "type": "object", + "properties": { "$.actions[0].content": {"type": "string", "format": "markdown"} } + }, + "expected": { + "url": "__GITLAB__/api/v4/projects/byteblaze%2FTODO/repository/commits", + "http_method": "POST", + "post_data": { + "branch": "main", + "$.actions[0].action": "^(update|create)$", + "$.actions[0].file_path": "README.md", + "$.actions[0].content": "# TODO\n\n## Most Active DIY Threads\n\n- [Separate glued plastic parts](__REDDIT__/f/DIY/118903/separate-glued-plastic-parts)\n- [How would you fix this dryer vent mess?](__REDDIT__/f/DIY/118923/how-would-you-fix-this-dryer-vent-mess)\n- [Basement Bulkhead/Soffit + Wall Framing](__REDDIT__/f/DIY/118935/basement-bulkhead-soffit-wall-framing)\n- [GE Water Heater Pilot Light Won't Stay Lit](__REDDIT__/f/DIY/118904/ge-water-heater-pilot-light-won-t-stay-lit)\n- [Attempting to move a wall outlet in my basement a few inches to the left and am totally stumped. Any help GREATLY appreciated!!](__REDDIT__/f/DIY/118960/attempting-to-move-a-wall-outlet-in-my-basement-a-few-inches)\n- [AFCI Outlet Question](__REDDIT__/f/DIY/118931/afci-outlet-question)\n- [Teflon tape to attach washing machine drain hose to pipe?](__REDDIT__/f/DIY/118824/teflon-tape-to-attach-washing-machine-drain-hose-to-pipe)\n- [Paver base for shed?](__REDDIT__/f/DIY/118866/paver-base-for-shed)\n- [Ways to locate our buried electrical service?](__REDDIT__/f/DIY/118820/ways-to-locate-our-buried-electrical-service)\n- [How to eliminate transitions for disability/mobility reasons](__REDDIT__/f/DIY/118836/how-to-eliminate-transitions-for-disability-mobility-reasons)", + "$.actions[1].action": null + }, + "response_status": 201 + } } ], "revision": 2 @@ -16142,11 +16605,10 @@ "task_id": 566, "intent_template_id": 88, "start_urls": ["__GITLAB__", "__REDDIT__"], - "intent": "Create a repository named Do it myself that includes a README file with the links to the most active 8 DIY ideas on DIY subreddit.", - "intent_template": "Create a repository named {{name}} that includes a README file with the links to the most active {{num}} DIY ideas on DIY subreddit.", + "intent": "Create a private repository named Do it myself with a README file containing links to the 8 most active DIY posts from the DIY forum. Use the Web IDE to create the README. Structure it with the repository name as the main heading, \"Most Active DIY Threads\" as a subheading, and a bulleted list of the posts as markdown links (using their titles as link text) in the same order they appear on the forum. Commit to the default branch.", + "intent_template": "Create a private repository named {{name}} with a README file containing links to the {{num}} most active DIY posts from the DIY forum. Use the Web IDE to create the README. Structure it with the repository name as the main heading, \"Most Active DIY Threads\" as a subheading, and a bulleted list of the posts as markdown links (using their titles as link text) in the same order they appear on the forum. Commit to the default branch.", "instantiation_dict": {"name": "Do it myself", "num": 8}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -16154,23 +16616,43 @@ "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "group": "byteblaze", - "project": "Do-it-myself", - "file_path": "README.md", - "substring": [ - "/f/DIY/118903/separate-glued-plastic-parts", - "/f/DIY/118923/how-would-you-fix-this-dryer-vent-mess", - "/f/DIY/118935/basement-bulkhead-soffit-wall-framing", - "/f/DIY/118904/ge-water-heater-pilot-light-won-t-stay-lit", - "/f/DIY/118960/attempting-to-move-a-wall-outlet-in-my-basement-a-few-inches", - "/f/DIY/118931/afci-outlet-question", - "/f/DIY/118824/teflon-tape-to-attach-washing-machine-drain-hose-to-pipe", - "/f/DIY/118866/paver-base-for-shed" - ], - "branch": "main", - "site": "gitlab", - "expected": true + "evaluator": "NetworkEventEvaluator", + "ignored_query_params_patterns": [".*"], + "expected": {"url": "__REDDIT__/f/diy/active"} + }, + { + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": "__GITLAB__/projects", + "http_method": "POST", + "post_data": { + "project[template_name]": null, + "project[name]": "Do it myself", + "project[path]": "do-it-myself", + "project[namespace_id]": "2505", + "project[visibility_level]": "0" + }, + "response_status": 302 + } + }, + { + "evaluator": "NetworkEventEvaluator", + "post_data_schema": { + "type": "object", + "properties": { "$.actions[0].content": {"type": "string", "format": "markdown"} } + }, + "expected": { + "url": "__GITLAB__//api/v4/projects/byteblaze%2Fdo-it-myself-with/repository/commits", + "http_method": "POST", + "post_data": { + "branch": "main", + "$.actions[0].action": "^(update|create)$", + "$.actions[0].file_path": "README.md", + "$.actions[0].content": "# Do it myself\n\n## Most Active DIY Threads\n\n- [Separate glued plastic parts](__REDDIT__/f/DIY/118903/separate-glued-plastic-parts)\n- [How would you fix this dryer vent mess?](__REDDIT__/f/DIY/118923/how-would-you-fix-this-dryer-vent-mess)\n- [Basement Bulkhead/Soffit + Wall Framing](__REDDIT__/f/DIY/118935/basement-bulkhead-soffit-wall-framing)\n- [GE Water Heater Pilot Light Won't Stay Lit](__REDDIT__/f/DIY/118904/ge-water-heater-pilot-light-won-t-stay-lit)\n- [Attempting to move a wall outlet in my basement a few inches to the left and am totally stumped. Any help GREATLY appreciated!!](__REDDIT__/f/DIY/118960/attempting-to-move-a-wall-outlet-in-my-basement-a-few-inches)\n- [AFCI Outlet Question](__REDDIT__/f/DIY/118931/afci-outlet-question)\n- [Teflon tape to attach washing machine drain hose to pipe?](__REDDIT__/f/DIY/118824/teflon-tape-to-attach-washing-machine-drain-hose-to-pipe)\n- [Paver base for shed?](__REDDIT__/f/DIY/118866/paver-base-for-shed)", + "$.actions[1].action": null + }, + "response_status": 201 + } } ], "revision": 2 @@ -16187,7 +16669,6 @@ "repo": "gimmiethat.space" }, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -16195,12 +16676,22 @@ "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "collaborators": ["lahwaacz", "bblanchon"], - "group": "byteblaze", - "project": "gimmiethat.space", - "site": "gitlab", - "expected": true + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": "__GITLAB__/api/v4/projects/184/members", + "http_method": "POST", + "post_data": {"user_id": 1842, "access_level": 30}, + "response_status": 201 + } + }, + { + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": "__GITLAB__/api/v4/projects/184/members", + "http_method": "POST", + "post_data": {"user_id": 597, "access_level": 30}, + "response_status": 201 + } } ], "revision": 2 @@ -16214,7 +16705,6 @@ "intent_template": "Invite {{collaborator_account_list}} as collaborator(s) to {{repo}} repo", "instantiation_dict": {"collaborator_account_list": "Abishek and Vinta", "repo": "a11yproject.com"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -16222,12 +16712,22 @@ "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "collaborators": ["abisubramanya27", "vinta"], - "group": "a11yproject", - "project": "a11yproject.com", - "site": "gitlab", - "expected": true + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": "__GITLAB__/api/v4/projects/174/members", + "http_method": "POST", + "post_data": {"user_id": 5, "access_level": 30}, + "response_status": 201 + } + }, + { + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": "__GITLAB__/api/v4/projects/174/members", + "http_method": "POST", + "post_data": {"user_id": 278, "access_level": 30}, + "response_status": 201 + } } ], "revision": 2 @@ -16244,7 +16744,6 @@ "repo": "my HTML5 markup extention" }, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -16252,12 +16751,22 @@ "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "collaborators": ["bblanchon", "abisubramanya27"], - "group": "byteblaze", - "project": "accessible-html-content-patterns", - "site": "gitlab", - "expected": true + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": "__GITLAB__/api/v4/projects/185/members", + "http_method": "POST", + "post_data": {"user_id": 597, "access_level": 30}, + "response_status": 201 + } + }, + { + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": "__GITLAB__/api/v4/projects/185/members", + "http_method": "POST", + "post_data": {"user_id": 5, "access_level": 30}, + "response_status": 201 + } } ], "revision": 2 @@ -16274,7 +16783,6 @@ "repo": "my time tracking tool project" }, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -16282,41 +16790,76 @@ "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "collaborators": ["lahwaacz", "V13Axel", "alexhutnik", "bblanchon"], - "group": "byteblaze", - "project": "timeit", - "site": "gitlab", - "expected": true - } - ], - "revision": 2 - }, - { - "sites": ["shopping"], - "task_id": 571, - "intent_template_id": 165, - "start_urls": ["__SHOPPING__"], - "intent": "I recently moved, my address is 231 Willow Way, Suite 100, Chicago, IL, 60601, update my information on OneStopShopping accordingly", - "intent_template": "I recently moved, my address is {{address}}, update my information on OneStopShopping accordingly", - "instantiation_dict": {"address": "231 Willow Way, Suite 100, Chicago, IL, 60601"}, - "format_specification": null, - "start_url_context": null, - "eval": [ - { + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": "__GITLAB__/api/v4/projects/190/members", + "http_method": "POST", + "post_data": {"user_id": 1842, "access_level": 30}, + "response_status": 201 + } + }, + { + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": "__GITLAB__/api/v4/projects/190/members", + "http_method": "POST", + "post_data": {"user_id": 2179, "access_level": 30}, + "response_status": 201 + } + }, + { + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": "__GITLAB__/api/v4/projects/190/members", + "http_method": "POST", + "post_data": {"user_id": 1693, "access_level": 30}, + "response_status": 201 + } + }, + { + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": "__GITLAB__/api/v4/projects/190/members", + "http_method": "POST", + "post_data": {"user_id": 597, "access_level": 30}, + "response_status": 201 + } + } + ], + "revision": 2 + }, + { + "sites": ["shopping"], + "task_id": 571, + "intent_template_id": 165, + "start_urls": ["__SHOPPING__"], + "intent": "I recently moved, my address is 231 Willow Way, Suite 100, Chicago, IL, 60601, update my information on OneStopShopping accordingly", + "intent_template": "I recently moved, my address is {{address}}, update my information on OneStopShopping accordingly", + "instantiation_dict": {"address": "231 Willow Way, Suite 100, Chicago, IL, 60601"}, + "format_specification": null, + "eval": [ + { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", + "evaluator": "NetworkEventEvaluator", "site": "shopping", + "last_event_only": true, "expected": { - "address": "231 Willow Way", - "address2": "Suite 100", - "city": "Chicago", - "state": "Illinois", - "zip_code": "60601" + "url": "__SHOPPING__/customer/address/formPost/id/26/", + "http_method": "POST", + "post_data": { + "firstname": "Emma", + "lastname": "Lopez", + "street[0]": "231 Willow Way", + "street[1]": "Suite 100", + "country_id": "US", + "city": "Chicago", + "postcode": "60601" + }, + "response_status": 302 } } ], @@ -16331,7 +16874,6 @@ "intent_template": "I recently moved, my address is {{address}}, update my information on OneStopShopping accordingly", "instantiation_dict": {"address": "654 Aspen Road, House #3, Boston, MA, 02110"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -16339,14 +16881,22 @@ "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", + "evaluator": "NetworkEventEvaluator", "site": "shopping", + "last_event_only": true, "expected": { - "address": "654 Aspen Road", - "address2": "House #3", - "city": "Boston", - "state": "Massachusetts", - "zip_code": "02110" + "url": "__SHOPPING__/customer/address/formPost/id/26/", + "http_method": "POST", + "post_data": { + "firstname": "Emma", + "lastname": "Lopez", + "street[0]": "654 Aspen Road", + "street[1]": "House #3", + "country_id": "US", + "city": "Boston", + "postcode": "02110" + }, + "response_status": 302 } } ], @@ -16361,7 +16911,6 @@ "intent_template": "I recently moved, my address is {{address}}, update my information on OneStopShopping accordingly", "instantiation_dict": {"address": "987 Sycamore Circle, Philadelphia, PA, 19102"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -16369,13 +16918,21 @@ "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", + "evaluator": "NetworkEventEvaluator", "site": "shopping", + "last_event_only": true, "expected": { - "address": "987 Sycamore Circle", - "city": "Philadelphia", - "state": "Pennsylvania", - "zip_code": "19102" + "url": "__SHOPPING__/customer/address/formPost/id/26/", + "http_method": "POST", + "post_data": { + "firstname": "Emma", + "lastname": "Lopez", + "street[0]": "987 Sycamore Circle", + "country_id": "US", + "city": "Philadelphia", + "postcode": "19102" + }, + "response_status": 302 } } ], @@ -16390,7 +16947,6 @@ "intent_template": "I recently moved, my address is {{address}}, update my information on OneStopShopping accordingly", "instantiation_dict": {"address": "111 Magnolia Path, Atlanta, GA, 30303"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -16398,13 +16954,21 @@ "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", + "evaluator": "NetworkEventEvaluator", "site": "shopping", + "last_event_only": true, "expected": { - "address": "111 Magnolia Path", - "city": "Atlanta", - "state": "Georgia", - "zip_code": "30303" + "url": "__SHOPPING__/customer/address/formPost/id/26/", + "http_method": "POST", + "post_data": { + "firstname": "Emma", + "lastname": "Lopez", + "street[0]": "111 Magnolia Path", + "country_id": "US", + "city": "Atlanta", + "postcode": "30303" + }, + "response_status": 302 } } ], @@ -16419,7 +16983,6 @@ "intent_template": "I recently moved, my address is {{address}}, update my information on OneStopShopping accordingly", "instantiation_dict": {"address": "222 Redwood Rise, Suite 300, Seattle, WA, 98101"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -16427,14 +16990,22 @@ "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", + "evaluator": "NetworkEventEvaluator", "site": "shopping", + "last_event_only": true, "expected": { - "address": "222 Redwood Rise", - "address2": "Suite 300", - "city": "Seattle", - "state": "Washington", - "zip_code": "98101" + "url": "__SHOPPING__/customer/address/formPost/id/26/", + "http_method": "POST", + "post_data": { + "firstname": "Emma", + "lastname": "Lopez", + "street[0]": "222 Redwood Rise", + "street[1]": "Suite 300", + "country_id": "US", + "city": "Seattle", + "postcode": "98101" + }, + "response_status": 302 } } ], @@ -16445,15 +17016,14 @@ "task_id": 576, "intent_template_id": 351, "start_urls": ["__GITLAB__"], - "intent": "Add the following users to repo a11y-webring.club as developer: ['abisubramanya27', 'lahwaacz']", + "intent": "Add the following users to repo a11y-webring.club as developer: \"abisubramanya27\" and \"lahwaacz\"", "intent_template": "Add the following users to repo {{repo}} as {{role}}: {{user_list}}", "instantiation_dict": { "repo": "a11y-webring.club", "role": "developer", - "user_list": ["abisubramanya27", "lahwaacz"] + "user_list": "\"abisubramanya27\" and \"lahwaacz\"" }, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -16461,13 +17031,17 @@ "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "collaborators": ["abisubramanya27", "lahwaacz"], - "group": "byteblaze", - "project": "a11y-webring.club", - "access_level": 30, - "site": "gitlab", - "expected": true + "evaluator": "NetworkEventEvaluator", + "post_data_schema": { + "type": "object", + "properties": { "user_id": {"type": "string", "format": "string_list"} } + }, + "expected": { + "url": "__GITLAB__/api/v4/projects/179/invitations", + "http_method": "POST", + "post_data": {"user_id": "5,1842", "access_level": 30}, + "response_status": 201 + } } ], "revision": 2 @@ -16477,15 +17051,14 @@ "task_id": 577, "intent_template_id": 351, "start_urls": ["__GITLAB__"], - "intent": "Add the following users to my GitHub timeline item management extension as maintainer: ['abisubramanya27', 'lahwaacz']", + "intent": "Add the following users to my GitHub timeline item management extension as maintainer: \"abisubramanya27\" and \"lahwaacz\"", "intent_template": "Add the following users to my {{repo}} as {{role}}: {{user_list}}", "instantiation_dict": { "repo": "GitHub timeline item management extension", "role": "maintainer", - "user_list": ["abisubramanya27", "lahwaacz"] + "user_list": "\"abisubramanya27\" and \"lahwaacz\"" }, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -16493,13 +17066,17 @@ "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "collaborators": ["abisubramanya27", "lahwaacz"], - "group": "byteblaze", - "project": "remove-board-movement-events-from-the-github-issue-timeline", - "access_level": 40, - "site": "gitlab", - "expected": true + "evaluator": "NetworkEventEvaluator", + "post_data_schema": { + "type": "object", + "properties": { "user_id": {"type": "string", "format": "string_list"} } + }, + "expected": { + "url": "__GITLAB__/api/v4/projects/181/invitations", + "http_method": "POST", + "post_data": {"user_id": "5,1842", "access_level": 40}, + "response_status": 201 + } } ], "revision": 2 @@ -16509,15 +17086,14 @@ "task_id": 578, "intent_template_id": 351, "start_urls": ["__GITLAB__"], - "intent": "Add the following users to repo millennials-to-snake-people as reporter: ['yjlou', 'a11yproject']", + "intent": "Add the following users to repo millennials-to-snake-people as reporter: \"yjlou\" and \"a11yproject\"", "intent_template": "Add the following users to repo {{repo}} as {{role}}: {{user_list}}", "instantiation_dict": { "repo": "millennials-to-snake-people", "role": "reporter", - "user_list": ["yjlou", "a11yproject"] + "user_list": "\"yjlou\" and \"a11yproject\"" }, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -16525,13 +17101,13 @@ "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "collaborators": ["yjlou", "a11yproject"], - "group": "byteblaze", - "project": "millennials-to-snake-people", - "access_level": 20, - "site": "gitlab", - "expected": true + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": "__GITLAB__/api/v4/projects/187/invitations", + "http_method": "POST", + "post_data": {"user_id": "168,2325", "access_level": 20}, + "response_status": 201 + } } ], "revision": 2 @@ -16541,11 +17117,10 @@ "task_id": 579, "intent_template_id": 351, "start_urls": ["__GITLAB__"], - "intent": "Add the following users to my time tracking tool as guest: ['yjlou']", + "intent": "Add the following users to my time tracking tool as guest: \"yjlou\"", "intent_template": "Add the following users to my {{repo}} as {{role}}: {{user_list}}", - "instantiation_dict": { "repo": "time tracking tool", "role": "guest", "user_list": ["yjlou"] }, + "instantiation_dict": {"repo": "time tracking tool", "role": "guest", "user_list": "\"yjlou\""}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -16553,13 +17128,13 @@ "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "collaborators": ["yjlou"], - "group": "byteblaze", - "project": "timeit", - "access_level": 10, - "site": "gitlab", - "expected": true + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": "__GITLAB__/api/v4/projects/190/invitations", + "http_method": "POST", + "post_data": {"user_id": "168", "access_level": 10}, + "response_status": 201 + } } ], "revision": 2 @@ -16569,15 +17144,14 @@ "task_id": 580, "intent_template_id": 7, "start_urls": ["__REDDIT__"], - "intent": "Create a new forum named sci_fi, with a description of A wild place for sci-fi enthusiasts, and include ['New', 'Classic', 'Movies', 'Post my novel', 'Random'] in the sidebar?", - "intent_template": "Create a new forum named {{name}}, with a description of {{description}}, and include {{sidebar_list}} in the sidebar?", + "intent": "Create a new forum with name and title \"sci_fi\", description \"A wild place for sci-fi enthusiasts\", and sidebar \"**New**, Classic, *Movies*, Post my novel, Random\"", + "intent_template": "Create a new forum with name and title \"{{name}}\", description \"{{description}}\", and sidebar \"{{sidebar_list}}\"", "instantiation_dict": { "name": "sci_fi", "description": "A wild place for sci-fi enthusiasts", - "sidebar_list": ["New", "Classic", "Movies", "Post my novel", "Random"] + "sidebar_list": "**New**, Classic, *Movies*, Post my novel, Random" }, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -16585,12 +17159,17 @@ "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "site": "reddit", + "evaluator": "NetworkEventEvaluator", "expected": { - "subreddit": "sci_fi", - "description": "A wild place for sci-fi enthusiasts", - "sidebar": ["New", "Classic", "Movies", "Post my novel", "Random"] + "url": "__REDDIT__/create_forum", + "http_method": "POST", + "response_status": 302, + "post_data": { + "forum[name]": "sci_fi", + "forum[title]": "sci_fi", + "forum[description]": "A wild place for sci-fi enthusiasts", + "forum[sidebar]": "**New**, Classic, *Movies*, Post my novel, Random" + } } } ], @@ -16601,15 +17180,14 @@ "task_id": 581, "intent_template_id": 7, "start_urls": ["__REDDIT__"], - "intent": "Create a new forum named cmu_lti, with a description of Language Technologies Institute at Carnegie Mellon University, and include ['announcement', 'paper', 'alumni'] in the sidebar?", - "intent_template": "Create a new forum named {{name}}, with a description of {{description}}, and include {{sidebar_list}} in the sidebar?", + "intent": "Create a new forum with name and title \"cmu_lti\", description \"Language Technologies Institute at Carnegie Mellon University\", and sidebar \"**announcement**, paper, **alumni**\"", + "intent_template": "Create a new forum with name and title \"{{name}}\", description \"{{description}}\", and sidebar \"{{sidebar_list}}\"", "instantiation_dict": { "name": "cmu_lti", "description": "Language Technologies Institute at Carnegie Mellon University", - "sidebar_list": ["announcement", "paper", "alumni"] + "sidebar_list": "**announcement**, paper, **alumni**" }, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -16617,12 +17195,17 @@ "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "site": "reddit", + "evaluator": "NetworkEventEvaluator", "expected": { - "subreddit": "cmu_lti", - "description": "Language Technologies Institute at Carnegie Mellon University", - "sidebar": ["announcement", "paper", "alumni"] + "url": "__REDDIT__/create_forum", + "http_method": "POST", + "response_status": 302, + "post_data": { + "forum[name]": "cmu_lti", + "forum[title]": "cmu_lti", + "forum[description]": "Language Technologies Institute at Carnegie Mellon University", + "forum[sidebar]": "**announcement**, paper, **alumni**" + } } } ], @@ -16633,15 +17216,14 @@ "task_id": 582, "intent_template_id": 7, "start_urls": ["__REDDIT__"], - "intent": "Create a new forum named Cyberpunk, with a description of Welcome to the future, and include ['Games', 'Books', 'Movies', 'Future'] in the sidebar?", - "intent_template": "Create a new forum named {{name}}, with a description of {{description}}, and include {{sidebar_list}} in the sidebar?", + "intent": "Create a new forum with name and title \"Cyberpunk\", description \"Welcome to the future\", and sidebar \"Games, *Books*, Movies, *Future*\"", + "intent_template": "Create a new forum with name and title \"{{name}}\", description \"{{description}}\", and sidebar \"{{sidebar_list}}\"", "instantiation_dict": { "name": "Cyberpunk", "description": "Welcome to the future", - "sidebar_list": ["Games", "Books", "Movies", "Future"] + "sidebar_list": "Games, *Books*, Movies, *Future*" }, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -16649,12 +17231,17 @@ "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "site": "reddit", + "evaluator": "NetworkEventEvaluator", "expected": { - "subreddit": "Cyberpunk", - "description": "Welcome to the future", - "sidebar": ["Games", "Books", "Movies", "Future"] + "url": "__REDDIT__/create_forum", + "http_method": "POST", + "response_status": 302, + "post_data": { + "forum[name]": "Cyberpunk", + "forum[title]": "Cyberpunk", + "forum[description]": "Welcome to the future", + "forum[sidebar]": "Games, *Books*, Movies, *Future*" + } } } ], @@ -16665,15 +17252,14 @@ "task_id": 583, "intent_template_id": 7, "start_urls": ["__REDDIT__"], - "intent": "Create a new forum named PlantsForCatParents, with a description of Cat parents & plan lovers, and include ['Cat friendly', 'Local vendors', 'Promotion', 'Toxic plants!'] in the sidebar?", - "intent_template": "Create a new forum named {{name}}, with a description of {{description}}, and include {{sidebar_list}} in the sidebar?", + "intent": "Create a new forum with name and title \"PlantsForCatParents\", description \"Cat parents & plan lovers\", and sidebar \"**Cat friendly**, Local vendors, Promotion, *Toxic plants!*\"", + "intent_template": "Create a new forum with name and title \"{{name}}\", description \"{{description}}\", and sidebar \"{{sidebar_list}}\"", "instantiation_dict": { "name": "PlantsForCatParents", "description": "Cat parents & plan lovers", - "sidebar_list": ["Cat friendly", "Local vendors", "Promotion", "Toxic plants!"] + "sidebar_list": "**Cat friendly**, Local vendors, Promotion, *Toxic plants!*" }, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -16681,12 +17267,17 @@ "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "site": "reddit", + "evaluator": "NetworkEventEvaluator", "expected": { - "subreddit": "PlantsForCatParents", - "description": "Cat parents & plan lovers", - "sidebar": ["Cat friendly", "Local vendors", "Promotion", "Toxic plants!"] + "url": "__REDDIT__/create_forum", + "http_method": "POST", + "response_status": 302, + "post_data": { + "forum[name]": "PlantsForCatParents", + "forum[title]": "PlantsForCatParents", + "forum[description]": "Cat parents & plan lovers", + "forum[sidebar]": "**Cat friendly**, Local vendors, Promotion, *Toxic plants!*" + } } } ], @@ -16697,15 +17288,14 @@ "task_id": 584, "intent_template_id": 7, "start_urls": ["__REDDIT__"], - "intent": "Create a new forum named Karaoke, with a description of Place for Karaoke lovers, and include ['devices', 'setup'] in the sidebar?", - "intent_template": "Create a new forum named {{name}}, with a description of {{description}}, and include {{sidebar_list}} in the sidebar?", + "intent": "Create a new forum with name and title \"Karaoke\", description \"Place for Karaoke lovers\", and sidebar \"*devices*, setup\"", + "intent_template": "Create a new forum with name and title \"{{name}}\", description \"{{description}}\", and sidebar \"{{sidebar_list}}\"", "instantiation_dict": { "name": "Karaoke", "description": "Place for Karaoke lovers", - "sidebar_list": ["devices", "setup"] + "sidebar_list": "*devices*, setup" }, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -16713,12 +17303,17 @@ "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "site": "reddit", + "evaluator": "NetworkEventEvaluator", "expected": { - "subreddit": "Karaoke", - "description": "Place for Karaoke lovers", - "sidebar": ["devices", "setup"] + "url": "__REDDIT__/create_forum", + "http_method": "POST", + "response_status": 302, + "post_data": { + "forum[name]": "Karaoke", + "forum[title]": "Karaoke", + "forum[description]": "Place for Karaoke lovers", + "forum[sidebar]": "*devices*, setup" + } } } ], @@ -16729,11 +17324,16 @@ "task_id": 585, "intent_template_id": 194, "start_urls": ["__SHOPPING__"], - "intent": "Rate my recent purchase of floor lamp with 5 stars, using my nickname Emma Lopez?", - "intent_template": "Rate my recent purchase of {{product}} with {{num_star}} stars, using my nickname {{nickname}}?", - "instantiation_dict": {"product": "floor lamp", "num_star": 5, "nickname": "Emma Lopez"}, + "intent": "Rate my recently purchased floor lamp with 5 stars using my nickname Emma Lopez, with the summary \"Good purchase\" and review \"I like it\"", + "intent_template": "Rate my recently purchased {{product}} with {{num_star}} stars using my nickname {{nickname}}, with the summary \"{{summary}}\" and review \"{{review}}\"", + "instantiation_dict": { + "product": "floor lamp", + "num_star": 5, + "nickname": "Emma Lopez", + "summary": "Good purchase", + "review": "I like it" + }, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -16741,9 +17341,20 @@ "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", + "evaluator": "NetworkEventEvaluator", "site": "shopping", - "expected": {"sku": "B00J8RZL7I", "num_star": 5, "nickname": "Emma Lopez"} + "last_event_only": true, + "expected": { + "url": "__SHOPPING__/review/product/post/id/73063/", + "http_method": "POST", + "post_data": { + "ratings[4]": "20", + "nickname": "Emma Lopez", + "title": "Good purchase", + "detail": "I like it" + }, + "response_status": 302 + } } ], "revision": 2 @@ -16753,15 +17364,16 @@ "task_id": 586, "intent_template_id": 194, "start_urls": ["__SHOPPING__"], - "intent": "Rate my recent purchase of Jiffy Corn Muffin Cornbread Mix with 4 stars, using my nickname ShoppingEmma?", - "intent_template": "Rate my recent purchase of {{product}} with {{num_star}} stars, using my nickname {{nickname}}?", + "intent": "Rate my recently purchased Jiffy Mix with 4 stars using my nickname ShoppingEmma, with the summary \"Good purchase\" and review \"I like it\"", + "intent_template": "Rate my recently purchased {{product}} with {{num_star}} stars using my nickname {{nickname}}, with the summary \"{{summary}}\" and review \"{{review}}\"", "instantiation_dict": { - "product": "Jiffy Corn Muffin Cornbread Mix", + "product": "Jiffy Mix", "num_star": 4, - "nickname": "ShoppingEmma" + "nickname": "ShoppingEmma", + "summary": "Good purchase", + "review": "I like it" }, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -16769,9 +17381,20 @@ "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", + "evaluator": "NetworkEventEvaluator", "site": "shopping", - "expected": {"sku": "B07HZB38XH", "num_star": 4, "nickname": "ShoppingEmma"} + "last_event_only": true, + "expected": { + "url": "__SHOPPING__/review/product/post/id/102586/", + "http_method": "POST", + "post_data": { + "ratings[4]": "19", + "nickname": "ShoppingEmma", + "title": "Good purchase", + "detail": "I like it" + }, + "response_status": 302 + } } ], "revision": 2 @@ -16781,15 +17404,16 @@ "task_id": 587, "intent_template_id": 194, "start_urls": ["__SHOPPING__"], - "intent": "Rate my recent purchase of PS3 Remote Controller Skins with 3 stars, using my nickname GamingEmma?", - "intent_template": "Rate my recent purchase of {{product}} with {{num_star}} stars, using my nickname {{nickname}}?", + "intent": "Rate my recently purchased PS3 accessory with 3 stars using my nickname GamingEmma, with the summary \"Ok I guess\" and review \"Does the job\"", + "intent_template": "Rate my recently purchased {{product}} with {{num_star}} stars using my nickname {{nickname}}, with the summary \"{{summary}}\" and review \"{{review}}\"", "instantiation_dict": { - "product": "PS3 Remote Controller Skins", + "product": "PS3 accessory", "num_star": 3, - "nickname": "GamingEmma" + "nickname": "GamingEmma", + "summary": "Ok I guess", + "review": "Does the job" }, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -16797,9 +17421,20 @@ "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", + "evaluator": "NetworkEventEvaluator", "site": "shopping", - "expected": {"sku": "B0041MSF2S", "num_star": 3, "nickname": "GamingEmma"} + "last_event_only": true, + "expected": { + "url": "__SHOPPING__/review/product/post/id/101441/", + "http_method": "POST", + "post_data": { + "ratings[4]": "18", + "nickname": "GamingEmma", + "title": "Ok I guess", + "detail": "Does the job" + }, + "response_status": 302 + } } ], "revision": 2 @@ -16809,15 +17444,16 @@ "task_id": 588, "intent_template_id": 194, "start_urls": ["__SHOPPING__"], - "intent": "Rate my recent purchase of Foundation For Mattress With Frame Set with 1 stars, using my nickname ShoppingEmma?", - "intent_template": "Rate my recent purchase of {{product}} with {{num_star}} stars, using my nickname {{nickname}}?", + "intent": "Rate my recently purchased Foundation For Mattress With Frame Set with 1 stars using my nickname ShoppingEmma, with the summary \"Very bad\" and review \"I hated it\"", + "intent_template": "Rate my recently purchased {{product}} with {{num_star}} stars using my nickname {{nickname}}, with the summary \"{{summary}}\" and review \"{{review}}\"", "instantiation_dict": { "product": "Foundation For Mattress With Frame Set", "num_star": 1, - "nickname": "ShoppingEmma" + "nickname": "ShoppingEmma", + "summary": "Very bad", + "review": "I hated it" }, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -16825,9 +17461,20 @@ "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", + "evaluator": "NetworkEventEvaluator", "site": "shopping", - "expected": {"sku": "B07DFJ5XKH", "num_star": 1, "nickname": "ShoppingEmma"} + "last_event_only": true, + "expected": { + "url": "__SHOPPING__/review/product/post/id/14854/", + "http_method": "POST", + "post_data": { + "ratings[4]": "16", + "nickname": "ShoppingEmma", + "title": "Very bad", + "detail": "I hated it" + }, + "response_status": 302 + } } ], "revision": 2 @@ -16837,15 +17484,16 @@ "task_id": 589, "intent_template_id": 194, "start_urls": ["__SHOPPING__"], - "intent": "Rate my recent purchase of Mini Wireless Bluetooth Speaker with 2 stars, using my nickname SimpleEmma?", - "intent_template": "Rate my recent purchase of {{product}} with {{num_star}} stars, using my nickname {{nickname}}?", + "intent": "Rate my recently purchased Mini Wireless Bluetooth Speaker with 2 stars using my nickname SimpleEmma, with the summary \"Very bad\" and review \"I hated it\"", + "intent_template": "Rate my recently purchased {{product}} with {{num_star}} stars using my nickname {{nickname}}, with the summary \"{{summary}}\" and review \"{{review}}\"", "instantiation_dict": { "product": "Mini Wireless Bluetooth Speaker", "num_star": 2, - "nickname": "SimpleEmma" + "nickname": "SimpleEmma", + "summary": "Very bad", + "review": "I hated it" }, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -16853,9 +17501,20 @@ "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", + "evaluator": "NetworkEventEvaluator", "site": "shopping", - "expected": {"sku": "B09P7BFL4H", "num_star": 2, "nickname": "SimpleEmma"} + "last_event_only": true, + "expected": { + "url": "__SHOPPING__/review/product/post/id/76228/", + "http_method": "POST", + "post_data": { + "ratings[4]": "17", + "nickname": "SimpleEmma", + "title": "Very bad", + "detail": "I hated it" + }, + "response_status": 302 + } } ], "revision": 2 @@ -16865,15 +17524,15 @@ "task_id": 590, "intent_template_id": 339, "start_urls": ["__GITLAB__/primer/design"], - "intent": "Create a milestone for the upcoming event of product launch starting on 1/16/2023 and ending on 1/30/2023", - "intent_template": "Create a milestone for the upcoming {{event}} starting on {{start_date}} and ending on {{end_date}}", + "intent": "Create a milestone in the current repo with title \"product launch\" for the upcoming event of product launch starting on January 16, 2023 and ending on January 30, 2023", + "intent_template": "Create a milestone in the current repo with title \"{{title}}\" for the upcoming {{event}} starting on {{start_date}} and ending {{end_date}}", "instantiation_dict": { + "title": "product launch", "event": "event of product launch", "start_date": "1/16/2023", - "end_date": "1/30/2023" + "end_date": "on January 30, 2023" }, "format_specification": null, - "start_url_context": "Complete the task given the context of the current page.", "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -16881,17 +17540,17 @@ "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "group": "primer", - "project": "design", - "values": { - "title": "product launch", - "start_date": "2023-01-16", - "due_date": "2023-01-30" - }, - "title_field_is_substring": true, - "site": "gitlab", - "expected": true + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": "__GITLAB__/primer/design/-/milestones", + "http_method": "POST", + "post_data": { + "milestone[title]": "product launch", + "milestone[start_date]": "2023-01-16", + "milestone[due_date]": "2023-01-30" + }, + "response_status": 302 + } } ], "revision": 2 @@ -16901,15 +17560,15 @@ "task_id": 591, "intent_template_id": 339, "start_urls": ["__GITLAB__/primer/design"], - "intent": "Create a milestone for the upcoming practice of collective code review starting on 1/16/2023 and ending on in 20 days", - "intent_template": "Create a milestone for the upcoming {{event}} starting on {{start_date}} and ending on {{end_date}}", + "intent": "Create a milestone in the current repo with title \"code review\" for the upcoming practice of collective code review starting on January 16, 2023 and ending in 20 days (inclusive)", + "intent_template": "Create a milestone in the current repo with title \"{{title}}\" for the upcoming {{event}} starting on {{start_date}} and ending {{end_date}}", "instantiation_dict": { + "title": "code review", "event": "practice of collective code review", "start_date": "1/16/2023", - "end_date": "in 20 days" + "end_date": "in 20 days (inclusive)" }, "format_specification": null, - "start_url_context": "Complete the task given the context of the current page.", "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -16917,13 +17576,17 @@ "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "group": "primer", - "project": "design", - "values": {"title": "code review", "start_date": "2023-01-16", "due_date": "2023-02-05"}, - "title_field_is_substring": true, - "site": "gitlab", - "expected": true + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": "__GITLAB__/primer/design/-/milestones", + "http_method": "POST", + "post_data": { + "milestone[title]": "code review", + "milestone[start_date]": "2023-01-16", + "milestone[due_date]": "2023-02-05" + }, + "response_status": 302 + } } ], "revision": 2 @@ -16933,15 +17596,15 @@ "task_id": 592, "intent_template_id": 339, "start_urls": ["__GITLAB__/primer/design"], - "intent": "Create a milestone for the upcoming task of cleaning sensitive information starting on 2/16/2023 and ending on in 20 days", - "intent_template": "Create a milestone for the upcoming {{event}} starting on {{start_date}} and ending on {{end_date}}", + "intent": "Create a milestone in the current repo with title \"sensitive information\" for the upcoming task of cleaning sensitive information starting on February 16, 2023 and ending in 20 days (inclusive)", + "intent_template": "Create a milestone in the current repo with title \"{{title}}\" for the upcoming {{event}} starting on {{start_date}} and ending {{end_date}}", "instantiation_dict": { + "title": "sensitive information", "event": "task of cleaning sensitive information", "start_date": "2/16/2023", - "end_date": "in 20 days" + "end_date": "in 20 days (inclusive)" }, "format_specification": null, - "start_url_context": "Complete the task given the context of the current page.", "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -16949,17 +17612,17 @@ "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "group": "primer", - "project": "design", - "values": { - "title": "sensitive information", - "start_date": "2023-02-16", - "due_date": "2023-03-08" - }, - "title_field_is_substring": true, - "site": "gitlab", - "expected": true + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": "__GITLAB__/primer/design/-/milestones", + "http_method": "POST", + "post_data": { + "milestone[title]": "sensitive information", + "milestone[start_date]": "2023-02-16", + "milestone[due_date]": "2023-03-08" + }, + "response_status": 302 + } } ], "revision": 2 @@ -16969,15 +17632,15 @@ "task_id": 593, "intent_template_id": 339, "start_urls": ["__GITLAB__/byteblaze/dotfiles"], - "intent": "Create a milestone for the upcoming task of merging all branches to main starting on March 15, 2044 and ending on March 30, 2044", - "intent_template": "Create a milestone for the upcoming {{event}} starting on {{start_date}} and ending on {{end_date}}", + "intent": "Create a milestone in the current repo with title \"all branches to main\" for the upcoming task of merging all branches to main starting on March 15, 2044 and ending on March 30, 2044", + "intent_template": "Create a milestone in the current repo with title \"{{title}}\" for the upcoming {{event}} starting on {{start_date}} and ending {{end_date}}", "instantiation_dict": { + "title": "all branches to main", "event": "task of merging all branches to main", "start_date": "March 15, 2044", - "end_date": "March 30, 2044" + "end_date": "on March 30, 2044" }, "format_specification": null, - "start_url_context": "Complete the task given the context of the current page.", "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -16985,17 +17648,17 @@ "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "group": "byteblaze", - "project": "dotfiles", - "values": { - "title": "all branches to main", - "start_date": "2044-03-15", - "due_date": "2044-03-30" - }, - "title_field_is_substring": true, - "site": "gitlab", - "expected": true + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": "__GITLAB__/byteblaze/dotfiles/-/milestones", + "http_method": "POST", + "post_data": { + "milestone[title]": "all branches to main", + "milestone[start_date]": "2044-03-15", + "milestone[due_date]": "2044-03-30" + }, + "response_status": 302 + } } ], "revision": 2 @@ -17005,15 +17668,15 @@ "task_id": 594, "intent_template_id": 339, "start_urls": ["__GITLAB__/byteblaze/dotfiles"], - "intent": "Create a milestone for the upcoming task of adding a new branch for zsh comprehensive support starting on 5/1/2044 and ending on in 20 days", - "intent_template": "Create a milestone for the upcoming {{event}} starting on {{start_date}} and ending on {{end_date}}", + "intent": "Create a milestone in the current repo with title \"zsh comprehensive support\" for the upcoming task of adding a new branch for zsh comprehensive support starting on May 1, 2044 and ending in 20 days (inclusive)", + "intent_template": "Create a milestone in the current repo with title \"{{title}}\" for the upcoming {{event}} starting on {{start_date}} and ending {{end_date}}", "instantiation_dict": { + "title": "zsh comprehensive support", "event": "task of adding a new branch for zsh comprehensive support", "start_date": "5/1/2044", - "end_date": "in 20 days" + "end_date": "in 20 days (inclusive)" }, "format_specification": null, - "start_url_context": "Complete the task given the context of the current page.", "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -17021,17 +17684,17 @@ "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "group": "byteblaze", - "project": "dotfiles", - "values": { - "title": "zsh comprehensive support", - "start_date": "2044-05-01", - "due_date": "2044-05-21" - }, - "title_field_is_substring": true, - "site": "gitlab", - "expected": true + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": "__GITLAB__/byteblaze/dotfiles/-/milestones", + "http_method": "POST", + "post_data": { + "milestone[title]": "zsh comprehensive support", + "milestone[start_date]": "2044-05-01", + "milestone[due_date]": "2044-05-21" + }, + "response_status": 302 + } } ], "revision": 2 @@ -17041,11 +17704,10 @@ "task_id": 595, "intent_template_id": 4, "start_urls": ["__REDDIT__"], - "intent": "Open the thread of a trending post on the forum \"space\" and subscribe.", - "intent_template": "Open the thread of a trending post on the forum \"{{subreddit}}\" and subscribe.", - "instantiation_dict": {"subreddit": "space"}, + "intent": "Open the hottest post in the forum \"space\" and subscribe", + "intent_template": "Open the {{post_selector}} post in the forum \"{{forum}}\" and subscribe", + "instantiation_dict": {"post_selector": "hottest", "forum": "space"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -17053,9 +17715,15 @@ "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "site": "reddit", - "expected": {"subreddit": "space"} + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": ["__REDDIT__/f/space/subscribe.json", "__REDDIT__/f/space/unsubscribe.json"], + "http_method": "POST", + "headers": { + "referer": "__REDDIT__/f/space/69581/the-moon-saturn-and-jupiter-through-my-4-telescope-and" + }, + "response_content": {"subscribed": true} + } } ], "revision": 2 @@ -17065,11 +17733,10 @@ "task_id": 596, "intent_template_id": 4, "start_urls": ["__REDDIT__"], - "intent": "Open the thread of a trending post on the forum \"books\" and subscribe.", - "intent_template": "Open the thread of a trending post on the forum \"{{subreddit}}\" and subscribe.", - "instantiation_dict": {"subreddit": "books"}, + "intent": "Open the all time top post in the forum \"books\" and subscribe", + "intent_template": "Open the {{post_selector}} post in the forum \"{{forum}}\" and subscribe", + "instantiation_dict": {"post_selector": "all time top", "forum": "books"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -17077,9 +17744,15 @@ "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "site": "reddit", - "expected": {"subreddit": "books"} + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": ["__REDDIT__/f/books/subscribe.json", "__REDDIT__/f/books/unsubscribe.json"], + "http_method": "POST", + "headers": { + "referer": "__REDDIT__/f/books/81371/the-letters-of-t-s-eliot-to-emily-hale-that-were-kept-sealed" + }, + "response_content": {"subscribed": true} + } } ], "revision": 2 @@ -17089,11 +17762,10 @@ "task_id": 597, "intent_template_id": 4, "start_urls": ["__REDDIT__"], - "intent": "Open the thread of a trending post on the forum \"consoles\" and subscribe.", - "intent_template": "Open the thread of a trending post on the forum \"{{subreddit}}\" and subscribe.", - "instantiation_dict": {"subreddit": "consoles"}, + "intent": "Open the most controversial post in the forum \"consoles\" and subscribe", + "intent_template": "Open the {{post_selector}} post in the forum \"{{forum}}\" and subscribe", + "instantiation_dict": {"post_selector": "most controversial", "forum": "consoles"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -17101,9 +17773,17 @@ "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "site": "reddit", - "expected": {"subreddit": "consoles"} + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": [ + "__REDDIT__/f/consoles/subscribe.json", "__REDDIT__/f/consoles/unsubscribe.json" + ], + "http_method": "POST", + "headers": { + "referer": "__REDDIT__/f/consoles/17949/i-like-xbox-series-s-more-than-xbox-series-x" + }, + "response_content": {"subscribed": true} + } } ], "revision": 2 @@ -17113,11 +17793,10 @@ "task_id": 598, "intent_template_id": 4, "start_urls": ["__REDDIT__"], - "intent": "Open the thread of a trending post on the forum \"pittsburgh\" and subscribe.", - "intent_template": "Open the thread of a trending post on the forum \"{{subreddit}}\" and subscribe.", - "instantiation_dict": {"subreddit": "pittsburgh"}, + "intent": "Open the all time most commented post in the forum \"pittsburgh\" and subscribe", + "intent_template": "Open the {{post_selector}} post in the forum \"{{forum}}\" and subscribe", + "instantiation_dict": {"post_selector": "all time most commented", "forum": "pittsburgh"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -17125,9 +17804,18 @@ "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "site": "reddit", - "expected": {"subreddit": "pittsburgh"} + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": [ + "__REDDIT__/f/pittsburgh/subscribe.json", + "__REDDIT__/f/pittsburgh/unsubscribe.json" + ], + "http_method": "POST", + "headers": { + "referer": "__REDDIT__/f/pittsburgh/132053/is-bbq-chipped-chopped-ham-a-thing-my-grandma-says-it-is" + }, + "response_content": {"subscribed": true} + } } ], "revision": 2 @@ -17137,11 +17825,10 @@ "task_id": 599, "intent_template_id": 4, "start_urls": ["__REDDIT__"], - "intent": "Open the thread of a trending post on the forum \"machine learning\" and subscribe.", - "intent_template": "Open the thread of a trending post on the forum \"{{subreddit}}\" and subscribe.", - "instantiation_dict": {"subreddit": "machine learning"}, + "intent": "Open the newest post in the forum \"machine learning\" and subscribe", + "intent_template": "Open the {{post_selector}} post in the forum \"{{forum}}\" and subscribe", + "instantiation_dict": {"post_selector": "newest", "forum": "machine learning"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -17149,9 +17836,18 @@ "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "site": "reddit", - "expected": {"subreddit": "machine learning"} + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": [ + "__REDDIT__/f/machine learning/subscribe.json", + "__REDDIT__/f/machine learning/unsubscribe.json" + ], + "http_method": "POST", + "headers": { + "referer": "__REDDIT__/f/machine learning/120613/d-yan-lecun-s-recent-recommendations" + }, + "response_content": {"subscribed": true} + } } ], "revision": 2 @@ -17161,11 +17857,10 @@ "task_id": 600, "intent_template_id": 3765, "start_urls": ["__REDDIT__"], - "intent": "Post my question with the title \"what is the recommended console to buy these days\", in a subreddit where I'm likely to get an answer", - "intent_template": "Post my question with the title \"{{question}}\", in a subreddit where I'm likely to get an answer", - "instantiation_dict": {"question": "what is the recommended console to buy these days"}, + "intent": "Post my question with the title \"what is the recommended console to buy these days?\", in a forum where I'm likely to get an answer", + "intent_template": "Post my question with the title \"{{question}}\", in a forum where I'm likely to get an answer", + "instantiation_dict": {"question": "what is the recommended console to buy these days?"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -17173,11 +17868,15 @@ "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "site": "reddit", + "evaluator": "NetworkEventEvaluator", "expected": { - "subreddit": "consoles", - "title": "what is the recommended console to buy these days" + "url": ["__REDDIT__/submit", "__REDDIT__/submit/consoles"], + "http_method": "POST", + "post_data": { + "submission[title]": "what is the recommended console to buy these days?", + "submission[forum]": "10039" + }, + "response_status": 302 } } ], @@ -17188,11 +17887,10 @@ "task_id": 601, "intent_template_id": 3765, "start_urls": ["__REDDIT__"], - "intent": "Post my question with the title \"is car necessary in NYC\", in a subreddit where I'm likely to get an answer", - "intent_template": "Post my question with the title \"{{question}}\", in a subreddit where I'm likely to get an answer", + "intent": "Post my question with the title \"is car necessary in NYC\", in a forum where I'm likely to get an answer", + "intent_template": "Post my question with the title \"{{question}}\", in a forum where I'm likely to get an answer", "instantiation_dict": {"question": "is car necessary in NYC"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -17200,9 +17898,13 @@ "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "site": "reddit", - "expected": {"subreddit": "nyc", "title": "is car necessary in NYC?"} + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": ["__REDDIT__/submit", "__REDDIT__/submit/nyc"], + "http_method": "POST", + "post_data": {"submission[title]": "is car necessary in NYC", "submission[forum]": "10067"}, + "response_status": 302 + } } ], "revision": 2 @@ -17212,11 +17914,10 @@ "task_id": 602, "intent_template_id": 3765, "start_urls": ["__REDDIT__"], - "intent": "Post my question with the title \"places for new drivers to learn driving in pittsburgh\", in a subreddit where I'm likely to get an answer", - "intent_template": "Post my question with the title \"{{question}}\", in a subreddit where I'm likely to get an answer", + "intent": "Post my question with the title \"places for new drivers to learn driving in pittsburgh\", in a forum where I'm likely to get an answer", + "intent_template": "Post my question with the title \"{{question}}\", in a forum where I'm likely to get an answer", "instantiation_dict": {"question": "places for new drivers to learn driving in pittsburgh"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -17224,11 +17925,15 @@ "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "site": "reddit", + "evaluator": "NetworkEventEvaluator", "expected": { - "subreddit": "pittsburgh", - "title": "places for new drivers to learn driving in pittsburgh" + "url": ["__REDDIT__/submit", "__REDDIT__/submit/pittsburgh"], + "http_method": "POST", + "post_data": { + "submission[title]": "places for new drivers to learn driving in pittsburgh", + "submission[forum]": "10073" + }, + "response_status": 302 } } ], @@ -17239,11 +17944,10 @@ "task_id": 603, "intent_template_id": 3765, "start_urls": ["__REDDIT__"], - "intent": "Post my question with the title \"safe and budget apartment to live in nyc\", in a subreddit where I'm likely to get an answer", - "intent_template": "Post my question with the title \"{{question}}\", in a subreddit where I'm likely to get an answer", + "intent": "Post my question with the title \"safe and budget apartment to live in nyc\", in a forum where I'm likely to get an answer", + "intent_template": "Post my question with the title \"{{question}}\", in a forum where I'm likely to get an answer", "instantiation_dict": {"question": "safe and budget apartment to live in nyc"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -17251,9 +17955,16 @@ "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "site": "reddit", - "expected": {"subreddit": "nyc", "title": "safe and budget apartment to live in nyc"} + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": ["__REDDIT__/submit", "__REDDIT__/submit/nyc"], + "http_method": "POST", + "post_data": { + "submission[title]": "safe and budget apartment to live in nyc", + "submission[forum]": "10067" + }, + "response_status": 302 + } } ], "revision": 2 @@ -17263,11 +17974,10 @@ "task_id": 604, "intent_template_id": 3765, "start_urls": ["__REDDIT__"], - "intent": "Post my question with the title \"what is the SOTA web navigation agent repo\", in a subreddit where I'm likely to get an answer", - "intent_template": "Post my question with the title \"{{question}}\", in a subreddit where I'm likely to get an answer", + "intent": "Post my question with the title \"what is the SOTA web navigation agent repo\", in a forum where I'm likely to get an answer", + "intent_template": "Post my question with the title \"{{question}}\", in a forum where I'm likely to get an answer", "instantiation_dict": {"question": "what is the SOTA web navigation agent repo"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -17275,11 +17985,18 @@ "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "site": "reddit", + "evaluator": "NetworkEventEvaluator", "expected": { - "subreddit": ["deeplearning", "MachineLearning", "singularity"], - "title": "what is the SOTA web navigation agent repo" + "url": [ + "__REDDIT__/submit", "__REDDIT__/submit/deeplearning", + "__REDDIT__/submit/MachineLearning", "__REDDIT__/submit/singularity" + ], + "http_method": "POST", + "post_data": { + "submission[title]": "what is the SOTA web navigation agent repo", + "submission[forum]": "{{forum_id}}" + }, + "response_status": 302 } } ], @@ -17290,14 +18007,13 @@ "task_id": 605, "intent_template_id": 5, "start_urls": ["__REDDIT__"], - "intent": "Find a subreddit focused on topics related to gaming consoles, and post my question, \"what is the recommended console to buy these days\" there", - "intent_template": "Find a subreddit focused on topics related to {{topic}}, and post my question, \"{{question}}\" there", + "intent": "Find a forum focused on topics related to gaming consoles, and post my question, \"what is the recommended console to buy these days\" there", + "intent_template": "Find a forum focused on topics related to {{topic}}, and post my question, \"{{question}}\" there", "instantiation_dict": { "topic": "gaming consoles", "question": "what is the recommended console to buy these days" }, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -17320,11 +18036,10 @@ "task_id": 606, "intent_template_id": 5, "start_urls": ["__REDDIT__"], - "intent": "Find a subreddit focused on topics related to NYC, and post my question, \"is car necessary\" there", - "intent_template": "Find a subreddit focused on topics related to {{topic}}, and post my question, \"{{question}}\" there", + "intent": "Find a forum focused on topics related to NYC, and post my question, \"is car necessary\" there", + "intent_template": "Find a forum focused on topics related to {{topic}}, and post my question, \"{{question}}\" there", "instantiation_dict": {"topic": "NYC", "question": "is car necessary"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -17344,14 +18059,13 @@ "task_id": 607, "intent_template_id": 5, "start_urls": ["__REDDIT__"], - "intent": "Find a subreddit focused on topics related to city Pittsburgh, and post my question, \"places for new drivers to learn driving\" there", - "intent_template": "Find a subreddit focused on topics related to {{topic}}, and post my question, \"{{question}}\" there", + "intent": "Find a forum focused on topics related to city Pittsburgh, and post my question, \"places for new drivers to learn driving\" there", + "intent_template": "Find a forum focused on topics related to {{topic}}, and post my question, \"{{question}}\" there", "instantiation_dict": { "topic": "city Pittsburgh", "question": "places for new drivers to learn driving" }, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -17371,14 +18085,13 @@ "task_id": 608, "intent_template_id": 5, "start_urls": ["__REDDIT__"], - "intent": "Find a subreddit focused on topics related to city lives in DMV area, and post my question, \"safe and budget apartment to live\" there", - "intent_template": "Find a subreddit focused on topics related to {{topic}}, and post my question, \"{{question}}\" there", + "intent": "Find a forum focused on topics related to city lives in DMV area, and post my question, \"safe and budget apartment to live\" there", + "intent_template": "Find a forum focused on topics related to {{topic}}, and post my question, \"{{question}}\" there", "instantiation_dict": { "topic": "city lives in DMV area", "question": "safe and budget apartment to live" }, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -17398,14 +18111,13 @@ "task_id": 609, "intent_template_id": 5, "start_urls": ["__REDDIT__"], - "intent": "Find a subreddit focused on topics related to ML, DL, NLP, and post my question, \"what is the SOTA web navigation agent repo\" there", - "intent_template": "Find a subreddit focused on topics related to {{topic}}, and post my question, \"{{question}}\" there", + "intent": "Find a forum focused on topics related to ML, DL, NLP, and post my question, \"what is the SOTA web navigation agent repo\" there", + "intent_template": "Find a forum focused on topics related to {{topic}}, and post my question, \"{{question}}\" there", "instantiation_dict": { "topic": "ML, DL, NLP", "question": "what is the SOTA web navigation agent repo" }, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -17428,11 +18140,10 @@ "task_id": 610, "intent_template_id": 9, "start_urls": ["__REDDIT__"], - "intent": "Post a review of my recent reading, titled \"To Kill a Mockingbird by Harper Lee\" in the r/books and put my comment \"good book!\" in the body.", - "intent_template": "Post a review of my recent reading, titled \"{{book}}\" in the r/books and put my comment \"{{content}}\" in the body.", + "intent": "Post a review of my recent reading, titled \"To Kill a Mockingbird by Harper Lee\" in f/books and then comment \"good book!\" on the created post", + "intent_template": "Post a review of my recent reading, titled \"{{book}}\" in f/books and then comment \"{{content}}\" on the created post", "instantiation_dict": {"book": "To Kill a Mockingbird by Harper Lee", "content": "good book!"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -17440,12 +18151,25 @@ "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "site": "reddit", + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": ["__REDDIT__/submit", "__REDDIT__/submit/books"], + "http_method": "POST", + "post_data": { + "submission[title]": "To Kill a Mockingbird by Harper Lee", + "submission[forum]": "10037" + }, + "response_status": 302 + } + }, + { + "evaluator": "NetworkEventEvaluator", "expected": { - "subreddit": "books", - "content": "good book!", - "title": "To Kill a Mockingbird by Harper Lee" + "url": "^__REDDIT__/f/books/\\d+/-/comment$", + "http_method": "POST", + "headers": {"referer": "^__REDDIT__/f/books/\\d+/to-kill-a-mockingbird-by-harper-lee$"}, + "post_data": {"reply_to_submission_{{post_id}}[comment]": "good book!"}, + "response_status": 302 } } ], @@ -17456,11 +18180,10 @@ "task_id": 611, "intent_template_id": 9, "start_urls": ["__REDDIT__"], - "intent": "Post a review of my recent reading, titled \"Harry Potter\" in the r/books and put my comment \"Wonderful journey\" in the body.", - "intent_template": "Post a review of my recent reading, titled \"{{book}}\" in the r/books and put my comment \"{{content}}\" in the body.", + "intent": "Post a review of my recent reading, titled \"Harry Potter\" in f/books and then comment \"Wonderful journey\" on the created post", + "intent_template": "Post a review of my recent reading, titled \"{{book}}\" in f/books and then comment \"{{content}}\" on the created post", "instantiation_dict": {"book": "Harry Potter", "content": "Wonderful journey"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -17468,9 +18191,22 @@ "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "site": "reddit", - "expected": {"subreddit": "books", "content": "Wonderful journey", "title": "Harry Potter"} + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": ["__REDDIT__/submit", "__REDDIT__/submit/books"], + "http_method": "POST", + "post_data": {"submission[title]": "Harry Potter", "submission[forum]": "10037"}, + "response_status": 302 + } + }, + { + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": "__REDDIT__/f/books/{{post_id}}/-/comment", + "http_method": "POST", + "post_data": {"reply_to_submission_{{post_id}}[comment]": "Wonderful journey"}, + "response_status": 302 + } } ], "revision": 2 @@ -17480,11 +18216,10 @@ "task_id": 612, "intent_template_id": 9, "start_urls": ["__REDDIT__"], - "intent": "Post a review of my recent reading, titled \"big little lies\" in the r/books and put my comment \"can't stop it\" in the body.", - "intent_template": "Post a review of my recent reading, titled \"{{book}}\" in the r/books and put my comment \"{{content}}\" in the body.", + "intent": "Post a review of my recent reading, titled \"big little lies\" in f/books and then comment \"can't stop it\" on the created post", + "intent_template": "Post a review of my recent reading, titled \"{{book}}\" in f/books and then comment \"{{content}}\" on the created post", "instantiation_dict": {"book": "big little lies", "content": "can't stop it"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -17492,9 +18227,22 @@ "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "site": "reddit", - "expected": {"subreddit": "books", "content": "can't stop it", "title": "big little lies"} + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": ["__REDDIT__/submit", "__REDDIT__/submit/books"], + "http_method": "POST", + "post_data": {"submission[title]": "big little lies", "submission[forum]": "10037"}, + "response_status": 302 + } + }, + { + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": "__REDDIT__/f/books/{{post_id}}/-/comment", + "http_method": "POST", + "post_data": {"reply_to_submission_{{post_id}}[comment]": "can't stop it"}, + "response_status": 302 + } } ], "revision": 2 @@ -17504,11 +18252,10 @@ "task_id": 613, "intent_template_id": 9, "start_urls": ["__REDDIT__"], - "intent": "Post a review of my recent reading, titled \"Love story\" in the r/books and put my comment \"I cried\" in the body.", - "intent_template": "Post a review of my recent reading, titled \"{{book}}\" in the r/books and put my comment \"{{content}}\" in the body.", + "intent": "Post a review of my recent reading, titled \"Love story\" in f/books and then comment \"I cried\" on the created post", + "intent_template": "Post a review of my recent reading, titled \"{{book}}\" in f/books and then comment \"{{content}}\" on the created post", "instantiation_dict": {"book": "Love story", "content": "I cried"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -17516,9 +18263,22 @@ "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "site": "reddit", - "expected": {"subreddit": "books", "comment": "I cried", "title": "Love story"} + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": ["__REDDIT__/submit", "__REDDIT__/submit/books"], + "http_method": "POST", + "post_data": {"submission[title]": "Love story", "submission[forum]": "10037"}, + "response_status": 302 + } + }, + { + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": "__REDDIT__/f/books/{{post_id}}/-/comment", + "http_method": "POST", + "post_data": {"reply_to_submission_{{post_id}}[comment]": "I cried"}, + "response_status": 302 + } } ], "revision": 2 @@ -17528,11 +18288,10 @@ "task_id": 614, "intent_template_id": 9, "start_urls": ["__REDDIT__"], - "intent": "Post a review of my recent reading, titled \"Gone with the wind\" in the r/books and put my comment \"It's a book with history\" in the body.", - "intent_template": "Post a review of my recent reading, titled \"{{book}}\" in the r/books and put my comment \"{{content}}\" in the body.", + "intent": "Post a review of my recent reading, titled \"Gone with the wind\" in f/books and then comment \"It's a book with history\" on the created post", + "intent_template": "Post a review of my recent reading, titled \"{{book}}\" in f/books and then comment \"{{content}}\" on the created post", "instantiation_dict": {"book": "Gone with the wind", "content": "It's a book with history"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -17540,12 +18299,21 @@ "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "site": "reddit", + "evaluator": "NetworkEventEvaluator", "expected": { - "subreddit": "books", - "content": "It's a book with history", - "title": "Gone with the wind" + "url": ["__REDDIT__/submit", "__REDDIT__/submit/books"], + "http_method": "POST", + "post_data": {"submission[title]": "Gone with the wind", "submission[forum]": "10037"}, + "response_status": 302 + } + }, + { + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": "__REDDIT__/f/books/{{post_id}}/-/comment", + "http_method": "POST", + "post_data": {"reply_to_submission_{{post_id}}[comment]": "It's a book with history"}, + "response_status": 302 } } ], @@ -17556,11 +18324,10 @@ "task_id": 615, "intent_template_id": 11, "start_urls": ["__REDDIT__/f/pics"], - "intent": "Re-post the image of Bald Eagle in this page to earthporn subreddit and title \"from /f/pics\"", - "intent_template": "Re-post the image of {{content}} in this page to {{subreddit}} subreddit and title \"from /f/pics\"", - "instantiation_dict": {"content": "Bald Eagle", "subreddit": "earthporn"}, + "intent": "Re-post the image of Bald Eagle from this forum to earthporn forum using the image URL and title \"from /f/pics\"", + "intent_template": "Re-post the image of {{content}} from this forum to {{forum}} forum using the image URL and title \"from /f/pics\"", + "instantiation_dict": {"content": "Bald Eagle", "forum": "earthporn"}, "format_specification": null, - "start_url_context": "Complete the task given the context of the current page.", "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -17568,12 +18335,16 @@ "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "site": "reddit", + "evaluator": "NetworkEventEvaluator", "expected": { - "subreddit": "earthporn", - "file_name": "b02113033af32feae9ff147dbbe3764039368d67d193885bd04e65c2e6beea9c.jpg", - "title": "from /f/pics" + "url": "^__REDDIT__/submit.*$", + "http_method": "POST", + "post_data": { + "submission[url]": "__REDDIT__/submission_images/b02113033af32feae9ff147dbbe3764039368d67d193885bd04e65c2e6beea9c.jpg", + "submission[title]": "from /f/pics", + "submission[forum]": "10009" + }, + "response_status": 302 } } ], @@ -17584,11 +18355,10 @@ "task_id": 616, "intent_template_id": 11, "start_urls": ["__REDDIT__/f/pics"], - "intent": "Re-post the image of Thanksgiving turkey in this page to funny subreddit and title \"from /f/pics\"", - "intent_template": "Re-post the image of {{content}} in this page to {{subreddit}} subreddit and title \"from /f/pics\"", - "instantiation_dict": {"content": "Thanksgiving turkey", "subreddit": "funny"}, + "intent": "Re-post the image of Thanksgiving turkey from this forum to funny forum using the image URL and title \"from /f/pics\"", + "intent_template": "Re-post the image of {{content}} from this forum to {{forum}} forum using the image URL and title \"from /f/pics\"", + "instantiation_dict": {"content": "Thanksgiving turkey", "forum": "funny"}, "format_specification": null, - "start_url_context": "Complete the task given the context of the current page.", "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -17596,12 +18366,16 @@ "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "site": "reddit", + "evaluator": "NetworkEventEvaluator", "expected": { - "subreddit": "funny", - "file_name": "2e4fa0a328e653a97a7d07046291c298ef5b4e0d0c73a287f317ca86a8e8685f.jpg", - "title": "from /f/pics" + "url": "^__REDDIT__/submit.*$", + "http_method": "POST", + "post_data": { + "submission[url]": "__REDDIT__/submission_images/2e4fa0a328e653a97a7d07046291c298ef5b4e0d0c73a287f317ca86a8e8685f.jpg", + "submission[title]": "from /f/pics", + "submission[forum]": "10046" + }, + "response_status": 302 } } ], @@ -17612,11 +18386,10 @@ "task_id": 617, "intent_template_id": 11, "start_urls": ["__REDDIT__/f/pics"], - "intent": "Re-post the image of Firework in this page to earthporn subreddit and title \"from /f/pics\"", - "intent_template": "Re-post the image of {{content}} in this page to {{subreddit}} subreddit and title \"from /f/pics\"", - "instantiation_dict": {"content": "Firework", "subreddit": "earthporn"}, + "intent": "Re-post the image of Firework from this forum to earthporn forum using the image URL and title \"from /f/pics\"", + "intent_template": "Re-post the image of {{content}} from this forum to {{forum}} forum using the image URL and title \"from /f/pics\"", + "instantiation_dict": {"content": "Firework", "forum": "earthporn"}, "format_specification": null, - "start_url_context": "Complete the task given the context of the current page.", "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -17624,12 +18397,16 @@ "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "site": "reddit", + "evaluator": "NetworkEventEvaluator", "expected": { - "subreddit": "earthporn", - "file_name": "92411be6af4e9ad5ccd3ccbaa01c10457bb00e704e99c58dd430de1a958307fd.jpg", - "title": "from /f/pics" + "url": "^__REDDIT__/submit.*$", + "http_method": "POST", + "post_data": { + "submission[url]": "__REDDIT__/submission_images/92411be6af4e9ad5ccd3ccbaa01c10457bb00e704e99c58dd430de1a958307fd.jpg", + "submission[title]": "from /f/pics", + "submission[forum]": "10009" + }, + "response_status": 302 } } ], @@ -17640,11 +18417,10 @@ "task_id": 618, "intent_template_id": 11, "start_urls": ["__REDDIT__/f/pics"], - "intent": "Re-post the image of Wife's costume in this page to funny subreddit and title \"from /f/pics\"", - "intent_template": "Re-post the image of {{content}} in this page to {{subreddit}} subreddit and title \"from /f/pics\"", - "instantiation_dict": {"content": "Wife's costume", "subreddit": "funny"}, + "intent": "Re-post the image of Wife's costume from this forum to funny forum using the image URL and title \"from /f/pics\"", + "intent_template": "Re-post the image of {{content}} from this forum to {{forum}} forum using the image URL and title \"from /f/pics\"", + "instantiation_dict": {"content": "Wife's costume", "forum": "funny"}, "format_specification": null, - "start_url_context": "Complete the task given the context of the current page.", "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -17652,12 +18428,16 @@ "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "site": "reddit", + "evaluator": "NetworkEventEvaluator", "expected": { - "subreddit": "funny", - "file_name": "6bfbf1280d28d84a9261695f0cac5a90addaaff1174807a2b381fdc159f3ed00.jpg", - "title": "from /f/pics" + "url": "^__REDDIT__/submit.*$", + "http_method": "POST", + "post_data": { + "submission[url]": "__REDDIT__/submission_images/6bfbf1280d28d84a9261695f0cac5a90addaaff1174807a2b381fdc159f3ed00.jpg", + "submission[title]": "from /f/pics", + "submission[forum]": "10046" + }, + "response_status": 302 } } ], @@ -17668,11 +18448,10 @@ "task_id": 619, "intent_template_id": 11, "start_urls": ["__REDDIT__/f/pics"], - "intent": "Re-post the image of costume contest in this page to funny subreddit and title \"from /f/pics\"", - "intent_template": "Re-post the image of {{content}} in this page to {{subreddit}} subreddit and title \"from /f/pics\"", - "instantiation_dict": {"content": "costume contest", "subreddit": "funny"}, + "intent": "Re-post the image of costume contest from this forum to funny forum using the image URL and title \"from /f/pics\"", + "intent_template": "Re-post the image of {{content}} from this forum to {{forum}} forum using the image URL and title \"from /f/pics\"", + "instantiation_dict": {"content": "costume contest", "forum": "funny"}, "format_specification": null, - "start_url_context": "Complete the task given the context of the current page.", "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -17680,12 +18459,16 @@ "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "site": "reddit", + "evaluator": "NetworkEventEvaluator", "expected": { - "subreddit": "funny", - "file_name": "bd8bc5f4c846aac4df08626faa3a34a7d47c8f3bdd92bf615a54afd939f063a7.jpg", - "title": "from /f/pics" + "url": "^__REDDIT__/submit.*$", + "http_method": "POST", + "post_data": { + "submission[url]": "__REDDIT__/submission_images/bd8bc5f4c846aac4df08626faa3a34a7d47c8f3bdd92bf615a54afd939f063a7.jpg", + "submission[title]": "from /f/pics", + "submission[forum]": "10046" + }, + "response_status": 302 } } ], @@ -17696,11 +18479,10 @@ "task_id": 620, "intent_template_id": 12, "start_urls": ["__REDDIT__"], - "intent": "Ask for advice with the title \"deal with long-distance relationships\" in a subreddit for relations", - "intent_template": "Ask for advice with the title \"{{issue}}\" in a subreddit for relations", + "intent": "Ask for advice in a forum for relations. Title it \"deal with long-distance relationships\" and in post details ask \"Please help\"", + "intent_template": "Ask for advice in a forum for relations. Title it \"{{issue}}\" and in post details ask \"Please help\"", "instantiation_dict": {"issue": "deal with long-distance relationships"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -17708,11 +18490,16 @@ "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "site": "reddit", + "evaluator": "NetworkEventEvaluator", "expected": { - "subreddit": "relationship_advice", - "title": ["long", "distance", "relationship"] + "url": ["__REDDIT__/submit", "__REDDIT__/submit/relationship_advice"], + "http_method": "POST", + "post_data": { + "submission[title]": "deal with long-distance relationships", + "submission[body]": "Please help", + "submission[forum]": "10075" + }, + "response_status": 302 } } ], @@ -17723,11 +18510,10 @@ "task_id": 621, "intent_template_id": 12, "start_urls": ["__REDDIT__"], - "intent": "Ask for advice with the title \"cheat\" in a subreddit for relations", - "intent_template": "Ask for advice with the title \"{{issue}}\" in a subreddit for relations", + "intent": "Ask for advice in a forum for relations. Title it \"cheat\" and in post details ask \"Please help\"", + "intent_template": "Ask for advice in a forum for relations. Title it \"{{issue}}\" and in post details ask \"Please help\"", "instantiation_dict": {"issue": "cheat"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -17735,9 +18521,17 @@ "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "site": "reddit", - "expected": {"subreddit": "relationship_advice", "title": "cheat"} + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": ["__REDDIT__/submit", "__REDDIT__/submit/relationship_advice"], + "http_method": "POST", + "post_data": { + "submission[title]": "cheat", + "submission[body]": "Please help", + "submission[forum]": "10075" + }, + "response_status": 302 + } } ], "revision": 2 @@ -17747,11 +18541,10 @@ "task_id": 622, "intent_template_id": 12, "start_urls": ["__REDDIT__"], - "intent": "Ask for advice with the title \"sexual harassment\" in a subreddit for relations", - "intent_template": "Ask for advice with the title \"{{issue}}\" in a subreddit for relations", + "intent": "Ask for advice in a forum for relations. Title it \"sexual harassment\" and in post details ask \"Please help\"", + "intent_template": "Ask for advice in a forum for relations. Title it \"{{issue}}\" and in post details ask \"Please help\"", "instantiation_dict": {"issue": "sexual harassment"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -17759,9 +18552,17 @@ "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "site": "reddit", - "expected": {"subreddit": "relationship_advice", "title": "sexual harassment"} + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": ["__REDDIT__/submit", "__REDDIT__/submit/relationship_advice"], + "http_method": "POST", + "post_data": { + "submission[title]": "sexual harassment", + "submission[body]": "Please help", + "submission[forum]": "10075" + }, + "response_status": 302 + } } ], "revision": 2 @@ -17771,11 +18572,10 @@ "task_id": 623, "intent_template_id": 12, "start_urls": ["__REDDIT__"], - "intent": "Ask for advice with the title \"gift for birthday\" in a subreddit for relations", - "intent_template": "Ask for advice with the title \"{{issue}}\" in a subreddit for relations", + "intent": "Ask for advice in a forum for relations. Title it \"gift for birthday\" and in post details ask \"Please help\"", + "intent_template": "Ask for advice in a forum for relations. Title it \"{{issue}}\" and in post details ask \"Please help\"", "instantiation_dict": {"issue": "gift for birthday"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -17783,9 +18583,17 @@ "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "site": "reddit", - "expected": { "subreddit": "relationship_advice", "title": ["gift", "birthday"] } + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": ["__REDDIT__/submit", "__REDDIT__/submit/relationship_advice"], + "http_method": "POST", + "post_data": { + "submission[title]": "gift for birthday", + "submission[body]": "Please help", + "submission[forum]": "10075" + }, + "response_status": 302 + } } ], "revision": 2 @@ -17795,11 +18603,10 @@ "task_id": 624, "intent_template_id": 12, "start_urls": ["__REDDIT__"], - "intent": "Ask for advice with the title \"break-up remedy\" in a subreddit for relations", - "intent_template": "Ask for advice with the title \"{{issue}}\" in a subreddit for relations", + "intent": "Ask for advice in a forum for relations. Title it \"break-up remedy\" and in post details ask \"Please help\"", + "intent_template": "Ask for advice in a forum for relations. Title it \"{{issue}}\" and in post details ask \"Please help\"", "instantiation_dict": {"issue": "break-up remedy"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -17807,9 +18614,17 @@ "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "site": "reddit", - "expected": { "subreddit": "relationship_advice", "title": ["break", "up", "remedy"] } + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": ["__REDDIT__/submit", "__REDDIT__/submit/relationship_advice"], + "http_method": "POST", + "post_data": { + "submission[title]": "break-up remedy", + "submission[body]": "Please help", + "submission[forum]": "10075" + }, + "response_status": 302 + } } ], "revision": 2 @@ -17819,11 +18634,10 @@ "task_id": 625, "intent_template_id": 13, "start_urls": ["__REDDIT__"], - "intent": "Create a discussion post titled \"the effectiveness of deep learning\" in a relevant subreddit and ask users for their opinions with the simple prompt, \"your opinion\"", - "intent_template": "Create a discussion post titled \"{{topic}}\" in a relevant subreddit and ask users for their opinions with the simple prompt, \"your opinion\"", + "intent": "Create a discussion post titled \"the effectiveness of deep learning\" in a relevant forum and ask users for their opinions with the simple prompt, \"your opinion\"", + "intent_template": "Create a discussion post titled \"{{topic}}\" in a relevant forum and ask users for their opinions with the simple prompt, \"your opinion\"", "instantiation_dict": {"topic": "the effectiveness of deep learning"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -17831,12 +18645,16 @@ "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "site": "reddit", + "evaluator": "NetworkEventEvaluator", "expected": { - "subreddit": "deeplearning", - "content": ["your opinion"], - "title": "the effectiveness of deep learning" + "url": ["__REDDIT__/submit", "__REDDIT__/submit/deeplearning"], + "http_method": "POST", + "post_data": { + "submission[title]": "the effectiveness of deep learning", + "submission[body]": "your opinion", + "submission[forum]": "10043" + }, + "response_status": 302 } } ], @@ -17847,11 +18665,10 @@ "task_id": 626, "intent_template_id": 13, "start_urls": ["__REDDIT__"], - "intent": "Create a discussion post titled \"Iphone 14\" in a relevant subreddit and ask users for their opinions with the simple prompt, \"your opinion\"", - "intent_template": "Create a discussion post titled \"{{topic}}\" in a relevant subreddit and ask users for their opinions with the simple prompt, \"your opinion\"", + "intent": "Create a discussion post titled \"Iphone 14\" in a relevant forum and ask users for their opinions with the simple prompt, \"your opinion\"", + "intent_template": "Create a discussion post titled \"{{topic}}\" in a relevant forum and ask users for their opinions with the simple prompt, \"your opinion\"", "instantiation_dict": {"topic": "Iphone 14"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -17859,9 +18676,17 @@ "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "site": "reddit", - "expected": { "subreddit": "iphone", "content": ["your opinion"], "title": "Iphone 14" } + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": ["__REDDIT__/submit", "__REDDIT__/submit/iphone"], + "http_method": "POST", + "post_data": { + "submission[title]": "Iphone 14", + "submission[body]": "your opinion", + "submission[forum]": "10052" + }, + "response_status": 302 + } } ], "revision": 2 @@ -17871,11 +18696,10 @@ "task_id": 627, "intent_template_id": 13, "start_urls": ["__REDDIT__"], - "intent": "Create a discussion post titled \"Harry Potter movie series\" in a relevant subreddit and ask users for their opinions with the simple prompt, \"your opinion\"", - "intent_template": "Create a discussion post titled \"{{topic}}\" in a relevant subreddit and ask users for their opinions with the simple prompt, \"your opinion\"", + "intent": "Create a discussion post titled \"Harry Potter movie series\" in a relevant forum and ask users for their opinions with the simple prompt, \"your opinion\"", + "intent_template": "Create a discussion post titled \"{{topic}}\" in a relevant forum and ask users for their opinions with the simple prompt, \"your opinion\"", "instantiation_dict": {"topic": "Harry Potter movie series"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -17883,12 +18707,16 @@ "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "site": "reddit", + "evaluator": "NetworkEventEvaluator", "expected": { - "subreddit": "movies", - "content": ["your opinion"], - "title": "Harry Potter movie series" + "url": ["__REDDIT__/submit", "__REDDIT__/submit/movies"], + "http_method": "POST", + "post_data": { + "submission[title]": "Harry Potter movie series", + "submission[body]": "your opinion", + "submission[forum]": "10060" + }, + "response_status": 302 } } ], @@ -17899,11 +18727,10 @@ "task_id": 628, "intent_template_id": 13, "start_urls": ["__REDDIT__"], - "intent": "Create a discussion post titled \"long distance relationship\" in a relevant subreddit and ask users for their opinions with the simple prompt, \"your opinion\"", - "intent_template": "Create a discussion post titled \"{{topic}}\" in a relevant subreddit and ask users for their opinions with the simple prompt, \"your opinion\"", + "intent": "Create a discussion post titled \"long distance relationship\" in a relevant forum and ask users for their opinions with the simple prompt, \"your opinion\"", + "intent_template": "Create a discussion post titled \"{{topic}}\" in a relevant forum and ask users for their opinions with the simple prompt, \"your opinion\"", "instantiation_dict": {"topic": "long distance relationship"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -17911,12 +18738,16 @@ "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "site": "reddit", + "evaluator": "NetworkEventEvaluator", "expected": { - "subreddit": "relationship_advice", - "content": ["your opinion"], - "title": "long distance relationship" + "url": ["__REDDIT__/submit", "__REDDIT__/submit/relationship_advice"], + "http_method": "POST", + "post_data": { + "submission[title]": "long distance relationship", + "submission[body]": "your opinion", + "submission[forum]": "10075" + }, + "response_status": 302 } } ], @@ -17927,11 +18758,10 @@ "task_id": 629, "intent_template_id": 13, "start_urls": ["__REDDIT__"], - "intent": "Create a discussion post titled \"Fun thing to do in Pittsburgh\" in a relevant subreddit and ask users for their opinions with the simple prompt, \"your opinion\"", - "intent_template": "Create a discussion post titled \"{{topic}}\" in a relevant subreddit and ask users for their opinions with the simple prompt, \"your opinion\"", + "intent": "Create a discussion post titled \"Fun thing to do in Pittsburgh\" in a relevant forum and ask users for their opinions with the simple prompt, \"your opinion\"", + "intent_template": "Create a discussion post titled \"{{topic}}\" in a relevant forum and ask users for their opinions with the simple prompt, \"your opinion\"", "instantiation_dict": {"topic": "Fun thing to do in Pittsburgh"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -17939,12 +18769,16 @@ "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "site": "reddit", + "evaluator": "NetworkEventEvaluator", "expected": { - "subreddit": "pittsburgh", - "content": ["your opinion"], - "title": "Fun thing to do in Pittsburgh" + "url": ["__REDDIT__/submit", "__REDDIT__/submit/pittsburgh"], + "http_method": "POST", + "post_data": { + "submission[title]": "Fun thing to do in Pittsburgh", + "submission[body]": "your opinion", + "submission[forum]": "10073" + }, + "response_status": 302 } } ], @@ -17955,15 +18789,14 @@ "task_id": 630, "intent_template_id": 15, "start_urls": ["__REDDIT__"], - "intent": "Ask for product recommendations for noise-cancelling headphones within a budget of $200 in r/headphones. Put all relevant information in the body of the post.", - "intent_template": "Ask for product recommendations for {{category}} within a budget of {{price}} in {{subreddit}}. Put all relevant information in the body of the post.", + "intent": "Create a post in f/headphones. Title it \"noise-cancelling headphones recommendations?\" and in post details ask \"Hey, looking for good noise-cancelling headphones under $200, any suggestions?\"", + "intent_template": "Create a post in {{forum}}. Title it \"{{category}} recommendations?\" and in post details ask \"Hey, looking for good {{category}} under {{price}}, any suggestions?\"", "instantiation_dict": { "category": "noise-cancelling headphones", "price": "$200", - "subreddit": "r/headphones" + "forum": "f/headphones" }, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -17971,11 +18804,16 @@ "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "site": "reddit", + "evaluator": "NetworkEventEvaluator", "expected": { - "subreddit": "headphones", - "content": ["200", "noise", "cancelling", "headphones"] + "url": ["__REDDIT__/submit", "__REDDIT__/submit/headphones"], + "http_method": "POST", + "post_data": { + "submission[title]": "noise-cancelling headphones recommendations?", + "submission[body]": "Hey, looking for good noise-cancelling headphones under $200, any suggestions?", + "submission[forum]": "10050" + }, + "response_status": 302 } } ], @@ -17986,11 +18824,10 @@ "task_id": 631, "intent_template_id": 15, "start_urls": ["__REDDIT__"], - "intent": "Ask for product recommendations for running shoes within a budget of $100 in r/sports. Put all relevant information in the body of the post.", - "intent_template": "Ask for product recommendations for {{category}} within a budget of {{price}} in {{subreddit}}. Put all relevant information in the body of the post.", - "instantiation_dict": {"category": "running shoes", "price": "$100", "subreddit": "r/sports"}, + "intent": "Create a post in f/sports. Title it \"running shoes recommendations?\" and in post details ask \"Hey, looking for good running shoes under $100, any suggestions?\"", + "intent_template": "Create a post in {{forum}}. Title it \"{{category}} recommendations?\" and in post details ask \"Hey, looking for good {{category}} under {{price}}, any suggestions?\"", + "instantiation_dict": {"category": "running shoes", "price": "$100", "forum": "f/sports"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -17998,9 +18835,17 @@ "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "site": "reddit", - "expected": { "subreddit": "sports", "content": ["running", "shoes", "100"] } + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": ["__REDDIT__/submit", "__REDDIT__/submit/sports"], + "http_method": "POST", + "post_data": { + "submission[title]": "running shoes recommendations?", + "submission[body]": "Hey, looking for good running shoes under $100, any suggestions?", + "submission[forum]": "10080" + }, + "response_status": 302 + } } ], "revision": 2 @@ -18010,11 +18855,10 @@ "task_id": 632, "intent_template_id": 15, "start_urls": ["__REDDIT__"], - "intent": "Ask for product recommendations for running shoes within a budget of $500 in r/sports. Put all relevant information in the body of the post.", - "intent_template": "Ask for product recommendations for {{category}} within a budget of {{price}} in {{subreddit}}. Put all relevant information in the body of the post.", - "instantiation_dict": {"category": "running shoes", "price": "$500", "subreddit": "r/sports"}, + "intent": "Create a post in f/sports. Title it \"running shoes recommendations?\" and in post details ask \"Hey, looking for good running shoes under $500, any suggestions?\"", + "intent_template": "Create a post in {{forum}}. Title it \"{{category}} recommendations?\" and in post details ask \"Hey, looking for good {{category}} under {{price}}, any suggestions?\"", + "instantiation_dict": {"category": "running shoes", "price": "$500", "forum": "f/sports"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -18022,9 +18866,17 @@ "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "site": "reddit", - "expected": { "subreddit": "sports", "content": ["running", "shoes", "500"] } + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": ["__REDDIT__/submit", "__REDDIT__/submit/sports"], + "http_method": "POST", + "post_data": { + "submission[title]": "running shoes recommendations?", + "submission[body]": "Hey, looking for good running shoes under $500, any suggestions?", + "submission[forum]": "10080" + }, + "response_status": 302 + } } ], "revision": 2 @@ -18034,11 +18886,10 @@ "task_id": 633, "intent_template_id": 15, "start_urls": ["__REDDIT__"], - "intent": "Ask for product recommendations for running pants within a budget of $500 in r/sports. Put all relevant information in the body of the post.", - "intent_template": "Ask for product recommendations for {{category}} within a budget of {{price}} in {{subreddit}}. Put all relevant information in the body of the post.", - "instantiation_dict": {"category": "running pants", "price": "$500", "subreddit": "r/sports"}, + "intent": "Create a post in f/sports. Title it \"running pants recommendations?\" and in post details ask \"Hey, looking for good running pants under $500, any suggestions?\"", + "intent_template": "Create a post in {{forum}}. Title it \"{{category}} recommendations?\" and in post details ask \"Hey, looking for good {{category}} under {{price}}, any suggestions?\"", + "instantiation_dict": {"category": "running pants", "price": "$500", "forum": "f/sports"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -18046,9 +18897,17 @@ "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "site": "reddit", - "expected": { "subreddit": "sports", "content": ["running", "pants", "500"] } + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": ["__REDDIT__/submit", "__REDDIT__/submit/sports"], + "http_method": "POST", + "post_data": { + "submission[title]": "running pants recommendations?", + "submission[body]": "Hey, looking for good running pants under $500, any suggestions?", + "submission[forum]": "10080" + }, + "response_status": 302 + } } ], "revision": 2 @@ -18058,11 +18917,10 @@ "task_id": 634, "intent_template_id": 15, "start_urls": ["__REDDIT__"], - "intent": "Ask for product recommendations for used iphone within a budget of $1000 in r/iphone. Put all relevant information in the body of the post.", - "intent_template": "Ask for product recommendations for {{category}} within a budget of {{price}} in {{subreddit}}. Put all relevant information in the body of the post.", - "instantiation_dict": {"category": "used iphone", "price": "$1000", "subreddit": "r/iphone"}, + "intent": "Create a post in f/iphone. Title it \"used iphone recommendations?\" and in post details ask \"Hey, looking for good used iphone under $1000, any suggestions?\"", + "intent_template": "Create a post in {{forum}}. Title it \"{{category}} recommendations?\" and in post details ask \"Hey, looking for good {{category}} under {{price}}, any suggestions?\"", + "instantiation_dict": {"category": "used iphone", "price": "$1000", "forum": "f/iphone"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -18070,9 +18928,17 @@ "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "site": "reddit", - "expected": { "subreddit": "iphone", "content": ["1000", "used", "iphone"] } + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": ["__REDDIT__/submit", "__REDDIT__/submit/iphone"], + "http_method": "POST", + "post_data": { + "submission[title]": "used iphone recommendations?", + "submission[body]": "Hey, looking for good used iphone under $1000, any suggestions?", + "submission[forum]": "10052" + }, + "response_status": 302 + } } ], "revision": 2 @@ -18082,11 +18948,10 @@ "task_id": 635, "intent_template_id": 6100, "start_urls": ["__REDDIT__"], - "intent": "Post in the most appropriate subreddit and ask for recommendations for noise-cancelling headphones products within a budget of $200. Put all relevant information in the body of the post.", - "intent_template": "Post in the most appropriate subreddit and ask for recommendations for {{category}} products within a budget of {{price}}. Put all relevant information in the body of the post.", + "intent": "Create a post in the most appropriate forum. Title it \"noise-cancelling headphones recommendations\" and in post details ask \"I need recommendations for noise-cancelling headphones within a budget of $200 please\"", + "intent_template": "Create a post in the most appropriate forum. Title it \"{{category}} recommendations\" and in post details ask \"I need recommendations for {{category}} within a budget of {{price}} please\"", "instantiation_dict": {"category": "noise-cancelling headphones", "price": "$200"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -18094,11 +18959,16 @@ "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "site": "reddit", + "evaluator": "NetworkEventEvaluator", "expected": { - "subreddit": "headphones", - "content": ["200", "noise", "cancelling", "headphones"] + "url": ["__REDDIT__/submit", "__REDDIT__/submit/headphones"], + "http_method": "POST", + "post_data": { + "submission[title]": "noise-cancelling headphones recommendations", + "submission[body]": "I need recommendations for noise-cancelling headphones within a budget of $200 please", + "submission[forum]": "10050" + }, + "response_status": 302 } } ], @@ -18109,11 +18979,10 @@ "task_id": 636, "intent_template_id": 6100, "start_urls": ["__REDDIT__"], - "intent": "Post in the most appropriate subreddit and ask for recommendations for DIY toolkit products within a budget of $100. Put all relevant information in the body of the post.", - "intent_template": "Post in the most appropriate subreddit and ask for recommendations for {{category}} products within a budget of {{price}}. Put all relevant information in the body of the post.", + "intent": "Create a post in the most appropriate forum. Title it \"DIY toolkit recommendations\" and in post details ask \"I need recommendations for DIY toolkit within a budget of $100 please\"", + "intent_template": "Create a post in the most appropriate forum. Title it \"{{category}} recommendations\" and in post details ask \"I need recommendations for {{category}} within a budget of {{price}} please\"", "instantiation_dict": {"category": "DIY toolkit", "price": "$100"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -18121,9 +18990,17 @@ "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "site": "reddit", - "expected": { "subreddit": "diy", "content": ["100", "DIY", "toolkit"] } + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": ["__REDDIT__/submit", "__REDDIT__/submit/diy"], + "http_method": "POST", + "post_data": { + "submission[title]": "DIY toolkit recommendations", + "submission[body]": "I need recommendations for DIY toolkit within a budget of $100 please", + "submission[forum]": "10007" + }, + "response_status": 302 + } } ], "revision": 2 @@ -18133,11 +19010,10 @@ "task_id": 637, "intent_template_id": 6100, "start_urls": ["__REDDIT__"], - "intent": "Post in the most appropriate subreddit and ask for recommendations for sony headphones products within a budget of $500. Put all relevant information in the body of the post.", - "intent_template": "Post in the most appropriate subreddit and ask for recommendations for {{category}} products within a budget of {{price}}. Put all relevant information in the body of the post.", + "intent": "Create a post in the most appropriate forum. Title it \"sony headphones recommendations\" and in post details ask \"I need recommendations for sony headphones within a budget of $500 please\"", + "intent_template": "Create a post in the most appropriate forum. Title it \"{{category}} recommendations\" and in post details ask \"I need recommendations for {{category}} within a budget of {{price}} please\"", "instantiation_dict": {"category": "sony headphones", "price": "$500"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -18145,9 +19021,17 @@ "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "site": "reddit", - "expected": { "subreddit": "headphones", "content": ["500", "sony", "headphones"] } + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": ["__REDDIT__/submit", "__REDDIT__/submit/headphones"], + "http_method": "POST", + "post_data": { + "submission[title]": "sony headphones recommendations", + "submission[body]": "I need recommendations for sony headphones within a budget of $500 please", + "submission[forum]": "10050" + }, + "response_status": 302 + } } ], "revision": 2 @@ -18157,11 +19041,10 @@ "task_id": 638, "intent_template_id": 6100, "start_urls": ["__REDDIT__"], - "intent": "Post in the most appropriate subreddit and ask for recommendations for must-have product to buy for life products within a budget of $30. Put all relevant information in the body of the post.", - "intent_template": "Post in the most appropriate subreddit and ask for recommendations for {{category}} products within a budget of {{price}}. Put all relevant information in the body of the post.", - "instantiation_dict": {"category": "must-have product to buy for life", "price": "$30"}, + "intent": "Create a post in the most appropriate forum. Title it \"Must have product at last for ever recommendations\" and in post details ask \"I need recommendations for Must have product at last for ever within a budget of $30 please\"", + "intent_template": "Create a post in the most appropriate forum. Title it \"{{category}} recommendations\" and in post details ask \"I need recommendations for {{category}} within a budget of {{price}} please\"", + "instantiation_dict": {"category": "Must have product at last for ever", "price": "$30"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -18169,11 +19052,16 @@ "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "site": "reddit", + "evaluator": "NetworkEventEvaluator", "expected": { - "subreddit": "BuyItForLife", - "content": ["30", "must-have", "product", "life"] + "url": ["__REDDIT__/submit", "__REDDIT__/submit/BuyItForLife"], + "http_method": "POST", + "post_data": { + "submission[title]": "Must have product at last for ever recommendations", + "submission[body]": "I need recommendations for Must have product at last for ever within a budget of $30 please", + "submission[forum]": "10003" + }, + "response_status": 302 } } ], @@ -18184,11 +19072,10 @@ "task_id": 639, "intent_template_id": 6100, "start_urls": ["__REDDIT__"], - "intent": "Post in the most appropriate subreddit and ask for recommendations for used iphone products within a budget of $1000. Put all relevant information in the body of the post.", - "intent_template": "Post in the most appropriate subreddit and ask for recommendations for {{category}} products within a budget of {{price}}. Put all relevant information in the body of the post.", + "intent": "Create a post in the most appropriate forum. Title it \"used iphone recommendations\" and in post details ask \"I need recommendations for used iphone within a budget of $1000 please\"", + "intent_template": "Create a post in the most appropriate forum. Title it \"{{category}} recommendations\" and in post details ask \"I need recommendations for {{category}} within a budget of {{price}} please\"", "instantiation_dict": {"category": "used iphone", "price": "$1000"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -18196,9 +19083,17 @@ "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "site": "reddit", - "expected": { "subreddit": "iphone", "content": ["1000", "used", "iphone"] } + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": ["__REDDIT__/submit", "__REDDIT__/submit/iphone"], + "http_method": "POST", + "post_data": { + "submission[title]": "used iphone recommendations", + "submission[body]": "I need recommendations for used iphone within a budget of $1000 please", + "submission[forum]": "10052" + }, + "response_status": 302 + } } ], "revision": 2 @@ -18208,11 +19103,10 @@ "task_id": 640, "intent_template_id": 16, "start_urls": ["__REDDIT__"], - "intent": "Post a notice on a virtual meetup for book reading enthusiasts on March 15th in the r/books subreddit. Put all relevant information in the body of the post.", - "intent_template": "Post a notice on a virtual meetup for {{interest}} enthusiasts on {{date}} in the {{subreddit}} subreddit. Put all relevant information in the body of the post.", - "instantiation_dict": {"interest": "book reading", "date": "March 15th", "subreddit": "r/books"}, + "intent": "Post a notice in f/books titled \"book reading Meet up!\". Set post details to \"virtual meetup for book reading on March 15th\"", + "intent_template": "Post a notice in {{forum}} titled \"{{interest}} Meet up!\". Set post details to \"virtual meetup for {{interest}} on {{date}}\"", + "instantiation_dict": {"interest": "book reading", "date": "March 15th", "forum": "f/books"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -18220,11 +19114,16 @@ "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "site": "reddit", + "evaluator": "NetworkEventEvaluator", "expected": { - "subreddit": "books", - "content": ["book reading", "March", "15", "virtual meetup"] + "url": ["__REDDIT__/submit", "__REDDIT__/submit/books"], + "http_method": "POST", + "post_data": { + "submission[title]": "book reading Meet up!", + "submission[body]": "virtual meetup for book reading on March 15th", + "submission[forum]": "10037" + }, + "response_status": 302 } } ], @@ -18235,11 +19134,10 @@ "task_id": 641, "intent_template_id": 16, "start_urls": ["__REDDIT__"], - "intent": "Post a notice on a virtual meetup for Harry Potter enthusiasts on July 8th in the books subreddit. Put all relevant information in the body of the post.", - "intent_template": "Post a notice on a virtual meetup for {{interest}} enthusiasts on {{date}} in the {{subreddit}} subreddit. Put all relevant information in the body of the post.", - "instantiation_dict": {"interest": "Harry Potter", "date": "July 8th", "subreddit": "books"}, + "intent": "Post a notice in books forum titled \"Harry Potter Meet up!\". Set post details to \"virtual meetup for Harry Potter on July 8th\"", + "intent_template": "Post a notice in {{forum}} titled \"{{interest}} Meet up!\". Set post details to \"virtual meetup for {{interest}} on {{date}}\"", + "instantiation_dict": {"interest": "Harry Potter", "date": "July 8th", "forum": "books forum"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -18247,11 +19145,16 @@ "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "site": "reddit", + "evaluator": "NetworkEventEvaluator", "expected": { - "subreddit": "books", - "content": ["Harry Potter", "July", "8", "virtual meetup"] + "url": ["__REDDIT__/submit", "__REDDIT__/submit/books"], + "http_method": "POST", + "post_data": { + "submission[title]": "Harry Potter Meet up!", + "submission[body]": "virtual meetup for Harry Potter on July 8th", + "submission[forum]": "10037" + }, + "response_status": 302 } } ], @@ -18262,11 +19165,10 @@ "task_id": 642, "intent_template_id": 16, "start_urls": ["__REDDIT__"], - "intent": "Post a notice on a virtual meetup for Big little lies enthusiasts on Sep 10th in the books subreddit. Put all relevant information in the body of the post.", - "intent_template": "Post a notice on a virtual meetup for {{interest}} enthusiasts on {{date}} in the {{subreddit}} subreddit. Put all relevant information in the body of the post.", - "instantiation_dict": {"interest": "Big little lies", "date": "Sep 10th", "subreddit": "books"}, + "intent": "Post a notice in books forum titled \"Big little lies Meet up!\". Set post details to \"virtual meetup for Big little lies on Sep 10th\"", + "intent_template": "Post a notice in {{forum}} titled \"{{interest}} Meet up!\". Set post details to \"virtual meetup for {{interest}} on {{date}}\"", + "instantiation_dict": {"interest": "Big little lies", "date": "Sep 10th", "forum": "books forum"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -18274,11 +19176,16 @@ "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "site": "reddit", + "evaluator": "NetworkEventEvaluator", "expected": { - "subreddit": "books", - "content": ["Big little lies", "Sep", "10", "virtual meetup"] + "url": ["__REDDIT__/submit", "__REDDIT__/submit/books"], + "http_method": "POST", + "post_data": { + "submission[title]": "Big little lies Meet up!", + "submission[body]": "virtual meetup for Big little lies on Sep 10th", + "submission[forum]": "10037" + }, + "response_status": 302 } } ], @@ -18289,11 +19196,10 @@ "task_id": 643, "intent_template_id": 16, "start_urls": ["__REDDIT__"], - "intent": "Post a notice on a virtual meetup for racing cars enthusiasts on Oct 21st in the nyc subreddit. Put all relevant information in the body of the post.", - "intent_template": "Post a notice on a virtual meetup for {{interest}} enthusiasts on {{date}} in the {{subreddit}} subreddit. Put all relevant information in the body of the post.", - "instantiation_dict": {"interest": "racing cars", "date": "Oct 21st", "subreddit": "nyc"}, + "intent": "Post a notice in nyc forum titled \"racing cars Meet up!\". Set post details to \"virtual meetup for racing cars on Oct 21st\"", + "intent_template": "Post a notice in {{forum}} titled \"{{interest}} Meet up!\". Set post details to \"virtual meetup for {{interest}} on {{date}}\"", + "instantiation_dict": {"interest": "racing cars", "date": "Oct 21st", "forum": "nyc forum"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -18301,11 +19207,16 @@ "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "site": "reddit", + "evaluator": "NetworkEventEvaluator", "expected": { - "subreddit": "nyc", - "content": ["racing cars", "Oct", "21", "virtual meetup"] + "url": ["__REDDIT__/submit", "__REDDIT__/submit/nyc"], + "http_method": "POST", + "post_data": { + "submission[title]": "racing cars Meet up!", + "submission[body]": "virtual meetup for racing cars on Oct 21st", + "submission[forum]": "10067" + }, + "response_status": 302 } } ], @@ -18316,11 +19227,10 @@ "task_id": 644, "intent_template_id": 16, "start_urls": ["__REDDIT__"], - "intent": "Post a notice on a virtual meetup for Tears of Kingdom enthusiasts on Dec 15th in the games subreddit. Put all relevant information in the body of the post.", - "intent_template": "Post a notice on a virtual meetup for {{interest}} enthusiasts on {{date}} in the {{subreddit}} subreddit. Put all relevant information in the body of the post.", - "instantiation_dict": {"interest": "Tears of Kingdom", "date": "Dec 15th", "subreddit": "games"}, + "intent": "Post a notice in games forum titled \"Tears of Kingdom Meet up!\". Set post details to \"virtual meetup for Tears of Kingdom on Dec 15th\"", + "intent_template": "Post a notice in {{forum}} titled \"{{interest}} Meet up!\". Set post details to \"virtual meetup for {{interest}} on {{date}}\"", + "instantiation_dict": {"interest": "Tears of Kingdom", "date": "Dec 15th", "forum": "games forum"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -18328,11 +19238,16 @@ "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "site": "reddit", + "evaluator": "NetworkEventEvaluator", "expected": { - "subreddit": "gaming", - "content": ["Tears of Kingdom", "Dec", "15", "virtual meetup"] + "url": ["__REDDIT__/submit", "__REDDIT__/submit/gaming"], + "http_method": "POST", + "post_data": { + "submission[title]": "Tears of Kingdom Meet up!", + "submission[body]": "virtual meetup for Tears of Kingdom on Dec 15th", + "submission[forum]": "10048" + }, + "response_status": 302 } } ], @@ -18343,11 +19258,14 @@ "task_id": 645, "intent_template_id": 19, "start_urls": ["__REDDIT__"], - "intent": "Post in books subreddit about what could machine learning help the corresponding field.", - "intent_template": "Post in {{subreddit}} subreddit about what could {{model_type}} help the corresponding field.", - "instantiation_dict": {"model_type": "machine learning", "subreddit": "books"}, + "intent": "Post in books forum with title \"What could machine learning help the books field?\"", + "intent_template": "Post in {{forum}} forum with title \"{{title}}\"", + "instantiation_dict": { + "model_type": "machine learning", + "title": "What could machine learning help the books field?", + "forum": "books" + }, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -18355,9 +19273,16 @@ "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "site": "reddit", - "expected": { "subreddit": "books", "title": ["machine learning", "help"] } + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": ["__REDDIT__/submit", "__REDDIT__/submit/books"], + "http_method": "POST", + "post_data": { + "submission[title]": "What could machine learning help the books field?", + "submission[forum]": "10037" + }, + "response_status": 302 + } } ], "revision": 2 @@ -18367,11 +19292,14 @@ "task_id": 646, "intent_template_id": 19, "start_urls": ["__REDDIT__"], - "intent": "Post in DIY subreddit about what could midjourney help the corresponding field.", - "intent_template": "Post in {{subreddit}} subreddit about what could {{model_type}} help the corresponding field.", - "instantiation_dict": {"model_type": "midjourney", "subreddit": "DIY"}, + "intent": "Post in DIY forum with title \"What could midjourney help the DIY field?\"", + "intent_template": "Post in {{forum}} forum with title \"{{title}}\"", + "instantiation_dict": { + "model_type": "midjourney", + "title": "What could midjourney help the DIY field?", + "forum": "DIY" + }, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -18379,9 +19307,16 @@ "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "site": "reddit", - "expected": { "subreddit": "diy", "title": ["midjourney", "help"] } + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": ["__REDDIT__/submit", "__REDDIT__/submit/DIY"], + "http_method": "POST", + "post_data": { + "submission[title]": "What could midjourney help the DIY field?", + "submission[forum]": "10007" + }, + "response_status": 302 + } } ], "revision": 2 @@ -18391,11 +19326,14 @@ "task_id": 647, "intent_template_id": 19, "start_urls": ["__REDDIT__"], - "intent": "Post in technology subreddit about what could open-source LLMs help the corresponding field.", - "intent_template": "Post in {{subreddit}} subreddit about what could {{model_type}} help the corresponding field.", - "instantiation_dict": {"model_type": "open-source LLMs", "subreddit": "technology"}, + "intent": "Post in technology forum with title \"What could open-source LLMs help the technology field?\"", + "intent_template": "Post in {{forum}} forum with title \"{{title}}\"", + "instantiation_dict": { + "model_type": "open-source LLMs", + "title": "What could open-source LLMs help the technology field?", + "forum": "technology" + }, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -18403,9 +19341,16 @@ "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "site": "reddit", - "expected": { "subreddit": "technology", "title": ["open-source LLMs", "help"] } + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": ["__REDDIT__/submit", "__REDDIT__/submit/technology"], + "http_method": "POST", + "post_data": { + "submission[title]": "What could open-source LLMs help the technology field?", + "submission[forum]": "10082" + }, + "response_status": 302 + } } ], "revision": 2 @@ -18415,11 +19360,14 @@ "task_id": 648, "intent_template_id": 19, "start_urls": ["__REDDIT__"], - "intent": "Post in dataisbeautiful subreddit about what could large language models help the corresponding field.", - "intent_template": "Post in {{subreddit}} subreddit about what could {{model_type}} help the corresponding field.", - "instantiation_dict": {"model_type": "large language models", "subreddit": "dataisbeautiful"}, + "intent": "Post in dataisbeautiful forum with title \"What could large language models help the dataisbeautiful field?\"", + "intent_template": "Post in {{forum}} forum with title \"{{title}}\"", + "instantiation_dict": { + "model_type": "large language models", + "title": "What could large language models help the dataisbeautiful field?", + "forum": "dataisbeautiful" + }, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -18427,9 +19375,16 @@ "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "site": "reddit", - "expected": { "subreddit": "dataisbeautiful", "title": ["large language models", "help"] } + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": ["__REDDIT__/submit", "__REDDIT__/submit/dataisbeautiful"], + "http_method": "POST", + "post_data": { + "submission[title]": "What could large language models help the dataisbeautiful field?", + "submission[forum]": "10042" + }, + "response_status": 302 + } } ], "revision": 2 @@ -18439,11 +19394,14 @@ "task_id": 649, "intent_template_id": 19, "start_urls": ["__REDDIT__"], - "intent": "Post in dataisbeautiful subreddit about what could diffusion model help the corresponding field.", - "intent_template": "Post in {{subreddit}} subreddit about what could {{model_type}} help the corresponding field.", - "instantiation_dict": {"model_type": "diffusion model", "subreddit": "dataisbeautiful"}, + "intent": "Post in dataisbeautiful forum with title \"What could diffusion model help the dataisbeautiful field?\"", + "intent_template": "Post in {{forum}} forum with title \"{{title}}\"", + "instantiation_dict": { + "model_type": "diffusion model", + "title": "What could diffusion model help the dataisbeautiful field?", + "forum": "dataisbeautiful" + }, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -18451,9 +19409,16 @@ "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "site": "reddit", - "expected": { "subreddit": "dataisbeautiful", "title": ["diffusion models", "help"] } + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": ["__REDDIT__/submit", "__REDDIT__/submit/dataisbeautiful"], + "http_method": "POST", + "post_data": { + "submission[title]": "What could diffusion model help the dataisbeautiful field?", + "submission[forum]": "10042" + }, + "response_status": 302 + } } ], "revision": 2 @@ -18463,14 +19428,13 @@ "task_id": 650, "intent_template_id": 23, "start_urls": ["__REDDIT__/f/books/59421/friendly-reminder-bookshop-org-exists"], - "intent": "Reply to the post with my comment \"I am a big fan of the bookorg\"", - "intent_template": "Reply to {{position_description}} with my comment \"{{content_description}}\"", + "intent": "Reply to the post on this page with my comment \"I am a big fan of the bookorg\"", + "intent_template": "Reply to {{position_description}} on this page with my comment \"{{content_description}}\"", "instantiation_dict": { "position_description": "the post", "content_description": "I am a big fan of the bookorg" }, "format_specification": null, - "start_url_context": "Complete the task given the context of the current page.", "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -18492,14 +19456,13 @@ "start_urls": [ "__REDDIT__/f/pittsburgh/45899/driving-in-pittsburgh-summed-up-by-one-traffic-sign" ], - "intent": "Reply to the post with my comment \"Yeah, pittsburgh traffic, you know...\"", - "intent_template": "Reply to {{position_description}} with my comment \"{{content_description}}\"", + "intent": "Reply to the post on this page with my comment \"Yeah, pittsburgh traffic, you know...\"", + "intent_template": "Reply to {{position_description}} on this page with my comment \"{{content_description}}\"", "instantiation_dict": { "position_description": "the post", "content_description": "Yeah, pittsburgh traffic, you know..." }, "format_specification": null, - "start_url_context": "Complete the task given the context of the current page.", "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -18507,9 +19470,13 @@ "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "site": "reddit", - "expected": {"post_id": "45899", "comment_content": "Yeah, pittsburgh traffic, you know..."} + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": "__REDDIT__/f/pittsburgh/45899/-/comment", + "http_method": "POST", + "post_data": {"reply_to_submission_45899[comment]": "Yeah, pittsburgh traffic, you know..."}, + "response_status": 302 + } } ], "revision": 2 @@ -18521,11 +19488,10 @@ "start_urls": [ "__REDDIT__/f/technology/134852/ai-experts-disown-musk-backed-campaign-citing-their-research" ], - "intent": "Reply to the post with my comment \"???\"", - "intent_template": "Reply to {{position_description}} with my comment \"{{content_description}}\"", + "intent": "Reply to the post on this page with my comment \"???\"", + "intent_template": "Reply to {{position_description}} on this page with my comment \"{{content_description}}\"", "instantiation_dict": {"position_description": "the post", "content_description": "???"}, "format_specification": null, - "start_url_context": "Complete the task given the context of the current page.", "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -18545,11 +19511,10 @@ "task_id": 653, "intent_template_id": 153, "start_urls": ["__SHOPPING__"], - "intent": "Fill the \"contact us\" form in the site for a refund on the phone screen protector I bought, stating that \"it broke after just three days of use\". Also, ensure to include the order number #000000180 and the product SKU. Don't submit yet, I will check.", - "intent_template": "Fill the \"contact us\" form in the site for a refund on the {{product}} I bought, stating that \"it broke after just three days of use\". Also, ensure to include the order number #{{order_id}} and the product SKU. Don't submit yet, I will check.", + "intent": "Fill the \"contact us\" form in the site for a refund on the phone screen protector I bought, stating that \"it broke after just three days of use\". Also, ensure to include the order number #000000180 and the product SKU. Don\"t submit yet, I will check.", + "intent_template": "Fill the \"contact us\" form in the site for a refund on the {{product}} I bought, stating that \"it broke after just three days of use\". Also, ensure to include the order number #{{order_id}} and the product SKU. Don\"t submit yet, I will check.", "instantiation_dict": {"product": "phone screen protector", "order_id": "000000180"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -18559,13 +19524,8 @@ { "evaluator": "NetworkEventEvaluator", "site": "shopping", - "url_match_mode": "exact", "last_event_only": true, - "expected": { - "url": "__SHOPPING__/contact", - "response_status": 200, - "event_type": "navigation" - } + "expected": {"url": "__SHOPPING__/contact", "response_status": 200} } ], "revision": 2 @@ -18575,11 +19535,10 @@ "task_id": 654, "intent_template_id": 153, "start_urls": ["__SHOPPING__"], - "intent": "Fill the \"contact us\" form in the site for a refund on the bluetooth speaker I bought, stating that \"it broke after just three days of use\". Also, ensure to include the order number #161 and the product SKU. Don't submit yet, I will check.", - "intent_template": "Fill the \"contact us\" form in the site for a refund on the {{product}} I bought, stating that \"it broke after just three days of use\". Also, ensure to include the order number #{{order_id}} and the product SKU. Don't submit yet, I will check.", + "intent": "Fill the \"contact us\" form in the site for a refund on the bluetooth speaker I bought, stating that \"it broke after just three days of use\". Also, ensure to include the order number #161 and the product SKU. Don\"t submit yet, I will check.", + "intent_template": "Fill the \"contact us\" form in the site for a refund on the {{product}} I bought, stating that \"it broke after just three days of use\". Also, ensure to include the order number #{{order_id}} and the product SKU. Don\"t submit yet, I will check.", "instantiation_dict": {"product": "bluetooth speaker", "order_id": "161"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -18589,13 +19548,8 @@ { "evaluator": "NetworkEventEvaluator", "site": "shopping", - "url_match_mode": "exact", "last_event_only": true, - "expected": { - "url": "__SHOPPING__/contact", - "response_status": 200, - "event_type": "navigation" - } + "expected": {"url": "__SHOPPING__/contact", "response_status": 200} } ], "revision": 2 @@ -18605,11 +19559,10 @@ "task_id": 655, "intent_template_id": 153, "start_urls": ["__SHOPPING__"], - "intent": "Fill the \"contact us\" form in the site for a refund on the iphone case I bought, stating that \"it broke after just three days of use\". Also, ensure to include the order number #180 and the product SKU. Don't submit yet, I will check.", - "intent_template": "Fill the \"contact us\" form in the site for a refund on the {{product}} I bought, stating that \"it broke after just three days of use\". Also, ensure to include the order number #{{order_id}} and the product SKU. Don't submit yet, I will check.", + "intent": "Fill the \"contact us\" form in the site for a refund on the iphone case I bought, stating that \"it broke after just three days of use\". Also, ensure to include the order number #180 and the product SKU. Don\"t submit yet, I will check.", + "intent_template": "Fill the \"contact us\" form in the site for a refund on the {{product}} I bought, stating that \"it broke after just three days of use\". Also, ensure to include the order number #{{order_id}} and the product SKU. Don\"t submit yet, I will check.", "instantiation_dict": {"product": "iphone case", "order_id": "180"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -18619,13 +19572,8 @@ { "evaluator": "NetworkEventEvaluator", "site": "shopping", - "url_match_mode": "exact", "last_event_only": true, - "expected": { - "url": "__SHOPPING__/contact", - "response_status": 200, - "event_type": "navigation" - } + "expected": {"url": "__SHOPPING__/contact", "response_status": 200} } ], "revision": 2 @@ -18635,11 +19583,10 @@ "task_id": 656, "intent_template_id": 153, "start_urls": ["__SHOPPING__"], - "intent": "Fill the \"contact us\" form in the site for a refund on the remote controller I bought, stating that \"it broke after just three days of use\". Also, ensure to include the order number #180 and the product SKU. Don't submit yet, I will check.", - "intent_template": "Fill the \"contact us\" form in the site for a refund on the {{product}} I bought, stating that \"it broke after just three days of use\". Also, ensure to include the order number #{{order_id}} and the product SKU. Don't submit yet, I will check.", + "intent": "Fill the \"contact us\" form in the site for a refund on the remote controller I bought, stating that \"it broke after just three days of use\". Also, ensure to include the order number #180 and the product SKU. Don\"t submit yet, I will check.", + "intent_template": "Fill the \"contact us\" form in the site for a refund on the {{product}} I bought, stating that \"it broke after just three days of use\". Also, ensure to include the order number #{{order_id}} and the product SKU. Don\"t submit yet, I will check.", "instantiation_dict": {"product": "remote controller", "order_id": "180"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -18649,13 +19596,8 @@ { "evaluator": "NetworkEventEvaluator", "site": "shopping", - "url_match_mode": "exact", "last_event_only": true, - "expected": { - "url": "__SHOPPING__/contact", - "response_status": 200, - "event_type": "navigation" - } + "expected": {"url": "__SHOPPING__/contact", "response_status": 200} } ], "revision": 2 @@ -18665,11 +19607,10 @@ "task_id": 657, "intent_template_id": 153, "start_urls": ["__SHOPPING__"], - "intent": "Fill the \"contact us\" form in the site for a refund on the speaker I bought, stating that \"it broke after just three days of use\". Also, ensure to include the order number #148 and the product SKU. Don't submit yet, I will check.", - "intent_template": "Fill the \"contact us\" form in the site for a refund on the {{product}} I bought, stating that \"it broke after just three days of use\". Also, ensure to include the order number #{{order_id}} and the product SKU. Don't submit yet, I will check.", + "intent": "Fill the \"contact us\" form in the site for a refund on the speaker I bought, stating that \"it broke after just three days of use\". Also, ensure to include the order number #148 and the product SKU. Don\"t submit yet, I will check.", + "intent_template": "Fill the \"contact us\" form in the site for a refund on the {{product}} I bought, stating that \"it broke after just three days of use\". Also, ensure to include the order number #{{order_id}} and the product SKU. Don\"t submit yet, I will check.", "instantiation_dict": {"product": "speaker", "order_id": "148"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -18679,13 +19620,8 @@ { "evaluator": "NetworkEventEvaluator", "site": "shopping", - "url_match_mode": "exact", "last_event_only": true, - "expected": { - "url": "__SHOPPING__/contact", - "response_status": 200, - "event_type": "navigation" - } + "expected": {"url": "__SHOPPING__/contact", "response_status": 200} } ], "revision": 2 @@ -18695,8 +19631,8 @@ "task_id": 658, "intent_template_id": 327, "start_urls": ["__GITLAB__"], - "intent": "Create an issue in a11yproject repo with title \"401 bad gateway\". Assign the issue to Roshanjossey. Set due date to be the end of 2030", - "intent_template": "Create an issue in {{repo}} repo with title \"{{issue}}\". Assign the issue to {{account}}. Set due date to be {{due}}", + "intent": "Create an issue in a11yproject repo with title \"401 bad gateway\". Assign the issue to Roshanjossey (you can invite users to the project if needed). Set due date to be the end of 2030", + "intent_template": "Create an issue in {{repo}} repo with title \"{{issue}}\". Assign the issue to {{account}} (you can invite users to the project if needed). Set due date to be {{due}}", "instantiation_dict": { "repo": "a11yproject", "issue": "401 bad gateway", @@ -18704,7 +19640,6 @@ "due": "the end of 2030" }, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -18712,17 +19647,18 @@ "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "group": "a11yproject", - "project": "a11yproject.com", - "fields": { - "title": "401 bad gateway", - "assignee.username": "Roshanjossey", - "due_date": "2030-12-31" - }, - "site": "gitlab", - "expected": true - } + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": "__GITLAB__/a11yproject/a11yproject.com/-/issues", + "http_method": "POST", + "post_data": { + "issue[title]": "401 bad gateway", + "issue[assignee_ids][]": "2264", + "issue[due_date]": "2030-12-31" + }, + "response_status": 302 + } + } ], "revision": 2 }, @@ -18731,16 +19667,15 @@ "task_id": 659, "intent_template_id": 327, "start_urls": ["__GITLAB__"], - "intent": "Create an issue in empathy-prompts repo with title \"Integrating LLMs for better prompts\". Assign the issue to Roshanjossey. Set due date to be the beginning of Q2 2033", - "intent_template": "Create an issue in {{repo}} repo with title \"{{issue}}\". Assign the issue to {{account}}. Set due date to be {{due}}", + "intent": "Create an issue in my empathy-prompts repo with title \"Integrating LLMs for better prompts\". Assign the issue to Roshanjossey (you can invite users to the project if needed). Set due date to be the beginning of Q2 2033", + "intent_template": "Create an issue in {{repo}} repo with title \"{{issue}}\". Assign the issue to {{account}} (you can invite users to the project if needed). Set due date to be {{due}}", "instantiation_dict": { - "repo": "empathy-prompts", + "repo": "my empathy-prompts", "issue": "Integrating LLMs for better prompts", "account": "Roshanjossey", "due": "the beginning of Q2 2033" }, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -18748,16 +19683,26 @@ "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "group": "byteblaze", - "project": "empathy-prompts", - "fields": { - "title": "Integrating LLMs for better prompts", - "assignee.username": "Roshanjossey", - "due_date": "2033-04-01" - }, - "site": "gitlab", - "expected": true + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": "__GITLAB__/api/v4/projects/183/invitations", + "http_method": "POST", + "post_data": {"user_id": "2264"}, + "response_status": 201 + } + }, + { + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": "__GITLAB__/byteblaze/empathy-prompts/-/issues", + "http_method": "POST", + "post_data": { + "issue[title]": "Integrating LLMs for better prompts", + "issue[assignee_ids][]": "2264", + "issue[due_date]": "2033-04-01" + }, + "response_status": 302 + } } ], "revision": 2 @@ -18767,16 +19712,15 @@ "task_id": 660, "intent_template_id": 327, "start_urls": ["__GITLAB__"], - "intent": "Create an issue in dotfiles repo with title \"add support for oh-my-zsh\". Assign the issue to Abishek. Set due date to be July 18 2033", - "intent_template": "Create an issue in {{repo}} repo with title \"{{issue}}\". Assign the issue to {{account}}. Set due date to be {{due}}", + "intent": "Create an issue in my dotfiles repo with title \"add support for oh-my-zsh\". Assign the issue to Abishek (you can invite users to the project if needed). Set due date to be July 18 2033", + "intent_template": "Create an issue in {{repo}} repo with title \"{{issue}}\". Assign the issue to {{account}} (you can invite users to the project if needed). Set due date to be {{due}}", "instantiation_dict": { - "repo": "dotfiles", + "repo": "my dotfiles", "issue": "add support for oh-my-zsh", "account": "Abishek", "due": "July 18 2033" }, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -18784,16 +19728,26 @@ "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "group": "byteblaze", - "project": "dotfiles", - "fields": { - "title": "add support for oh-my-zsh", - "assignee.username": "abisubramanya27", - "due_date": "2033-07-18" - }, - "site": "gitlab", - "expected": true + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": "__GITLAB__/api/v4/projects/193/invitations", + "http_method": "POST", + "post_data": {"user_id": "5"}, + "response_status": 201 + } + }, + { + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": "__GITLAB__/byteblaze/dotfiles/-/issues", + "http_method": "POST", + "post_data": { + "issue[title]": "add support for oh-my-zsh", + "issue[assignee_ids][]": "5", + "issue[due_date]": "2033-07-18" + }, + "response_status": 302 + } } ], "revision": 2 @@ -18811,7 +19765,6 @@ "repo": "ChatGPT" }, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -18819,12 +19772,14 @@ "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "group": "convexegg", - "project": "chatgpt", - "fields": {"title": "Connection refused"}, + "evaluator": "NetworkEventEvaluator", "site": "gitlab", - "expected": true + "expected": { + "url": "__GITLAB__/api/v4/projects/175/issues", + "http_method": "POST", + "post_data": {"title": "Connection refused"}, + "response_status": 201 + } } ], "revision": 2 @@ -18842,7 +19797,6 @@ "repo": "aem-hacker" }, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -18850,12 +19804,14 @@ "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "group": "0ang3el", - "project": "aem-hacker", - "fields": {"title": "OSError: [Errno 98] Address already in use"}, + "evaluator": "NetworkEventEvaluator", "site": "gitlab", - "expected": true + "expected": { + "url": "__GITLAB__/api/v4/projects/138/issues", + "http_method": "POST", + "post_data": {"title": "OSError: [Errno 98] Address already in use"}, + "response_status": 201 + } } ], "revision": 2 @@ -18865,7 +19821,7 @@ "task_id": 663, "intent_template_id": 328, "start_urls": ["__GITLAB__"], - "intent": "Open an issue with title \"Llama model support timeline\" to ask the team's plan on supporting Llama and other llama family models in the respository metaseq.", + "intent": "Open an issue with title \"Llama model support timeline\" to ask the team\"s plan on supporting Llama and other llama family models in the respository metaseq.", "intent_template": "Open an issue with title \"{{title}}\" to {{issue}} in the respository {{repo}}.", "instantiation_dict": { "title": "Llama model support timeline", @@ -18873,7 +19829,6 @@ "repo": "metaseq" }, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -18881,12 +19836,14 @@ "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "group": "root", - "project": "metaseq", - "fields": {"title": "Llama model support timeline"}, + "evaluator": "NetworkEventEvaluator", "site": "gitlab", - "expected": true + "expected": { + "url": "__GITLAB__/api/v4/projects/33/issues", + "http_method": "POST", + "post_data": {"title": "Llama model support timeline"}, + "response_status": 201 + } } ], "revision": 2 @@ -18896,7 +19853,7 @@ "task_id": 664, "intent_template_id": 328, "start_urls": ["__GITLAB__"], - "intent": "Open an issue with title \"Question on future usage of Python 3.11\" to ask the team's plans on adding Python 3.11 related resources in the respository awesome-python.", + "intent": "Open an issue with title \"Question on future usage of Python 3.11\" to ask the team\"s plans on adding Python 3.11 related resources in the respository awesome-python.", "intent_template": "Open an issue with title \"{{title}}\" to {{issue}} in the respository {{repo}}.", "instantiation_dict": { "title": "Question on future usage of Python 3.11", @@ -18904,7 +19861,6 @@ "repo": "awesome-python" }, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -18912,12 +19868,14 @@ "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "group": "vinta", - "project": "awesome-python", - "fields": {"title": "Question on future usage of Python 3.11"}, + "evaluator": "NetworkEventEvaluator", "site": "gitlab", - "expected": true + "expected": { + "url": "__GITLAB__/api/v4/projects/84/issues", + "http_method": "POST", + "post_data": {"title": "Question on future usage of Python 3.11"}, + "response_status": 201 + } } ], "revision": 2 @@ -18935,7 +19893,6 @@ "repo": "a11y-syntax-highlighting" }, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -18943,12 +19900,14 @@ "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "group": "byteblaze", - "project": "a11y-syntax-highlighting", - "fields": {"title": "Feature Request: MT support"}, + "evaluator": "NetworkEventEvaluator", "site": "gitlab", - "expected": true + "expected": { + "url": "__GITLAB__/api/v4/projects/186/issues", + "http_method": "POST", + "post_data": {"title": "Feature Request: MT support"}, + "response_status": 201 + } } ], "revision": 2 @@ -18958,15 +19917,15 @@ "task_id": 666, "intent_template_id": 335, "start_urls": ["__GITLAB__/primer/design"], - "intent": "Submit a request to merge dialog-component branch into dialog branch, assign Primer as the reviewer", - "intent_template": "Submit a request to merge {{source_branch}} branch into {{target_branch}} branch, assign {{reviewer}} as the reviewer", + "intent": "Submit a merge request for dialog-component in the current repository to be merged into dialog branch, assign Primer as the reviewer", + "intent_template": "Submit a merge request for {{source_branch}} in {{source_project}} to be merged into {{target_branch}} branch, assign {{reviewer}} as the reviewer", "instantiation_dict": { "source_branch": "dialog-component", "target_branch": "dialog", - "reviewer": "Primer" + "reviewer": "Primer", + "source_project": "the current repository" }, "format_specification": null, - "start_url_context": "Complete the task given the context of the current page.", "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -18974,14 +19933,20 @@ "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "group": "primer", - "project": "design", - "source_branch": "dialog-component", - "target_branch": "dialog", - "reviewer": "Primer", - "site": "gitlab", - "expected": true + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": "__GITLAB__/primer/design/-/merge_requests", + "http_method": "POST", + "post_data": { + "merge_request[target_branch]": "dialog", + "merge_request[source_branch]": "dialog-component", + "merge_request[target_project_id]": "180", + "merge_request[source_project_id]": "180", + "merge_request[assignee_ids][]": "0", + "merge_request[reviewer_ids][]": "2367" + }, + "response_status": 302 + } } ], "revision": 2 @@ -18991,15 +19956,15 @@ "task_id": 667, "intent_template_id": 335, "start_urls": ["__GITLAB__/primer/design"], - "intent": "Submit a merge request for dialog-component branch to be merged into bump-doctocat branch, assign primer as the reviewer", - "intent_template": "Submit a merge request for {{source_branch}} branch to be merged into {{target_branch}} branch, assign {{reviewer}} as the reviewer", + "intent": "Submit a merge request for dialog-component in the current repository to be merged into bump-doctocat branch, assign primer as the reviewer", + "intent_template": "Submit a merge request for {{source_branch}} in {{source_project}} to be merged into {{target_branch}} branch, assign {{reviewer}} as the reviewer", "instantiation_dict": { "source_branch": "dialog-component", "target_branch": "bump-doctocat", - "reviewer": "primer" + "reviewer": "primer", + "source_project": "the current repository" }, "format_specification": null, - "start_url_context": "Complete the task given the context of the current page.", "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -19007,14 +19972,20 @@ "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "group": "primer", - "project": "design", - "source_branch": "dialog-component", - "target_branch": "bump-doctocat", - "reviewer": "primer", - "site": "gitlab", - "expected": true + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": "__GITLAB__/primer/design/-/merge_requests", + "http_method": "POST", + "post_data": { + "merge_request[target_branch]": "bump-doctocat", + "merge_request[source_branch]": "dialog-component", + "merge_request[target_project_id]": "180", + "merge_request[source_project_id]": "180", + "merge_request[assignee_ids][]": "0", + "merge_request[reviewer_ids][]": "2367" + }, + "response_status": 302 + } } ], "revision": 2 @@ -19024,15 +19995,15 @@ "task_id": 668, "intent_template_id": 335, "start_urls": ["__GITLAB__"], - "intent": "Submit a merge request for a11yproject.com/redesign branch to be merged into main branch, assign Roshan Jossy as the reviewer", - "intent_template": "Submit a merge request for {{source_branch}} branch to be merged into {{target_branch}} branch, assign {{reviewer}} as the reviewer", + "intent": "Submit a merge request for redesign in a11yproject.com to be merged into main branch, assign Roshan Jossy as the reviewer", + "intent_template": "Submit a merge request for {{source_branch}} in {{source_project}} to be merged into {{target_branch}} branch, assign {{reviewer}} as the reviewer", "instantiation_dict": { - "source_branch": "a11yproject.com/redesign", + "source_branch": "redesign", "target_branch": "main", - "reviewer": "Roshan Jossy" + "reviewer": "Roshan Jossy", + "source_project": "a11yproject.com" }, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -19040,14 +20011,20 @@ "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "group": "a11yproject", - "project": "a11yproject.com", - "source_branch": "redesign", - "target_branch": "main", - "reviewer": "Roshan Jossy", - "site": "gitlab", - "expected": true + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": "__GITLAB__/a11yproject/a11yproject.com/-/merge_requests", + "http_method": "POST", + "post_data": { + "merge_request[target_branch]": "redesign", + "merge_request[source_branch]": "main", + "merge_request[target_project_id]": "174", + "merge_request[source_project_id]": "174", + "merge_request[assignee_ids][]": "0", + "merge_request[reviewer_ids][]": "2264" + }, + "response_status": 302 + } } ], "revision": 2 @@ -19057,11 +20034,10 @@ "task_id": 669, "intent_template_id": 337, "start_urls": ["__GITLAB__/byteblaze/solarized-prism-theme"], - "intent": "Open a new issue to discuss the implementation of dark mode", - "intent_template": "Open a new issue to discuss the implementation of {{feature}}", + "intent": "Open a new issue to discuss the implementation of dark mode for the current project", + "intent_template": "Open a new issue to discuss the implementation of {{feature}} for the current project", "instantiation_dict": {"feature": "dark mode"}, "format_specification": null, - "start_url_context": "Complete the task given the context of the current page.", "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -19085,11 +20061,10 @@ "task_id": 670, "intent_template_id": 337, "start_urls": ["__GITLAB__/byteblaze/dotfiles"], - "intent": "Open a new issue to discuss the implementation of default plugins for .zsh", - "intent_template": "Open a new issue to discuss the implementation of {{feature}}", + "intent": "Open a new issue to discuss the implementation of default plugins for .zsh for the current project", + "intent_template": "Open a new issue to discuss the implementation of {{feature}} for the current project", "instantiation_dict": {"feature": "default plugins for .zsh"}, "format_specification": null, - "start_url_context": "Complete the task given the context of the current page.", "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -19113,11 +20088,10 @@ "task_id": 671, "intent_template_id": 101, "start_urls": ["__SHOPPING__", "__REDDIT__"], - "intent": "Gather the titles of Sony Computer Entertainment VR reviews with 2 stars and less rating from OneStopShop, and post them in the games subreddit under the title \"real user feedback on Sony Computer Entertainment VR\"", - "intent_template": "Gather the titles of {{product}} reviews with {{rating}} rating from OneStopShop, and post them in the games subreddit under the title \"real user feedback on {{product}}\"", + "intent": "Gather the titles of Sony Computer Entertainment VR reviews with 2 stars and less rating from OneStopShop, and post them in the games forum under the title \"real user feedback on Sony Computer Entertainment VR\"", + "intent_template": "Gather the titles of {{product}} reviews with {{rating}} rating from OneStopShop, and post them in the games forum under the title \"real user feedback on {{product}}\"", "instantiation_dict": {"product": "Sony Computer Entertainment VR", "rating": "2 stars and less"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -19127,13 +20101,8 @@ { "evaluator": "NetworkEventEvaluator", "site": "reddit", - "url_match_mode": "exact", "last_event_only": true, - "expected": { - "url": "__REDDIT__/f/gaming", - "response_status": 200, - "event_type": "navigation" - } + "expected": {"url": "__REDDIT__/f/gaming", "response_status": 200} } ], "revision": 2 @@ -19143,14 +20112,13 @@ "task_id": 672, "intent_template_id": 101, "start_urls": ["__SHOPPING__", "__REDDIT__"], - "intent": "Gather the titles of Nintendo Switch Fortnite Wildcat Console EU reviews with 3 stars and less rating from OneStopShop, and post them in the games subreddit under the title \"real user feedback on Nintendo Switch Fortnite Wildcat Console EU\"", - "intent_template": "Gather the titles of {{product}} reviews with {{rating}} rating from OneStopShop, and post them in the games subreddit under the title \"real user feedback on {{product}}\"", + "intent": "Gather the titles of Nintendo Switch Fortnite Wildcat Console EU reviews with 3 stars and less rating from OneStopShop, and post them in the games forum under the title \"real user feedback on Nintendo Switch Fortnite Wildcat Console EU\"", + "intent_template": "Gather the titles of {{product}} reviews with {{rating}} rating from OneStopShop, and post them in the games forum under the title \"real user feedback on {{product}}\"", "instantiation_dict": { "product": "Nintendo Switch Fortnite Wildcat Console EU", "rating": "3 stars and less" }, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -19160,13 +20128,8 @@ { "evaluator": "NetworkEventEvaluator", "site": "reddit", - "url_match_mode": "exact", "last_event_only": true, - "expected": { - "url": "__REDDIT__/f/gaming", - "response_status": 200, - "event_type": "navigation" - } + "expected": {"url": "__REDDIT__/f/gaming", "response_status": 200} } ], "revision": 2 @@ -19176,11 +20139,10 @@ "task_id": 673, "intent_template_id": 101, "start_urls": ["__SHOPPING__", "__REDDIT__"], - "intent": "Gather the titles of Racing Wheel Overdrive for Xbox X reviews with 1 star rating from OneStopShop, and post them in the games subreddit under the title \"real user feedback on Racing Wheel Overdrive for Xbox X\"", - "intent_template": "Gather the titles of {{product}} reviews with {{rating}} rating from OneStopShop, and post them in the games subreddit under the title \"real user feedback on {{product}}\"", + "intent": "Gather the titles of Racing Wheel Overdrive for Xbox X reviews with 1 star rating from OneStopShop, and post them in the games forum under the title \"real user feedback on Racing Wheel Overdrive for Xbox X\"", + "intent_template": "Gather the titles of {{product}} reviews with {{rating}} rating from OneStopShop, and post them in the games forum under the title \"real user feedback on {{product}}\"", "instantiation_dict": {"product": "Racing Wheel Overdrive for Xbox X", "rating": "1 star"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -19190,13 +20152,8 @@ { "evaluator": "NetworkEventEvaluator", "site": "reddit", - "url_match_mode": "exact", "last_event_only": true, - "expected": { - "url": "__REDDIT__/f/gaming", - "response_status": 200, - "event_type": "navigation" - } + "expected": {"url": "__REDDIT__/f/gaming", "response_status": 200} } ], "revision": 2 @@ -19206,14 +20163,13 @@ "task_id": 674, "intent_template_id": 101, "start_urls": ["__SHOPPING__", "__REDDIT__"], - "intent": "Gather the titles of Doc and Pies Arcade Factory Cocktail Arcade Machine reviews with 3 stars and less rating from OneStopShop, and post them in the games subreddit under the title \"real user feedback on Doc and Pies Arcade Factory Cocktail Arcade Machine\"", - "intent_template": "Gather the titles of {{product}} reviews with {{rating}} rating from OneStopShop, and post them in the games subreddit under the title \"real user feedback on {{product}}\"", + "intent": "Gather the titles of Doc and Pies Arcade Factory Cocktail Arcade Machine reviews with 3 stars and less rating from OneStopShop, and post them in the games forum under the title \"real user feedback on Doc and Pies Arcade Factory Cocktail Arcade Machine\"", + "intent_template": "Gather the titles of {{product}} reviews with {{rating}} rating from OneStopShop, and post them in the games forum under the title \"real user feedback on {{product}}\"", "instantiation_dict": { "product": "Doc and Pies Arcade Factory Cocktail Arcade Machine", "rating": "3 stars and less" }, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -19223,13 +20179,8 @@ { "evaluator": "NetworkEventEvaluator", "site": "reddit", - "url_match_mode": "exact", "last_event_only": true, - "expected": { - "url": "__REDDIT__/f/gaming", - "response_status": 200, - "event_type": "navigation" - } + "expected": {"url": "__REDDIT__/f/gaming", "response_status": 200} } ], "revision": 2 @@ -19239,11 +20190,10 @@ "task_id": 675, "intent_template_id": 101, "start_urls": ["__SHOPPING__", "__REDDIT__"], - "intent": "Gather the titles of HORI 3D Surround Gaming Neckset reviews with 2 stars and less rating from OneStopShop, and post them in the games subreddit under the title \"real user feedback on HORI 3D Surround Gaming Neckset\"", - "intent_template": "Gather the titles of {{product}} reviews with {{rating}} rating from OneStopShop, and post them in the games subreddit under the title \"real user feedback on {{product}}\"", + "intent": "Gather the titles of HORI 3D Surround Gaming Neckset reviews with 2 stars and less rating from OneStopShop, and post them in the games forum under the title \"real user feedback on HORI 3D Surround Gaming Neckset\"", + "intent_template": "Gather the titles of {{product}} reviews with {{rating}} rating from OneStopShop, and post them in the games forum under the title \"real user feedback on {{product}}\"", "instantiation_dict": {"product": "HORI 3D Surround Gaming Neckset", "rating": "2 stars and less"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -19253,13 +20203,8 @@ { "evaluator": "NetworkEventEvaluator", "site": "reddit", - "url_match_mode": "exact", "last_event_only": true, - "expected": { - "url": "__REDDIT__/f/gaming", - "response_status": 200, - "event_type": "navigation" - } + "expected": {"url": "__REDDIT__/f/gaming", "response_status": 200} } ], "revision": 2 @@ -19269,16 +20214,36 @@ "task_id": 676, "intent_template_id": 253, "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Navigate to the list of orders that are suspected of being fraudulent", - "intent_template": "Navigate to the list of orders that are {{status}}", + "intent": "Show me the list of orders that are suspected of being fraudulent", + "intent_template": "Show me the list of orders that are {{status}}", "instantiation_dict": {"status": "suspected of being fraudulent"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} + }, + { + "evaluator": "NetworkEventEvaluator", + "last_event_only": true, + "expected": { + "url": "^__SHOPPING_ADMIN__/mui/index/render/.*$", + "response_status": 200, + "headers": { + "referer": "__SHOPPING_ADMIN__/sales/order/", + "X-Requested-With": "XMLHttpRequest" + }, + "query_params": { + "namespace" : [ "sales_order_grid" ], + "filters[placeholder]": [ "true" ], + "filters[status]" : [ "fraud" ], + "search" : [ "" ], + "keywordUpdated" : [ "false" ] + } + }, + "site": "shopping_admin", + "ignored_query_params_patterns": ["^paging", "^sorting", "isAjax"] } ], "revision": 2 @@ -19288,11 +20253,10 @@ "task_id": 677, "intent_template_id": 253, "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Navigate to the list of orders that are processing", - "intent_template": "Navigate to the list of orders that are {{status}}", + "intent": "Show me the list of orders that are processing", + "intent_template": "Show me the list of orders that are {{status}}", "instantiation_dict": {"status": "processing"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -19301,21 +20265,20 @@ }, { "evaluator": "NetworkEventEvaluator", - "url_match_mode": "prefix", "last_event_only": true, "expected": { - "url": "__SHOPPING_ADMIN__/mui/index/render/?namespace=sales_order_grid", + "url": "__SHOPPING_ADMIN__/mui/index/render/", "response_status": 200, "headers": { "referer": "__SHOPPING_ADMIN__/sales/order/", "X-Requested-With": "XMLHttpRequest" }, - "query_string": { - "namespace": "sales_order_grid", - "filters[placeholder]": "true", - "filters[status]": "processing", - "search": "", - "keywordUpdated": "false" + "query_params": { + "namespace" : [ "sales_order_grid" ], + "filters[placeholder]": [ "true" ], + "filters[status]" : [ "processing" ], + "search" : [ "" ], + "keywordUpdated" : [ "false" ] } }, "site": "shopping_admin", @@ -19329,11 +20292,10 @@ "task_id": 678, "intent_template_id": 253, "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Navigate to the list of orders that are canceled", - "intent_template": "Navigate to the list of orders that are {{status}}", + "intent": "Show me the list of orders that are canceled", + "intent_template": "Show me the list of orders that are {{status}}", "instantiation_dict": {"status": "canceled"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -19342,21 +20304,20 @@ }, { "evaluator": "NetworkEventEvaluator", - "url_match_mode": "prefix", "last_event_only": true, "expected": { - "url": "__SHOPPING_ADMIN__/mui/index/render/?namespace=sales_order_grid", + "url": "__SHOPPING_ADMIN__/mui/index/render/", "response_status": 200, "headers": { "referer": "__SHOPPING_ADMIN__/sales/order/", "X-Requested-With": "XMLHttpRequest" }, - "query_string": { - "namespace": "sales_order_grid", - "filters[placeholder]": "true", - "filters[status]": "canceled", - "search": "", - "keywordUpdated": "false" + "query_params": { + "namespace" : [ "sales_order_grid" ], + "filters[placeholder]": [ "true" ], + "filters[status]" : [ "canceled" ], + "search" : [ "" ], + "keywordUpdated" : [ "false" ] } }, "site": "shopping_admin", @@ -19370,11 +20331,10 @@ "task_id": 679, "intent_template_id": 253, "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Navigate to the list of orders that are completed", - "intent_template": "Navigate to the list of orders that are {{status}}", + "intent": "Show me the list of orders that are completed", + "intent_template": "Show me the list of orders that are {{status}}", "instantiation_dict": {"status": "completed"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -19383,21 +20343,16 @@ }, { "evaluator": "NetworkEventEvaluator", - "url_match_mode": "prefix", - "last_event_only": true, "expected": { - "url": "__SHOPPING_ADMIN__/mui/index/render/?namespace=sales_order_grid", - "response_status": 200, - "headers": { - "referer": "__SHOPPING_ADMIN__/sales/order/", - "X-Requested-With": "XMLHttpRequest" - }, - "query_string": { - "namespace": "sales_order_grid", - "filters[placeholder]": "true", - "filters[status]": "complete", - "search": "", - "keywordUpdated": "false" + "url": "^__SHOPPING_ADMIN__/mui/index/render/.*$", + "decode_base64_query": true, + "headers": {"referer": "__SHOPPING_ADMIN__/sales/order/"}, + "query_params": { + "namespace" : [ "sales_order_grid" ], + "filters[placeholder]": [ "true" ], + "filters[status]" : [ "complete" ], + "search" : [ "" ], + "keywordUpdated" : [ "false" ] } }, "site": "shopping_admin", @@ -19411,11 +20366,10 @@ "task_id": 680, "intent_template_id": 253, "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Navigate to the list of orders that are on hold", - "intent_template": "Navigate to the list of orders that are {{status}}", + "intent": "Show me the list of orders that are on hold", + "intent_template": "Show me the list of orders that are {{status}}", "instantiation_dict": {"status": "on hold"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -19424,21 +20378,20 @@ }, { "evaluator": "NetworkEventEvaluator", - "url_match_mode": "prefix", "last_event_only": true, "expected": { - "url": "__SHOPPING_ADMIN__/mui/index/render/?namespace=sales_order_grid", + "url": "__SHOPPING_ADMIN__/mui/index/render/", "response_status": 200, "headers": { "referer": "__SHOPPING_ADMIN__/sales/order/", "X-Requested-With": "XMLHttpRequest" }, - "query_string": { - "namespace": "sales_order_grid", - "filters[placeholder]": "true", - "filters[status]": "holded", - "search": "", - "keywordUpdated": "false" + "query_params": { + "namespace" : [ "sales_order_grid" ], + "filters[placeholder]": [ "true" ], + "filters[status]" : [ "holded" ], + "search" : [ "" ], + "keywordUpdated" : [ "false" ] } }, "site": "shopping_admin", @@ -19452,29 +20405,29 @@ "task_id": 681, "intent_template_id": 116, "start_urls": ["__REDDIT__", "__GITLAB__"], - "intent": "Find a GitLab repository related to gan implementation and make a Reddit post linking to it in a relevant subreddit", - "intent_template": "Find a GitLab repository related to {{topic}} and make a Reddit post linking to it in a relevant subreddit", + "intent": "Find a GitLab repository related to gan implementation and create a new post in the discussion forum with the URL of the repository to the most relevant forum. Use the repository path as the post title and \" commit(s) already!\" as the post body.", + "intent_template": "Find a GitLab repository related to {{topic}} and create a new post in the discussion forum with the URL of the repository to the most relevant forum. Use the repository path as the post title and \" commit(s) already!\" as the post body.", "instantiation_dict": {"topic": "gan implementation"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", - "site": "reddit", - "url_match_mode": "exact", - "last_event_only": true, "expected": { - "url": [ - "__REDDIT__/f/machinelearning", "__REDDIT__/f/deeplearning", - "__REDDIT__/f/technology" - ], - "response_status": 200, - "event_type": "navigation" + "url": "^__REDDIT__/submit/?.*$", + "http_method": "POST", + "post_data": { + "submission[mediaType]": "url", + "submission[url]": "__GITLAB__/eriklindernoren/PyTorch-GAN", + "submission[title]": "eriklindernoren/PyTorch-GAN", + "submission[body]": "193 commits already!", + "submission[forum]": "^(10043|10018)$" + }, + "response_status": 302 } } ], @@ -19485,29 +20438,29 @@ "task_id": 682, "intent_template_id": 116, "start_urls": ["__REDDIT__", "__GITLAB__"], - "intent": "Find a GitLab repository related to chatGPT and make a Reddit post linking to it in a relevant subreddit", - "intent_template": "Find a GitLab repository related to {{topic}} and make a Reddit post linking to it in a relevant subreddit", + "intent": "Find a GitLab repository related to chatGPT and create a new post in the discussion forum with the URL of the repository to the most relevant forum. Use the repository path as the post title and \" commit(s) already!\" as the post body.", + "intent_template": "Find a GitLab repository related to {{topic}} and create a new post in the discussion forum with the URL of the repository to the most relevant forum. Use the repository path as the post title and \" commit(s) already!\" as the post body.", "instantiation_dict": {"topic": "chatGPT"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", - "site": "reddit", - "url_match_mode": "exact", - "last_event_only": true, "expected": { - "url": [ - "__REDDIT__/f/machinelearning", "__REDDIT__/f/singularity", - "__REDDIT__/f/deeplearning", "__REDDIT__/f/technology" - ], - "response_status": 200, - "event_type": "navigation" + "url": "^__REDDIT__/submit/?.*$", + "http_method": "POST", + "post_data": { + "submission[mediaType]": "url", + "submission[url]": "__GITLAB__/convexegg/chatgpt", + "submission[title]": "convexegg/chatgpt", + "submission[body]": "193 commits already!", + "submission[forum]": "^(10043|10018|10082|10078)$" + }, + "response_status": 302 } } ], @@ -19518,26 +20471,29 @@ "task_id": 683, "intent_template_id": 116, "start_urls": ["__REDDIT__", "__GITLAB__"], - "intent": "Find a GitLab repository related to metaseq and make a Reddit post linking to it in a relevant subreddit", - "intent_template": "Find a GitLab repository related to {{topic}} and make a Reddit post linking to it in a relevant subreddit", + "intent": "Find a GitLab repository related to metaseq and create a new post in the discussion forum with the URL of the repository to the most relevant forum. Use the repository path as the post title and \" commit(s) already!\" as the post body.", + "intent_template": "Find a GitLab repository related to {{topic}} and create a new post in the discussion forum with the URL of the repository to the most relevant forum. Use the repository path as the post title and \" commit(s) already!\" as the post body.", "instantiation_dict": {"topic": "metaseq"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", - "site": "reddit", - "url_match_mode": "exact", - "last_event_only": true, "expected": { - "url": ["__REDDIT__/f/machinelearning", "__REDDIT__/f/deeplearning"], - "response_status": 200, - "event_type": "navigation" + "url": "^__REDDIT__/submit/?.*$", + "http_method": "POST", + "post_data": { + "submission[mediaType]": "url", + "submission[url]": "__GITLAB__/facebook/metaseq", + "submission[title]": "facebook/metaseq", + "submission[body]": "272 commits already!", + "submission[forum]": "^(10043|10018)$" + }, + "response_status": 302 } } ], @@ -19548,26 +20504,28 @@ "task_id": 684, "intent_template_id": 117, "start_urls": ["__REDDIT__", "__GITLAB__"], - "intent": "Promote byteblaze/cloud-to-butt to subreddit LifeProTips with the description from the repo itself.", - "intent_template": "Promote {{repo}} to subreddit {{subreddit}} with the description from the repo itself.", - "instantiation_dict": {"repo": "byteblaze/cloud-to-butt", "subreddit": "LifeProTips"}, + "intent": "Promote byteblaze/cloud-to-butt in the discussion forum LifeProTips by creating a post with the project description as the title.", + "intent_template": "Promote {{repo}} in the discussion forum {{forum}} by creating a post with the project description as the title.", + "instantiation_dict": {"repo": "byteblaze/cloud-to-butt", "forum": "LifeProTips"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", - "site": "reddit", - "url_match_mode": "exact", - "last_event_only": true, "expected": { - "url": "__REDDIT__/f/LifeProTips", - "response_status": 200, - "event_type": "navigation" + "url": "^__REDDIT__/submit/?.*$", + "http_method": "POST", + "post_data": { + "submission[mediaType]": "url", + "submission[url]": "__GITLAB__/byteblaze/cloud-to-butt", + "submission[title]": "Chrome extension that replaces occurrences of 'the cloud' with 'my butt'", + "submission[forum]": "10016" + }, + "response_status": 302 } } ], @@ -19578,23 +20536,29 @@ "task_id": 685, "intent_template_id": 117, "start_urls": ["__REDDIT__", "__GITLAB__"], - "intent": "Promote byteblaze/dotfiles to subreddit aww with the description from the repo itself.", - "intent_template": "Promote {{repo}} to subreddit {{subreddit}} with the description from the repo itself.", - "instantiation_dict": {"repo": "byteblaze/dotfiles", "subreddit": "aww"}, + "intent": "Promote byteblaze/dotfiles in the discussion forum aww by creating a post with the project description as the title.", + "intent_template": "Promote {{repo}} in the discussion forum {{forum}} by creating a post with the project description as the title.", + "instantiation_dict": {"repo": "byteblaze/dotfiles", "forum": "aww"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", - "site": "reddit", - "url_match_mode": "exact", - "last_event_only": true, - "expected": {"url": "__REDDIT__/f/aww", "response_status": 200, "event_type": "navigation"} + "expected": { + "url": "^__REDDIT__/submit/?.*$", + "http_method": "POST", + "post_data": { + "submission[mediaType]": "url", + "submission[url]": "__GITLAB__/byteblaze/dotfiles", + "submission[title]": "\ud83e\udd16 Computer setup", + "submission[forum]": "10035" + }, + "response_status": 302 + } } ], "revision": 2 @@ -19604,26 +20568,28 @@ "task_id": 686, "intent_template_id": 117, "start_urls": ["__REDDIT__", "__GITLAB__"], - "intent": "Promote auth0/angular-storage to subreddit technology with the description from the repo itself.", - "intent_template": "Promote {{repo}} to subreddit {{subreddit}} with the description from the repo itself.", - "instantiation_dict": {"repo": "auth0/angular-storage", "subreddit": "technology"}, + "intent": "Promote auth0/angular-storage in the discussion forum technology by creating a post with the project description as the title.", + "intent_template": "Promote {{repo}} in the discussion forum {{forum}} by creating a post with the project description as the title.", + "instantiation_dict": {"repo": "auth0/angular-storage", "forum": "technology"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", - "site": "reddit", - "url_match_mode": "exact", - "last_event_only": true, "expected": { - "url": "__REDDIT__/f/technology", - "response_status": 200, - "event_type": "navigation" + "url": "^__REDDIT__/submit/?.*$", + "http_method": "POST", + "post_data": { + "submission[mediaType]": "url", + "submission[url]": "__GITLAB__/auth0/angular-storage", + "submission[title]": "A storage library for AngularJS done right", + "submission[forum]": "10082" + }, + "response_status": 302 } } ], @@ -19634,26 +20600,28 @@ "task_id": 687, "intent_template_id": 117, "start_urls": ["__REDDIT__", "__GITLAB__"], - "intent": "Promote koush/AndroidAsync to subreddit funny with the description from the repo itself.", - "intent_template": "Promote {{repo}} to subreddit {{subreddit}} with the description from the repo itself.", - "instantiation_dict": {"repo": "koush/AndroidAsync", "subreddit": "funny"}, + "intent": "Promote koush/AndroidAsync in the discussion forum funny by creating a post with the project description as the title.", + "intent_template": "Promote {{repo}} in the discussion forum {{forum}} by creating a post with the project description as the title.", + "instantiation_dict": {"repo": "koush/AndroidAsync", "forum": "funny"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", - "site": "reddit", - "url_match_mode": "exact", - "last_event_only": true, "expected": { - "url": "__REDDIT__/f/funny", - "response_status": 200, - "event_type": "navigation" + "url": "^__REDDIT__/submit/?.*$", + "http_method": "POST", + "post_data": { + "submission[mediaType]": "url", + "submission[url]": "__GITLAB__/koush/AndroidAsync", + "submission[title]": "Asynchronous socket, http(s) (client+server) and websocket library for android. Based on nio, not threads.", + "submission[forum]": "10046" + }, + "response_status": 302 } } ], @@ -19664,26 +20632,28 @@ "task_id": 688, "intent_template_id": 117, "start_urls": ["__REDDIT__", "__GITLAB__"], - "intent": "Promote lahwaacz/arch-wiki-docs to subreddit science with the description from the repo itself.", - "intent_template": "Promote {{repo}} to subreddit {{subreddit}} with the description from the repo itself.", - "instantiation_dict": {"repo": "lahwaacz/arch-wiki-docs", "subreddit": "science"}, + "intent": "Promote lahwaacz/arch-wiki-docs in the discussion forum science by creating a post with the project description as the title.", + "intent_template": "Promote {{repo}} in the discussion forum {{forum}} by creating a post with the project description as the title.", + "instantiation_dict": {"repo": "lahwaacz/arch-wiki-docs", "forum": "science"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", - "site": "reddit", - "url_match_mode": "exact", - "last_event_only": true, "expected": { - "url": "__REDDIT__/f/science", - "response_status": 200, - "event_type": "navigation" + "url": "^__REDDIT__/submit/?.*$", + "http_method": "POST", + "post_data": { + "submission[mediaType]": "url", + "submission[url]": "__GITLAB__/lahwaacz/arch-wiki-docs", + "submission[title]": "A script to download pages from Arch Wiki for offline browsing", + "submission[forum]": "10077" + }, + "response_status": 302 } } ], @@ -19694,27 +20664,34 @@ "task_id": 689, "intent_template_id": 163, "start_urls": ["__SHOPPING__"], - "intent": "Draft an email to the shop owner via their contact us function for a coupon, enter in the exact reason of \"I am a loyal customer\"", - "intent_template": "Draft an email to the shop owner via their contact us function for a coupon, enter in the exact reason of \"{{reason}}\"", - "instantiation_dict": {"reason": "I am a loyal customer"}, + "intent": "Draft a message using the shop contact form to the shop owner for a coupon, enter in the exact reason of \"I am a loyal customer, give me a coupon\"", + "intent_template": "Draft a message using the shop contact form to the shop owner for a coupon, enter in the exact reason of \"{{reason}}\"", + "instantiation_dict": {"reason": "I am a loyal customer, give me a coupon"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", - "site": "shopping", - "url_match_mode": "exact", - "last_event_only": true, "expected": { - "url": "__SHOPPING__/contact", - "response_status": 200, - "event_type": "navigation" + "url": "^http://.*/dummy_bin$", + "http_method": "POST", + "post_data": { + "form_id": "contact-form", + "name": "Emma Lopez", + "email": "emma.lopez@gmail.com", + "comment": "I am a loyal customer, give me a coupon" + }, + "response_status": -1 } + }, + { + "evaluator": "NetworkEventEvaluator", + "should_not_exist": true, + "expected": {"url": "__SHOPPING__/contact/index/post", "http_method": "POST"} } ], "revision": 2 @@ -19724,27 +20701,34 @@ "task_id": 690, "intent_template_id": 163, "start_urls": ["__SHOPPING__"], - "intent": "Draft an email to the shop owner via their contact us function for a coupon, enter in the exact reason of \"they promised me a coupon last time\"", - "intent_template": "Draft an email to the shop owner via their contact us function for a coupon, enter in the exact reason of \"{{reason}}\"", + "intent": "Draft a message using the shop contact form to the shop owner for a coupon, enter in the exact reason of \"they promised me a coupon last time\"", + "intent_template": "Draft a message using the shop contact form to the shop owner for a coupon, enter in the exact reason of \"{{reason}}\"", "instantiation_dict": {"reason": "they promised me a coupon last time"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", - "site": "shopping", - "url_match_mode": "exact", - "last_event_only": true, "expected": { - "url": "__SHOPPING__/contact", - "response_status": 200, - "event_type": "navigation" + "url": "^http://.*/dummy_bin$", + "http_method": "POST", + "post_data": { + "form_id": "contact-form", + "name": "Emma Lopez", + "email": "emma.lopez@gmail.com", + "comment": "they promised me a coupon last time" + }, + "response_status": -1 } + }, + { + "evaluator": "NetworkEventEvaluator", + "should_not_exist": true, + "expected": {"url": "__SHOPPING__/contact/index/post", "http_method": "POST"} } ], "revision": 2 @@ -19754,27 +20738,34 @@ "task_id": 691, "intent_template_id": 163, "start_urls": ["__SHOPPING__"], - "intent": "Draft an email to the shop owner via their contact us function for a coupon, enter in the exact reason of \"I plan to make a bulk purchase\"", - "intent_template": "Draft an email to the shop owner via their contact us function for a coupon, enter in the exact reason of \"{{reason}}\"", + "intent": "Draft a message using the shop contact form to the shop owner for a coupon, enter in the exact reason of \"I plan to make a bulk purchase\"", + "intent_template": "Draft a message using the shop contact form to the shop owner for a coupon, enter in the exact reason of \"{{reason}}\"", "instantiation_dict": {"reason": "I plan to make a bulk purchase"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", - "site": "shopping", - "url_match_mode": "exact", - "last_event_only": true, "expected": { - "url": "__SHOPPING__/contact", - "response_status": 200, - "event_type": "navigation" + "url": "^http://.*/dummy_bin$", + "http_method": "POST", + "post_data": { + "form_id": "contact-form", + "name": "Emma Lopez", + "email": "emma.lopez@gmail.com", + "comment": "I plan to make a bulk purchase" + }, + "response_status": -1 } + }, + { + "evaluator": "NetworkEventEvaluator", + "should_not_exist": true, + "expected": {"url": "__SHOPPING__/contact/index/post", "http_method": "POST"} } ], "revision": 2 @@ -19784,27 +20775,34 @@ "task_id": 692, "intent_template_id": 163, "start_urls": ["__SHOPPING__"], - "intent": "Draft an email to the shop owner via their contact us function for a coupon, enter in the exact reason of \"I am a student\"", - "intent_template": "Draft an email to the shop owner via their contact us function for a coupon, enter in the exact reason of \"{{reason}}\"", + "intent": "Draft a message using the shop contact form to the shop owner for a coupon, enter in the exact reason of \"I am a student\"", + "intent_template": "Draft a message using the shop contact form to the shop owner for a coupon, enter in the exact reason of \"{{reason}}\"", "instantiation_dict": {"reason": "I am a student"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", - "site": "shopping", - "url_match_mode": "exact", - "last_event_only": true, "expected": { - "url": "__SHOPPING__/contact", - "response_status": 200, - "event_type": "navigation" + "url": "^http://.*/dummy_bin$", + "http_method": "POST", + "post_data": { + "form_id": "contact-form", + "name": "Emma Lopez", + "email": "emma.lopez@gmail.com", + "comment": "I am a student" + }, + "response_status": -1 } + }, + { + "evaluator": "NetworkEventEvaluator", + "should_not_exist": true, + "expected": {"url": "__SHOPPING__/contact/index/post", "http_method": "POST"} } ], "revision": 2 @@ -19814,27 +20812,34 @@ "task_id": 693, "intent_template_id": 163, "start_urls": ["__SHOPPING__"], - "intent": "Draft an email to the shop owner via their contact us function for a coupon, enter in the exact reason of \"my refund is supposed to be replaced by a coupon\"", - "intent_template": "Draft an email to the shop owner via their contact us function for a coupon, enter in the exact reason of \"{{reason}}\"", + "intent": "Draft a message using the shop contact form to the shop owner for a coupon, enter in the exact reason of \"my refund is supposed to be replaced by a coupon\"", + "intent_template": "Draft a message using the shop contact form to the shop owner for a coupon, enter in the exact reason of \"{{reason}}\"", "instantiation_dict": {"reason": "my refund is supposed to be replaced by a coupon"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", - "site": "shopping", - "url_match_mode": "exact", - "last_event_only": true, "expected": { - "url": "__SHOPPING__/contact", - "response_status": 200, - "event_type": "navigation" + "url": "^http://.*/dummy_bin$", + "http_method": "POST", + "post_data": { + "form_id": "contact-form", + "name": "Emma Lopez", + "email": "emma.lopez@gmail.com", + "comment": "my refund is supposed to be replaced by a coupon" + }, + "response_status": -1 } + }, + { + "evaluator": "NetworkEventEvaluator", + "should_not_exist": true, + "expected": {"url": "__SHOPPING__/contact/index/post", "http_method": "POST"} } ], "revision": 2 @@ -19844,17 +20849,16 @@ "task_id": 694, "intent_template_id": 256, "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Add a simple product named Energy-Bulk Women Shirt with 50 in stock, available in size S and color blue, priced at $60", - "intent_template": "Add a simple product named {{product}} with {{stock}} in stock, available in size {{size}} and color {{color}}, priced at ${{price}}", + "intent": "Add a simple product named \"Energy-Bulk Women Shirt\" with 50 in stock, available in size S and color blue, priced at $60 using the appropriate attribute set.", + "intent_template": "Add a simple product named \"{{product}}\" with {{stock}} in stock, available in {{size}} and color {{color}}, priced at ${{price}} using the appropriate attribute set.", "instantiation_dict": { "product": "Energy-Bulk Women Shirt", "stock": "50", - "size": "S", + "size": "size S", "color": "blue", "price": "60" }, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -19862,15 +20866,23 @@ "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "name": "Energy-Bulk Women Shirt", + "evaluator": "NetworkEventEvaluator", "site": "shopping_admin", - "expected": { - "name": "Energy-Bulk Women Shirt", - "price": 60.0, - "stock_qty": 50, - "color": "50", - "size": "167" + "ignored_query_params_patterns": ["isAjax"], + "expected": { + "url": "^__SHOPPING_ADMIN__/catalog/product/save/type/simple/store/0/set/\\d+/back/edit$", + "http_method": "POST", + "response_status": 302, + "post_data": { + "product[name]": "Energy-Bulk Women Shirt", + "product[price]": "60", + "product[status]": "1", + "product[quantity_and_stock_status][qty]": "50", + "product[quantity_and_stock_status][is_in_stock]": "1", + "product[new]": "1", + "product[size]": "167", + "product[color]": "50" + } } } ], @@ -19881,17 +20893,16 @@ "task_id": 695, "intent_template_id": 256, "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Add a simple product named Energy-Bulk Man Yoga Pant with 50 in stock, available in size 38 and color yellow, priced at $69.99", - "intent_template": "Add a simple product named {{product}} with {{stock}} in stock, available in size {{size}} and color {{color}}, priced at ${{price}}", + "intent": "Add a simple product named \"Energy-Bulk Man Yoga Pant\" with 50 in stock, available in size 38 and color yellow, priced at $69.99 using the appropriate attribute set.", + "intent_template": "Add a simple product named \"{{product}}\" with {{stock}} in stock, available in {{size}} and color {{color}}, priced at ${{price}} using the appropriate attribute set.", "instantiation_dict": { "product": "Energy-Bulk Man Yoga Pant", "stock": "50", - "size": "38", + "size": "size 38", "color": "yellow", "price": "69.99" }, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -19899,15 +20910,23 @@ "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "name": "Energy-Bulk Man Yoga Pant", + "evaluator": "NetworkEventEvaluator", "site": "shopping_admin", - "expected": { - "name": "Energy-Bulk Man Yoga Pant", - "price": 69.99, - "stock_qty": 50, - "color": "60", - "size": "179" + "ignored_query_params_patterns": ["isAjax"], + "expected": { + "url": "^__SHOPPING_ADMIN__/catalog/product/save/type/simple/store/0/set/\\d+/back/edit$", + "http_method": "POST", + "response_status": 302, + "post_data": { + "product[name]": "Energy-Bulk Man Yoga Pant", + "product[price]": "69.99", + "product[status]": "1", + "product[quantity_and_stock_status][qty]": "50", + "product[quantity_and_stock_status][is_in_stock]": "1", + "product[new]": "1", + "product[size]": "179", + "product[color]": "60" + } } } ], @@ -19918,17 +20937,16 @@ "task_id": 696, "intent_template_id": 256, "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Add a simple product named FancyBoy Man Causal Jeans with 42 in stock, available in size 34 and color Blue, priced at $169.99", - "intent_template": "Add a simple product named {{product}} with {{stock}} in stock, available in size {{size}} and color {{color}}, priced at ${{price}}", + "intent": "Add a simple product named \"FancyBoy Man Causal Jeans\" with 42 in stock, available in size 34 and color Blue, priced at $169.99 using the appropriate attribute set.", + "intent_template": "Add a simple product named \"{{product}}\" with {{stock}} in stock, available in {{size}} and color {{color}}, priced at ${{price}} using the appropriate attribute set.", "instantiation_dict": { "product": "FancyBoy Man Causal Jeans", "stock": "42", - "size": "34", + "size": "size 34", "color": "Blue", "price": "169.99" }, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -19936,15 +20954,23 @@ "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "name": "FancyBoy Man Causal Jeans", + "evaluator": "NetworkEventEvaluator", "site": "shopping_admin", - "expected": { - "name": "FancyBoy Man Causal Jeans", - "price": 169.99, - "stock_qty": 42, - "color": "50", - "size": "177" + "ignored_query_params_patterns": ["isAjax"], + "expected": { + "url": "^__SHOPPING_ADMIN__/catalog/product/save/type/simple/store/0/set/\\d+/back/edit$", + "http_method": "POST", + "response_status": 302, + "post_data": { + "product[name]": "FancyBoy Man Causal Jeans", + "product[price]": "169.99", + "product[status]": "1", + "product[quantity_and_stock_status][qty]": "42", + "product[quantity_and_stock_status][is_in_stock]": "1", + "product[new]": "1", + "product[size]": "177", + "product[color]": "50" + } } } ], @@ -19955,17 +20981,16 @@ "task_id": 697, "intent_template_id": 256, "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Add a simple product named Swaatch Smart Watch with 42 in stock, available in size uni-size and color Blue, priced at $769.99", - "intent_template": "Add a simple product named {{product}} with {{stock}} in stock, available in size {{size}} and color {{color}}, priced at ${{price}}", + "intent": "Add a simple product named \"Swatch Smart Watch\" with 42 in stock, available in a single size and color Blue, priced at $769.99 using the appropriate attribute set.", + "intent_template": "Add a simple product named \"{{product}}\" with {{stock}} in stock, available in {{size}} and color {{color}}, priced at ${{price}} using the appropriate attribute set.", "instantiation_dict": { - "product": "Swaatch Smart Watch", + "product": "Swatch Smart Watch", "stock": "42", - "size": "uni-size", + "size": "a single size", "color": "Blue", "price": "769.99" }, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -19973,15 +20998,22 @@ "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "name": "Swaatch Smart Watch", + "evaluator": "NetworkEventEvaluator", "site": "shopping_admin", - "expected": { - "name": "Swaatch Smart Watch", - "price": 769.99, - "stock_qty": "42", - "color": "50", - "size": "uni-size" + "ignored_query_params_patterns": ["isAjax"], + "expected": { + "url": "^__SHOPPING_ADMIN__/catalog/product/save/type/simple/store/0/set/\\d+/back/edit$", + "http_method": "POST", + "response_status": 302, + "post_data": { + "product[name]": "Swatch Smart Watch", + "product[price]": "769.99", + "product[status]": "1", + "product[quantity_and_stock_status][qty]": "42", + "product[quantity_and_stock_status][is_in_stock]": "1", + "product[new]": "1", + "product[color]": "50" + } } } ], @@ -19992,17 +21024,16 @@ "task_id": 698, "intent_template_id": 256, "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Add a simple product named Lelelumon Yoga Mat with 42 in stock, available in size uni-size and color black, priced at $769.99", - "intent_template": "Add a simple product named {{product}} with {{stock}} in stock, available in size {{size}} and color {{color}}, priced at ${{price}}", + "intent": "Add a simple product named \"Lelelumon Yoga Mat\" with 42 in stock, available in size uni-size and color black, priced at $769.99 using the appropriate attribute set.", + "intent_template": "Add a simple product named \"{{product}}\" with {{stock}} in stock, available in {{size}} and color {{color}}, priced at ${{price}} using the appropriate attribute set.", "instantiation_dict": { "product": "Lelelumon Yoga Mat", "stock": "42", - "size": "uni-size", + "size": "size uni-size", "color": "black", "price": "769.99" }, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -20029,11 +21060,10 @@ "task_id": 699, "intent_template_id": 258, "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Draft a new marketing price rule for spring sale that offers a 20 percent discount site-wide for all customers", - "intent_template": "Draft a new marketing price rule for {{topic}} that offers {{rule}} for all customers", + "intent": "Create a new marketing price rule called \"spring sale\" that offers a 20 percent discount site-wide for all customers", + "intent_template": "Create a new marketing price rule called \"{{topic}}\" that offers {{rule}} for all customers", "instantiation_dict": {"topic": "spring sale", "rule": "a 20 percent discount site-wide"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -20041,15 +21071,19 @@ "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "topic": "spring sale", + "evaluator": "NetworkEventEvaluator", "site": "shopping_admin", "expected": { - "name": "spring sale", - "website_ids": [1], - "customer_group_ids": [1], - "discount_amount_type": "by_percent", - "discount_amount": 20 + "url": "__SHOPPING_ADMIN__/sales_rule/promo_quote/save", + "http_method": "POST", + "response_status": 302, + "post_data": { + "name": "spring sale", + "website_ids": [1], + "customer_group_ids": [1], + "simple_action": "by_percent", + "discount_amount": 20 + } } } ], @@ -20060,11 +21094,10 @@ "task_id": 700, "intent_template_id": 258, "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Draft a new marketing price rule for fall discount that offers $10 discount on checkout for all customers", - "intent_template": "Draft a new marketing price rule for {{topic}} that offers {{rule}} for all customers", + "intent": "Create a new marketing price rule called \"fall discount\" that offers $10 discount on checkout for all customers", + "intent_template": "Create a new marketing price rule called \"{{topic}}\" that offers {{rule}} for all customers", "instantiation_dict": {"topic": "fall discount", "rule": "$10 discount on checkout"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -20072,15 +21105,19 @@ "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "topic": "fall discount", + "evaluator": "NetworkEventEvaluator", "site": "shopping_admin", "expected": { - "name": "fall discount", - "website_ids": [1], - "customer_group_ids": [1], - "discount_amount_type": "cart_fixed", - "discount_amount": 10 + "url": "__SHOPPING_ADMIN__/sales_rule/promo_quote/save", + "http_method": "POST", + "response_status": 302, + "post_data": { + "name": "fall discount", + "website_ids": [1], + "customer_group_ids": [1], + "simple_action": "cart_fixed", + "discount_amount": 10 + } } } ], @@ -20091,11 +21128,13 @@ "task_id": 701, "intent_template_id": 258, "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Draft a new marketing price rule for Mother's day sale that offers $15 discount on checkout for all customers", - "intent_template": "Draft a new marketing price rule for {{topic}} that offers {{rule}} for all customers", - "instantiation_dict": {"topic": "Mother's day sale", "rule": "$15 discount on checkout"}, + "intent": "Create a new marketing price rule called \"Mother's day sale\" that offers 15% discount on checkout on all their cart for all customers", + "intent_template": "Create a new marketing price rule called \"{{topic}}\" that offers {{rule}} for all customers", + "instantiation_dict": { + "topic": "Mother's day sale", + "rule": "15% discount on checkout on all their cart" + }, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -20103,15 +21142,21 @@ "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "topic": "Mother's day sale", + "evaluator": "NetworkEventEvaluator", "site": "shopping_admin", "expected": { - "name": "Mother's day sale", - "website_ids": [1], - "customer_group_ids": [1], - "discount_amount_type": "cart_fixed", - "discount_amount": 15 + "url": "__SHOPPING_ADMIN__/sales_rule/promo_quote/save", + "http_method": "POST", + "response_status": 302, + "post_data": { + "name": "Mother's day sale", + "is_active": "1", + "customer_group_ids[0]": "1", + "website_ids[0]": "1", + "coupon_type": "1", + "simple_action": "by_percent", + "discount_amount": 15 + } } } ], @@ -20122,11 +21167,10 @@ "task_id": 702, "intent_template_id": 258, "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Draft a new marketing price rule for Pride Month that offers 45% off on all products for all customers", - "intent_template": "Draft a new marketing price rule for {{topic}} that offers {{rule}} for all customers", + "intent": "Create a new marketing price rule called \"Pride Month\" that offers 45% off on all products for all customers", + "intent_template": "Create a new marketing price rule called \"{{topic}}\" that offers {{rule}} for all customers", "instantiation_dict": {"topic": "Pride Month", "rule": "45% off on all products"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -20134,15 +21178,20 @@ "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "topic": "Pride Month", + "evaluator": "NetworkEventEvaluator", "site": "shopping_admin", "expected": { - "name": "Pride Month", - "website_ids": [1], - "customer_group_ids": [1], - "discount_amount_type": "by_percent", - "discount_amount": 45 + "url": "__SHOPPING_ADMIN__/catalog_rule/promo_catalog/save/", + "http_method": "POST", + "response_status": 302, + "post_data": { + "name": "Pride Month", + "is_active": "1", + "customer_group_ids[0]": "1", + "website_ids[0]": "1", + "simple_action": "by_percent", + "discount_amount": "45" + } } } ], @@ -20153,11 +21202,10 @@ "task_id": 703, "intent_template_id": 258, "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Draft a new marketing price rule for Thanks giving sale that offers $40 discount on checkout for all customers", - "intent_template": "Draft a new marketing price rule for {{topic}} that offers {{rule}} for all customers", - "instantiation_dict": {"topic": "Thanks giving sale", "rule": "$40 discount on checkout"}, + "intent": "Create a new marketing price rule called \"Thanks giving sale\" that offers $40 discount on all their purchase for all customers", + "intent_template": "Create a new marketing price rule called \"{{topic}}\" that offers {{rule}} for all customers", + "instantiation_dict": {"topic": "Thanks giving sale", "rule": "$40 discount on all their purchase"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -20165,15 +21213,23 @@ "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "topic": "Thanks giving sale", + "evaluator": "NetworkEventEvaluator", "site": "shopping_admin", "expected": { - "name": "Thanks giving sale", - "website_ids": [1], - "customer_group_ids": [1], - "discount_amount_type": "cart_fixed", - "discount_amount": 40 + "url": "__SHOPPING_ADMIN__/sales_rule/promo_quote/save", + "http_method": "POST", + "response_status": 302, + "post_data": { + "name": "Thanks giving sale", + "discount_amount": "40", + "is_active": "1", + "customer_group_ids[0]": "1", + "website_ids[0]": "1", + "coupon_type": "1", + "rule[actions][1][type]": "Magento\\SalesRule\\Model\\Rule\\Condition\\Product\\Combine", + "rule[actions][1][aggregator]": "all", + "rule[actions][1][value]": "1" + } } } ], @@ -20184,21 +21240,19 @@ "task_id": 704, "intent_template_id": 268, "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Today is 3/15/2023, generate a sales order report for last month", - "intent_template": "Today is 3/15/2023, generate a {{report}} {{time_span}}", - "instantiation_dict": {"report": "sales order report", "time_span": "for last month"}, + "intent": "Today is March 15, 2023, generate a new sales order report for last months", + "intent_template": "Today is March 15, 2023, generate a new {{report}} {{time_span}}", + "instantiation_dict": {"report": "sales order report", "time_span": "for last months"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", "site": "shopping_admin", - "url_match_mode": "exact", "last_event_only": true, "ignored_query_params_patterns": ["period_type", "^show"], "decode_base64_query": true, @@ -20212,8 +21266,11 @@ }, "expected": { "url": "__SHOPPING_ADMIN__/reports/report_sales/sales/filter", - "event_type": "navigation", - "query_string": {"report_type": "created_at_order", "from": "02/1/2023", "to": "02/28/2023"} + "query_params": { + "report_type": [ "created_at_order" ], + "from" : [ "02/1/2023" ], + "to" : [ "02/28/2023" ] + } } } ], @@ -20224,21 +21281,19 @@ "task_id": 705, "intent_template_id": 268, "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Today is 3/15/2023, generate a sales order report over the last 45 days", - "intent_template": "Today is 3/15/2023, generate a {{report}} {{time_span}}", + "intent": "Today is March 15, 2023, generate a new sales order report over the last 45 days", + "intent_template": "Today is March 15, 2023, generate a new {{report}} {{time_span}}", "instantiation_dict": {"report": "sales order report", "time_span": "over the last 45 days"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", "site": "shopping_admin", - "url_match_mode": "exact", "last_event_only": true, "ignored_query_params_patterns": ["period_type", "^show"], "decode_base64_query": true, @@ -20252,8 +21307,11 @@ }, "expected": { "url": "__SHOPPING_ADMIN__/reports/report_sales/sales/filter", - "event_type": "navigation", - "query_string": {"report_type": "created_at_order", "from": "01/29/2023", "to": "03/15/2023"} + "query_params": { + "report_type": [ "created_at_order" ], + "from" : [ "01/29/2023" ], + "to" : [ "03/15/2023" ] + } } } ], @@ -20264,21 +21322,19 @@ "task_id": 706, "intent_template_id": 268, "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Today is 3/15/2023, generate a refund report for Q1", - "intent_template": "Today is 3/15/2023, generate a {{report}} {{time_span}}", + "intent": "Today is March 15, 2023, generate a new refund report for Q1", + "intent_template": "Today is March 15, 2023, generate a new {{report}} {{time_span}}", "instantiation_dict": {"report": "refund report", "time_span": "for Q1"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", "site": "shopping_admin", - "url_match_mode": "exact", "last_event_only": true, "ignored_query_params_patterns": ["period_type", "^show"], "decode_base64_query": true, @@ -20292,8 +21348,11 @@ }, "expected": { "url": "__SHOPPING_ADMIN__/reports/report_sales/refunded/filter", - "event_type": "navigation", - "query_string": {"report_type": "created_at_order", "from": "01/1/2023", "to": "03/31/2023"} + "query_params": { + "report_type": [ "created_at_order" ], + "from" : [ "01/1/2023" ], + "to" : [ "03/31/2023" ] + } } } ], @@ -20304,36 +21363,39 @@ "task_id": 707, "intent_template_id": 268, "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Today is 3/15/2023, generate a sales order report for last year", - "intent_template": "Today is 3/15/2023, generate a {{report}} {{time_span}}", + "intent": "Today is March 15, 2023, generate a new sales order report for last year", + "intent_template": "Today is March 15, 2023, generate a new {{report}} {{time_span}}", "instantiation_dict": {"report": "sales order report", "time_span": "for last year"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", "site": "shopping_admin", - "url_match_mode": "exact", "last_event_only": true, "ignored_query_params_patterns": ["period_type", "^show"], "decode_base64_query": true, "query_string_schema": { "type": "object", "properties": { - "report_type": { "type": "string" }, - "from" : { "type": "string", "format": "date" }, - "to" : { "type": "string", "format": "date" } + "report_type" : { "type": "string" }, + "from" : { "type": "string", "format": "date" }, + "to" : { "type": "string", "format": "date" }, + "order_statuses[]": { "type": "string" } } }, "expected": { "url": "__SHOPPING_ADMIN__/reports/report_sales/sales/filter", - "event_type": "navigation", - "query_string": {"report_type": "created_at_order", "from": "01/1/2022", "to": "12/31/2022"} + "query_params": { + "report_type" : [ "created_at_order" ], + "from" : [ "01/1/2022" ], + "to" : [ "12/31/2022" ], + "order_statuses[]": [ "complete" ] + } } } ], @@ -20344,21 +21406,19 @@ "task_id": 708, "intent_template_id": 268, "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Today is 3/15/2023, generate a tax report for this year", - "intent_template": "Today is 3/15/2023, generate a {{report}} {{time_span}}", + "intent": "Today is March 15, 2023, generate a new tax report for this year", + "intent_template": "Today is March 15, 2023, generate a new {{report}} {{time_span}}", "instantiation_dict": {"report": "tax report", "time_span": "for this year"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", "site": "shopping_admin", - "url_match_mode": "exact", "last_event_only": true, "ignored_query_params_patterns": ["period_type", "^show"], "decode_base64_query": true, @@ -20372,8 +21432,11 @@ }, "expected": { "url": "__SHOPPING_ADMIN__/reports/report_sales/tax/filter", - "event_type": "navigation", - "query_string": {"report_type": "created_at_order", "from": "01/1/2023", "to": "12/31/2023"} + "query_params": { + "report_type": [ "created_at_order" ], + "from" : [ "01/1/2023" ], + "to" : [ "03/15/2023" ] + } } } ], @@ -20384,25 +21447,19 @@ "task_id": 709, "intent_template_id": 271, "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Create an orders report from beginning of May 2021 to end of March 2022", + "intent": "Create an orders report from May 1, 2021 to March 31, 2022", "intent_template": "Create an {{type}} report from {{start_date}} to {{end_date}}", - "instantiation_dict": { - "type": "orders", - "start_date": "beginning of May 2021", - "end_date": "end of March 2022" - }, + "instantiation_dict": {"type": "orders", "start_date": "May 1, 2021", "end_date": "March 31, 2022"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", "site": "shopping_admin", - "url_match_mode": "exact", "last_event_only": true, "ignored_query_params_patterns": ["period_type", "^show"], "decode_base64_query": true, @@ -20416,8 +21473,11 @@ }, "expected": { "url": "__SHOPPING_ADMIN__/reports/report_sales/sales/filter", - "event_type": "navigation", - "query_string": {"report_type": "created_at_order", "from": "05/1/2021", "to": "03/31/2022"} + "query_params": { + "report_type": [ "created_at_order" ], + "from" : [ "05/1/2021" ], + "to" : [ "03/31/2022" ] + } } } ], @@ -20428,21 +21488,19 @@ "task_id": 710, "intent_template_id": 271, "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Create a shipping report from 08/05/2022 to 03/01/2023", + "intent": "Create a shipping report from August 5, 2022 to March 1, 2023", "intent_template": "Create a {{type}} report from {{start_date}} to {{end_date}}", "instantiation_dict": {"type": "shipping", "start_date": "08/05/2022", "end_date": "03/01/2023"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", "site": "shopping_admin", - "url_match_mode": "exact", "last_event_only": true, "ignored_query_params_patterns": ["period_type", "^show"], "decode_base64_query": true, @@ -20456,8 +21514,11 @@ }, "expected": { "url": "__SHOPPING_ADMIN__/reports/report_sales/shipping/filter", - "event_type": "navigation", - "query_string": {"report_type": "created_at_order", "from": "08/5/2022", "to": "03/1/2023"} + "query_params": { + "report_type": [ "created_at_order" ], + "from" : [ "08/5/2022" ], + "to" : [ "03/1/2023" ] + } } } ], @@ -20468,21 +21529,19 @@ "task_id": 711, "intent_template_id": 271, "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Create a product view report from 07/05/2021 to 05/31/2023", + "intent": "Create a product view report from July 5, 2021 to May 31, 2023", "intent_template": "Create a {{type}} report from {{start_date}} to {{end_date}}", "instantiation_dict": {"type": "product view", "start_date": "07/05/2021", "end_date": "05/31/2023"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", "site": "shopping_admin", - "url_match_mode": "exact", "last_event_only": true, "ignored_query_params_patterns": ["period_type", "^show"], "decode_base64_query": true, @@ -20496,8 +21555,11 @@ }, "expected": { "url": "__SHOPPING_ADMIN__/reports/report_product/viewed/filter", - "event_type": "navigation", - "query_string": {"report_type": "created_at_order", "from": "07/5/2021", "to": "05/31/2023"} + "query_params": { + "report_type": [ "created_at_order" ], + "from" : [ "07/5/2021" ], + "to" : [ "05/31/2023" ] + } } } ], @@ -20508,21 +21570,19 @@ "task_id": 712, "intent_template_id": 271, "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Create a coupons report from 05/01/2021 to 05/15/2023", + "intent": "Create a coupons report from May 1, 2021 to May 15, 2023", "intent_template": "Create a {{type}} report from {{start_date}} to {{end_date}}", "instantiation_dict": {"type": "coupons", "start_date": "05/01/2021", "end_date": "05/15/2023"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", "site": "shopping_admin", - "url_match_mode": "exact", "last_event_only": true, "ignored_query_params_patterns": ["period_type", "^show"], "decode_base64_query": true, @@ -20536,8 +21596,11 @@ }, "expected": { "url": "__SHOPPING_ADMIN__/reports/report_sales/coupons/filter", - "event_type": "navigation", - "query_string": {"report_type": "created_at_order", "from": "05/1/2021", "to": "05/15/2023"} + "query_params": { + "report_type": [ "created_at_order" ], + "from" : [ "05/1/2021" ], + "to" : [ "05/15/2023" ] + } } } ], @@ -20548,21 +21611,19 @@ "task_id": 713, "intent_template_id": 271, "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Create a best sellers report from 05/01/2022 to 05/31/2023", + "intent": "Create a best sellers report from May 1, 2022 to May 31, 2023", "intent_template": "Create a {{type}} report from {{start_date}} to {{end_date}}", "instantiation_dict": {"type": "best sellers", "start_date": "05/01/2022", "end_date": "05/31/2023"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", "site": "shopping_admin", - "url_match_mode": "exact", "last_event_only": true, "ignored_query_params_patterns": ["period_type", "^show"], "decode_base64_query": true, @@ -20576,8 +21637,11 @@ }, "expected": { "url": "__SHOPPING_ADMIN__/reports/report_sales/bestsellers/filter", - "event_type": "navigation", - "query_string": {"report_type": "created_at_order", "from": "05/1/2022", "to": "05/31/2023"} + "query_params": { + "report_type": [ "created_at_order" ], + "from" : [ "05/1/2022" ], + "to" : [ "05/31/2023" ] + } } } ], @@ -20589,10 +21653,9 @@ "intent_template_id": 24, "start_urls": ["__REDDIT__"], "intent": "Thumbs down the top 1 post ever in gadgets.", - "intent_template": "Thumbs down the top {{k}} post ever in {{subreddit}}.", - "instantiation_dict": {"subreddit": "gadgets", "k": "1"}, + "intent_template": "Thumbs down the top {{k}} post ever in {{forum}}.", + "instantiation_dict": {"k": "1", "forum": "gadgets"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -20600,9 +21663,12 @@ "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "site": "reddit", - "expected": { "post_ids": [19459] } + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": "__REDDIT__/sv/19459.json", + "http_method": "POST", + "post_data": {"choice": "-1"} + } } ], "revision": 2 @@ -20613,10 +21679,9 @@ "intent_template_id": 24, "start_urls": ["__REDDIT__"], "intent": "Thumbs down the top 2 post ever in history.", - "intent_template": "Thumbs down the top {{k}} post ever in {{subreddit}}.", - "instantiation_dict": {"subreddit": "history", "k": "2"}, + "intent_template": "Thumbs down the top {{k}} post ever in {{forum}}.", + "instantiation_dict": {"k": "2", "forum": "history"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -20624,9 +21689,20 @@ "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "site": "reddit", - "expected": { "post_ids": [84338, 105990] } + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": "__REDDIT__/sv/58888.json", + "http_method": "POST", + "post_data": {"choice": "-1"} + } + }, + { + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": "__REDDIT__/sv/41616.json", + "http_method": "POST", + "post_data": {"choice": "-1"} + } } ], "revision": 2 @@ -20637,10 +21713,9 @@ "intent_template_id": 24, "start_urls": ["__REDDIT__"], "intent": "Thumbs down the top 3 post ever in books.", - "intent_template": "Thumbs down the top {{k}} post ever in {{subreddit}}.", - "instantiation_dict": {"k": "3", "subreddit": "books"}, + "intent_template": "Thumbs down the top {{k}} post ever in {{forum}}.", + "instantiation_dict": {"k": "3", "forum": "books"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -20648,9 +21723,28 @@ "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "site": "reddit", - "expected": { "post_ids": [81371, 59421, 59447] } + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": "__REDDIT__/sv/81371.json", + "http_method": "POST", + "post_data": {"choice": "-1"} + } + }, + { + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": "__REDDIT__/sv/59421.json", + "http_method": "POST", + "post_data": {"choice": "-1"} + } + }, + { + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": "__REDDIT__/sv/59447.json", + "http_method": "POST", + "post_data": {"choice": "-1"} + } } ], "revision": 2 @@ -20661,10 +21755,9 @@ "intent_template_id": 24, "start_urls": ["__REDDIT__"], "intent": "Thumbs down the top 4 post ever in movies.", - "intent_template": "Thumbs down the top {{k}} post ever in {{subreddit}}.", - "instantiation_dict": {"k": "4", "subreddit": "movies"}, + "intent_template": "Thumbs down the top {{k}} post ever in {{forum}}.", + "instantiation_dict": {"k": "4", "forum": "movies"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -20672,9 +21765,36 @@ "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "site": "reddit", - "expected": { "post_ids": [86174, 86029, 86055, 42682] } + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": "__REDDIT__/sv/86174.json", + "http_method": "POST", + "post_data": {"choice": "-1"} + } + }, + { + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": "__REDDIT__/sv/86029.json", + "http_method": "POST", + "post_data": {"choice": "-1"} + } + }, + { + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": "__REDDIT__/sv/86055.json", + "http_method": "POST", + "post_data": {"choice": "-1"} + } + }, + { + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": "__REDDIT__/sv/42682.json", + "http_method": "POST", + "post_data": {"choice": "-1"} + } } ], "revision": 2 @@ -20685,10 +21805,9 @@ "intent_template_id": 24, "start_urls": ["__REDDIT__"], "intent": "Thumbs down the top 5 post ever in technology.", - "intent_template": "Thumbs down the top {{k}} post ever in {{subreddit}}.", - "instantiation_dict": {"k": "5", "subreddit": "technology"}, + "intent_template": "Thumbs down the top {{k}} post ever in {{forum}}.", + "instantiation_dict": {"k": "5", "forum": "technology"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -20696,9 +21815,44 @@ "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "site": "reddit", - "expected": { "post_ids": [48670, 134696, 48785, 70354, 70233] } + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": "__REDDIT__/sv/48670.json", + "http_method": "POST", + "post_data": {"choice": "-1"} + } + }, + { + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": "__REDDIT__/sv/134696.json", + "http_method": "POST", + "post_data": {"choice": "-1"} + } + }, + { + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": "__REDDIT__/sv/48785.json", + "http_method": "POST", + "post_data": {"choice": "-1"} + } + }, + { + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": "__REDDIT__/sv/70354.json", + "http_method": "POST", + "post_data": {"choice": "-1"} + } + }, + { + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": "__REDDIT__/sv/70233.json", + "http_method": "POST", + "post_data": {"choice": "-1"} + } } ], "revision": 2 @@ -20708,11 +21862,10 @@ "task_id": 719, "intent_template_id": 25, "start_urls": ["__REDDIT__"], - "intent": "Like all submissions created by ThetaGang_wsb in subreddit wallstreetbets", - "intent_template": "Like all submissions created by {{user}} in subreddit {{subreddit}}", - "instantiation_dict": {"subreddit": "wallstreetbets", "user": "ThetaGang_wsb"}, + "intent": "Like all submissions created by ThetaGang_wsb in forum wallstreetbets", + "intent_template": "Like all submissions created by {{user}} in forum {{forum}}", + "instantiation_dict": {"user": "ThetaGang_wsb", "forum": "wallstreetbets"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -20720,9 +21873,20 @@ "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "site": "reddit", - "expected": { "post_ids": [29478, 29458] } + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": "__REDDIT__/sv/29478.json", + "http_method": "POST", + "post_data": {"choice": "1"} + } + }, + { + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": "__REDDIT__/sv/29458.json", + "http_method": "POST", + "post_data": {"choice": "1"} + } } ], "revision": 2 @@ -20732,11 +21896,10 @@ "task_id": 720, "intent_template_id": 25, "start_urls": ["__REDDIT__"], - "intent": "Like all submissions created by CameronKelsey in subreddit earthporn", - "intent_template": "Like all submissions created by {{user}} in subreddit {{subreddit}}", - "instantiation_dict": {"subreddit": "earthporn", "user": "CameronKelsey"}, + "intent": "Like all submissions created by CameronKelsey in forum earthporn", + "intent_template": "Like all submissions created by {{user}} in forum {{forum}}", + "instantiation_dict": {"user": "CameronKelsey", "forum": "earthporn"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -20744,9 +21907,28 @@ "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "site": "reddit", - "expected": { "post_ids": [98332, 98297, 98256] } + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": "__REDDIT__/sv/98332.json", + "http_method": "POST", + "post_data": {"choice": "1"} + } + }, + { + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": "__REDDIT__/sv/98297.json", + "http_method": "POST", + "post_data": {"choice": "1"} + } + }, + { + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": "__REDDIT__/sv/98256.json", + "http_method": "POST", + "post_data": {"choice": "1"} + } } ], "revision": 2 @@ -20756,11 +21938,10 @@ "task_id": 721, "intent_template_id": 25, "start_urls": ["__REDDIT__"], - "intent": "Like all submissions created by UniversityofBath in subreddit IAmA", - "intent_template": "Like all submissions created by {{user}} in subreddit {{subreddit}}", - "instantiation_dict": {"user": "UniversityofBath", "subreddit": "IAmA"}, + "intent": "Like all submissions created by UniversityofBath in forum IAmA", + "intent_template": "Like all submissions created by {{user}} in forum {{forum}}", + "instantiation_dict": {"user": "UniversityofBath", "forum": "IAmA"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -20768,9 +21949,68 @@ "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "site": "reddit", - "expected": { "post_ids": [119742, 119719, 119714, 55155, 55142, 34032, 13175, 13170] } + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": "__REDDIT__/sv/119742.json", + "http_method": "POST", + "post_data": {"choice": "1"} + } + }, + { + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": "__REDDIT__/sv/119719.json", + "http_method": "POST", + "post_data": {"choice": "1"} + } + }, + { + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": "__REDDIT__/sv/119714.json", + "http_method": "POST", + "post_data": {"choice": "1"} + } + }, + { + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": "__REDDIT__/sv/55155.json", + "http_method": "POST", + "post_data": {"choice": "1"} + } + }, + { + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": "__REDDIT__/sv/55142.json", + "http_method": "POST", + "post_data": {"choice": "1"} + } + }, + { + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": "__REDDIT__/sv/34032.json", + "http_method": "POST", + "post_data": {"choice": "1"} + } + }, + { + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": "__REDDIT__/sv/13175.json", + "http_method": "POST", + "post_data": {"choice": "1"} + } + }, + { + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": "__REDDIT__/sv/13170.json", + "http_method": "POST", + "post_data": {"choice": "1"} + } } ], "revision": 2 @@ -20780,11 +22020,10 @@ "task_id": 722, "intent_template_id": 25, "start_urls": ["__REDDIT__"], - "intent": "Like all submissions created by Don_Gato1 in subreddit nyc", - "intent_template": "Like all submissions created by {{user}} in subreddit {{subreddit}}", - "instantiation_dict": {"user": "Don_Gato1", "subreddit": "nyc"}, + "intent": "Like all submissions created by Don_Gato1 in forum nyc", + "intent_template": "Like all submissions created by {{user}} in forum {{forum}}", + "instantiation_dict": {"user": "Don_Gato1", "forum": "nyc"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -20792,9 +22031,12 @@ "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "site": "reddit", - "expected": { "post_ids": [44650] } + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": "__REDDIT__/sv/44650.json", + "http_method": "POST", + "post_data": {"choice": "1"} + } } ], "revision": 2 @@ -20804,11 +22046,10 @@ "task_id": 723, "intent_template_id": 25, "start_urls": ["__REDDIT__"], - "intent": "Like all submissions created by FTorrez81 in subreddit iphone13", - "intent_template": "Like all submissions created by {{user}} in subreddit {{subreddit}}", - "instantiation_dict": {"user": "FTorrez81", "subreddit": "iphone13"}, + "intent": "Like all submissions created by FTorrez81 in forum iphone13", + "intent_template": "Like all submissions created by {{user}} in forum {{forum}}", + "instantiation_dict": {"user": "FTorrez81", "forum": "iphone13"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -20827,11 +22068,10 @@ "task_id": 724, "intent_template_id": 25, "start_urls": ["__REDDIT__"], - "intent": "Like all submissions created by Hrekires in subreddit news", - "intent_template": "Like all submissions created by {{user}} in subreddit {{subreddit}}", - "instantiation_dict": {"user": "Hrekires", "subreddit": "news"}, + "intent": "Like all submissions created by Hrekires in forum news", + "intent_template": "Like all submissions created by {{user}} in forum {{forum}}", + "instantiation_dict": {"user": "Hrekires", "forum": "news"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -20839,10 +22079,83 @@ "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "site": "reddit", + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": "__REDDIT__/sv/129816.json", + "http_method": "POST", + "post_data": {"choice": "1"} + } + }, + { + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": "__REDDIT__/sv/129808.json", + "http_method": "POST", + "post_data": {"choice": "1"} + } + }, + { + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": "__REDDIT__/sv/129794.json", + "http_method": "POST", + "post_data": {"choice": "1"} + } + }, + { + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": "__REDDIT__/sv/129783.json", + "http_method": "POST", + "post_data": {"choice": "1"} + } + }, + { + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": "__REDDIT__/sv/129594.json", + "http_method": "POST", + "post_data": {"choice": "1"} + } + }, + { + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": "__REDDIT__/sv/129508.json", + "http_method": "POST", + "post_data": {"choice": "1"} + } + }, + { + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": "__REDDIT__/sv/43839.json", + "http_method": "POST", + "post_data": {"choice": "1"} + } + }, + { + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": "__REDDIT__/sv/43781.json", + "http_method": "POST", + "post_data": {"choice": "1"} + } + }, + { + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": "__REDDIT__/sv/43572.json", + "http_method": "POST", + "post_data": {"choice": "1"} + } + }, + { + "evaluator": "NetworkEventEvaluator", "expected": { - "post_ids": [129816, 129808, 129794, 129783, 129594, 129508, 43839, 43781, 43572, 43558] + "url": "__REDDIT__/sv/43558.json", + "http_method": "POST", + "post_data": {"choice": "1"} } } ], @@ -20853,11 +22166,10 @@ "task_id": 725, "intent_template_id": 1510, "start_urls": ["__REDDIT__"], - "intent": "DisLike all submissions created by RickyDontLoseThat in subreddit massachusetts", - "intent_template": "DisLike all submissions created by {{user}} in subreddit {{subreddit}}", - "instantiation_dict": {"subreddit": "massachusetts", "user": "RickyDontLoseThat"}, + "intent": "DisLike all submissions created by RickyDontLoseThat in forum massachusetts", + "intent_template": "DisLike all submissions created by {{user}} in forum {{forum}}", + "instantiation_dict": {"user": "RickyDontLoseThat", "forum": "massachusetts"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -20865,9 +22177,12 @@ "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "site": "reddit", - "expected": { "post_ids": [84954] } + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": "__REDDIT__/sv/84954.json", + "http_method": "POST", + "post_data": {"choice": "-1"} + } } ], "revision": 2 @@ -20877,11 +22192,10 @@ "task_id": 726, "intent_template_id": 1510, "start_urls": ["__REDDIT__"], - "intent": "DisLike all submissions created by jacyanthis in subreddit earthporn", - "intent_template": "DisLike all submissions created by {{user}} in subreddit {{subreddit}}", - "instantiation_dict": {"subreddit": "earthporn", "user": "jacyanthis"}, + "intent": "DisLike all submissions created by jacyanthis in forum earthporn", + "intent_template": "DisLike all submissions created by {{user}} in forum {{forum}}", + "instantiation_dict": {"user": "jacyanthis", "forum": "earthporn"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -20900,11 +22214,10 @@ "task_id": 727, "intent_template_id": 1510, "start_urls": ["__REDDIT__"], - "intent": "DisLike all submissions created by PatientBuilder499 in subreddit videos", - "intent_template": "DisLike all submissions created by {{user}} in subreddit {{subreddit}}", - "instantiation_dict": {"user": "PatientBuilder499", "subreddit": "videos"}, + "intent": "DisLike all submissions created by PatientBuilder499 in forum videos", + "intent_template": "DisLike all submissions created by {{user}} in forum {{forum}}", + "instantiation_dict": {"user": "PatientBuilder499", "forum": "videos"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -20912,9 +22225,12 @@ "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "site": "reddit", - "expected": { "post_ids": [115139] } + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": "__REDDIT__/sv/115139.json", + "http_method": "POST", + "post_data": {"choice": "-1"} + } } ], "revision": 2 @@ -20924,11 +22240,10 @@ "task_id": 728, "intent_template_id": 1510, "start_urls": ["__REDDIT__"], - "intent": "DisLike all submissions created by sirbarani in subreddit sports", - "intent_template": "DisLike all submissions created by {{user}} in subreddit {{subreddit}}", - "instantiation_dict": {"user": "sirbarani", "subreddit": "sports"}, + "intent": "DisLike all submissions created by sirbarani in forum sports", + "intent_template": "DisLike all submissions created by {{user}} in forum {{forum}}", + "instantiation_dict": {"user": "sirbarani", "forum": "sports"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -20936,9 +22251,12 @@ "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "site": "reddit", - "expected": { "post_ids": [48303] } + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": "__REDDIT__/sv/48303.json", + "http_method": "POST", + "post_data": {"choice": "-1"} + } } ], "revision": 2 @@ -20948,11 +22266,10 @@ "task_id": 729, "intent_template_id": 1510, "start_urls": ["__REDDIT__"], - "intent": "DisLike all submissions created by AdamCannon in subreddit UpliftingNews", - "intent_template": "DisLike all submissions created by {{user}} in subreddit {{subreddit}}", - "instantiation_dict": {"user": "AdamCannon", "subreddit": "UpliftingNews"}, + "intent": "DisLike all submissions created by AdamCannon in forum UpliftingNews", + "intent_template": "DisLike all submissions created by {{user}} in forum {{forum}}", + "instantiation_dict": {"user": "AdamCannon", "forum": "UpliftingNews"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -20960,34 +22277,109 @@ "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "site": "reddit", - "expected": { "post_ids": [16087] } - } - ], - "revision": 2 - }, - { - "sites": ["reddit"], - "task_id": 730, - "intent_template_id": 1510, - "start_urls": ["__REDDIT__"], - "intent": "DisLike all submissions created by Hrekires in subreddit news", - "intent_template": "DisLike all submissions created by {{user}} in subreddit {{subreddit}}", - "instantiation_dict": {"user": "Hrekires", "subreddit": "news"}, - "format_specification": null, - "start_url_context": null, - "eval": [ + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": "__REDDIT__/sv/16087.json", + "http_method": "POST", + "post_data": {"choice": "-1"} + } + } + ], + "revision": 2 + }, + { + "sites": ["reddit"], + "task_id": 730, + "intent_template_id": 1510, + "start_urls": ["__REDDIT__"], + "intent": "DisLike all submissions created by Hrekires in forum news", + "intent_template": "DisLike all submissions created by {{user}} in forum {{forum}}", + "instantiation_dict": {"user": "Hrekires", "forum": "news"}, + "format_specification": null, + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + }, + { + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": "__REDDIT__/sv/129816.json", + "http_method": "POST", + "post_data": {"choice": "-1"} + } + }, + { + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": "__REDDIT__/sv/129808.json", + "http_method": "POST", + "post_data": {"choice": "-1"} + } + }, + { + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": "__REDDIT__/sv/129794.json", + "http_method": "POST", + "post_data": {"choice": "-1"} + } + }, + { + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": "__REDDIT__/sv/129783.json", + "http_method": "POST", + "post_data": {"choice": "-1"} + } + }, + { + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": "__REDDIT__/sv/129594.json", + "http_method": "POST", + "post_data": {"choice": "-1"} + } + }, + { + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": "__REDDIT__/sv/129508.json", + "http_method": "POST", + "post_data": {"choice": "-1"} + } + }, + { + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": "__REDDIT__/sv/43839.json", + "http_method": "POST", + "post_data": {"choice": "-1"} + } + }, + { + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": "__REDDIT__/sv/43781.json", + "http_method": "POST", + "post_data": {"choice": "-1"} + } + }, { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": "__REDDIT__/sv/43572.json", + "http_method": "POST", + "post_data": {"choice": "-1"} + } }, { - "evaluator": "BackendStateEvaluator", - "site": "reddit", + "evaluator": "NetworkEventEvaluator", "expected": { - "post_ids": [129816, 129808, 129794, 129783, 129594, 129508, 43839, 43781, 43572, 43558] + "url": "__REDDIT__/sv/43558.json", + "http_method": "POST", + "post_data": {"choice": "-1"} } } ], @@ -21002,7 +22394,6 @@ "intent_template": "Edit my post on {{post}} by adding a line to the body that says \"{{content}}\"", "instantiation_dict": {"post": "Nvidia RTX 4090", "content": "EDIT: This news aged well"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -21030,7 +22421,6 @@ "intent_template": "Edit my post on {{post}} by adding a line to the body that says \"{{content}}\"", "instantiation_dict": {"post": "The Night Agent", "content": "Done watching, pretty cool!"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -21061,7 +22451,6 @@ "content": "Every watch makes me feel like a kid again" }, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -21069,12 +22458,16 @@ "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "site": "reddit", + "evaluator": "NetworkEventEvaluator", "expected": { - "subreddit": "television", - "post_id": 135201, - "title": "Every watch makes me feel like a kid again" + "url": "__REDDIT__/f/television/135201/-/edit", + "http_method": "POST", + "headers": {"referer": "__REDDIT__/f/television/135201/-/edit"}, + "post_data": { + "submission[title]": "Star Trek: Starfleet Academy Series, From Alex Kurtzman and Nancy Drew Creator, Ordered at Paramount+", + "submission[body]": "Every watch makes me feel like a kid again" + }, + "response_status": 302 } } ], @@ -21092,7 +22485,6 @@ "content": "Done watching. I love the renew!" }, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -21100,12 +22492,16 @@ "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "site": "reddit", + "evaluator": "NetworkEventEvaluator", "expected": { - "subreddit": "television", - "content": "Done watching. I love the renew!", - "title": "\u2018Ted Lasso\u2019 Season 3 Premiere Scores 870K U.S. Households, Up 59% From Season 2 Premiere" + "url": "__REDDIT__/f/television/135156/-/edit", + "http_method": "POST", + "headers": {"referer": "__REDDIT__/f/television/135156/-/edit"}, + "post_data": { + "submission[title]": "\u2018Ted Lasso\u2019 Season 3 Premiere Scores 870K U.S. Households, Up 59% From Season 2 Premiere", + "submission[body]": "Done watching. I love the renew!" + }, + "response_status": 302 } } ], @@ -21116,14 +22512,13 @@ "task_id": 735, "intent_template_id": 27, "start_urls": ["__REDDIT__"], - "intent": "Edit my post on Lord of the Rings season 2 cast announcement by adding a line to the body that says \"The cast is amazing!\"", + "intent": "Edit my post on Lord of the Rings season 2 cast announcement with the lowest vote count by adding a line to the body that says \"The cast is amazing!\"", "intent_template": "Edit my post on {{post}} by adding a line to the body that says \"{{content}}\"", "instantiation_dict": { - "post": "Lord of the Rings season 2 cast announcement", + "post": "Lord of the Rings season 2 cast announcement with the lowest vote count", "content": "The cast is amazing!" }, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -21131,12 +22526,16 @@ "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "site": "reddit", + "evaluator": "NetworkEventEvaluator", "expected": { - "subreddit": "television", - "content": "The cast is amazing!", - "title": "\u2018Lord of the Rings: The Rings of Power\u2019 Season 2 Cast Adds Ciar\u00e0n Hinds, Rory Kinnear and Tanya Moodie" + "url": "__REDDIT__/f/television/70886/-/edit", + "http_method": "POST", + "headers": {"referer": "__REDDIT__/f/television/70886/-/edit"}, + "post_data": { + "submission[title]": "\u2018The Lord Of The Rings: The Rings Of Power\u2019 Adds 8 To Season 2 Cast", + "submission[body]": "The cast is amazing!" + }, + "response_status": 302 } } ], @@ -21154,7 +22553,6 @@ "license": "MIT licenses" }, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -21189,15 +22587,14 @@ "task_id": 737, "intent_template_id": 94, "start_urls": ["__MAP__", "__WIKIPEDIA__"], - "intent": "Show me the way from Carnegie Mellon University to the home stadium of Philadelphia 76ers ", - "intent_template": "Show me the way from {{location}} to the home stadium of {{sport_team}} {{time}}", + "intent": "Show me the way from Carnegie Mellon University to the home stadium of Philadelphia 76ers (use the provided wiki site to look up any needed information)", + "intent_template": "Show me the way from {{location}} to the home stadium of {{sport_team}} {{time}} (use the provided wiki site to look up any needed information)", "instantiation_dict": { "location": "Carnegie Mellon University", "sport_team": "Philadelphia 76ers", "time": "" }, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -21207,9 +22604,8 @@ { "evaluator": "NetworkEventEvaluator", "site": "map", - "url_match_mode": "exact", "last_event_only": true, - "expected": {"url": "__MAP__", "response_status": 200, "event_type": "navigation"} + "expected": {"url": "__MAP__", "response_status": 200} } ], "revision": 2 @@ -21219,15 +22615,14 @@ "task_id": 738, "intent_template_id": 94, "start_urls": ["__MAP__", "__WIKIPEDIA__"], - "intent": "Show me the way from Carnegie Mellon University to the home stadium of Philadelphia 76ers in the 70th", - "intent_template": "Show me the way from {{location}} to the home stadium of {{sport_team}} {{time}}", + "intent": "Show me the way from Carnegie Mellon University to the home stadium of Philadelphia 76ers in the 70th (use the provided wiki site to look up any needed information)", + "intent_template": "Show me the way from {{location}} to the home stadium of {{sport_team}} {{time}} (use the provided wiki site to look up any needed information)", "instantiation_dict": { "location": "Carnegie Mellon University", "sport_team": "Philadelphia 76ers", "time": "in the 70th" }, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -21237,9 +22632,8 @@ { "evaluator": "NetworkEventEvaluator", "site": "map", - "url_match_mode": "exact", "last_event_only": true, - "expected": {"url": "__MAP__", "response_status": 200, "event_type": "navigation"} + "expected": {"url": "__MAP__", "response_status": 200} } ], "revision": 2 @@ -21249,15 +22643,14 @@ "task_id": 739, "intent_template_id": 94, "start_urls": ["__MAP__", "__WIKIPEDIA__"], - "intent": "Show me the way from Carnegie Mellon University to the home stadium of Yankees in the 80th", - "intent_template": "Show me the way from {{location}} to the home stadium of {{sport_team}} {{time}}", + "intent": "Show me the way from Carnegie Mellon University to the home stadium of Yankees in the 80th (use the provided wiki site to look up any needed information)", + "intent_template": "Show me the way from {{location}} to the home stadium of {{sport_team}} {{time}} (use the provided wiki site to look up any needed information)", "instantiation_dict": { "location": "Carnegie Mellon University", "sport_team": "Yankees", "time": "in the 80th" }, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -21267,9 +22660,8 @@ { "evaluator": "NetworkEventEvaluator", "site": "map", - "url_match_mode": "exact", "last_event_only": true, - "expected": {"url": "__MAP__", "response_status": 200, "event_type": "navigation"} + "expected": {"url": "__MAP__", "response_status": 200} } ], "revision": 2 @@ -21279,15 +22671,14 @@ "task_id": 740, "intent_template_id": 94, "start_urls": ["__MAP__", "__WIKIPEDIA__"], - "intent": "Show me the way from Carnegie Mellon University to the home stadium of NYC NBA team ", - "intent_template": "Show me the way from {{location}} to the home stadium of {{sport_team}} {{time}}", + "intent": "Show me the way from Carnegie Mellon University to the home stadium of NYC NBA team (use the provided wiki site to look up any needed information)", + "intent_template": "Show me the way from {{location}} to the home stadium of {{sport_team}} {{time}} (use the provided wiki site to look up any needed information)", "instantiation_dict": { "location": "Carnegie Mellon University", "sport_team": "NYC NBA team", "time": "" }, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -21297,9 +22688,8 @@ { "evaluator": "NetworkEventEvaluator", "site": "map", - "url_match_mode": "exact", "last_event_only": true, - "expected": {"url": "__MAP__", "response_status": 200, "event_type": "navigation"} + "expected": {"url": "__MAP__", "response_status": 200} } ], "revision": 2 @@ -21309,15 +22699,14 @@ "task_id": 741, "intent_template_id": 94, "start_urls": ["__MAP__", "__WIKIPEDIA__"], - "intent": "Show me the way from Carnegie Mellon University to the home stadium of Boston home NBA team ", - "intent_template": "Show me the way from {{location}} to the home stadium of {{sport_team}} {{time}}", + "intent": "Show me the way from Carnegie Mellon University to the home stadium of Boston home NBA team (use the provided wiki site to look up any needed information)", + "intent_template": "Show me the way from {{location}} to the home stadium of {{sport_team}} {{time}} (use the provided wiki site to look up any needed information)", "instantiation_dict": { "location": "Carnegie Mellon University", "sport_team": "Boston home NBA team", "time": "" }, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -21327,9 +22716,8 @@ { "evaluator": "NetworkEventEvaluator", "site": "map", - "url_match_mode": "exact", "last_event_only": true, - "expected": {"url": "__MAP__", "response_status": 200, "event_type": "navigation"} + "expected": {"url": "__MAP__", "response_status": 200} } ], "revision": 2 @@ -21343,7 +22731,6 @@ "intent_template": "Create a new {{scope}} project \"planner\" and add {{account_list}} as members", "instantiation_dict": {"scope": "private", "account_list": "Abishek, Vinta"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -21351,13 +22738,31 @@ "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "group": "byteblaze", - "project": "planner", - "visibility": "private", - "members": ["abisubramanya27", "vinta"], - "site": "gitlab", - "expected": true + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": "__GITLAB__/api/v4/projects", + "http_method": "POST", + "post_data": {"name": "planner", "visibility": "private"}, + "response_status": 201 + } + }, + { + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": "^__GITLAB__/api/v4/projects/\\d+/members$", + "http_method": "POST", + "post_data": {"user_id": 5, "access_level": 30}, + "response_status": 201 + } + }, + { + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": "^__GITLAB__/api/v4/projects/\\d+/members$", + "http_method": "POST", + "post_data": {"user_id": 278, "access_level": 30}, + "response_status": 201 + } } ], "revision": 2 @@ -21371,7 +22776,6 @@ "intent_template": "Create a new {{scope}} project \"web_arena\" and add {{account_list}} as members", "instantiation_dict": {"scope": "public", "account_list": "Abishek, Vinta"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -21379,13 +22783,31 @@ "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "group": "byteblaze", - "project": "web_arena", - "visibility": "public", - "members": ["abisubramanya27", "vinta"], - "site": "gitlab", - "expected": true + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": "__GITLAB__/api/v4/projects", + "http_method": "POST", + "post_data": {"name": "web_arena", "visibility": "public"}, + "response_status": 201 + } + }, + { + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": "^__GITLAB__/api/v4/projects/\\d+/members$", + "http_method": "POST", + "post_data": {"user_id": 5, "access_level": 30}, + "response_status": 201 + } + }, + { + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": "^__GITLAB__/api/v4/projects/\\d+/members$", + "http_method": "POST", + "post_data": {"user_id": 278, "access_level": 30}, + "response_status": 201 + } } ], "revision": 2 @@ -21399,7 +22821,6 @@ "intent_template": "Create a new {{scope}} project \"AutoAGI\" and add {{account_list}} as members", "instantiation_dict": {"scope": "public", "account_list": "primer"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -21407,13 +22828,22 @@ "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "group": "byteblaze", - "project": "AutoAGI", - "visibility": "public", - "members": ["primer"], - "site": "gitlab", - "expected": true + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": "__GITLAB__/api/v4/projects", + "http_method": "POST", + "post_data": {"name": "AutoAGI", "visibility": "public"}, + "response_status": 201 + } + }, + { + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": "^__GITLAB__/api/v4/projects/\\d+/members$", + "http_method": "POST", + "post_data": {"user_id": 2367, "access_level": 30}, + "response_status": 201 + } } ], "revision": 2 @@ -21427,7 +22857,6 @@ "intent_template": "Create a new {{scope}} project \"awesome-llms\" and add {{account_list}} as members", "instantiation_dict": {"scope": "public", "account_list": "primer, convexegg, abishek"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -21435,13 +22864,40 @@ "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "group": "byteblaze", - "project": "awesome-llms", - "visibility": "public", - "members": ["primer", "convexegg", "abisubramanya27"], - "site": "gitlab", - "expected": true + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": "__GITLAB__/api/v4/projects", + "http_method": "POST", + "post_data": {"name": "awesome-llms", "visibility": "public"}, + "response_status": 201 + } + }, + { + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": "^__GITLAB__/api/v4/projects/\\d+/members$", + "http_method": "POST", + "post_data": {"user_id": 2367, "access_level": 30}, + "response_status": 201 + } + }, + { + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": "^__GITLAB__/api/v4/projects/\\d+/members$", + "http_method": "POST", + "post_data": {"user_id": 43, "access_level": 30}, + "response_status": 201 + } + }, + { + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": "^__GITLAB__/api/v4/projects/\\d+/members$", + "http_method": "POST", + "post_data": {"user_id": 5, "access_level": 30}, + "response_status": 201 + } } ], "revision": 2 @@ -21455,7 +22911,6 @@ "intent_template": "Create a new {{scope}} project \"llm_bulk_inference\" and add {{account_list}} as members", "instantiation_dict": {"scope": "private", "account_list": "primer, convexegg, abishek"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -21463,13 +22918,40 @@ "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "group": "byteblaze", - "project": "llm_bulk_inference", - "visibility": "private", - "members": ["primer", "convexegg", "abisubramanya27"], - "site": "gitlab", - "expected": true + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": "__GITLAB__/api/v4/projects", + "http_method": "POST", + "post_data": {"name": "llm_bulk_inference", "visibility": "private"}, + "response_status": 201 + } + }, + { + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": "^__GITLAB__/api/v4/projects/\\d+/members$", + "http_method": "POST", + "post_data": {"user_id": 2367, "access_level": 30}, + "response_status": 201 + } + }, + { + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": "^__GITLAB__/api/v4/projects/\\d+/members$", + "http_method": "POST", + "post_data": {"user_id": 43, "access_level": 30}, + "response_status": 201 + } + }, + { + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": "^__GITLAB__/api/v4/projects/\\d+/members$", + "http_method": "POST", + "post_data": {"user_id": 5, "access_level": 30}, + "response_status": 201 + } } ], "revision": 2 @@ -21479,15 +22961,14 @@ "task_id": 747, "intent_template_id": 2100, "start_urls": ["__GITLAB__"], - "intent": "Start a private project awesome_web_agents with blank template and add Abishek, Vinta as members", - "intent_template": "Start a private project {{project_name}} with {{template}} template and add {{account_list}} as members", + "intent": "Start a private project called \"awesome_web_agents\" with blank template and add Abishek, Vinta as members", + "intent_template": "Start a private project called \"{{project_name}}\" with {{template}} template and add {{account_list}} as members", "instantiation_dict": { "project_name": "awesome_web_agents", "template": "blank", "account_list": "Abishek, Vinta" }, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -21495,15 +22976,35 @@ "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "group": "byteblaze", - "project": "awesome_web_agents", - "visibility": "private", - "initial_commit_message_substrings": ["Initial commit"], - "require_all_substrings": true, - "collaborators": ["abisubramanya27", "vinta"], - "site": "gitlab", - "expected": true + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": "__GITLAB__/projects", + "http_method": "POST", + "post_data": { + "project[template_name]": null, + "project[name]": "awesome_web_agents", + "project[path]": "awesome_web_agents", + "project[namespace_id]": "2505", + "project[visibility_level]": "0" + }, + "response_status": 302 + } + }, + { + "evaluator": "NetworkEventEvaluator", + "ignored_query_params": ["serializer"], + "last_event_only": true, + "post_data_schema": { + "type": "object", + "properties": { "user_id": {"type": "string", "format": "string_list"} } + }, + "expected": { + "url": "^__GITLAB__/api/v4/projects/\\d+/invitations$", + "headers": {"referer": "__GITLAB__/byteblaze/awesome_web_agents/-/project_members"}, + "http_method": "POST", + "post_data": {"user_id": "5,278"}, + "response_status": 201 + } } ], "revision": 2 @@ -21513,15 +23014,14 @@ "task_id": 748, "intent_template_id": 2100, "start_urls": ["__GITLAB__"], - "intent": "Start a private project web_agent_android_xl with Android template and add primer, convexegg, abishek as members", - "intent_template": "Start a private project {{project_name}} with {{template}} template and add {{account_list}} as members", + "intent": "Start a private project called \"web_agent_android_xl\" with Android template and add primer, convexegg, abishek as members", + "intent_template": "Start a private project called \"{{project_name}}\" with {{template}} template and add {{account_list}} as members", "instantiation_dict": { "project_name": "web_agent_android_xl", "template": "Android", "account_list": "primer, convexegg, abishek" }, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -21529,15 +23029,33 @@ "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "group": "byteblaze", - "project": "web_agent_android_xl", - "visibility": "private", - "initial_commit_message_substrings": ["Initialized from 'Android' project template"], - "require_all_substrings": true, - "collaborators": ["primer", "convexegg", "abisubramanya27"], + "evaluator": "NetworkEventEvaluator", "site": "gitlab", - "expected": true + "expected": { + "url": "__GITLAB__/api/v4/projects", + "http_method": "POST", + "post_data": { + "name": "web_agent_android_xl", + "visibility": "private", + "namespace_id": 2505, + "template_name": "android" + }, + "response_status": 201 + } + }, + { + "evaluator": "NetworkEventEvaluator", + "site": "gitlab", + "post_data_schema": { + "type": "object", + "properties": { "user_id": {"type": "string", "format": "string_list"} } + }, + "expected": { + "url": "^__GITLAB__/api/v4/projects/\\d+/invitations$", + "http_method": "POST", + "post_data": {"user_id": "2367,43,5", "access_level": 30}, + "response_status": 201 + } } ], "revision": 2 @@ -21547,15 +23065,14 @@ "task_id": 749, "intent_template_id": 2100, "start_urls": ["__GITLAB__"], - "intent": "Start a private project project_site with NodeJS template and add primer, convexegg, vinta as members", - "intent_template": "Start a private project {{project_name}} with {{template}} template and add {{account_list}} as members", + "intent": "Start a private project called \"project_site\" with NodeJS template and add primer, convexegg, vinta as members", + "intent_template": "Start a private project called \"{{project_name}}\" with {{template}} template and add {{account_list}} as members", "instantiation_dict": { "project_name": "project_site", "template": "NodeJS", "account_list": "primer, convexegg, vinta" }, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -21563,15 +23080,33 @@ "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "group": "byteblaze", - "project": "project_site", - "visibility": "private", - "initial_commit_message_substrings": ["Initialized from 'NodeJS Express' project template"], - "require_all_substrings": true, - "collaborators": ["primer", "convexegg", "vinta"], + "evaluator": "NetworkEventEvaluator", "site": "gitlab", - "expected": true + "expected": { + "url": "__GITLAB__/api/v4/projects", + "http_method": "POST", + "post_data": { + "name": "project_site", + "visibility": "private", + "namespace_id": 2505, + "template_name": "express" + }, + "response_status": 201 + } + }, + { + "evaluator": "NetworkEventEvaluator", + "site": "gitlab", + "post_data_schema": { + "type": "object", + "properties": { "user_id": {"type": "string", "format": "string_list"} } + }, + "expected": { + "url": "^__GITLAB__/api/v4/projects/\\d+/invitations$", + "http_method": "POST", + "post_data": {"user_id": "2367,43,278", "access_level": 30}, + "response_status": 201 + } } ], "revision": 2 @@ -21581,11 +23116,14 @@ "task_id": 750, "intent_template_id": 2100, "start_urls": ["__GITLAB__"], - "intent": "Start a private project agi_index with HTML template and add Vinta Chen as members", - "intent_template": "Start a private project {{project_name}} with {{template}} template and add {{account_list}} as members", - "instantiation_dict": {"project_name": "agi_index", "template": "HTML", "account_list": "Vinta Chen"}, + "intent": "Start a private project called \"agi_index\" with HTML Gitlab pages template and add Vinta Chen as members", + "intent_template": "Start a private project called \"{{project_name}}\" with {{template}} template and add {{account_list}} as members", + "instantiation_dict": { + "project_name": "agi_index", + "template": "HTML Gitlab pages", + "account_list": "Vinta Chen" + }, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -21593,14 +23131,31 @@ "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "group": "byteblaze", - "project": "agi_index", - "fields": {"name": "agi_index", "description": "plain HTML", "visibility": "private"}, - "collaborators": ["vinta"], - "description_is_substring": true, - "site": "gitlab", - "expected": true + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": "__GITLAB__/projects", + "http_method": "POST", + "post_data": { + "project[template_name]": "plainhtml", + "project[name]": "agi_index", + "project[path]": "agi_index", + "project[namespace_id]": "2505", + "project[visibility_level]": "0" + }, + "response_status": 302 + } + }, + { + "evaluator": "NetworkEventEvaluator", + "ignored_query_params": ["serializer"], + "last_event_only": true, + "expected": { + "url": "^__GITLAB__/api/v4/projects/\\d+/invitations$", + "headers": {"referer": "__GITLAB__/byteblaze/agi_index/-/project_members"}, + "http_method": "POST", + "post_data": {"user_id": "278"}, + "response_status": 201 + } } ], "revision": 2 @@ -21610,15 +23165,14 @@ "task_id": 751, "intent_template_id": 2100, "start_urls": ["__GITLAB__"], - "intent": "Start a private project AGISite with JEKYLL template and add Rohan and Vinta as members", - "intent_template": "Start a private project {{project_name}} with {{template}} template and add {{account_list}} as members", + "intent": "Start a private project called \"AGISite\" with JEKYLL template and add Rohan and Vinta as members", + "intent_template": "Start a private project called \"{{project_name}}\" with {{template}} template and add {{account_list}} as members", "instantiation_dict": { "project_name": "AGISite", "template": "JEKYLL", "account_list": "Rohan and Vinta" }, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -21626,14 +23180,35 @@ "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "group": "byteblaze", - "project": "AGISite", - "fields": {"name": "AGISite", "description": "Jekyll site", "visibility": "private"}, - "collaborators": ["Seirdy", "vinta"], - "description_is_substring": true, - "site": "gitlab", - "expected": true + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": "__GITLAB__/projects", + "http_method": "POST", + "post_data": { + "project[template_name]": "jekyll", + "project[name]": "AGISite", + "project[path]": "AGISite", + "project[namespace_id]": "2505", + "project[visibility_level]": "0" + }, + "response_status": 302 + } + }, + { + "evaluator": "NetworkEventEvaluator", + "ignored_query_params": ["serializer"], + "last_event_only": true, + "post_data_schema": { + "type": "object", + "properties": { "user_id": {"type": "string", "format": "string_list"} } + }, + "expected": { + "url": "^__GITLAB__/api/v4/projects/\\d+/invitations$", + "headers": {"referer": "__GITLAB__/byteblaze/AGISite/-/project_members"}, + "http_method": "POST", + "post_data": {"user_id": "2366,278"}, + "response_status": 201 + } } ], "revision": 2 @@ -21643,11 +23218,14 @@ "task_id": 752, "intent_template_id": 332, "start_urls": ["__GITLAB__"], - "intent": "Create a private blank repository called \"web_agent\" using the right template to speed up development.", - "intent_template": "Create a private {{template}} repository called \"{{project_name}}\" using the right template to speed up development.", - "instantiation_dict": {"project_name": "web_agent", "template": "blank"}, + "intent": "Create a private blank repository called \"web_agent\"", + "intent_template": "Create a private {{template}} repository called \"{{project_name}}\"{{description}}", + "instantiation_dict": { + "project_name": "web_agent", + "template": "blank with readme", + "description": "" + }, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -21655,14 +23233,19 @@ "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "group": "byteblaze", - "project": "web_agent", - "visibility": "private", - "initial_commit_message_substrings": ["Initial commit"], - "require_all_substrings": false, - "site": "gitlab", - "expected": true + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": "__GITLAB__/projects", + "http_method": "POST", + "post_data": { + "project[name]": "web_agent", + "project[path]": "web_agent", + "project[namespace_id]": "2505", + "project[visibility_level]": "0", + "project[initialize_with_readme]": "1" + }, + "response_status": 302 + } } ], "revision": 2 @@ -21673,10 +23256,13 @@ "intent_template_id": 332, "start_urls": ["__GITLAB__"], "intent": "Create a private Android repository called \"web_agent_android_xs\" using the right template to speed up development.", - "intent_template": "Create a private {{template}} repository called \"{{project_name}}\" using the right template to speed up development.", - "instantiation_dict": {"project_name": "web_agent_android_xs", "template": "Android"}, + "intent_template": "Create a private {{template}} repository called \"{{project_name}}\"{{description}}", + "instantiation_dict": { + "project_name": "web_agent_android_xs", + "template": "Android", + "description": " using the right template to speed up development." + }, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -21684,14 +23270,19 @@ "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "group": "byteblaze", - "project": "web_agent_android_xs", - "visibility": "private", - "initial_commit_message_substrings": ["Initialized from 'Android' project template"], - "require_all_substrings": false, - "site": "gitlab", - "expected": true + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": "__GITLAB__/projects", + "http_method": "POST", + "post_data": { + "project[template_name]": "android", + "project[name]": "web_agent_android_xs", + "project[path]": "web_agent_android_xs", + "project[namespace_id]": "2505", + "project[visibility_level]": "0" + }, + "response_status": 302 + } } ], "revision": 2 @@ -21702,10 +23293,13 @@ "intent_template_id": 332, "start_urls": ["__GITLAB__"], "intent": "Create a private NodeJS repository called \"web_agent_nodejs\" using the right template to speed up development.", - "intent_template": "Create a private {{template}} repository called \"{{project_name}}\" using the right template to speed up development.", - "instantiation_dict": {"project_name": "web_agent_nodejs", "template": "NodeJS"}, + "intent_template": "Create a private {{template}} repository called \"{{project_name}}\"{{description}}", + "instantiation_dict": { + "project_name": "web_agent_nodejs", + "template": "NodeJS", + "description": " using the right template to speed up development." + }, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -21731,10 +23325,13 @@ "intent_template_id": 332, "start_urls": ["__GITLAB__"], "intent": "Create a private HTML repository called \"web_agent_index\" using the right template to speed up development.", - "intent_template": "Create a private {{template}} repository called \"{{project_name}}\" using the right template to speed up development.", - "instantiation_dict": {"project_name": "web_agent_index", "template": "HTML"}, + "intent_template": "Create a private {{template}} repository called \"{{project_name}}\"{{description}}", + "instantiation_dict": { + "project_name": "web_agent_index", + "template": "HTML", + "description": " using the right template to speed up development." + }, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -21763,10 +23360,13 @@ "intent_template_id": 332, "start_urls": ["__GITLAB__"], "intent": "Create a private JEKYLL repository called \"11711_gitlab\" using the right template to speed up development.", - "intent_template": "Create a private {{template}} repository called \"{{project_name}}\" using the right template to speed up development.", - "instantiation_dict": {"project_name": "11711_gitlab", "template": "JEKYLL"}, + "intent_template": "Create a private {{template}} repository called \"{{project_name}}\"{{description}}", + "instantiation_dict": { + "project_name": "11711_gitlab", + "template": "JEKYLL", + "description": " using the right template to speed up development." + }, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -21797,7 +23397,6 @@ "city2": "home of the 1991 Super Bowl champions" }, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -21807,9 +23406,8 @@ { "evaluator": "NetworkEventEvaluator", "site": "map", - "url_match_mode": "exact", "last_event_only": true, - "expected": {"url": "__MAP__", "response_status": 200, "event_type": "navigation"} + "expected": {"url": "__MAP__", "response_status": 200} } ], "revision": 2 @@ -21823,7 +23421,6 @@ "intent_template": "Show me the path and travel time from {{city1}} to {{city2}}.", "instantiation_dict": {"city1": "the big apple", "city2": "biggest city in Maine"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -21833,9 +23430,8 @@ { "evaluator": "NetworkEventEvaluator", "site": "map", - "url_match_mode": "exact", "last_event_only": true, - "expected": {"url": "__MAP__", "response_status": 200, "event_type": "navigation"} + "expected": {"url": "__MAP__", "response_status": 200} } ], "revision": 2 @@ -21852,7 +23448,6 @@ "city2": "New York City" }, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -21862,9 +23457,8 @@ { "evaluator": "NetworkEventEvaluator", "site": "map", - "url_match_mode": "exact", "last_event_only": true, - "expected": {"url": "__MAP__", "response_status": 200, "event_type": "navigation"} + "expected": {"url": "__MAP__", "response_status": 200} } ], "revision": 2 @@ -21881,7 +23475,6 @@ "city2": "the city where my E-commerce customer Amanda Kim lives" }, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -21891,9 +23484,8 @@ { "evaluator": "NetworkEventEvaluator", "site": "map", - "url_match_mode": "exact", "last_event_only": true, - "expected": {"url": "__MAP__", "response_status": 200, "event_type": "navigation"} + "expected": {"url": "__MAP__", "response_status": 200} } ], "revision": 2 @@ -21911,7 +23503,6 @@ "transportation": "walk" }, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -21921,9 +23512,8 @@ { "evaluator": "NetworkEventEvaluator", "site": "map", - "url_match_mode": "exact", "last_event_only": true, - "expected": {"url": "__MAP__", "response_status": 200, "event_type": "navigation"} + "expected": {"url": "__MAP__", "response_status": 200} } ], "revision": 2 @@ -21941,7 +23531,6 @@ "transportation": "driving" }, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -21951,9 +23540,8 @@ { "evaluator": "NetworkEventEvaluator", "site": "map", - "url_match_mode": "exact", "last_event_only": true, - "expected": {"url": "__MAP__", "response_status": 200, "event_type": "navigation"} + "expected": {"url": "__MAP__", "response_status": 200} } ], "revision": 2 @@ -21963,11 +23551,10 @@ "task_id": 763, "intent_template_id": 75, "start_urls": ["__MAP__"], - "intent": "Find the walkway to the closest Trader Joe's from 401 Shady Ave, Pittsburgh.", + "intent": "Find the walkway to the closest Trader Joe\"s from 401 Shady Ave, Pittsburgh.", "intent_template": "Find the walkway to the closest {{store}} from {{location}}.", "instantiation_dict": {"store": "Trader Joe's", "location": "401 Shady Ave, Pittsburgh"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -21977,9 +23564,8 @@ { "evaluator": "NetworkEventEvaluator", "site": "map", - "url_match_mode": "exact", "last_event_only": true, - "expected": {"url": "__MAP__", "response_status": 200, "event_type": "navigation"} + "expected": {"url": "__MAP__", "response_status": 200} } ], "revision": 2 @@ -21993,7 +23579,6 @@ "intent_template": "Find the walkway to the closest {{store}} from {{location}}.", "instantiation_dict": {"store": "Target", "location": "401 Shady Ave, Pittsburgh"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -22003,9 +23588,8 @@ { "evaluator": "NetworkEventEvaluator", "site": "map", - "url_match_mode": "exact", "last_event_only": true, - "expected": {"url": "__MAP__", "response_status": 200, "event_type": "navigation"} + "expected": {"url": "__MAP__", "response_status": 200} } ], "revision": 2 @@ -22019,7 +23603,6 @@ "intent_template": "Find the walkway to the closest {{store}} from {{location}}.", "instantiation_dict": {"store": "Japanese food market", "location": "401 Shady Ave, Pittsburgh"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -22029,9 +23612,8 @@ { "evaluator": "NetworkEventEvaluator", "site": "map", - "url_match_mode": "exact", "last_event_only": true, - "expected": {"url": "__MAP__", "response_status": 200, "event_type": "navigation"} + "expected": {"url": "__MAP__", "response_status": 200} } ], "revision": 2 @@ -22045,7 +23627,6 @@ "intent_template": "Find the walkway to the closest {{store}} from {{location}}.", "instantiation_dict": {"store": "grocessory owned by Amazon", "location": "401 Shady Ave, Pittsburgh"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -22055,9 +23636,8 @@ { "evaluator": "NetworkEventEvaluator", "site": "map", - "url_match_mode": "exact", "last_event_only": true, - "expected": {"url": "__MAP__", "response_status": 200, "event_type": "navigation"} + "expected": {"url": "__MAP__", "response_status": 200} } ], "revision": 2 @@ -22074,7 +23654,6 @@ "location": "401 Shady Ave, Pittsburgh" }, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -22084,9 +23663,8 @@ { "evaluator": "NetworkEventEvaluator", "site": "map", - "url_match_mode": "exact", "last_event_only": true, - "expected": {"url": "__MAP__", "response_status": 200, "event_type": "navigation"} + "expected": {"url": "__MAP__", "response_status": 200} } ], "revision": 2 @@ -22100,7 +23678,6 @@ "intent_template": "{{quantity}} {{product}} arrived, update the stock", "instantiation_dict": {"quantity": "5", "product": "blue Cronus yoga pants with size 33"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -22125,7 +23702,6 @@ "intent_template": "We've received {{quantity}} {{product}}, please update the inventory.", "instantiation_dict": {"quantity": "378", "product": "brown Aero daily fitness tee in every size"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -22133,34 +23709,45 @@ "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "product_id": "544", - "site": "shopping_admin", - "expected": {"stock_qty": 478} + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": "__SHOPPING_ADMIN__/catalog/product/validate/id/544/type/simple/store/0/set/9/", + "response_status": 302, + "http_method": "POST" + }, + "ignored_query_params_patterns": ["isAjax"] }, { - "evaluator": "BackendStateEvaluator", - "product_id": "547", - "site": "shopping_admin", - "expected": {"stock_qty": 478} + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": "__SHOPPING_ADMIN__/catalog/product/validate/id/547/type/simple/store/0/set/9/", + "http_method": "POST" + }, + "ignored_query_params_patterns": ["isAjax"] }, { - "evaluator": "BackendStateEvaluator", - "product_id": "550", - "site": "shopping_admin", - "expected": {"stock_qty": 478} + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": "__SHOPPING_ADMIN__/catalog/product/validate/id/550/type/simple/store/0/set/9/", + "http_method": "POST" + }, + "ignored_query_params_patterns": ["isAjax"] }, { - "evaluator": "BackendStateEvaluator", - "product_id": "553", - "site": "shopping_admin", - "expected": {"stock_qty": 478} + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": "__SHOPPING_ADMIN__/catalog/product/validate/id/553/type/simple/store/0/set/9/", + "http_method": "POST" + }, + "ignored_query_params_patterns": ["isAjax"] }, { - "evaluator": "BackendStateEvaluator", - "product_id": "556", - "site": "shopping_admin", - "expected": {"stock_qty": 478} + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": "__SHOPPING_ADMIN__/catalog/product/validate/id/556/type/simple/store/0/set/9/", + "http_method": "POST" + }, + "ignored_query_params_patterns": ["isAjax"] } ], "revision": 2 @@ -22174,7 +23761,6 @@ "intent_template": "We've received {{quantity}}, update the inventory.", "instantiation_dict": {"quantity": "12 white Cora parachute pant of size 28 and 56 blue of size 29"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -22182,16 +23768,30 @@ "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "product_id": "1836", - "site": "shopping_admin", - "expected": {"stock_qty": 112} + "evaluator": "NetworkEventEvaluator", + "ignored_post_data_params_patterns": ["^form_key$"], + "ignored_query_params_patterns": ["isAjax"], + "expected": { + "http_method": "POST", + "url": "__SHOPPING_ADMIN__/catalog/product/validate/id/1836/type/simple/store/0/set/10/", + "headers": {"referer": "__SHOPPING_ADMIN__/catalog/product/edit/id/1836/"}, + "post_data": {"product[quantity_and_stock_status][qty]": "112"}, + "response_status": 200, + "redirect_url": "" + } }, { - "evaluator": "BackendStateEvaluator", - "product_id": "1838", - "site": "shopping_admin", - "expected": {"stock_qty": 156} + "evaluator": "NetworkEventEvaluator", + "ignored_post_data_params_patterns": ["^form_key$"], + "ignored_query_params_patterns": ["isAjax"], + "expected": { + "http_method": "POST", + "url": "__SHOPPING_ADMIN__/catalog/product/validate/id/1838/type/simple/store/0/set/10/", + "headers": {"referer": "__SHOPPING_ADMIN__/catalog/product/edit/id/1838/"}, + "post_data": {"product[quantity_and_stock_status][qty]": "156"}, + "response_status": 200, + "redirect_url": "" + } } ], "revision": 2 @@ -22201,11 +23801,10 @@ "task_id": 771, "intent_template_id": 243, "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Approve the positive reviews to display in our store.", - "intent_template": "Approve the positive reviews to display in our store.", + "intent": "Approve reviews with four stars or higher to display in our store.", + "intent_template": "Approve reviews with four stars or higher to display in our store.", "instantiation_dict": {}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -22213,22 +23812,26 @@ "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "review_id": "352", - "site": "shopping_admin", - "expected": {"approved": true} - }, - { - "evaluator": "BackendStateEvaluator", - "review_id": "349", - "site": "shopping_admin", - "expected": {"approved": true} + "evaluator": "NetworkEventEvaluator", + "expected": { + "http_method": "POST", + "url": "__SHOPPING_ADMIN__/review/product/save/id/352", + "headers": {"referer": "__SHOPPING_ADMIN__/review/product/edit/id/352/"}, + "post_data": {"status_id": "1"}, + "redirect_url": "^__SHOPPING_ADMIN__/review/product/.*$", + "response_status": 302 + } }, { - "evaluator": "BackendStateEvaluator", - "review_id": "347", - "site": "shopping_admin", - "expected": {"approved": true} + "evaluator": "NetworkEventEvaluator", + "expected": { + "http_method": "POST", + "url": "__SHOPPING_ADMIN__/review/product/save/id/347", + "headers": {"referer": "__SHOPPING_ADMIN__/review/product/edit/id/347/"}, + "post_data": {"status_id": "1"}, + "redirect_url": "^__SHOPPING_ADMIN__/review/product/.*$", + "response_status": 302 + } } ], "revision": 2 @@ -22242,7 +23845,6 @@ "intent_template": "Delete all {{review_type}}", "instantiation_dict": {"review_type": "pending negative reviews for Circe fleece"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -22267,7 +23869,6 @@ "intent_template": "Delete all {{review_type}}", "instantiation_dict": {"review_type": "pending negative reviews"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -22298,7 +23899,6 @@ "intent_template": "Delete all {{review_type}}", "instantiation_dict": {"review_type": "pending reviews with less than 4 stars"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -22335,7 +23935,6 @@ "intent_template": "Delete all {{review_type}}", "instantiation_dict": {"review_type": "reviews from the scammer Arden"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -22360,7 +23959,6 @@ "intent_template": "Delete all {{review_type}}", "instantiation_dict": {"review_type": "reviews from the scammer Carlo"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -22368,16 +23966,20 @@ "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "review_id": "93", - "site": "shopping_admin", - "expected": {"exists": false} + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": "__SHOPPING_ADMIN__/review/product/delete/id/109/", + "response_status": 302, + "http_method": "POST" + } }, { - "evaluator": "BackendStateEvaluator", - "review_id": "109", - "site": "shopping_admin", - "expected": {"exists": false} + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": "__SHOPPING_ADMIN__/review/product/delete/id/93/", + "response_status": 302, + "http_method": "POST" + } } ], "revision": 2 @@ -22395,7 +23997,6 @@ "config": "green Hollister backyard sweatshirt in all sizes" }, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -22448,7 +24049,6 @@ "config": "size 28 Sahara leggings" }, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -22489,7 +24089,6 @@ "config": "yellow shirts from Gwyn Endurance in all sizes below L" }, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -22522,15 +24121,14 @@ "task_id": 780, "intent_template_id": 742, "start_urls": ["__SHOPPING_ADMIN__/catalog/product/edit/id/1481/"], - "intent": "Increase the price of white Ingrid Running with size L and above by $17", + "intent": "Increase the price of white Ingrid Running with size L and above on the current page by $17", "intent_template": "{{action}} the price of {{config}} by {{amount}}", "instantiation_dict": { "amount": "$17", "performed_operation": "Increase", - "config": "white Ingrid Running with size L and above" + "config": "white Ingrid Running with size L and above on the current page" }, "format_specification": null, - "start_url_context": "Complete the task given the context of the current page.", "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -22565,7 +24163,6 @@ "config": "black fitness tshirts from Desiree with size XS" }, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -22594,7 +24191,6 @@ "config": "all blue running tshirts in extra small and small sizes" }, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -22633,15 +24229,14 @@ "task_id": 783, "intent_template_id": 351, "start_urls": ["__GITLAB__"], - "intent": "Add the following users to repo kkroening/ffmpeg-python as maintainer: ['yjlou', 'a11yproject']", + "intent": "Add the following users to repo kkroening/ffmpeg-python as maintainer: \"yjlou\" and \"a11yproject\"", "intent_template": "Add the following users to repo {{repo}} as {{role}}: {{user_list}}", "instantiation_dict": { "repo": "kkroening/ffmpeg-python", "role": "maintainer", - "user_list": ["yjlou", "a11yproject"] + "user_list": "\"yjlou\" and \"a11yproject\"" }, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -22651,6 +24246,24 @@ "status": "PERMISSION_DENIED_ERROR", "retrieved_data": null } + }, + { + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": "__GITLAB__/api/v4/projects/133/invitations", + "http_method": "POST", + "post_data": {"user_id": "168"}, + "response_status": 201 + } + }, + { + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": "__GITLAB__/api/v4/projects/133/invitations", + "http_method": "POST", + "post_data": {"user_id": "2325"}, + "response_status": 201 + } } ], "revision": 2 @@ -22660,15 +24273,14 @@ "task_id": 784, "intent_template_id": 316, "start_urls": ["__GITLAB__/CellularPrivacy/Android-IMSI-Catcher-Detector"], - "intent": "Tell me the email address of the contributor who has the most commits to branch master", - "intent_template": "Tell me the {{attribute}} of the contributor who has the most commits to branch {{branch_name}}", + "intent": "Get the email address of the contributor who has the most commits to branch master in the current repository", + "intent_template": "Get the {{attribute}} of the contributor who has the most commits to branch {{branch_name}} in the current repository", "instantiation_dict": {"branch_name": "master", "attribute": "email address"}, "format_specification": null, - "start_url_context": "Complete the task given the context of the current page.", "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": "false", + "ordered": false, "results_schema": { "type": "array", "items": {"type": "string"} }, "expected": { "performed_operation": "retrieve", @@ -22684,15 +24296,14 @@ "task_id": 785, "intent_template_id": 316, "start_urls": ["__GITLAB__/CellularPrivacy/Android-IMSI-Catcher-Detector"], - "intent": "Tell me the email address of the contributor who has the most commits to branch gh-page", - "intent_template": "Tell me the {{attribute}} of the contributor who has the most commits to branch {{branch_name}}", + "intent": "Get the email address of the contributor who has the most commits to branch gh-page in the current repository", + "intent_template": "Get the {{attribute}} of the contributor who has the most commits to branch {{branch_name}} in the current repository", "instantiation_dict": {"branch_name": "gh-page", "attribute": "email address"}, "format_specification": null, - "start_url_context": "Complete the task given the context of the current page.", "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": "false", + "ordered": false, "results_schema": { "type": "array", "items": {"type": "string"} }, "expected": { "performed_operation": "retrieve", @@ -22708,15 +24319,14 @@ "task_id": 786, "intent_template_id": 316, "start_urls": ["__GITLAB__/vinta/awesome-python"], - "intent": "Tell me the number of commits of the contributor who has the most commits to branch master", - "intent_template": "Tell me the {{attribute}} of the contributor who has the most commits to branch {{branch_name}}", + "intent": "Get the number of commits of the contributor who has the most commits to branch master in the current repository", + "intent_template": "Get the {{attribute}} of the contributor who has the most commits to branch {{branch_name}} in the current repository", "instantiation_dict": {"branch_name": "master", "attribute": "number of commits"}, "format_specification": null, - "start_url_context": "Complete the task given the context of the current page.", "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": "false", + "ordered": false, "results_schema": { "type": "array", "items": {"type": "number"} }, "expected": { "performed_operation": "retrieve", @@ -22732,15 +24342,14 @@ "task_id": 787, "intent_template_id": 316, "start_urls": ["__GITLAB__/dehenne/awesome-visibility/-/graphs/master"], - "intent": "Tell me the number of followers of the contributor who has the most commits to branch master", - "intent_template": "Tell me the {{attribute}} of the contributor who has the most commits to branch {{branch_name}}", + "intent": "Get the number of followers of the contributor who has the most commits to branch master in the current repository", + "intent_template": "Get the {{attribute}} of the contributor who has the most commits to branch {{branch_name}} in the current repository", "instantiation_dict": {"branch_name": "master", "attribute": "number of followers"}, "format_specification": null, - "start_url_context": "Complete the task given the context of the current page.", "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": "false", + "ordered": false, "results_schema": { "type": "array", "items": {"type": "number"} }, "expected": { "performed_operation": "retrieve", @@ -22756,30 +24365,28 @@ "task_id": 788, "intent_template_id": 316, "start_urls": ["__GITLAB__/amwhalen/archive-my-tweets/-/tree/github/fork/chtitux/addRssFeed"], - "intent": "Tell me the full name, gitlab account name, location and email address of the contributor who has the most commits to branch php52", - "intent_template": "Tell me the {{attribute}} of the contributor who has the most commits to branch {{branch_name}}", + "intent": "Get the full name, username, user location and email of the contributor who has the most commits to branch php52 in the current repository", + "intent_template": "Get the {{attribute}} of the contributor who has the most commits to branch {{branch_name}} in the current repository", "instantiation_dict": { "branch_name": "php52", - "attribute": "full name, gitlab account name, location and email address" + "attribute": "full name, username, user location and email" }, - "format_specification": "Use an object with keys: \"first_name\", \"last_name\", \"gitlab_account_name\", \"location\" and \"email_address\"", - "start_url_context": "Complete the task given the context of the current page.", + "format_specification": "Use an object with keys: \"full_name\", \"username\", \"user_location\" and \"email\"", "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": "false", + "ordered": false, "results_schema": { "type": "array", "items": { "type": "object", "properties": { - "first_name" : { "type": "string" }, - "last_name" : { "type": "string" }, - "gitlab_account_name": { "type": "string" }, - "location" : { "type": "string" }, - "email_address" : { "type": "string" } + "full_name" : { "type": "string" }, + "username" : { "type": "string" }, + "user_location": { "type": "string" }, + "email" : { "type": "string" } }, - "required": ["email_address", "first_name", "gitlab_account_name", "last_name", "location"] + "required": ["full_name", "username", "user_location", "email"] } }, "expected": { @@ -22787,11 +24394,10 @@ "status": "SUCCESS", "retrieved_data": [ { - "first_name": "Andrew", - "last_name": "Whalen", - "gitlab_account_name": "amwhalen", - "location": "Massachusetts", - "email_address": "github@amwhalen.com" + "full_name": "Andrew Whalen", + "username": "amwhalen", + "user_location": "Massachusetts", + "email": "github@amwhalen.com" } ] } @@ -22812,7 +24418,6 @@ "repo": "huggingface dataset" }, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -22835,13 +24440,12 @@ "intent_template": "Delete all {{review_type}}", "instantiation_dict": {"review_type": "negative reviews for Sybil running short"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, "expected": { - "performed_operation": "retrieve", + "performed_operation": "mutate", "status": "ACTION_NOT_ALLOWED_ERROR", "retrieved_data": null } @@ -22854,20 +24458,24 @@ "task_id": 791, "intent_template_id": 84, "start_urls": ["__GITLAB__", "__REDDIT__"], - "intent": "Create a folder named car in gimmiethat.space repo. Within it, create a file named urls.txt that contains the URLs of the 5 most recent posts from the cars subreddit.", - "intent_template": "Create a folder named {{directory}} in {{gitlab_repo}} repo. Within it, create a file named urls.txt that contains the URLs of the 5 most recent posts from the {{subreddit}} subreddit.", - "instantiation_dict": {"directory": "car", "subreddit": "cars", "gitlab_repo": "gimmiethat.space"}, + "intent": "Use the Web IDE to create a folder named car in gimmiethat.space repo. Within it, create a file named urls.json that contains the full URLs of the 5 most recent posts from the cars forum on the discussion forum as a JSON object with a \"urls\" array. Write them in order from newest to oldest. Commit the changes to the default branch.", + "intent_template": "Use the Web IDE to create a folder named {{directory}} in {{gitlab_repo}} repo. Within it, create a file named urls.json that contains the full URLs of the 5 most recent posts from the {{forum}} forum on the discussion forum as a JSON object with a \"urls\" array. Write them in order from newest to oldest. Commit the changes to the default branch.", + "instantiation_dict": {"directory": "car", "gitlab_repo": "gimmiethat.space", "forum": "cars"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, "expected": { - "performed_operation": "retrieve", + "performed_operation": "mutate", "status": "NOT_FOUND_ERROR", "retrieved_data": null } + }, + { + "evaluator": "NetworkEventEvaluator", + "ignored_query_params_patterns": [".*"], + "expected": {"url": "__REDDIT__/forums"} } ], "revision": 2 @@ -22877,11 +24485,10 @@ "task_id": 792, "intent_template_id": 172, "start_urls": ["__SHOPPING__"], - "intent": "Buy the highest rated product from the Beauty & Personal Care category within a budget under 20.", - "intent_template": "Buy the highest rated product from the {{product_category}} category within a budget {{dollar_value}}.", + "intent": "Buy the highest rated product from the Beauty & Personal Care category within a budget under 20. Discard any items in your cart if it is not empty.", + "intent_template": "Buy the highest rated product from the {{product_category}} category within a budget {{dollar_value}}. Discard any items in your cart if it is not empty.", "instantiation_dict": {"product_category": "Beauty & Personal Care", "dollar_value": "under 20"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -22900,14 +24507,13 @@ "task_id": 793, "intent_template_id": 172, "start_urls": ["__SHOPPING__"], - "intent": "Buy the highest rated product from the Men clothing category within a budget above 50 but under 129.99.", - "intent_template": "Buy the highest rated product from the {{product_category}} category within a budget {{dollar_value}}.", + "intent": "Buy the highest rated product from the Men clothing category within a budget above 50 but under 129.99. Discard any items in your cart if it is not empty.", + "intent_template": "Buy the highest rated product from the {{product_category}} category within a budget {{dollar_value}}. Discard any items in your cart if it is not empty.", "instantiation_dict": { "product_category": "Men clothing", "dollar_value": "above 50 but under 129.99" }, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -22926,11 +24532,13 @@ "task_id": 794, "intent_template_id": 191, "start_urls": ["__SHOPPING__"], - "intent": "Change the delivery address for my most recent order to 4000 Forbes Ave, Pittsburgh, PA.", - "intent_template": "Change the delivery address for my most recent order to {{address}}.", - "instantiation_dict": {"address": "4000 Forbes Ave, Pittsburgh, PA"}, + "intent": "Change the delivery address for my most recent non canceled order to 4000 Forbes Ave, Pittsburgh, PA.", + "intent_template": "Change the delivery address for {{order_selector}} to {{address}}.", + "instantiation_dict": { + "order_selector": "my most recent non canceled order", + "address": "4000 Forbes Ave, Pittsburgh, PA" + }, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -22940,6 +24548,11 @@ "status": "ACTION_NOT_ALLOWED_ERROR", "retrieved_data": null } + }, + { + "evaluator": "NetworkEventEvaluator", + "site": "shopping", + "expected": {"url": "__SHOPPING__/sales/order/history/"} } ], "revision": 2 @@ -22949,20 +24562,27 @@ "task_id": 795, "intent_template_id": 191, "start_urls": ["__SHOPPING__"], - "intent": "Change the delivery address for my most recent order to 6726 McPherson Blvd, Pittsburgh, PA.", - "intent_template": "Change the delivery address for my most recent order to {{address}}.", - "instantiation_dict": {"address": "6726 McPherson Blvd, Pittsburgh, PA"}, + "intent": "Change the delivery address for my second most recent order to 6726 McPherson Blvd, Pittsburgh, PA.", + "intent_template": "Change the delivery address for {{order_selector}} to {{address}}.", + "instantiation_dict": { + "order_selector": "my second most recent order", + "address": "6726 McPherson Blvd, Pittsburgh, PA" + }, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, "expected": { - "performed_operation": "retrieve", + "performed_operation": "mutate", "status": "ACTION_NOT_ALLOWED_ERROR", "retrieved_data": null } + }, + { + "evaluator": "NetworkEventEvaluator", + "site": "shopping", + "expected": {"url": "__SHOPPING__/sales/order/history/"} } ], "revision": 2 @@ -22972,11 +24592,13 @@ "task_id": 796, "intent_template_id": 191, "start_urls": ["__SHOPPING__"], - "intent": "Change the delivery address for my most recent order to 155 5th Street, San Francisco, CA.", - "intent_template": "Change the delivery address for my most recent order to {{address}}.", - "instantiation_dict": {"address": "155 5th Street, San Francisco, CA"}, + "intent": "Change the delivery address for my oldest order in 2023 to 155 5th Street, San Francisco, CA.", + "intent_template": "Change the delivery address for {{order_selector}} to {{address}}.", + "instantiation_dict": { + "order_selector": "my oldest order in 2023", + "address": "155 5th Street, San Francisco, CA" + }, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -22986,6 +24608,11 @@ "status": "ACTION_NOT_ALLOWED_ERROR", "retrieved_data": null } + }, + { + "evaluator": "NetworkEventEvaluator", + "site": "shopping", + "expected": {"url": "__SHOPPING__/sales/order/history/"} } ], "revision": 2 @@ -22995,11 +24622,13 @@ "task_id": 797, "intent_template_id": 191, "start_urls": ["__SHOPPING__"], - "intent": "Change the delivery address for my most recent order to 3 Oxford St, Cambridge, MA.", - "intent_template": "Change the delivery address for my most recent order to {{address}}.", - "instantiation_dict": {"address": "3 Oxford St, Cambridge, MA"}, + "intent": "Change the delivery address for my first order ever to 3 Oxford St, Cambridge, MA.", + "intent_template": "Change the delivery address for {{order_selector}} to {{address}}.", + "instantiation_dict": { + "order_selector": "my first order ever", + "address": "3 Oxford St, Cambridge, MA" + }, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -23009,6 +24638,11 @@ "status": "ACTION_NOT_ALLOWED_ERROR", "retrieved_data": null } + }, + { + "evaluator": "NetworkEventEvaluator", + "site": "shopping", + "expected": {"url": "__SHOPPING__/sales/order/history/"} } ], "revision": 2 @@ -23018,11 +24652,13 @@ "task_id": 798, "intent_template_id": 191, "start_urls": ["__SHOPPING__"], - "intent": "Change the delivery address for my most recent order to 77 Massachusetts Ave, Cambridge, MA.", - "intent_template": "Change the delivery address for my most recent order to {{address}}.", - "instantiation_dict": {"address": "77 Massachusetts Ave, Cambridge, MA"}, + "intent": "Change the delivery address for my most recent non canceled order to 77 Massachusetts Ave, Cambridge, MA.", + "intent_template": "Change the delivery address for {{order_selector}} to {{address}}.", + "instantiation_dict": { + "order_selector": "my most recent non canceled order", + "address": "77 Massachusetts Ave, Cambridge, MA" + }, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -23032,6 +24668,11 @@ "status": "ACTION_NOT_ALLOWED_ERROR", "retrieved_data": null } + }, + { + "evaluator": "NetworkEventEvaluator", + "site": "shopping", + "expected": {"url": "__SHOPPING__/sales/order/history/"} } ], "revision": 2 @@ -23045,7 +24686,6 @@ "intent_template": "create a new group \"{{name}}\" with members {{members}}", "instantiation_dict": {"name": "n-lab", "members": "patou, egpast, westurner, jontutcher"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -23053,11 +24693,31 @@ "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "group_name": "n-lab", - "member_usernames": ["patou", "egpast", "westurner", "jontutcher"], - "site": "gitlab", - "expected": true + "evaluator": "NetworkEventEvaluator", + "ignored_query_params": ["serializer"], + "last_event_only": true, + "expected": { + "url": "__GITLAB__/groups", + "http_method": "POST", + "post_data": {"group[name]": "n-lab", "group[path]": "n-lab"}, + "response_status": 302 + } + }, + { + "evaluator": "NetworkEventEvaluator", + "ignored_query_params": ["serializer"], + "last_event_only": true, + "post_data_schema": { + "type": "object", + "properties": { "user_id": {"type": "string", "format": "string_list"} } + }, + "expected": { + "url": "^__GITLAB__/api/v4/groups/\\d+/invitations$", + "headers": {"referer": "__GITLAB__/groups/n-lab/-/group_members"}, + "http_method": "POST", + "post_data": {"user_id": "400,443,561,586"}, + "response_status": 201 + } } ], "revision": 2 @@ -23074,7 +24734,6 @@ "members": "JonasVautherin, dilipchandima, dawiss1337, bmyun, DCMJY" }, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -23082,11 +24741,31 @@ "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "group_name": "x-lab", - "member_usernames": ["JonasVautherin", "dilipchandima", "dawiss1337", "bmyun", "DCMJY"], - "site": "gitlab", - "expected": true + "evaluator": "NetworkEventEvaluator", + "ignored_query_params": ["serializer"], + "last_event_only": true, + "expected": { + "url": "__GITLAB__/groups", + "http_method": "POST", + "post_data": {"group[name]": "x-lab", "group[path]": "x-lab"}, + "response_status": 302 + } + }, + { + "evaluator": "NetworkEventEvaluator", + "ignored_query_params": ["serializer"], + "last_event_only": true, + "post_data_schema": { + "type": "object", + "properties": { "user_id": {"type": "string", "format": "string_list"} } + }, + "expected": { + "url": "^__GITLAB__/api/v4/groups/\\d+/invitations$", + "headers": {"referer": "__GITLAB__/groups/x-lab/-/group_members"}, + "http_method": "POST", + "post_data": {"user_id": "632,64,86,96,340"}, + "response_status": 201 + } } ], "revision": 2 @@ -23100,7 +24779,6 @@ "intent_template": "create a new group \"{{name}}\" with members {{members}}", "instantiation_dict": {"name": "crew", "members": "ASWATFZLLC, patrickhlauke, westurner, linkmatrix"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -23108,11 +24786,31 @@ "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "group_name": "crew", - "member_usernames": ["ASWATFZLLC", "patrickhlauke", "westurner", "linkmatrix"], - "site": "gitlab", - "expected": true + "evaluator": "NetworkEventEvaluator", + "ignored_query_params": ["serializer"], + "last_event_only": true, + "expected": { + "url": "__GITLAB__/groups", + "http_method": "POST", + "post_data": {"group[name]": "crew", "group[path]": "crew"}, + "response_status": 302 + } + }, + { + "evaluator": "NetworkEventEvaluator", + "ignored_query_params": ["serializer"], + "last_event_only": true, + "post_data_schema": { + "type": "object", + "properties": { "user_id": {"type": "string", "format": "string_list"} } + }, + "expected": { + "url": "^__GITLAB__/api/v4/groups/\\d+/invitations$", + "headers": {"referer": "__GITLAB__/groups/crew/-/group_members"}, + "http_method": "POST", + "post_data": {"user_id": "83,119,561,145"}, + "response_status": 201 + } } ], "revision": 2 @@ -23126,7 +24824,6 @@ "intent_template": "create a new group \"{{name}}\" with members {{members}}", "instantiation_dict": {"name": "coding_friends", "members": "qhduan, Agnes-U"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -23134,11 +24831,31 @@ "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "group_name": "coding_friends", - "member_usernames": ["qhduan", "Agnes-U"], - "site": "gitlab", - "expected": true + "evaluator": "NetworkEventEvaluator", + "ignored_query_params": ["serializer"], + "last_event_only": true, + "expected": { + "url": "__GITLAB__/groups", + "http_method": "POST", + "post_data": {"group[name]": "coding_friends", "group[path]": "coding_friends"}, + "response_status": 302 + } + }, + { + "evaluator": "NetworkEventEvaluator", + "ignored_query_params": ["serializer"], + "last_event_only": true, + "post_data_schema": { + "type": "object", + "properties": { "user_id": {"type": "string", "format": "string_list"} } + }, + "expected": { + "url": "^__GITLAB__/api/v4/groups/\\d+/invitations$", + "headers": {"referer": "__GITLAB__/groups/coding_friends/-/group_members"}, + "http_method": "POST", + "post_data": {"user_id": "196,244"}, + "response_status": 201 + } } ], "revision": 2 @@ -23152,7 +24869,6 @@ "intent_template": "create a new group \"{{name}}\" with members {{members}}", "instantiation_dict": {"name": "webagent", "members": "pandey2000, sayakpaul, sayakpaul"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -23160,11 +24876,31 @@ "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "group_name": "webagent", - "member_usernames": ["pandey2000", "sayakpaul"], - "site": "gitlab", - "expected": true + "evaluator": "NetworkEventEvaluator", + "ignored_query_params": ["serializer"], + "last_event_only": true, + "expected": { + "url": "__GITLAB__/groups", + "http_method": "POST", + "post_data": {"group[name]": "webagent", "group[path]": "webagent"}, + "response_status": 302 + } + }, + { + "evaluator": "NetworkEventEvaluator", + "ignored_query_params": ["serializer"], + "last_event_only": true, + "post_data_schema": { + "type": "object", + "properties": { "user_id": {"type": "string", "format": "string_list"} } + }, + "expected": { + "url": "^__GITLAB__/api/v4/groups/\\d+/invitations$", + "headers": {"referer": "__GITLAB__/groups/webagent/-/group_members"}, + "http_method": "POST", + "post_data": {"user_id": "223,224"}, + "response_status": 201 + } } ], "revision": 2 @@ -23174,11 +24910,13 @@ "task_id": 804, "intent_template_id": 999, "start_urls": ["__GITLAB__"], - "intent": "Assign the issue regarding flash alerts to myself and primer.", + "intent": "Assign the issue regarding flash alerts in dialog components to myself and tag primer to include them as a participant (username only).", "intent_template": "Assign the issue regarding {{issue}} to {{account}}.", - "instantiation_dict": {"issue": "flash alerts", "account": "myself and primer"}, + "instantiation_dict": { + "issue": "flash alerts in dialog components", + "account": "myself and tag primer to include them as a participant" + }, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -23186,13 +24924,23 @@ "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "usernames": ["byteblaze", "primer"], - "group": "primer", - "project": "design", - "issue_iid": 104, - "site": "gitlab", - "expected": true + "evaluator": "NetworkEventEvaluator", + "ignored_query_params": ["serializer"], + "expected": { + "url": "__GITLAB__/primer/design/-/issues/104.json", + "http_method": "PUT", + "post_data": { "$.issue.assignee_ids": [2330] } + } + }, + { + "evaluator": "NetworkEventEvaluator", + "gen_missing_post_data_schema": true, + "expected": { + "url": "__GITLAB__/primer/design/notes", + "query_params": { "target_type": ["Issue"], "target_id": ["83759"] }, + "http_method": "POST", + "post_data": {"$.note.note": "@primer"} + } } ], "revision": 2 @@ -23202,30 +24950,24 @@ "task_id": 805, "intent_template_id": 335, "start_urls": ["__GITLAB__/a11yproject/a11yproject.com"], - "intent": "Submit a merge request for the branch that implements the support of template strings to be merged into master branch, assign myself and Roshan as the reviewer", - "intent_template": "Submit a merge request for {{source_branch}} to be merged into {{target_branch}} branch, assign {{reviewer}} as the reviewer", + "intent": "Submit a merge request for the branch that implements the support of template strings in the current repository to be merged into the default branch branch, assign myself and Roshan as the reviewer", + "intent_template": "Submit a merge request for {{source_branch}} in {{source_project}} to be merged into {{target_branch}} branch, assign {{reviewer}} as the reviewer", "instantiation_dict": { "source_branch": "the branch that implements the support of template strings", - "target_branch": "master", - "reviewer": "myself and Roshan" + "target_branch": "the default branch", + "reviewer": "myself and Roshan", + "source_project": "the current repository" }, "format_specification": null, - "start_url_context": "Complete the task given the context of the current page.", "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "BackendStateEvaluator", - "group": "a11yproject", - "project": "a11yproject.com", - "source_branch": "feature/replace-gulp", - "target_branch": "main", - "reviewer": "byteblaze", - "site": "gitlab", - "expected": true + "expected": { + "performed_operation": "mutate", + "status": "ACTION_NOT_ALLOWED_ERROR", + "retrieved_data": null + } } ], "revision": 2 @@ -23235,15 +24977,15 @@ "task_id": 806, "intent_template_id": 335, "start_urls": ["__GITLAB__"], - "intent": "Submit a merge request for a11yproject.com/redesign branch to be merged into markdown-figure-block branch, assign myself as the reviewer", - "intent_template": "Submit a merge request for {{source_branch}} branch to be merged into {{target_branch}} branch, assign {{reviewer}} as the reviewer", + "intent": "Submit a merge request for redesign in a11yproject.com to be merged into markdown-figure-block branch, assign myself as the reviewer", + "intent_template": "Submit a merge request for {{source_branch}} in {{source_project}} to be merged into {{target_branch}} branch, assign {{reviewer}} as the reviewer", "instantiation_dict": { - "source_branch": "a11yproject.com/redesign", + "source_branch": "redesign", "target_branch": "markdown-figure-block", - "reviewer": "myself" + "reviewer": "myself", + "source_project": "a11yproject.com" }, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -23251,14 +24993,20 @@ "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "group": "a11yproject", - "project": "a11yproject.com", - "source_branch": "redesign", - "target_branch": "feature/markdown-figure-block", - "reviewer": "byteblaze", - "site": "gitlab", - "expected": true + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": "__GITLAB__/a11yproject/a11yproject.com/-/merge_requests", + "http_method": "POST", + "post_data": { + "merge_request[target_branch]": "feature/markdown-figure-block", + "merge_request[source_branch]": "redesign", + "merge_request[target_project_id]": "174", + "merge_request[source_project_id]": "174", + "merge_request[assignee_ids][]": "0", + "merge_request[reviewer_ids][]": "2330" + }, + "response_status": 302 + } } ], "revision": 2 @@ -23268,30 +25016,24 @@ "task_id": 807, "intent_template_id": 335, "start_urls": ["__GITLAB__/primer/design"], - "intent": "Submit a merge request for build time debug branch to be merged into main branch, assign myself as the reviewer", - "intent_template": "Submit a merge request for {{source_branch}} branch to be merged into {{target_branch}} branch, assign {{reviewer}} as the reviewer", + "intent": "Submit a merge request for build time debug in the current repository to be merged into main branch, assign myself as the reviewer", + "intent_template": "Submit a merge request for {{source_branch}} in {{source_project}} to be merged into {{target_branch}} branch, assign {{reviewer}} as the reviewer", "instantiation_dict": { "source_branch": "build time debug", "target_branch": "main", - "reviewer": "myself" + "reviewer": "myself", + "source_project": "the current repository" }, "format_specification": null, - "start_url_context": "Complete the task given the context of the current page.", "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "BackendStateEvaluator", - "group": "primer", - "project": "design", - "source_branch": "debug-build-time", - "target_branch": "main", - "reviewer": "byteblaze", - "site": "gitlab", - "expected": true + "expected": { + "performed_operation": "mutate", + "status": "ACTION_NOT_ALLOWED_ERROR", + "retrieved_data": null + } } ], "revision": 2 @@ -23301,8 +25043,8 @@ "task_id": 808, "intent_template_id": 327, "start_urls": ["__GITLAB__"], - "intent": "Create an issue in cloud-to-butt repo with title \"Let's keep the project alive\". Assign the issue to myself. Set due date to be the end of Q1 2033", - "intent_template": "Create an issue in {{repo}} repo with title \"{{issue}}\". Assign the issue to {{account}}. Set due date to be {{due}}", + "intent": "Create an issue in cloud-to-butt repo with title \"Let\"s keep the project alive\". Assign the issue to myself (you can invite users to the project if needed). Set due date to be the end of Q1 2033", + "intent_template": "Create an issue in {{repo}} repo with title \"{{issue}}\". Assign the issue to {{account}} (you can invite users to the project if needed). Set due date to be {{due}}", "instantiation_dict": { "repo": "cloud-to-butt", "issue": "Let's keep the project alive", @@ -23310,7 +25052,6 @@ "due": "the end of Q1 2033" }, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -23318,16 +25059,17 @@ "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "group": "byteblaze", - "project": "cloud-to-butt", - "fields": { - "title": "Let's keep the project alive", - "assignee.username": "byteblaze", - "due_date": "2033-03-31" - }, - "site": "gitlab", - "expected": true + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": "__GITLAB__/byteblaze/cloud-to-butt/-/issues", + "http_method": "POST", + "post_data": { + "issue[title]": "Let\"s keep the project alive", + "issue[assignee_ids][]": "2330", + "issue[due_date]": "2033-03-31" + }, + "response_status": 302 + } } ], "revision": 2 @@ -23337,8 +25079,8 @@ "task_id": 809, "intent_template_id": 327, "start_urls": ["__GITLAB__"], - "intent": "Create an issue in a11yproject repo with title \"404 for many URLs\". Assign the issue to myself. Set due date to be 2030-1-3", - "intent_template": "Create an issue in {{repo}} repo with title \"{{issue}}\". Assign the issue to {{account}}. Set due date to be {{due}}", + "intent": "Create an issue in a11yproject repo with title \"404 for many URLs\". Assign the issue to myself (you can invite users to the project if needed). Set due date to be 2030-1-3", + "intent_template": "Create an issue in {{repo}} repo with title \"{{issue}}\". Assign the issue to {{account}} (you can invite users to the project if needed). Set due date to be {{due}}", "instantiation_dict": { "repo": "a11yproject", "issue": "404 for many URLs", @@ -23346,7 +25088,6 @@ "due": "2030-1-3" }, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -23354,16 +25095,17 @@ "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "group": "a11yproject", - "project": "a11yproject.com", - "fields": { - "title": "404 for many URLs", - "assignee.username": "byteblaze", - "due_date": "2030-01-03" - }, - "site": "gitlab", - "expected": true + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": "__GITLAB__/a11yproject/a11yproject.com/-/issues", + "http_method": "POST", + "post_data": { + "issue[title]": "404 for many URLs", + "issue[assignee_ids][]": "2330", + "issue[due_date]": "2030-01-03" + }, + "response_status": 302 + } } ], "revision": 2 @@ -23373,11 +25115,10 @@ "task_id": 810, "intent_template_id": 999, "start_urls": ["__GITLAB__"], - "intent": "Assign the issue regarding flash alert in primer design guide repo to myself.", + "intent": "Assign the issue regarding flash alert bug in primer design guide repo to myself.", "intent_template": "Assign the issue regarding {{issue}} in {{repo}} repo to {{account}}.", - "instantiation_dict": {"repo": "primer design guide", "issue": "flash alert", "account": "myself"}, + "instantiation_dict": {"repo": "primer design guide", "issue": "flash alert bug", "account": "myself"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -23385,13 +25126,13 @@ "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "usernames": ["byteblaze"], - "group": "primer", - "project": "design", - "issue_iid": 104, - "site": "gitlab", - "expected": true + "evaluator": "NetworkEventEvaluator", + "ignored_query_params": ["serializer"], + "expected": { + "url": "__GITLAB__/primer/design/-/issues/316.json", + "http_method": "PUT", + "post_data": { "$.issue.assignee_ids": [2330] } + } } ], "revision": 2 @@ -23405,7 +25146,6 @@ "intent_template": "Assign the issue regarding {{issue}} in {{repo}} to {{account}}.", "instantiation_dict": {"repo": "a11yproject", "issue": 404, "account": "myself"}, "format_specification": null, - "start_url_context": null, "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -23413,13 +25153,13 @@ "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "usernames": ["byteblaze"], - "group": "a11yproject", - "project": "a11yproject.com", - "issue_iid": 1478, - "site": "gitlab", - "expected": true + "evaluator": "NetworkEventEvaluator", + "ignored_query_params": ["serializer"], + "expected": { + "url": "__GITLAB__/a11yproject/a11yproject.com/-/issues/1478.json", + "http_method": "PUT", + "post_data": { "$.issue.assignee_ids": [2330] } + } } ], "revision": 2 From c0c08148c47b30fd35a67bc48061bebdeafe44c9 Mon Sep 17 00:00:00 2001 From: Nicolas Gontier Date: Mon, 10 Nov 2025 20:46:10 +0000 Subject: [PATCH 39/64] small debug --- .../src/browsergym/webarena_verified/evaluators.py | 9 ++++++--- .../src/browsergym/webarena_verified/task.py | 11 +++++++++-- 2 files changed, 15 insertions(+), 5 deletions(-) diff --git a/browsergym/webarena_verified/src/browsergym/webarena_verified/evaluators.py b/browsergym/webarena_verified/src/browsergym/webarena_verified/evaluators.py index 8242851e..da436ea7 100644 --- a/browsergym/webarena_verified/src/browsergym/webarena_verified/evaluators.py +++ b/browsergym/webarena_verified/src/browsergym/webarena_verified/evaluators.py @@ -118,7 +118,10 @@ def __call__( logger.info( f"- {result.evaluator_name}: status: {result.status}, score: {result.score}, error_msg: {result.error_msg}" ) - # return average score - return sum(result.score for result in results.evaluators_results) / len( - results.evaluators_results + # return average score if multiple evaluators are present, otherwise return the aggregated score + return ( + sum(result.score for result in results.evaluators_results) + / len(results.evaluators_results) + if results.evaluators_results + else results.score ) diff --git a/browsergym/webarena_verified/src/browsergym/webarena_verified/task.py b/browsergym/webarena_verified/src/browsergym/webarena_verified/task.py index 1afa34d0..65ee439d 100644 --- a/browsergym/webarena_verified/src/browsergym/webarena_verified/task.py +++ b/browsergym/webarena_verified/src/browsergym/webarena_verified/task.py @@ -118,7 +118,7 @@ def setup(self, page: playwright.sync_api.Page) -> tuple[str, dict]: # authenticate for site in self.config["sites"]: - for attempt in range(3): + for attempt in range(3): # Try 3 times in case of timeout try: self.webarena_instance.ui_login(site=site, page=page) break # Success, move to next site @@ -138,7 +138,14 @@ def setup(self, page: playwright.sync_api.Page) -> tuple[str, dict]: # https://github.com/web-arena-x/webarena/blob/c6475f0e9affe5252a2966e26b8cb4c834a4ae40/browser_env/envs.py#L150 if start_urls := self.config.get("start_urls"): for i, url in enumerate(start_urls): - page.goto(url) + for attempt in range(3): # Try 3 times in case of timeout + try: + page.goto(url) + break # Success, move to next url + except playwright_errors.TimeoutError as e: + if attempt == 2: # Last attempt (0, 1, 2) + raise # Re-raise the timeout error after 3 failed attempts + sleep(1) # Wait 1 second before retrying if i < len(start_urls) - 1: page = page.context.new_page() From d7dc8459ce47fe71d93d25124c02bb870d28850a Mon Sep 17 00:00:00 2001 From: Nicolas Gontier Date: Thu, 13 Nov 2025 20:55:18 +0000 Subject: [PATCH 40/64] add massage of shopping_admin tasks --- .../experiments/src/browsergym/experiments/benchmark/utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/browsergym/experiments/src/browsergym/experiments/benchmark/utils.py b/browsergym/experiments/src/browsergym/experiments/benchmark/utils.py index e87ab50c..f49f4089 100644 --- a/browsergym/experiments/src/browsergym/experiments/benchmark/utils.py +++ b/browsergym/experiments/src/browsergym/experiments/benchmark/utils.py @@ -159,14 +159,14 @@ def prepare_backend(backend: str): [ f"webarena_verified.{intent_template_id}.{task_id}" for intent_template_id, task_id in [ - # gitlab, shopping_admin, and map are not ready yet + # gitlab and map are not ready yet (23, 410), # reddit # (330, 533), # gitlab # (87, 561), # gitlab wiki # (87, 562), # gitlab reddit (165, 574), # shopping (16, 640), # reddit - # (253, 680), # shopping_admin + (253, 680), # shopping_admin # (94, 740), # wiki map ] ] From b8a666a73a56f1b24905eaba1cb4c248782ee6da Mon Sep 17 00:00:00 2001 From: Nicolas Gontier Date: Tue, 2 Dec 2025 19:37:56 +0000 Subject: [PATCH 41/64] assume all endpoints are running --- .../src/browsergym/experiments/benchmark/utils.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/browsergym/experiments/src/browsergym/experiments/benchmark/utils.py b/browsergym/experiments/src/browsergym/experiments/benchmark/utils.py index f49f4089..ef9d37c0 100644 --- a/browsergym/experiments/src/browsergym/experiments/benchmark/utils.py +++ b/browsergym/experiments/src/browsergym/experiments/benchmark/utils.py @@ -159,15 +159,14 @@ def prepare_backend(backend: str): [ f"webarena_verified.{intent_template_id}.{task_id}" for intent_template_id, task_id in [ - # gitlab and map are not ready yet (23, 410), # reddit - # (330, 533), # gitlab - # (87, 561), # gitlab wiki - # (87, 562), # gitlab reddit + (330, 533), # gitlab + (87, 561), # gitlab wiki + (87, 562), # gitlab reddit (165, 574), # shopping (16, 640), # reddit (253, 680), # shopping_admin - # (94, 740), # wiki map + (94, 740), # wiki map ] ] ) From 49506a7d8bf65a09be4f08f70a662c5ef966c76a Mon Sep 17 00:00:00 2001 From: Nicolas Gontier Date: Tue, 2 Dec 2025 19:43:25 +0000 Subject: [PATCH 42/64] update to latest version before the public release --- .../webarena_verified/webarena_verified.json | 10513 +++++++--------- 1 file changed, 4654 insertions(+), 5859 deletions(-) diff --git a/browsergym/webarena_verified/src/browsergym/webarena_verified/webarena_verified.json b/browsergym/webarena_verified/src/browsergym/webarena_verified/webarena_verified.json index e3c15594..eb73812e 100644 --- a/browsergym/webarena_verified/src/browsergym/webarena_verified/webarena_verified.json +++ b/browsergym/webarena_verified/src/browsergym/webarena_verified/webarena_verified.json @@ -5,16 +5,14 @@ "intent_template_id": 279, "start_urls": ["__SHOPPING_ADMIN__"], "intent": "Get the top-1 best-selling product name(s) in 2022", - "intent_template": "Get the top-{{n}} best-selling product name(s) in {{year}}", - "instantiation_dict": {"n": 1, "year": 2022}, - "format_specification": null, + "intent_template": "Get the top-{{n}} best-selling {{entity}} in {{period}}", + "instantiation_dict": {"n": 1, "entity": "product name(s)", "period": "2022"}, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": false, "results_schema": { "type": "array", "items": {"type": "string"} }, "expected": { - "performed_operation": "retrieve", + "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": ["Quest Lumaflex\u2122 Band"] } @@ -28,19 +26,13 @@ "intent_template_id": 279, "start_urls": ["__SHOPPING_ADMIN__"], "intent": "Get the top-1 best-selling brand name(s) in Quarter 1 2022", - "intent_template": "Get the top-{{n}} best-selling brand name(s) in {{period}}", - "instantiation_dict": {"n": 1, "period": "Quarter 1 2022"}, - "format_specification": null, + "intent_template": "Get the top-{{n}} best-selling {{entity}} in {{period}}", + "instantiation_dict": {"n": 1, "period": "Quarter 1 2022", "entity": "brand name(s)"}, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": false, "results_schema": { "type": "array", "items": {"type": "string"} }, - "expected": { - "performed_operation": "retrieve", - "status": "SUCCESS", - "retrieved_data": ["Sprite"] - } + "expected": { "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": ["Sprite"] } } ], "revision": 2 @@ -51,16 +43,14 @@ "intent_template_id": 279, "start_urls": ["__SHOPPING_ADMIN__"], "intent": "Get the top-1 best-selling product type name(s) in Quarter 1 2022", - "intent_template": "Get the top-{{n}} best-selling product type name(s) in {{period}}", - "instantiation_dict": {"n": 1, "period": "Quarter 1 2022"}, - "format_specification": null, + "intent_template": "Get the top-{{n}} best-selling {{entity}} in {{period}}", + "instantiation_dict": {"n": 1, "period": "Quarter 1 2022", "entity": "product type name(s)"}, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": false, "results_schema": { "type": "array", "items": {"type": "string"} }, "expected": { - "performed_operation": "retrieve", + "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": [ ["Digital Watch", "Band", "Stasis Ball", "Yoga Strap"] ] } @@ -74,16 +64,14 @@ "intent_template_id": 279, "start_urls": ["__SHOPPING_ADMIN__"], "intent": "Get the top-2 best-selling product name(s) in 2022", - "intent_template": "Get the top-{{n}} best-selling product name(s) in {{year}}", - "instantiation_dict": {"n": 2, "year": 2022}, - "format_specification": null, + "intent_template": "Get the top-{{n}} best-selling {{entity}} in {{period}}", + "instantiation_dict": {"n": 2, "entity": "product name(s)", "period": "2022"}, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": false, "results_schema": { "type": "array", "items": {"type": "string"} }, "expected": { - "performed_operation": "retrieve", + "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": [ "Quest Lumaflex\u2122 Band", @@ -103,16 +91,14 @@ "intent_template_id": 279, "start_urls": ["__SHOPPING_ADMIN__"], "intent": "Get the top-3 best-selling product name(s) in Jan 2023", - "intent_template": "Get the top-{{n}} best-selling product name(s) in {{period}}", - "instantiation_dict": {"n": 3, "period": "Jan 2023"}, - "format_specification": null, + "intent_template": "Get the top-{{n}} best-selling {{entity}} in {{period}}", + "instantiation_dict": {"n": 3, "period": "Jan 2023", "entity": "product name(s)"}, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": false, "results_schema": { "type": "array", "items": {"type": "string"} }, "expected": { - "performed_operation": "retrieve", + "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": ["Impulse Duffle", "Overnight Duffle", "Hawkeye Yoga Short-32-Blue"] } @@ -126,19 +112,13 @@ "intent_template_id": 279, "start_urls": ["__SHOPPING_ADMIN__"], "intent": "Get the top-1 best-selling product type name(s) in Jan 2023", - "intent_template": "Get the top-{{n}} best-selling product type name(s) in {{period}}", - "instantiation_dict": {"n": 1, "period": "Jan 2023"}, - "format_specification": null, + "intent_template": "Get the top-{{n}} best-selling {{entity}} in {{period}}", + "instantiation_dict": {"n": 1, "period": "Jan 2023", "entity": "product type name(s)"}, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": false, "results_schema": { "type": "array", "items": {"type": "string"} }, - "expected": { - "performed_operation": "retrieve", - "status": "SUCCESS", - "retrieved_data": ["Duffle"] - } + "expected": { "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": ["Duffle"] } } ], "revision": 2 @@ -149,16 +129,14 @@ "intent_template_id": 279, "start_urls": ["__SHOPPING_ADMIN__"], "intent": "Get the top-2 best-selling product name(s) in 2023", - "intent_template": "Get the top-{{n}} best-selling product name(s) in {{year}}", - "instantiation_dict": {"n": 2, "year": 2023}, - "format_specification": null, + "intent_template": "Get the top-{{n}} best-selling {{entity}} in {{period}}", + "instantiation_dict": {"n": 2, "entity": "product name(s)", "period": "2023"}, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": false, "results_schema": { "type": "array", "items": {"type": "string"} }, "expected": { - "performed_operation": "retrieve", + "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": [ "Sprite Yoga Strap 6 foot", @@ -174,18 +152,17 @@ "task_id": 7, "intent_template_id": 79, "start_urls": ["__MAP__"], - "intent": "Get the name, state, and zip code of all international airports that are within a driving distance of 50 km to Carnegie Mellon University", - "intent_template": "Get the name, state, and zip code of all {{airport_type}} that are within a driving distance of {{radius}} to {{start}}", + "intent": "Get the name, state, and zip code of all international airports that are within a driving distance of 50 km to Carnegie Mellon University. Return a list of objects with keys \"name\", \"state\", and \"postcode\". (Use the OSRM direction service.)", + "intent_template": "Get the name, state, and zip code of all {{airport_type}} that are within a driving distance of {{radius}} to {{start}}. {{retrieved_data_format_spec}}. (Use the OSRM direction service.)", "instantiation_dict": { "airport_type": "international airports", "start": "Carnegie Mellon University", - "radius": "50 km" + "radius": "50 km", + "retrieved_data_format_spec": "Return a list of objects with keys \"name\", \"state\", and \"postcode\"" }, - "format_specification": "Use \"name\" for the name, \"state\" for the state, and \"postcode\" for the postcode.", "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": false, "results_schema": { "type": "array", "items": { @@ -195,12 +172,11 @@ "name" : { "type": "string" }, "state" : { "type": "string" }, "postcode": { "type": "string" } - }, - "required": ["name", "state", "postcode"] + } } }, "expected": { - "performed_operation": "retrieve", + "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": [ { @@ -219,23 +195,19 @@ "task_id": 8, "intent_template_id": 79, "start_urls": ["__MAP__"], - "intent": "Get the name, state, and zip code of all international airports that are within a driving distance of 5 km to Carnegie Mellon University", - "intent_template": "Get the name, state, and zip code of all {{airport_type}} that are within a driving distance of {{radius}} to {{start}}", + "intent": "Get the name, state, and zip code of all international airports that are within a driving distance of 5 km to Carnegie Mellon University. Return a list of objects with keys \"name\", \"state\", and \"postcode\". (Use the OSRM direction service.)", + "intent_template": "Get the name, state, and zip code of all {{airport_type}} that are within a driving distance of {{radius}} to {{start}}. {{retrieved_data_format_spec}}. (Use the OSRM direction service.)", "instantiation_dict": { "airport_type": "international airports", "start": "Carnegie Mellon University", - "radius": "5 km" + "radius": "5 km", + "retrieved_data_format_spec": "Return a list of objects with keys \"name\", \"state\", and \"postcode\"" }, - "format_specification": "Use \"name\" for the name, \"state\" for the state, and \"postcode\" for the postcode.", "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": { - "performed_operation": "retrieve", - "status": "NOT_FOUND_ERROR", - "retrieved_data": null - } + "expected": {"task_type": "retrieve", "status": "NOT_FOUND_ERROR", "retrieved_data": null} } ], "revision": 2 @@ -245,18 +217,17 @@ "task_id": 9, "intent_template_id": 79, "start_urls": ["__MAP__"], - "intent": "Get the name, state, and zip code of all international airports that are within a driving distance of 30 km to Carnegie Art Museum", - "intent_template": "Get the name, state, and zip code of all {{airport_type}} that are within a driving distance of {{radius}} to {{start}}", + "intent": "Get the name, state, and zip code of all international airports that are within a driving distance of 30 km to Carnegie Art Museum. Return a list of objects with keys \"name\", \"state\", and \"postcode\". (Use the OSRM direction service.)", + "intent_template": "Get the name, state, and zip code of all {{airport_type}} that are within a driving distance of {{radius}} to {{start}}. {{retrieved_data_format_spec}}. (Use the OSRM direction service.)", "instantiation_dict": { "airport_type": "international airports", "start": "Carnegie Art Museum", - "radius": "30 km" + "radius": "30 km", + "retrieved_data_format_spec": "Return a list of objects with keys \"name\", \"state\", and \"postcode\"" }, - "format_specification": "Use \"name\" for the name, \"state\" for the state, and \"postcode\" for the postcode.", "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": false, "results_schema": { "type": "array", "items": { @@ -266,12 +237,11 @@ "name" : { "type": "string" }, "state" : { "type": "string" }, "postcode": { "type": "string" } - }, - "required": ["name", "state", "postcode"] + } } }, "expected": { - "performed_operation": "retrieve", + "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": [ { @@ -290,18 +260,17 @@ "task_id": 10, "intent_template_id": 79, "start_urls": ["__MAP__"], - "intent": "Get the name, state, and zip code of all US international airports that are within a driving distance of 60 km to Niagara Falls", - "intent_template": "Get the name, state, and zip code of all {{airport_type}} that are within a driving distance of {{radius}} to {{start}}", + "intent": "Get the name, state, and zip code of all US international airports that are within a driving distance of 60 km to Niagara Falls. Return a list of objects with keys \"name\", \"state\", and \"postcode\". (Use the OSRM direction service.)", + "intent_template": "Get the name, state, and zip code of all {{airport_type}} that are within a driving distance of {{radius}} to {{start}}. {{retrieved_data_format_spec}}. (Use the OSRM direction service.)", "instantiation_dict": { "airport_type": "US international airports", "start": "Niagara Falls", - "radius": "60 km" + "radius": "60 km", + "retrieved_data_format_spec": "Return a list of objects with keys \"name\", \"state\", and \"postcode\"" }, - "format_specification": "Use \"name\" for the name, \"state\" for the state, and \"postcode\" for the postcode.", "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": false, "results_schema": { "type": "array", "items": { @@ -311,12 +280,11 @@ "name" : { "type": "string" }, "state" : { "type": "string" }, "postcode": { "type": "string" } - }, - "required": ["name", "state", "postcode"] + } } }, "expected": { - "performed_operation": "retrieve", + "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": [ { @@ -325,7 +293,7 @@ "postcode": "14304" }, { - "name": "Buffalo-Niagara International Airport", + "name": "Buffalo Niagara International Airport", "state": "New York", "postcode": "14225" } @@ -343,17 +311,11 @@ "intent": "Get the total number of reviews that our store received so far that mention term \"disappointed\"", "intent_template": "Get the total number of reviews that our store received so far that mention term \"{{term}}\"", "instantiation_dict": {"term": "disappointed"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": false, "results_schema": { "type": "array", "items": {"type": "number"} }, - "expected": { - "performed_operation": "retrieve", - "status": "SUCCESS", - "retrieved_data": [6] - } + "expected": { "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": [6] } } ], "revision": 2 @@ -366,17 +328,11 @@ "intent": "Get the total number of reviews that our store received so far that mention term \"satisfied\"", "intent_template": "Get the total number of reviews that our store received so far that mention term \"{{term}}\"", "instantiation_dict": {"term": "satisfied"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": false, "results_schema": { "type": "array", "items": {"type": "number"} }, - "expected": { - "performed_operation": "retrieve", - "status": "SUCCESS", - "retrieved_data": [2] - } + "expected": { "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": [2] } } ], "revision": 2 @@ -389,17 +345,11 @@ "intent": "Get the total number of reviews that our store received so far that mention term \"decent\"", "intent_template": "Get the total number of reviews that our store received so far that mention term \"{{term}}\"", "instantiation_dict": {"term": "decent"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": false, "results_schema": { "type": "array", "items": {"type": "number"} }, - "expected": { - "performed_operation": "retrieve", - "status": "SUCCESS", - "retrieved_data": [2] - } + "expected": { "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": [2] } } ], "revision": 2 @@ -412,17 +362,11 @@ "intent": "Get the total number of reviews that our store received so far that mention term \"not useful\"", "intent_template": "Get the total number of reviews that our store received so far that mention term \"{{term}}\"", "instantiation_dict": {"term": "not useful"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": false, "results_schema": { "type": "array", "items": {"type": "number"} }, - "expected": { - "performed_operation": "retrieve", - "status": "SUCCESS", - "retrieved_data": [0] - } + "expected": { "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": [0] } } ], "revision": 2 @@ -435,17 +379,11 @@ "intent": "Get the total number of reviews that our store received so far that mention term \"best\"", "intent_template": "Get the total number of reviews that our store received so far that mention term \"{{term}}\"", "instantiation_dict": {"term": "best"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": false, "results_schema": { "type": "array", "items": {"type": "number"} }, - "expected": { - "performed_operation": "retrieve", - "status": "SUCCESS", - "retrieved_data": [2] - } + "expected": { "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": [2] } } ], "revision": 2 @@ -455,14 +393,16 @@ "task_id": 16, "intent_template_id": 73, "start_urls": ["__MAP__"], - "intent": "Tell me the time for walking and driving route from 5000 Fifth Avenue, Pittsburgh to UPMC family health center", - "intent_template": "Tell me the time for walking and driving route from {{start}} to {{end}}", - "instantiation_dict": {"start": "5000 Fifth Avenue, Pittsburgh", "end": "UPMC family health center"}, - "format_specification": "Use \"mode\" for the travel mode and \"duration\" for the duration.", + "intent": "Get the time for walking and driving route from 5000 Fifth Avenue, Pittsburgh to UPMC family health center. Return a list of objects with keys \"mode\" (driving or walking) and \"duration\" (in HH:MM:SS format) only, without any additional details. (Use the OSRM direction service.)", + "intent_template": "Get the time for walking and driving route from {{start}} to {{end}}. {{retrieved_data_format_spec}}. (Use the OSRM direction service.)", + "instantiation_dict": { + "start": "5000 Fifth Avenue, Pittsburgh", + "end": "UPMC family health center", + "retrieved_data_format_spec": "Return a list of objects with keys \"mode\" (driving or walking) and \"duration\" (in HH:MM:SS format) only, without any additional details" + }, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": false, "results_schema": { "type": "array", "items": { @@ -470,12 +410,11 @@ "properties": { "mode" : { "type": "string" }, "duration": { "type": "string", "format": "duration" } - }, - "required": ["duration", "mode"] + } } }, "expected": { - "performed_operation": "retrieve", + "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": [ { "mode": "driving", "duration": "2min" }, @@ -491,14 +430,16 @@ "task_id": 17, "intent_template_id": 73, "start_urls": ["__MAP__"], - "intent": "Tell me the time for walking and driving route from AMC Waterfront to Carnegie Mellon University", - "intent_template": "Tell me the time for walking and driving route from {{start}} to {{end}}", - "instantiation_dict": {"start": "AMC Waterfront", "end": "Carnegie Mellon University"}, - "format_specification": "Use \"mode\" for the travel mode and \"duration\" for the duration.", + "intent": "Get the time for walking and driving route from AMC Waterfront to Carnegie Mellon University. Return a list of objects with keys \"mode\" (driving or walking) and \"duration\" (in HH:MM:SS format) only, without any additional details. (Use the OSRM direction service.)", + "intent_template": "Get the time for walking and driving route from {{start}} to {{end}}. {{retrieved_data_format_spec}}. (Use the OSRM direction service.)", + "instantiation_dict": { + "start": "AMC Waterfront", + "end": "Carnegie Mellon University", + "retrieved_data_format_spec": "Return a list of objects with keys \"mode\" (driving or walking) and \"duration\" (in HH:MM:SS format) only, without any additional details" + }, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": false, "results_schema": { "type": "array", "items": { @@ -506,12 +447,11 @@ "properties": { "mode" : { "type": "string" }, "duration": { "type": "string", "format": "duration" } - }, - "required": ["duration", "mode"] + } } }, "expected": { - "performed_operation": "retrieve", + "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": [ { "mode": "driving", "duration": "13min" }, @@ -527,14 +467,16 @@ "task_id": 18, "intent_template_id": 73, "start_urls": ["__MAP__"], - "intent": "Tell me the time for walking and driving route from AMC Waterfront to Univ of Pittsburgh", - "intent_template": "Tell me the time for walking and driving route from {{start}} to {{end}}", - "instantiation_dict": {"start": "AMC Waterfront", "end": "Univ of Pittsburgh"}, - "format_specification": "Use \"mode\" for the travel mode and \"duration\" for the duration.", + "intent": "Get the time for walking and driving route from AMC Waterfront to Univ of Pittsburgh. Return a list of objects with keys \"mode\" (driving or walking) and \"duration\" (in HH:MM:SS format) only, without any additional details. (Use the OSRM direction service.)", + "intent_template": "Get the time for walking and driving route from {{start}} to {{end}}. {{retrieved_data_format_spec}}. (Use the OSRM direction service.)", + "instantiation_dict": { + "start": "AMC Waterfront", + "end": "Univ of Pittsburgh", + "retrieved_data_format_spec": "Return a list of objects with keys \"mode\" (driving or walking) and \"duration\" (in HH:MM:SS format) only, without any additional details" + }, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": false, "results_schema": { "type": "array", "items": { @@ -542,12 +484,11 @@ "properties": { "mode" : { "type": "string" }, "duration": { "type": "string", "format": "duration" } - }, - "required": ["duration", "mode"] + } } }, "expected": { - "performed_operation": "retrieve", + "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": [ { "mode": "driving", "duration": "2min" }, @@ -563,14 +504,16 @@ "task_id": 19, "intent_template_id": 73, "start_urls": ["__MAP__"], - "intent": "Tell me the time for walking and driving route from Carnegie Science Center to Carnegie Mellon University", - "intent_template": "Tell me the time for walking and driving route from {{start}} to {{end}}", - "instantiation_dict": {"start": "Carnegie Science Center", "end": "Carnegie Mellon University"}, - "format_specification": "Use \"mode\" for the travel mode and \"duration\" for the duration.", + "intent": "Get the time for walking and driving route from Carnegie Science Center to Carnegie Mellon University. Return a list of objects with keys \"mode\" (driving or walking) and \"duration\" (in HH:MM:SS format) only, without any additional details. (Use the OSRM direction service.)", + "intent_template": "Get the time for walking and driving route from {{start}} to {{end}}. {{retrieved_data_format_spec}}. (Use the OSRM direction service.)", + "instantiation_dict": { + "start": "Carnegie Science Center", + "end": "Carnegie Mellon University", + "retrieved_data_format_spec": "Return a list of objects with keys \"mode\" (driving or walking) and \"duration\" (in HH:MM:SS format) only, without any additional details" + }, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": false, "results_schema": { "type": "array", "items": { @@ -578,12 +521,11 @@ "properties": { "mode" : { "type": "string" }, "duration": { "type": "string", "format": "duration" } - }, - "required": ["duration", "mode"] + } } }, "expected": { - "performed_operation": "retrieve", + "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": [ { "mode": "driving", "duration": "12min" }, @@ -599,14 +541,16 @@ "task_id": 20, "intent_template_id": 73, "start_urls": ["__MAP__"], - "intent": "Tell me the time for walking and driving route from Randyland to Carnegie Mellon University", - "intent_template": "Tell me the time for walking and driving route from {{start}} to {{end}}", - "instantiation_dict": {"start": "Randyland", "end": "Carnegie Mellon University"}, - "format_specification": "Use \"mode\" for the travel mode and \"duration\" for the duration.", + "intent": "Get the time for walking and driving route from Randyland to Carnegie Mellon University. Return a list of objects with keys \"mode\" (driving or walking) and \"duration\" (in HH:MM:SS format) only, without any additional details. (Use the OSRM direction service.)", + "intent_template": "Get the time for walking and driving route from {{start}} to {{end}}. {{retrieved_data_format_spec}}. (Use the OSRM direction service.)", + "instantiation_dict": { + "start": "Randyland", + "end": "Carnegie Mellon University", + "retrieved_data_format_spec": "Return a list of objects with keys \"mode\" (driving or walking) and \"duration\" (in HH:MM:SS format) only, without any additional details" + }, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": false, "results_schema": { "type": "array", "items": { @@ -614,12 +558,11 @@ "properties": { "mode" : { "type": "string" }, "duration": { "type": "string", "format": "duration" } - }, - "required": ["duration", "mode"] + } } }, "expected": { - "performed_operation": "retrieve", + "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": [ { "mode": "driving", "duration": "13min" }, @@ -640,14 +583,12 @@ "intent": "Get name(s) of reviewer(s) who mention ear cups being small for the product on the current page", "intent_template": "Get name(s) of reviewer(s) who mention {{description}} for the product on the current page", "instantiation_dict": {"description": "ear cups being small"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": false, "results_schema": { "type": "array", "items": {"type": "string"} }, "expected": { - "performed_operation": "retrieve", + "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": [ "Catso", @@ -670,16 +611,11 @@ "intent": "Get name(s) of reviewer(s) who mention under water photo for the product on the current page", "intent_template": "Get name(s) of reviewer(s) who mention {{description}} for the product on the current page", "instantiation_dict": {"description": "under water photo"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": { - "performed_operation": "retrieve", - "status": "NOT_FOUND_ERROR", - "retrieved_data": null - } + "expected": {"task_type": "retrieve", "status": "NOT_FOUND_ERROR", "retrieved_data": null} } ], "revision": 2 @@ -694,14 +630,12 @@ "intent": "Get name(s) of reviewer(s) who mention good fingerprint resistant for the product on the current page", "intent_template": "Get name(s) of reviewer(s) who mention {{description}} for the product on the current page", "instantiation_dict": {"description": "good fingerprint resistant"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": false, "results_schema": { "type": "array", "items": {"type": "string"} }, "expected": { - "performed_operation": "retrieve", + "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": ["Rachel", "T. Gannon"] } @@ -719,16 +653,11 @@ "intent": "Get name(s) of reviewer(s) who mention price being unfair for the product on the current page", "intent_template": "Get name(s) of reviewer(s) who mention {{description}} for the product on the current page", "instantiation_dict": {"description": "price being unfair"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": { - "performed_operation": "retrieve", - "status": "NOT_FOUND_ERROR", - "retrieved_data": null - } + "expected": {"task_type": "retrieve", "status": "NOT_FOUND_ERROR", "retrieved_data": null} } ], "revision": 2 @@ -743,13 +672,12 @@ "intent": "Get name(s) of reviewer(s) who mention print quality explicitly with a rating of 3 or less stars for the product on the current page", "intent_template": "Get name(s) of reviewer(s) who mention {{description}} for the product on the current page", "instantiation_dict": {"description": "print quality explicitly with a rating of 3 or less stars"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": { "type": "array", "items": {"type": "string"} }, "expected": { - "performed_operation": "retrieve", + "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": ["Roxanne Brandon Coffey", "Nelson"] } @@ -767,14 +695,12 @@ "intent": "Get name(s) of reviewer(s) who mention complain of the customer service for the product on the current page", "intent_template": "Get name(s) of reviewer(s) who mention {{description}} for the product on the current page", "instantiation_dict": {"description": "complain of the customer service"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": false, "results_schema": { "type": "array", "items": {"type": "string"} }, "expected": { - "performed_operation": "retrieve", + "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": ["RemyRRemyR", "Bob in Vegas"] } @@ -787,14 +713,15 @@ "task_id": 27, "intent_template_id": 33, "start_urls": ["__REDDIT__"], - "intent": "In the Showerthoughts forum, get the username and post title of the most recent post, and count the number of comments on that post that are not from the author and have more downvotes than upvotes.", - "intent_template": "In the {{forum}} forum, get the username and post title of the most recent post, and count the number of comments on that post that are not from the author and have more downvotes than upvotes.", - "instantiation_dict": {"forum": "personal finances"}, - "format_specification": "Use \"username\" for the username, \"post_title\" for the post title, and \"count\" for the number of comments.", + "intent": "In the personal finances forum, get the username and post title of the most recent post, and count the number of comments on that post that are not from the author and have more downvotes than upvotes. Return a list of objects with keys \"username\", \"post_title\", and \"count\".", + "intent_template": "In the {{forum}} forum, get the username and post title of the most recent post, and count the number of comments on that post that are not from the author and have more downvotes than upvotes. {{retrieved_data_format_spec}}.", + "instantiation_dict": { + "forum": "personal finances", + "retrieved_data_format_spec": "Return a list of objects with keys \"username\", \"post_title\", and \"count\"" + }, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": false, "results_schema": { "type": "array", "items": { @@ -803,12 +730,11 @@ "username" : { "type": "string" }, "post_title": { "type": "string" }, "count" : { "type": "number" } - }, - "required": ["username", "post_title", "count"] + } } }, "expected": { - "performed_operation": "retrieve", + "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": [ { @@ -827,14 +753,15 @@ "task_id": 28, "intent_template_id": 33, "start_urls": ["__REDDIT__"], - "intent": "In the Worcester forum, get the username and post title of the most recent post, and count the number of comments on that post that are not from the author and have more downvotes than upvotes.", - "intent_template": "In the {{forum}} forum, get the username and post title of the most recent post, and count the number of comments on that post that are not from the author and have more downvotes than upvotes.", - "instantiation_dict": {"forum": "Worcester"}, - "format_specification": "Use \"username\" for the username, \"post_title\" for the post title, and \"count\" for the number of comments.", + "intent": "In the Worcester forum, get the username and post title of the most recent post, and count the number of comments on that post that are not from the author and have more downvotes than upvotes. Return a list of objects with keys \"username\", \"post_title\", and \"count\".", + "intent_template": "In the {{forum}} forum, get the username and post title of the most recent post, and count the number of comments on that post that are not from the author and have more downvotes than upvotes. {{retrieved_data_format_spec}}.", + "instantiation_dict": { + "forum": "Worcester", + "retrieved_data_format_spec": "Return a list of objects with keys \"username\", \"post_title\", and \"count\"" + }, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": false, "results_schema": { "type": "array", "items": { @@ -843,12 +770,11 @@ "username" : { "type": "string" }, "post_title": { "type": "string" }, "count" : { "type": "number" } - }, - "required": ["username", "post_title", "count"] + } } }, "expected": { - "performed_operation": "retrieve", + "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": [ { @@ -867,14 +793,15 @@ "task_id": 29, "intent_template_id": 33, "start_urls": ["__REDDIT__"], - "intent": "In the DIY forum, get the username and post title of the most recent post, and count the number of comments on that post that are not from the author and have more downvotes than upvotes.", - "intent_template": "In the {{forum}} forum, get the username and post title of the most recent post, and count the number of comments on that post that are not from the author and have more downvotes than upvotes.", - "instantiation_dict": {"forum": "DIY"}, - "format_specification": "Use \"username\" for the username, \"post_title\" for the post title, and \"count\" for the number of comments.", + "intent": "In the DIY forum, get the username and post title of the most recent post, and count the number of comments on that post that are not from the author and have more downvotes than upvotes. Return a list of objects with keys \"username\", \"post_title\", and \"count\".", + "intent_template": "In the {{forum}} forum, get the username and post title of the most recent post, and count the number of comments on that post that are not from the author and have more downvotes than upvotes. {{retrieved_data_format_spec}}.", + "instantiation_dict": { + "forum": "DIY", + "retrieved_data_format_spec": "Return a list of objects with keys \"username\", \"post_title\", and \"count\"" + }, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": false, "results_schema": { "type": "array", "items": { @@ -883,12 +810,11 @@ "username" : { "type": "string" }, "post_title": { "type": "string" }, "count" : { "type": "number" } - }, - "required": ["username", "post_title", "count"] + } } }, "expected": { - "performed_operation": "retrieve", + "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": [ { @@ -907,14 +833,15 @@ "task_id": 30, "intent_template_id": 33, "start_urls": ["__REDDIT__"], - "intent": "In the space forum, get the username and post title of the most recent post, and count the number of comments on that post that are not from the author and have more downvotes than upvotes.", - "intent_template": "In the {{forum}} forum, get the username and post title of the most recent post, and count the number of comments on that post that are not from the author and have more downvotes than upvotes.", - "instantiation_dict": {"forum": "space"}, - "format_specification": "Use \"username\" for the username, \"post_title\" for the post title, and \"count\" for the number of comments.", + "intent": "In the space forum, get the username and post title of the most recent post, and count the number of comments on that post that are not from the author and have more downvotes than upvotes. Return a list of objects with keys \"username\", \"post_title\", and \"count\".", + "intent_template": "In the {{forum}} forum, get the username and post title of the most recent post, and count the number of comments on that post that are not from the author and have more downvotes than upvotes. {{retrieved_data_format_spec}}.", + "instantiation_dict": { + "forum": "space", + "retrieved_data_format_spec": "Return a list of objects with keys \"username\", \"post_title\", and \"count\"" + }, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": false, "results_schema": { "type": "array", "items": { @@ -923,12 +850,11 @@ "username" : { "type": "string" }, "post_title": { "type": "string" }, "count" : { "type": "number" } - }, - "required": ["username", "post_title", "count"] + } } }, "expected": { - "performed_operation": "retrieve", + "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": [ { @@ -947,14 +873,15 @@ "task_id": 31, "intent_template_id": 33, "start_urls": ["__REDDIT__"], - "intent": "In the photoshopbattles forum, get the username and post title of the most recent post, and count the number of comments on that post that are not from the author and have more downvotes than upvotes.", - "intent_template": "In the {{forum}} forum, get the username and post title of the most recent post, and count the number of comments on that post that are not from the author and have more downvotes than upvotes.", - "instantiation_dict": {"forum": "photoshopbattles"}, - "format_specification": "Use \"username\" for the username, \"post_title\" for the post title, and \"count\" for the number of comments.", + "intent": "In the photoshopbattles forum, get the username and post title of the most recent post, and count the number of comments on that post that are not from the author and have more downvotes than upvotes. Return a list of objects with keys \"username\", \"post_title\", and \"count\".", + "intent_template": "In the {{forum}} forum, get the username and post title of the most recent post, and count the number of comments on that post that are not from the author and have more downvotes than upvotes. {{retrieved_data_format_spec}}.", + "instantiation_dict": { + "forum": "photoshopbattles", + "retrieved_data_format_spec": "Return a list of objects with keys \"username\", \"post_title\", and \"count\"" + }, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": false, "results_schema": { "type": "array", "items": { @@ -963,12 +890,11 @@ "username" : { "type": "string" }, "post_title": { "type": "string" }, "count" : { "type": "number" } - }, - "required": ["username", "post_title", "count"] + } } }, "expected": { - "performed_operation": "retrieve", + "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": [ { @@ -987,32 +913,30 @@ "task_id": 32, "intent_template_id": 78, "start_urls": ["__MAP__"], - "intent": "I will arrive at Pittsburgh Airport soon. Provide the name of a Hilton hotel in the vicinity, if available. Then, tell me the the walking distance to the nearest supermarket own by a local company from the hotel.", - "intent_template": "I will arrive at {{place}} soon. Provide the name of a {{target1}} in the vicinity, if available. Then, tell me the {{information}} to {{target2}} from the hotel.", + "intent": "I will arrive at Pittsburgh Airport soon. Find a Hilton hotel in the vicinity, if available, and get me its name and the walking distance to the nearest supermarket own by a local company from the hotel. Return a list of objects with keys \"hotel\" (hotel name) and \"distance\" (numeric value with unit km or m, e.g., 2.4km, 500m) only, without any additional details. (Use the OSRM direction service.)", + "intent_template": "I will arrive at {{place}} soon. Find a {{target1}} in the vicinity, if available, and get me its name and the {{information}} to {{target2}} from the hotel. {{retrieved_data_format_spec}}. (Use the OSRM direction service.)", "instantiation_dict": { "place": "Pittsburgh Airport", - "information": "the walking distance", + "information": "walking distance", "target1": "Hilton hotel", - "target2": "the nearest supermarket own by a local company" + "target2": "the nearest supermarket own by a local company", + "retrieved_data_format_spec": "Return a list of objects with keys \"hotel\" (hotel name) and \"distance\" (numeric value with unit km or m, e.g., 2.4km, 500m) only, without any additional details" }, - "format_specification": "Use \"hotel\" for the hotel name and \"distance\" for the distance.", "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": false, "results_schema": { "type": "array", "items": { "type": "object", "properties": { - "hotel" : { "type": "string" }, - "distance": { "type": "string", "format": "distance" } - }, - "required": ["hotel", "distance"] + "hotel" : { "type": "string", "format": "location-name" }, + "distance": { "type": "string", "format": "distance" } + } } }, "expected": { - "performed_operation": "retrieve", + "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": [ {"hotel": "DoubleTree by Hilton Hotel Pittsburgh Airport", "distance": "2km"} @@ -1027,32 +951,30 @@ "task_id": 33, "intent_template_id": 78, "start_urls": ["__MAP__"], - "intent": "I will arrive at Pittsburgh Airport soon. Provide the name of a Hilton hotel in the vicinity, if available. Then, tell me the the shortest walking distance to a supermarket from the hotel.", - "intent_template": "I will arrive at {{place}} soon. Provide the name of a {{target1}} in the vicinity, if available. Then, tell me the {{information}} to {{target2}} from the hotel.", + "intent": "I will arrive at Pittsburgh Airport soon. Find a Hilton hotel in the vicinity, if available, and get me its name and the shortest walking distance to a supermarket from the hotel. Return a list of objects with keys \"hotel\" (hotel name) and \"distance\" (numeric value with unit km or m, e.g., 2.4km, 500m) only, without any additional details. (Use the OSRM direction service.)", + "intent_template": "I will arrive at {{place}} soon. Find a {{target1}} in the vicinity, if available, and get me its name and the {{information}} to {{target2}} from the hotel. {{retrieved_data_format_spec}}. (Use the OSRM direction service.)", "instantiation_dict": { "place": "Pittsburgh Airport", "target1": "Hilton hotel", - "information": "the shortest walking distance", - "target2": "a supermarket" + "information": "shortest walking distance", + "target2": "a supermarket", + "retrieved_data_format_spec": "Return a list of objects with keys \"hotel\" (hotel name) and \"distance\" (numeric value with unit km or m, e.g., 2.4km, 500m) only, without any additional details" }, - "format_specification": "Use \"hotel\" for the hotel name and \"distance\" for the distance.", "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": false, "results_schema": { "type": "array", "items": { "type": "object", "properties": { - "hotel" : { "type": "string" }, - "distance": { "type": "string", "format": "distance" } - }, - "required": ["hotel", "distance"] + "hotel" : { "type": "string", "format": "location-name" }, + "distance": { "type": "string", "format": "distance" } + } } }, "expected": { - "performed_operation": "retrieve", + "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": [ {"hotel": "DoubleTree by Hilton Hotel Pittsburgh Airport", "distance": "1.4km"} @@ -1067,37 +989,35 @@ "task_id": 34, "intent_template_id": 78, "start_urls": ["__MAP__"], - "intent": "I will arrive at Pittsburgh Airport soon. Provide the name of a Hyatt hotel in the vicinity, if available. Then, tell me the the shortest walking time to a supermarket from the hotel.", - "intent_template": "I will arrive at {{place}} soon. Provide the name of a {{target1}} in the vicinity, if available. Then, tell me the {{information}} to {{target2}} from the hotel.", + "intent": "I will arrive at Pittsburgh Airport soon. Find a Hyatt hotel in the vicinity, if available, and get me its name and the shortest walking time to a supermarket from the hotel. Return a list of objects with keys \"hotel_name\" (hotel name only) and \"travel_time\" (in HH:MM:SS format) only, without any additional details. (Use the OSRM direction service.)", + "intent_template": "I will arrive at {{place}} soon. Find a {{target1}} in the vicinity, if available, and get me its name and the {{information}} to {{target2}} from the hotel. {{retrieved_data_format_spec}}. (Use the OSRM direction service.)", "instantiation_dict": { "place": "Pittsburgh Airport", "target1": "Hyatt hotel", - "information": "the shortest walking time", - "target2": "a supermarket" + "information": "shortest walking time", + "target2": "a supermarket", + "retrieved_data_format_spec": "Return a list of objects with keys \"hotel_name\" (hotel name only) and \"travel_time\" (in HH:MM:SS format) only, without any additional details" }, - "format_specification": "Use \"hotel\" for the hotel name and \"information\" for the distance or time information requested.", "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": false, "results_schema": { "type": "array", "items": { "type": "object", "properties": { - "hotel" : { "type": "string" }, - "information": { "type": "string", "format": "duration" } - }, - "required": ["hotel", "information"] + "hotel" : { "type": "string", "format": "location-name" }, + "information": { "type": "string", "format": "duration" } + } } }, "expected": { - "performed_operation": "retrieve", + "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": [ { - "hotel": "Hyatt Regency Pittsburgh International Airport", - "information": "3h 30min" + "hotel_name": "Hyatt Regency Pittsburgh International Airport", + "travel_time": "3h 30min" } ] } @@ -1110,37 +1030,35 @@ "task_id": 35, "intent_template_id": 78, "start_urls": ["__MAP__"], - "intent": "I will arrive at Pittsburgh Airport soon. Provide the name of a Hyatt hotel in the vicinity, if available. Then, tell me the the minimal driving time to a supermarket from the hotel.", - "intent_template": "I will arrive at {{place}} soon. Provide the name of a {{target1}} in the vicinity, if available. Then, tell me the {{information}} to {{target2}} from the hotel.", + "intent": "I will arrive at Pittsburgh Airport soon. Find a Hyatt hotel in the vicinity, if available, and get me its name and the minimal driving time to a supermarket from the hotel. Return a list of objects with keys \"hotel_name\" (hotel name only) and \"travel_time\" (in HH:MM:SS format) only, without any additional details. (Use the OSRM direction service.)", + "intent_template": "I will arrive at {{place}} soon. Find a {{target1}} in the vicinity, if available, and get me its name and the {{information}} to {{target2}} from the hotel. {{retrieved_data_format_spec}}. (Use the OSRM direction service.)", "instantiation_dict": { "place": "Pittsburgh Airport", "target1": "Hyatt hotel", - "information": "the minimal driving time", - "target2": "a supermarket" + "information": "minimal driving time", + "target2": "a supermarket", + "retrieved_data_format_spec": "Return a list of objects with keys \"hotel_name\" (hotel name only) and \"travel_time\" (in HH:MM:SS format) only, without any additional details" }, - "format_specification": "Use \"hotel\" for the hotel name and \"information\" for the distance or time information requested.", "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": false, "results_schema": { "type": "array", "items": { "type": "object", "properties": { - "hotel" : { "type": "string" }, - "information": { "type": "string", "format": "duration" } - }, - "required": ["hotel", "information"] + "hotel" : { "type": "string", "format": "location-name" }, + "information": { "type": "string", "format": "duration" } + } } }, "expected": { - "performed_operation": "retrieve", + "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": [ { - "hotel": "Hyatt Regency Pittsburgh International Airport", - "information": "15min" + "hotel_name": "Hyatt Regency Pittsburgh International Airport", + "travel_time": "15min" } ] } @@ -1153,23 +1071,18 @@ "task_id": 36, "intent_template_id": 77, "start_urls": ["__MAP__"], - "intent": "Check if the social security administration in pittsburgh can be reached in one hour by car from Carnegie Mellon University", - "intent_template": "Check if the {{place}} in pittsburgh can be reached in one hour by car from {{location}}", + "intent": "Determine whether the social security administration in Pittsburgh can be reached within one hour by car from Carnegie Mellon University. Return true if it can, otherwise false. (Use the OSRM direction service.)", + "intent_template": "Determine whether the {{place}} in Pittsburgh can be reached within one hour by car from {{location}}{{retrieved_data_format_spec}} (Use the OSRM direction service.)", "instantiation_dict": { "place": "social security administration", - "location": "Carnegie Mellon University" + "location": "Carnegie Mellon University", + "retrieved_data_format_spec": ". Return true if it can, otherwise false." }, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": false, "results_schema": { "type": "array", "items": {"type": "boolean"} }, - "expected": { - "performed_operation": "retrieve", - "status": "SUCCESS", - "retrieved_data": [true] - } + "expected": { "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": [true] } } ], "revision": 2 @@ -1179,20 +1092,18 @@ "task_id": 37, "intent_template_id": 77, "start_urls": ["__MAP__"], - "intent": "Check if the police station in pittsburgh can be reached in one hour by car from gates building at CMU", - "intent_template": "Check if the {{place}} in pittsburgh can be reached in one hour by car from {{location}}", - "instantiation_dict": {"place": "police station", "location": "gates building at CMU"}, - "format_specification": null, + "intent": "Determine whether the police station in Pittsburgh can be reached within one hour by car from gates building at CMU. Return true if it can, otherwise false. (Use the OSRM direction service.)", + "intent_template": "Determine whether the {{place}} in Pittsburgh can be reached within one hour by car from {{location}}{{retrieved_data_format_spec}} (Use the OSRM direction service.)", + "instantiation_dict": { + "place": "police station", + "location": "gates building at CMU", + "retrieved_data_format_spec": ". Return true if it can, otherwise false." + }, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": false, "results_schema": { "type": "array", "items": {"type": "boolean"} }, - "expected": { - "performed_operation": "retrieve", - "status": "SUCCESS", - "retrieved_data": [true] - } + "expected": { "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": [true] } } ], "revision": 2 @@ -1202,20 +1113,18 @@ "task_id": 38, "intent_template_id": 77, "start_urls": ["__MAP__"], - "intent": "Check if the duquesne university in pittsburgh can be reached in one hour by car from pittsburgh airport", - "intent_template": "Check if the {{place}} in pittsburgh can be reached in one hour by car from {{location}}", - "instantiation_dict": {"place": "duquesne university", "location": "pittsburgh airport"}, - "format_specification": null, + "intent": "Determine whether the duquesne university in Pittsburgh can be reached within one hour by car from pittsburgh airport. Return true if it can, otherwise false. (Use the OSRM direction service.)", + "intent_template": "Determine whether the {{place}} in Pittsburgh can be reached within one hour by car from {{location}}{{retrieved_data_format_spec}} (Use the OSRM direction service.)", + "instantiation_dict": { + "place": "duquesne university", + "location": "pittsburgh airport", + "retrieved_data_format_spec": ". Return true if it can, otherwise false." + }, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": false, "results_schema": { "type": "array", "items": {"type": "boolean"} }, - "expected": { - "performed_operation": "retrieve", - "status": "SUCCESS", - "retrieved_data": [true] - } + "expected": { "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": [true] } } ], "revision": 2 @@ -1225,20 +1134,18 @@ "task_id": 39, "intent_template_id": 77, "start_urls": ["__MAP__"], - "intent": "Check if the walmart in pittsburgh can be reached in one hour by car from 5600 fifth avenue", - "intent_template": "Check if the {{place}} in pittsburgh can be reached in one hour by car from {{location}}", - "instantiation_dict": {"place": "walmart", "location": "5600 fifth avenue"}, - "format_specification": null, + "intent": "Determine whether the walmart in Pittsburgh can be reached within one hour by car from 5600 fifth avenue. Return true if it can, otherwise false. (Use the OSRM direction service.)", + "intent_template": "Determine whether the {{place}} in Pittsburgh can be reached within one hour by car from {{location}}{{retrieved_data_format_spec}} (Use the OSRM direction service.)", + "instantiation_dict": { + "place": "walmart", + "location": "5600 fifth avenue", + "retrieved_data_format_spec": ". Return true if it can, otherwise false." + }, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": false, "results_schema": { "type": "array", "items": {"type": "boolean"} }, - "expected": { - "performed_operation": "retrieve", - "status": "SUCCESS", - "retrieved_data": [true] - } + "expected": { "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": [true] } } ], "revision": 2 @@ -1248,20 +1155,18 @@ "task_id": 40, "intent_template_id": 77, "start_urls": ["__MAP__"], - "intent": "Check if the amc theatre in pittsburgh can be reached in one hour by car from hobart street", - "intent_template": "Check if the {{place}} in pittsburgh can be reached in one hour by car from {{location}}", - "instantiation_dict": {"place": "amc theatre", "location": "hobart street"}, - "format_specification": null, + "intent": "Determine whether the amc theatre in Pittsburgh can be reached within one hour by car from hobart street. Return true if it can, otherwise false. (Use the OSRM direction service.)", + "intent_template": "Determine whether the {{place}} in Pittsburgh can be reached within one hour by car from {{location}}{{retrieved_data_format_spec}} (Use the OSRM direction service.)", + "instantiation_dict": { + "place": "amc theatre", + "location": "hobart street", + "retrieved_data_format_spec": ". Return true if it can, otherwise false." + }, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": false, "results_schema": { "type": "array", "items": {"type": "boolean"} }, - "expected": { - "performed_operation": "retrieve", - "status": "SUCCESS", - "retrieved_data": [true] - } + "expected": { "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": [true] } } ], "revision": 2 @@ -1274,14 +1179,12 @@ "intent": "Get the top 1 search term(s) in my store", "intent_template": "Get the top {{n}} search term(s) in my store", "instantiation_dict": {"n": "1"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": false, "results_schema": { "type": "array", "items": {"type": "string"} }, "expected": { - "performed_operation": "retrieve", + "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": ["hollister"] } @@ -1297,14 +1200,12 @@ "intent": "Get the top 2 search term(s) in my store", "intent_template": "Get the top {{n}} search term(s) in my store", "instantiation_dict": {"n": "2"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": false, "results_schema": { "type": "array", "items": {"type": "string"} }, "expected": { - "performed_operation": "retrieve", + "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": ["hollister", "Joust Bag"] } @@ -1320,14 +1221,12 @@ "intent": "Get the top 3 search term(s) in my store", "intent_template": "Get the top {{n}} search term(s) in my store", "instantiation_dict": {"n": "3"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": false, "results_schema": { "type": "array", "items": {"type": "string"} }, "expected": { - "performed_operation": "retrieve", + "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": ["hollister", "Joust Bag", "nike"] } @@ -1340,19 +1239,17 @@ "task_id": 44, "intent_template_id": 303, "start_urls": ["__GITLAB__"], - "intent": "Show me my todos", - "intent_template": "Show me my todos", + "intent": "Open my todos page", + "intent_template": "Open my todos page", "instantiation_dict": {}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "navigate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", - "site": "gitlab", "ignored_query_params_patterns": ["page", "sort"], "expected": { "url": ["__GITLAB__/dashboard/todos", "__GITLAB__/dashboard/todos?state=pending"] @@ -1366,15 +1263,14 @@ "task_id": 45, "intent_template_id": 300, "start_urls": ["__GITLAB__/a11yproject/a11yproject.com"], - "intent": "Show me the most recent open issues for the current project", - "intent_template": "Show me the most recent open issues for the current project", + "intent": "Open the issues page for the current project filtered to the most recent open issues", + "intent_template": "Open the issues page for the current project filtered to the most recent open issues", "instantiation_dict": {}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "navigate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", @@ -1388,26 +1284,22 @@ "task_id": 46, "intent_template_id": 300, "start_urls": ["__GITLAB__/primer/design"], - "intent": "Show me the most recent open issues for the current project", - "intent_template": "Show me the most recent open issues for the current project", + "intent": "Open the issues page for the current project filtered to the most recent open issues", + "intent_template": "Open the issues page for the current project filtered to the most recent open issues", "instantiation_dict": {}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "navigate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", - "site": "gitlab", - "last_event_only": true, "expected": { "url": [ "__GITLAB__/primer/design/-/issues", "__GITLAB__/primer/design/-/issues/?sort=created_date&state=opened" - ], - "response_status": 200 + ] } } ], @@ -1418,14 +1310,15 @@ "task_id": 47, "intent_template_id": 197, "start_urls": ["__SHOPPING__"], - "intent": "Today is June 12, 2023. Tell me how many complete orders I have over the past months, and the total amount of money I spent (including shipping and handling fees).", - "intent_template": "Today is June 12, 2023. Tell me how many complete orders I have {{period}}, and the total amount of money I spent (including shipping and handling fees).", - "instantiation_dict": {"period": "over the past months"}, - "format_specification": "Use \"order_count\" for the number of orders and \"amount\" for the total amount spent.", + "intent": "Today is June 12, 2023. Get how many complete orders I have over the past months, and the total amount of money I spent (including shipping and handling fees). Return an object with keys \"order_count\" (as integer) and \"amount\" (as number, e.g., 10.99) only, without any additional details.", + "intent_template": "Today is June 12, 2023. Get how many complete orders I have {{period}}, and the total amount of money I spent (including shipping and handling fees). {{retrieved_data_format_spec}}.", + "instantiation_dict": { + "period": "over the past months", + "retrieved_data_format_spec": "Return an object with keys \"order_count\" (as integer) and \"amount\" (as number, e.g., 10.99) only, without any additional details" + }, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": false, "results_schema": { "type": "array", "items": { @@ -1433,12 +1326,11 @@ "properties": { "order_count": { "type": "number" }, "amount" : { "type": "number", "format": "currency" } - }, - "required": ["amount", "order_count"] + } } }, "expected": { - "performed_operation": "retrieve", + "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": [ {"order_count": 0, "amount": 0} ] } @@ -1451,14 +1343,15 @@ "task_id": 48, "intent_template_id": 197, "start_urls": ["__SHOPPING__"], - "intent": "Today is June 12, 2023. Tell me how many complete orders I have over the past three days, and the total amount of money I spent (including shipping and handling fees).", - "intent_template": "Today is June 12, 2023. Tell me how many complete orders I have {{period}}, and the total amount of money I spent (including shipping and handling fees).", - "instantiation_dict": {"period": "over the past three days"}, - "format_specification": "Use \"order_count\" for the number of orders and \"amount\" for the total amount spent.", + "intent": "Today is June 12, 2023. Get how many complete orders I have over the past three days, and the total amount of money I spent (including shipping and handling fees). Return an object with keys \"order_count\" (as integer) and \"amount\" (as number, e.g., 10.99) only, without any additional details.", + "intent_template": "Today is June 12, 2023. Get how many complete orders I have {{period}}, and the total amount of money I spent (including shipping and handling fees). {{retrieved_data_format_spec}}.", + "instantiation_dict": { + "period": "over the past three days", + "retrieved_data_format_spec": "Return an object with keys \"order_count\" (as integer) and \"amount\" (as number, e.g., 10.99) only, without any additional details" + }, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": false, "results_schema": { "type": "array", "items": { @@ -1466,12 +1359,11 @@ "properties": { "order_count": { "type": "number" }, "amount" : { "type": "number", "format": "currency" } - }, - "required": ["amount", "order_count"] + } } }, "expected": { - "performed_operation": "retrieve", + "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": [ {"order_count": 0, "amount": 0} ] } @@ -1484,14 +1376,15 @@ "task_id": 49, "intent_template_id": 197, "start_urls": ["__SHOPPING__"], - "intent": "Today is June 12, 2023. Tell me how many complete orders I have over the past four months, and the total amount of money I spent (including shipping and handling fees).", - "intent_template": "Today is June 12, 2023. Tell me how many complete orders I have {{period}}, and the total amount of money I spent (including shipping and handling fees).", - "instantiation_dict": {"period": "over the past four months"}, - "format_specification": "Use \"order_count\" for the number of orders and \"amount\" for the total amount spent.", + "intent": "Today is June 12, 2023. Get how many complete orders I have over the past four months, and the total amount of money I spent (including shipping and handling fees). Return an object with keys \"order_count\" (as integer) and \"amount\" (as number, e.g., 10.99) only, without any additional details.", + "intent_template": "Today is June 12, 2023. Get how many complete orders I have {{period}}, and the total amount of money I spent (including shipping and handling fees). {{retrieved_data_format_spec}}.", + "instantiation_dict": { + "period": "over the past four months", + "retrieved_data_format_spec": "Return an object with keys \"order_count\" (as integer) and \"amount\" (as number, e.g., 10.99) only, without any additional details" + }, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": false, "results_schema": { "type": "array", "items": { @@ -1499,12 +1392,11 @@ "properties": { "order_count": { "type": "number" }, "amount" : { "type": "number", "format": "currency" } - }, - "required": ["amount", "order_count"] + } } }, "expected": { - "performed_operation": "retrieve", + "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": [ {"order_count": 3, "amount": 845.49} ] } @@ -1517,14 +1409,15 @@ "task_id": 50, "intent_template_id": 197, "start_urls": ["__SHOPPING__"], - "intent": "Today is June 12, 2023. Tell me how many complete orders I have over the past year, and the total amount of money I spent (including shipping and handling fees).", - "intent_template": "Today is June 12, 2023. Tell me how many complete orders I have {{period}}, and the total amount of money I spent (including shipping and handling fees).", - "instantiation_dict": {"period": "over the past year"}, - "format_specification": "Use \"order_count\" for the number of orders and \"amount\" for the total amount spent.", + "intent": "Today is June 12, 2023. Get how many complete orders I have over the past year, and the total amount of money I spent (including shipping and handling fees). Return an object with keys \"order_count\" (as integer) and \"amount\" (as number, e.g., 10.99) only, without any additional details.", + "intent_template": "Today is June 12, 2023. Get how many complete orders I have {{period}}, and the total amount of money I spent (including shipping and handling fees). {{retrieved_data_format_spec}}.", + "instantiation_dict": { + "period": "over the past year", + "retrieved_data_format_spec": "Return an object with keys \"order_count\" (as integer) and \"amount\" (as number, e.g., 10.99) only, without any additional details" + }, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": false, "results_schema": { "type": "array", "items": { @@ -1532,12 +1425,11 @@ "properties": { "order_count": { "type": "number" }, "amount" : { "type": "number", "format": "currency" } - }, - "required": ["amount", "order_count"] + } } }, "expected": { - "performed_operation": "retrieve", + "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": [ {"order_count": 21, "amount": 6560.69} ] } @@ -1550,14 +1442,15 @@ "task_id": 51, "intent_template_id": 197, "start_urls": ["__SHOPPING__"], - "intent": "Today is June 12, 2023. Tell me how many complete orders I have over the past six months, and the total amount of money I spent (including shipping and handling fees).", - "intent_template": "Today is June 12, 2023. Tell me how many complete orders I have {{period}}, and the total amount of money I spent (including shipping and handling fees).", - "instantiation_dict": {"period": "over the past six months"}, - "format_specification": "Use \"order_count\" for the number of orders and \"amount\" for the total amount spent.", + "intent": "Today is June 12, 2023. Get how many complete orders I have over the past six months, and the total amount of money I spent (including shipping and handling fees). Return an object with keys \"order_count\" (as integer) and \"amount\" (as number, e.g., 10.99) only, without any additional details.", + "intent_template": "Today is June 12, 2023. Get how many complete orders I have {{period}}, and the total amount of money I spent (including shipping and handling fees). {{retrieved_data_format_spec}}.", + "instantiation_dict": { + "period": "over the past six months", + "retrieved_data_format_spec": "Return an object with keys \"order_count\" (as integer) and \"amount\" (as number, e.g., 10.99) only, without any additional details" + }, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": false, "results_schema": { "type": "array", "items": { @@ -1565,12 +1458,11 @@ "properties": { "order_count": { "type": "number" }, "amount" : { "type": "number", "format": "currency" } - }, - "required": ["amount", "order_count"] + } } }, "expected": { - "performed_operation": "retrieve", + "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": [ {"order_count": 7, "amount": 1700.84} ] } @@ -1583,20 +1475,18 @@ "task_id": 52, "intent_template_id": 68, "start_urls": ["__MAP__"], - "intent": "How long does it take to walk from Carnegie Mellon University to starbucks on Craig Street?", - "intent_template": "How long does it take to walk from {{start}} to {{end}}?", - "instantiation_dict": {"start": "Carnegie Mellon University", "end": "starbucks on Craig Street"}, - "format_specification": null, + "intent": "How long does it take to walk from Carnegie Mellon University to starbucks on Craig Street? Return the value as a string in HH:MM:SS format only, without any additional details. (Use the OSRM direction service.)", + "intent_template": "How long does it take to walk from {{start}} to {{end}}? {{retrieved_data_format_spec}}. (Use the OSRM direction service.)", + "instantiation_dict": { + "start": "Carnegie Mellon University", + "end": "starbucks on Craig Street", + "retrieved_data_format_spec": "Return the value as a string in HH:MM:SS format only, without any additional details" + }, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": false, "results_schema": { "type": "array", "items": {"type": "string", "format": "duration"} }, - "expected": { - "performed_operation": "retrieve", - "status": "SUCCESS", - "retrieved_data": ["7min"] - } + "expected": { "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": ["7min"] } } ], "revision": 2 @@ -1606,20 +1496,18 @@ "task_id": 53, "intent_template_id": 68, "start_urls": ["__MAP__"], - "intent": "How long does it take to walk from Univ of Pittsburgh to starbucks on Craig Street?", - "intent_template": "How long does it take to walk from {{start}} to {{end}}?", - "instantiation_dict": {"start": "Univ of Pittsburgh", "end": "starbucks on Craig Street"}, - "format_specification": null, + "intent": "How long does it take to walk from Univ of Pittsburgh to starbucks on Craig Street? Return the value as a string in HH:MM:SS format only, without any additional details. (Use the OSRM direction service.)", + "intent_template": "How long does it take to walk from {{start}} to {{end}}? {{retrieved_data_format_spec}}. (Use the OSRM direction service.)", + "instantiation_dict": { + "start": "Univ of Pittsburgh", + "end": "starbucks on Craig Street", + "retrieved_data_format_spec": "Return the value as a string in HH:MM:SS format only, without any additional details" + }, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": false, "results_schema": { "type": "array", "items": {"type": "string", "format": "duration"} }, - "expected": { - "performed_operation": "retrieve", - "status": "SUCCESS", - "retrieved_data": ["18min"] - } + "expected": { "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": ["18min"] } } ], "revision": 2 @@ -1629,20 +1517,18 @@ "task_id": 54, "intent_template_id": 68, "start_urls": ["__MAP__"], - "intent": "How long does it take to walk from Carnegie Mellon University to Univ of Pittsburgh?", - "intent_template": "How long does it take to walk from {{start}} to {{end}}?", - "instantiation_dict": {"start": "Carnegie Mellon University", "end": "Univ of Pittsburgh"}, - "format_specification": null, + "intent": "How long does it take to walk from Carnegie Mellon University to Univ of Pittsburgh? Return the value as a string in HH:MM:SS format only, without any additional details. (Use the OSRM direction service.)", + "intent_template": "How long does it take to walk from {{start}} to {{end}}? {{retrieved_data_format_spec}}. (Use the OSRM direction service.)", + "instantiation_dict": { + "start": "Carnegie Mellon University", + "end": "Univ of Pittsburgh", + "retrieved_data_format_spec": "Return the value as a string in HH:MM:SS format only, without any additional details" + }, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": false, "results_schema": { "type": "array", "items": {"type": "string", "format": "duration"} }, - "expected": { - "performed_operation": "retrieve", - "status": "SUCCESS", - "retrieved_data": ["25min"] - } + "expected": { "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": ["25min"] } } ], "revision": 2 @@ -1652,20 +1538,18 @@ "task_id": 55, "intent_template_id": 68, "start_urls": ["__MAP__"], - "intent": "How long does it take to walk from the starbuck near CMU to Chatham university?", - "intent_template": "How long does it take to walk from {{start}} to {{end}}?", - "instantiation_dict": {"start": "the starbuck near CMU", "end": "Chatham university"}, - "format_specification": null, + "intent": "How long does it take to walk from the Starbucks near CMU to Chatham university? Return the value as a string in HH:MM:SS format only, without any additional details. (Use the OSRM direction service.)", + "intent_template": "How long does it take to walk from {{start}} to {{end}}? {{retrieved_data_format_spec}}. (Use the OSRM direction service.)", + "instantiation_dict": { + "start": "the Starbucks near CMU", + "end": "Chatham university", + "retrieved_data_format_spec": "Return the value as a string in HH:MM:SS format only, without any additional details" + }, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": false, "results_schema": { "type": "array", "items": {"type": "string", "format": "duration"} }, - "expected": { - "performed_operation": "retrieve", - "status": "SUCCESS", - "retrieved_data": ["30min"] - } + "expected": { "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": ["30min"] } } ], "revision": 2 @@ -1675,20 +1559,18 @@ "task_id": 56, "intent_template_id": 68, "start_urls": ["__MAP__"], - "intent": "How long does it take to walk from Carnegie Museum of Art to a library at CMU?", - "intent_template": "How long does it take to walk from {{start}} to {{end}}?", - "instantiation_dict": {"start": "Carnegie Museum of Art", "end": "a library at CMU"}, - "format_specification": null, + "intent": "How long does it take to walk from Carnegie Museum of Art to a library at CMU? Return the value as a string in HH:MM:SS format only, without any additional details. (Use the OSRM direction service.)", + "intent_template": "How long does it take to walk from {{start}} to {{end}}? {{retrieved_data_format_spec}}. (Use the OSRM direction service.)", + "instantiation_dict": { + "start": "Carnegie Museum of Art", + "end": "a library at CMU", + "retrieved_data_format_spec": "Return the value as a string in HH:MM:SS format only, without any additional details" + }, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": false, "results_schema": { "type": "array", "items": {"type": "string", "format": "duration"} }, - "expected": { - "performed_operation": "retrieve", - "status": "SUCCESS", - "retrieved_data": ["11min"] - } + "expected": { "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": ["11min"] } } ], "revision": 2 @@ -1698,20 +1580,18 @@ "task_id": 57, "intent_template_id": 69, "start_urls": ["__MAP__"], - "intent": "Tell me the closest restaurant(s) to university center at Carnegie Mellon University", - "intent_template": "Tell me the closest {{place1}}(s) to {{place2}}", + "intent": "Get the closest restaurant(s) to university center at Carnegie Mellon University", + "intent_template": "Get the closest {{place1}}(s) to {{place2}}", "instantiation_dict": { "place1": "restaurant", "place2": "university center at Carnegie Mellon University" }, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": false, - "results_schema": { "type": "array", "items": {"type": "string"} }, + "results_schema": { "type": "array", "items": {"type": "string", "format": "location-name"} }, "expected": { - "performed_operation": "retrieve", + "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": [ "El Gallo de Oro", "Back Bar Grill", "Grano", "Beefsteak", "Nourish", @@ -1727,17 +1607,15 @@ "task_id": 58, "intent_template_id": 69, "start_urls": ["__MAP__"], - "intent": "Tell me the closest cafe(s) to CMU Hunt library", - "intent_template": "Tell me the closest {{place1}}(s) to {{place2}}", + "intent": "Get the closest cafe(s) to CMU Hunt library", + "intent_template": "Get the closest {{place1}}(s) to {{place2}}", "instantiation_dict": {"place1": "cafe", "place2": "CMU Hunt library"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": false, - "results_schema": { "type": "array", "items": {"type": "string"} }, + "results_schema": { "type": "array", "items": {"type": "string", "format": "location-name"} }, "expected": { - "performed_operation": "retrieve", + "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": ["De Fer Coffee & Tea"] } @@ -1750,17 +1628,15 @@ "task_id": 59, "intent_template_id": 69, "start_urls": ["__MAP__"], - "intent": "Tell me the closest restaurant(s) to CMU Hunt library", - "intent_template": "Tell me the closest {{place1}}(s) to {{place2}}", + "intent": "Get the closest restaurant(s) to CMU Hunt library", + "intent_template": "Get the closest {{place1}}(s) to {{place2}}", "instantiation_dict": {"place1": "restaurant", "place2": "CMU Hunt library"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": false, - "results_schema": { "type": "array", "items": {"type": "string"} }, + "results_schema": { "type": "array", "items": {"type": "string", "format": "location-name"} }, "expected": { - "performed_operation": "retrieve", + "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": ["The exchange"] } @@ -1773,17 +1649,15 @@ "task_id": 60, "intent_template_id": 69, "start_urls": ["__MAP__"], - "intent": "Tell me the closest restaurant(s) to CMU Posner Hall", - "intent_template": "Tell me the closest {{place1}}(s) to {{place2}}", + "intent": "Get the closest restaurant(s) to CMU Posner Hall", + "intent_template": "Get the closest {{place1}}(s) to {{place2}}", "instantiation_dict": {"place1": "restaurant", "place2": "CMU Posner Hall"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": false, - "results_schema": { "type": "array", "items": {"type": "string"} }, + "results_schema": { "type": "array", "items": {"type": "string", "format": "location-name"} }, "expected": { - "performed_operation": "retrieve", + "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": ["The exchange"] } @@ -1796,17 +1670,15 @@ "task_id": 61, "intent_template_id": 69, "start_urls": ["__MAP__"], - "intent": "Tell me the closest restaurant(s) to CMU Sorrells Library", - "intent_template": "Tell me the closest {{place1}}(s) to {{place2}}", + "intent": "Get the closest restaurant(s) to CMU Sorrells Library", + "intent_template": "Get the closest {{place1}}(s) to {{place2}}", "instantiation_dict": {"place1": "restaurant", "place2": "CMU Sorrells Library"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": false, - "results_schema": { "type": "array", "items": {"type": "string"} }, + "results_schema": { "type": "array", "items": {"type": "string", "format": "location-name"} }, "expected": { - "performed_operation": "retrieve", + "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": ["La Prima Espresso"] } @@ -1819,19 +1691,17 @@ "task_id": 62, "intent_template_id": 276, "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Get customer name(s) who completed the most number of orders in the entire history", - "intent_template": "Get customer name(s) who completed the {{quantifier}} number of orders in the entire history", - "instantiation_dict": {"quantifier": "most"}, - "format_specification": null, + "intent": "Get customer email(s) who completed the most number of orders in the entire history", + "intent_template": "Get customer email(s) who {{order_criteria}} in the entire history", + "instantiation_dict": {"order_criteria": "completed the most number of orders"}, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": false, "results_schema": { "type": "array", "items": {"type": "string"} }, "expected": { - "performed_operation": "retrieve", + "task_type": "retrieve", "status": "SUCCESS", - "retrieved_data": ["Jane Smith"] + "retrieved_data": ["janesmith456@yahoo.com"] } } ], @@ -1842,19 +1712,17 @@ "task_id": 63, "intent_template_id": 276, "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Get customer name(s) who completed the second most number of orders in the entire history", - "intent_template": "Get customer name(s) who completed the {{quantifier}} number of orders in the entire history", - "instantiation_dict": {"quantifier": "second most"}, - "format_specification": null, + "intent": "Get customer email(s) who completed the second most number of orders in the entire history", + "intent_template": "Get customer email(s) who {{order_criteria}} in the entire history", + "instantiation_dict": {"order_criteria": "completed the second most number of orders"}, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": false, "results_schema": { "type": "array", "items": {"type": "string"} }, "expected": { - "performed_operation": "retrieve", + "task_type": "retrieve", "status": "SUCCESS", - "retrieved_data": ["Michael Nguyen", "Sarah Miller"] + "retrieved_data": ["helloworld@yahoo.com", "michael.nguyen@yahoo.com"] } } ], @@ -1865,19 +1733,17 @@ "task_id": 64, "intent_template_id": 276, "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Get customer name(s) who have 2 orders in any state in the entire history", - "intent_template": "Get customer name(s) who have {{number}} orders in any state in the entire history", - "instantiation_dict": {"number": "2"}, - "format_specification": null, + "intent": "Get customer email(s) who have 2 orders in any state in the entire history", + "intent_template": "Get customer email(s) who {{order_criteria}} in the entire history", + "instantiation_dict": {"order_criteria": "have 2 orders in any state"}, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": false, "results_schema": { "type": "array", "items": {"type": "string"} }, "expected": { - "performed_operation": "retrieve", + "task_type": "retrieve", "status": "SUCCESS", - "retrieved_data": ["Emma Davis", "Veronica Costello"] + "retrieved_data": ["musiclover99@hotmail.com", "roni_cost@example.com"] } } ], @@ -1888,19 +1754,17 @@ "task_id": 65, "intent_template_id": 276, "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Get customer name(s) who completed the fifth most number of orders in the entire history", - "intent_template": "Get customer name(s) who completed the {{quantifier}} number of orders in the entire history", - "instantiation_dict": {"quantifier": "fifth most"}, - "format_specification": null, + "intent": "Get customer email(s) who completed the fifth most number of orders in the entire history", + "intent_template": "Get customer email(s) who {{order_criteria}} in the entire history", + "instantiation_dict": {"order_criteria": "completed the fifth most number of orders"}, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": false, "results_schema": { "type": "array", "items": {"type": "string"} }, "expected": { - "performed_operation": "retrieve", + "task_type": "retrieve", "status": "SUCCESS", - "retrieved_data": ["Adam Garcia", "Jason Miller", "Jennifer White"] + "retrieved_data": ["adam.garcia@gmail.com", "jason.miller@yahoo.com", "jennifer.white@yahoo.com"] } } ], @@ -1911,17 +1775,21 @@ "task_id": 66, "intent_template_id": 17, "start_urls": ["__REDDIT__"], - "intent": "Get the post titles of the top 10 hottest posts in the \"Books\" forum that recommend exactly one book.", - "intent_template": "Get {{description}} of the top {{number}} hottest posts in the \"{{forum}}\" forum that recommend exactly one book.", - "instantiation_dict": {"number": 10, "description": "the post titles", "forum": "books"}, - "format_specification": null, + "intent": "Among the top 10 hottest posts in the \"Books\" forum, get the post titles from those that recommend exactly one book.", + "intent_template": "Among the top {{number}} hottest posts in the \"{{forum}}\" forum, get {{description}} from those that {{filter_criterion}}.{{retrieved_data_format_spec}}", + "instantiation_dict": { + "number": 10, + "description": "the post titles", + "forum": "Books", + "filter_criterion": "recommend exactly one book", + "retrieved_data_format_spec": "" + }, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": false, "results_schema": { "type": "array", "items": {"type": "string"} }, "expected": { - "performed_operation": "retrieve", + "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": [ "I just finished reading The Hobbit to my 6 year old daughter, and she loved it!", @@ -1937,21 +1805,21 @@ "task_id": 67, "intent_template_id": 17, "start_urls": ["__REDDIT__"], - "intent": "Get the book titles, exactly as they appear in the post description, of the top 10 hottest posts in the \"Books\" forum that recommend exactly one book.", - "intent_template": "Get {{description}} of the top {{number}} hottest posts in the \"{{forum}}\" forum that recommend exactly one book.", + "intent": "Among the top 10 hottest posts in the \"Books\" forum, get the book titles, exactly as they appear in the post description, from those that recommend exactly one book.", + "intent_template": "Among the top {{number}} hottest posts in the \"{{forum}}\" forum, get {{description}} from those that {{filter_criterion}}.{{retrieved_data_format_spec}}", "instantiation_dict": { "number": 10, "description": "the book titles, exactly as they appear in the post description,", - "forum": "Books" + "forum": "Books", + "filter_criterion": "recommend exactly one book", + "retrieved_data_format_spec": "" }, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": false, "results_schema": { "type": "array", "items": {"type": "string"} }, "expected": { - "performed_operation": "retrieve", + "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": ["The Hobbit", "A Christmas Carol"] } @@ -1964,28 +1832,27 @@ "task_id": 68, "intent_template_id": 17, "start_urls": ["__REDDIT__"], - "intent": "Get the author names and book titles, exactly as they appear in the post description, of the top 10 hottest posts in the \"Books\" forum that recommend exactly one book.", - "intent_template": "Get {{description}} of the top {{number}} hottest posts in the \"{{forum}}\" forum that recommend exactly one book.", + "intent": "Among the top 10 hottest posts in the \"Books\" forum, get the author names and book titles, exactly as they appear in the post description, from those that recommend exactly one book. Return a list of objects with keys \"book\" and \"author\".", + "intent_template": "Among the top {{number}} hottest posts in the \"{{forum}}\" forum, get {{description}} from those that {{filter_criterion}}.{{retrieved_data_format_spec}}", "instantiation_dict": { "number": 10, "description": "the author names and book titles, exactly as they appear in the post description,", - "forum": "Books" + "forum": "Books", + "filter_criterion": "recommend exactly one book", + "retrieved_data_format_spec": " Return a list of objects with keys \"book\" and \"author\"." }, - "format_specification": "For each pair, return a dictionary with the key 'book' for the book title and 'author' for the author name. Use null for any missing value.", "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": false, "results_schema": { "type": "array", "items": { "type": "object", - "properties": { "book": {"type": "string"}, "author": {"type": "string"} }, - "required": ["author", "book"] + "properties": { "book": {"type": "string"}, "author": {"type": "string"} } } }, "expected": { - "performed_operation": "retrieve", + "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": [ { "book": "The Hobbit" , "author": "Tolkien" }, @@ -2001,21 +1868,21 @@ "task_id": 69, "intent_template_id": 17, "start_urls": ["__REDDIT__"], - "intent": "Get the URLs of any organizations involved, exactly as they appear in the post description, of the top 10 hottest posts in the \"Books\" forum that talk about supporting local book stores.", - "intent_template": "Get {{description}} of the top {{number}} hottest posts in the \"{{forum}}\" forum that talk about supporting local book stores.", + "intent": "Among the top 10 hottest posts in the \"Books\" forum, get the URLs of any organizations involved, exactly as they appear in the post description, from those that talk about supporting local book stores.", + "intent_template": "Among the top {{number}} hottest posts in the \"{{forum}}\" forum, get {{description}} from those that {{filter_criterion}}.{{retrieved_data_format_spec}}", "instantiation_dict": { "number": 10, "description": "the URLs of any organizations involved, exactly as they appear in the post description,", - "forum": "Books" + "forum": "Books", + "filter_criterion": "talk about supporting local book stores", + "retrieved_data_format_spec": "" }, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": false, "results_schema": { "type": "array", "items": {"type": "string"} }, "expected": { - "performed_operation": "retrieve", + "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": [ ["bookshop.org", "https://bookshop.org"] ] } @@ -2031,17 +1898,11 @@ "intent": "What is the zip code of Carnegie Mellon University?", "intent_template": "What is the zip code of {{place}}?", "instantiation_dict": {"place": "Carnegie Mellon University"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": false, "results_schema": { "type": "array", "items": {"type": "string"} }, - "expected": { - "performed_operation": "retrieve", - "status": "SUCCESS", - "retrieved_data": ["15213"] - } + "expected": { "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": ["15213"] } } ], "revision": 2 @@ -2054,17 +1915,11 @@ "intent": "What is the zip code of Chatham University?", "intent_template": "What is the zip code of {{place}}?", "instantiation_dict": {"place": "Chatham University"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": false, "results_schema": { "type": "array", "items": {"type": "string"} }, - "expected": { - "performed_operation": "retrieve", - "status": "SUCCESS", - "retrieved_data": ["15232"] - } + "expected": { "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": ["15232"] } } ], "revision": 2 @@ -2077,17 +1932,11 @@ "intent": "What is the zip code of Yale University?", "intent_template": "What is the zip code of {{place}}?", "instantiation_dict": {"place": "Yale University"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": false, "results_schema": { "type": "array", "items": {"type": "string"} }, - "expected": { - "performed_operation": "retrieve", - "status": "SUCCESS", - "retrieved_data": ["06516"] - } + "expected": { "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": ["06516"] } } ], "revision": 2 @@ -2100,17 +1949,11 @@ "intent": "What is the zip code of Columbia University?", "intent_template": "What is the zip code of {{place}}?", "instantiation_dict": {"place": "Columbia University"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": false, "results_schema": { "type": "array", "items": {"type": "string"} }, - "expected": { - "performed_operation": "retrieve", - "status": "SUCCESS", - "retrieved_data": ["10027"] - } + "expected": { "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": ["10027"] } } ], "revision": 2 @@ -2120,22 +1963,21 @@ "task_id": 74, "intent_template_id": 65, "start_urls": ["__MAP__"], - "intent": "Given the following locations,\"Carnegie Mellon University\", \"apple store shadyside\", \"starbucks on craig street\", what would be the optimal route to travel through them all in order to minimize total travel time? Please note the journey begins at the first place listed.", - "intent_template": "Given the following locations,{% for place in place_list %}\"{{ place }}\"{% if not loop.last %}, {% endif %}{% endfor %}, what would be the optimal route to travel through them all in order to minimize total travel time? Please note the journey begins at the first place listed.", + "intent": "Given the following locations,\"Carnegie Mellon University\", \"apple store shadyside\", \"starbucks on craig street\", what would be the optimal route to travel through them all in order to minimize total travel time? Please note the journey begins at the first place listed. (Use the OSRM direction service.)", + "intent_template": "Given the following locations,{% for place in place_list %}\"{{ place }}\"{% if not loop.last %}, {% endif %}{% endfor %}, what would be the optimal route to travel through them all in order to minimize total travel time? Please note the journey begins at the first place listed. (Use the OSRM direction service.)", "instantiation_dict": { "place_list": [ "Carnegie Mellon University", "apple store shadyside", "starbucks on craig street" ] }, - "format_specification": "Return the list of place in order using their names.", "eval": [ { "evaluator": "AgentResponseEvaluator", "ordered": true, - "results_schema": { "type": "array", "items": {"type": "string"} }, + "results_schema": { "type": "array", "items": {"type": "string", "format": "location-name"} }, "expected": { - "performed_operation": "retrieve", + "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": [ "Carnegie Mellon University", "starbucks on craig street", @@ -2151,22 +1993,21 @@ "task_id": 75, "intent_template_id": 65, "start_urls": ["__MAP__"], - "intent": "Given the following locations,\"Massachusetts Institute of Technology\", \"Harvard University\", \"Boston Logan International Airport\", what would be the optimal route to travel through them all in order to minimize total travel time? Please note the journey begins at the first place listed.", - "intent_template": "Given the following locations,{% for place in place_list %}\"{{ place }}\"{% if not loop.last %}, {% endif %}{% endfor %}, what would be the optimal route to travel through them all in order to minimize total travel time? Please note the journey begins at the first place listed.", + "intent": "Given the following locations,\"Massachusetts Institute of Technology\", \"Harvard University\", \"Boston Logan International Airport\", what would be the optimal route to travel through them all in order to minimize total travel time? Please note the journey begins at the first place listed. (Use the OSRM direction service.)", + "intent_template": "Given the following locations,{% for place in place_list %}\"{{ place }}\"{% if not loop.last %}, {% endif %}{% endfor %}, what would be the optimal route to travel through them all in order to minimize total travel time? Please note the journey begins at the first place listed. (Use the OSRM direction service.)", "instantiation_dict": { "place_list": [ "Massachusetts Institute of Technology", "Harvard University", "Boston Logan International Airport" ] }, - "format_specification": "Return the list of place in order using their names.", "eval": [ { "evaluator": "AgentResponseEvaluator", "ordered": true, - "results_schema": { "type": "array", "items": {"type": "string"} }, + "results_schema": { "type": "array", "items": {"type": "string", "format": "location-name"} }, "expected": { - "performed_operation": "retrieve", + "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": [ "Massachusetts Institute of Technology", "Harvard University", @@ -2182,19 +2023,18 @@ "task_id": 76, "intent_template_id": 65, "start_urls": ["__MAP__"], - "intent": "Given the following locations,\"Princeton University\", \"Yale University\", \"Harvard University\", what would be the optimal route to travel through them all in order to minimize total travel time? Please note the journey begins at the first place listed.", - "intent_template": "Given the following locations,{% for place in place_list %}\"{{ place }}\"{% if not loop.last %}, {% endif %}{% endfor %}, what would be the optimal route to travel through them all in order to minimize total travel time? Please note the journey begins at the first place listed.", + "intent": "Given the following locations,\"Princeton University\", \"Yale University\", \"Harvard University\", what would be the optimal route to travel through them all in order to minimize total travel time? Please note the journey begins at the first place listed. (Use the OSRM direction service.)", + "intent_template": "Given the following locations,{% for place in place_list %}\"{{ place }}\"{% if not loop.last %}, {% endif %}{% endfor %}, what would be the optimal route to travel through them all in order to minimize total travel time? Please note the journey begins at the first place listed. (Use the OSRM direction service.)", "instantiation_dict": { "place_list": ["Princeton University", "Yale University", "Harvard University"] }, - "format_specification": "Return the list of place in order using their names.", "eval": [ { "evaluator": "AgentResponseEvaluator", "ordered": true, - "results_schema": { "type": "array", "items": {"type": "string"} }, + "results_schema": { "type": "array", "items": {"type": "string", "format": "location-name"} }, "expected": { - "performed_operation": "retrieve", + "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": ["Princeton University", "Yale University", "Harvard University"] } @@ -2207,20 +2047,14 @@ "task_id": 77, "intent_template_id": 277, "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Get the total number of Pending reviews amongst all the reviews?", - "intent_template": "Get the total number of {{status}} reviews amongst all the reviews?", + "intent": "Get the total number of Pending reviews amongst all the reviews", + "intent_template": "Get the total number of {{status}} reviews amongst all the reviews", "instantiation_dict": {"status": "Pending"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": false, "results_schema": { "type": "array", "items": {"type": "number"} }, - "expected": { - "performed_operation": "retrieve", - "status": "SUCCESS", - "retrieved_data": [5] - } + "expected": { "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": [5] } } ], "revision": 2 @@ -2230,20 +2064,14 @@ "task_id": 78, "intent_template_id": 277, "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Get the total number of Approved reviews amongst all the reviews?", - "intent_template": "Get the total number of {{status}} reviews amongst all the reviews?", + "intent": "Get the total number of Approved reviews amongst all the reviews", + "intent_template": "Get the total number of {{status}} reviews amongst all the reviews", "instantiation_dict": {"status": "Approved"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": false, "results_schema": { "type": "array", "items": {"type": "number"} }, - "expected": { - "performed_operation": "retrieve", - "status": "SUCCESS", - "retrieved_data": [346] - } + "expected": { "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": [346] } } ], "revision": 2 @@ -2253,20 +2081,14 @@ "task_id": 79, "intent_template_id": 277, "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Get the total number of Not Approved reviews amongst all the reviews?", - "intent_template": "Get the total number of {{status}} reviews amongst all the reviews?", + "intent": "Get the total number of Not Approved reviews amongst all the reviews", + "intent_template": "Get the total number of {{status}} reviews amongst all the reviews", "instantiation_dict": {"status": "Not Approved"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": false, "results_schema": { "type": "array", "items": {"type": "number"} }, - "expected": { - "performed_operation": "retrieve", - "status": "SUCCESS", - "retrieved_data": [0] - } + "expected": { "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": [0] } } ], "revision": 2 @@ -2276,24 +2098,19 @@ "task_id": 80, "intent_template_id": 72, "start_urls": ["__MAP__"], - "intent": "What is the duration required to first walk from Carnegie Mellon University to Starbucks on Craig Street, and then drive to Pittsburgh International Airport?", - "intent_template": "What is the duration required to first walk from {{place_A}} to {{place_B}}, and then drive to {{place_C}}?", + "intent": "What is the duration required to first walk from Carnegie Mellon University to Starbucks on Craig Street, and then drive to Pittsburgh International Airport? Return the value as a string in HH:MM:SS format only, without any additional details. (Use the OSRM direction service.)", + "intent_template": "What is the duration required to first walk from {{place_A}} to {{place_B}}, and then drive to {{place_C}}? {{retrieved_data_format_spec}}. (Use the OSRM direction service.)", "instantiation_dict": { "place_A": "Carnegie Mellon University", "place_B": "Starbucks on Craig Street", - "place_C": "Pittsburgh International Airport" + "place_C": "Pittsburgh International Airport", + "retrieved_data_format_spec": "Return the value as a string in HH:MM:SS format only, without any additional details" }, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": false, "results_schema": { "type": "array", "items": {"type": "string", "format": "duration"} }, - "expected": { - "performed_operation": "retrieve", - "status": "SUCCESS", - "retrieved_data": ["38min"] - } + "expected": { "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": ["38min"] } } ], "revision": 2 @@ -2303,24 +2120,19 @@ "task_id": 81, "intent_template_id": 72, "start_urls": ["__MAP__"], - "intent": "What is the duration required to first walk from Univ of Pittsburgh to starbucks on Craig Street, and then drive to Pittsburgh International Airport?", - "intent_template": "What is the duration required to first walk from {{place_A}} to {{place_B}}, and then drive to {{place_C}}?", + "intent": "What is the duration required to first walk from Univ of Pittsburgh to starbucks on Craig Street, and then drive to Pittsburgh International Airport? Return the value as a string in HH:MM:SS format only, without any additional details. (Use the OSRM direction service.)", + "intent_template": "What is the duration required to first walk from {{place_A}} to {{place_B}}, and then drive to {{place_C}}? {{retrieved_data_format_spec}}. (Use the OSRM direction service.)", "instantiation_dict": { "place_A": "Univ of Pittsburgh", "place_B": "starbucks on Craig Street", - "place_C": "Pittsburgh International Airport" + "place_C": "Pittsburgh International Airport", + "retrieved_data_format_spec": "Return the value as a string in HH:MM:SS format only, without any additional details" }, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": false, "results_schema": { "type": "array", "items": {"type": "string", "format": "duration"} }, - "expected": { - "performed_operation": "retrieve", - "status": "SUCCESS", - "retrieved_data": ["49min"] - } + "expected": { "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": ["49min"] } } ], "revision": 2 @@ -2330,24 +2142,19 @@ "task_id": 82, "intent_template_id": 72, "start_urls": ["__MAP__"], - "intent": "What is the duration required to first walk from Massachusetts Institute of Technology to Harvard University, and then drive to Boston Logan International Airport?", - "intent_template": "What is the duration required to first walk from {{place_A}} to {{place_B}}, and then drive to {{place_C}}?", + "intent": "What is the duration required to first walk from Massachusetts Institute of Technology to Harvard University, and then drive to Boston Logan International Airport? Return the value as a string in HH:MM:SS format only, without any additional details. (Use the OSRM direction service.)", + "intent_template": "What is the duration required to first walk from {{place_A}} to {{place_B}}, and then drive to {{place_C}}? {{retrieved_data_format_spec}}. (Use the OSRM direction service.)", "instantiation_dict": { "place_A": "Massachusetts Institute of Technology", "place_B": "Harvard University", - "place_C": "Boston Logan International Airport" + "place_C": "Boston Logan International Airport", + "retrieved_data_format_spec": "Return the value as a string in HH:MM:SS format only, without any additional details" }, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": false, "results_schema": { "type": "array", "items": {"type": "string", "format": "duration"} }, - "expected": { - "performed_operation": "retrieve", - "status": "SUCCESS", - "retrieved_data": ["63min"] - } + "expected": { "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": ["63min"] } } ], "revision": 2 @@ -2357,24 +2164,19 @@ "task_id": 83, "intent_template_id": 72, "start_urls": ["__MAP__"], - "intent": "What is the duration required to first walk from Carnegie Mellon University to apple store shadyside, and then drive to starbucks on craig street?", - "intent_template": "What is the duration required to first walk from {{place_A}} to {{place_B}}, and then drive to {{place_C}}?", + "intent": "What is the duration required to first walk from Carnegie Mellon University to apple store shadyside, and then drive to starbucks on craig street? Return the value as a string in HH:MM:SS format only, without any additional details. (Use the OSRM direction service.)", + "intent_template": "What is the duration required to first walk from {{place_A}} to {{place_B}}, and then drive to {{place_C}}? {{retrieved_data_format_spec}}. (Use the OSRM direction service.)", "instantiation_dict": { "place_A": "Carnegie Mellon University", "place_B": "apple store shadyside", - "place_C": "starbucks on craig street" + "place_C": "starbucks on craig street", + "retrieved_data_format_spec": "Return the value as a string in HH:MM:SS format only, without any additional details" }, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": false, "results_schema": { "type": "array", "items": {"type": "string", "format": "duration"} }, - "expected": { - "performed_operation": "retrieve", - "status": "SUCCESS", - "retrieved_data": ["22min"] - } + "expected": { "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": ["22min"] } } ], "revision": 2 @@ -2384,20 +2186,18 @@ "task_id": 84, "intent_template_id": 64, "start_urls": ["__MAP__"], - "intent": "From my stay at DoubleTree by Hilton New York Downtown, what's the estimated driving time to reach Keens Steakhouse?", - "intent_template": "From my stay at {{hotel}}, what's the estimated driving time to reach {{place}}?", - "instantiation_dict": {"hotel": "DoubleTree by Hilton New York Downtown", "place": "Keens Steakhouse"}, - "format_specification": null, + "intent": "From my stay at DoubleTree by Hilton New York Downtown, what's the estimated driving time to reach Keens Steakhouse? Return the value as a string in HH:MM:SS format only, without any additional details. (Use the OSRM direction service.)", + "intent_template": "From my stay at {{hotel}}, what's the estimated driving time to reach {{place}}? {{retrieved_data_format_spec}}. (Use the OSRM direction service.)", + "instantiation_dict": { + "hotel": "DoubleTree by Hilton New York Downtown", + "place": "Keens Steakhouse", + "retrieved_data_format_spec": "Return the value as a string in HH:MM:SS format only, without any additional details" + }, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": false, "results_schema": { "type": "array", "items": {"type": "string", "format": "duration"} }, - "expected": { - "performed_operation": "retrieve", - "status": "SUCCESS", - "retrieved_data": ["14min"] - } + "expected": { "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": ["14min"] } } ], "revision": 2 @@ -2407,23 +2207,18 @@ "task_id": 85, "intent_template_id": 64, "start_urls": ["__MAP__"], - "intent": "From my stay at La Quinta Inn near the airport, what's the estimated driving time to reach Carnegie Mellon University?", - "intent_template": "From my stay at {{hotel}}, what's the estimated driving time to reach {{place}}?", + "intent": "From my stay at La Quinta Inn near the airport, what's the estimated driving time to reach Carnegie Mellon University? Return the value as a string in HH:MM:SS format only, without any additional details. (Use the OSRM direction service.)", + "intent_template": "From my stay at {{hotel}}, what's the estimated driving time to reach {{place}}? {{retrieved_data_format_spec}}. (Use the OSRM direction service.)", "instantiation_dict": { "hotel": "La Quinta Inn near the airport", - "place": "Carnegie Mellon University" + "place": "Carnegie Mellon University", + "retrieved_data_format_spec": "Return the value as a string in HH:MM:SS format only, without any additional details" }, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": false, "results_schema": { "type": "array", "items": {"type": "string", "format": "duration"} }, - "expected": { - "performed_operation": "retrieve", - "status": "SUCCESS", - "retrieved_data": ["30min"] - } + "expected": { "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": ["30min"] } } ], "revision": 2 @@ -2433,20 +2228,18 @@ "task_id": 86, "intent_template_id": 64, "start_urls": ["__MAP__"], - "intent": "From my stay at La Quinta Inn near the airport, what's the estimated driving time to reach Upitt?", - "intent_template": "From my stay at {{hotel}}, what's the estimated driving time to reach {{place}}?", - "instantiation_dict": {"hotel": "La Quinta Inn near the airport", "place": "Upitt"}, - "format_specification": null, + "intent": "From my stay at La Quinta Inn near the airport, what's the estimated driving time to reach Upitt? Return the value as a string in HH:MM:SS format only, without any additional details. (Use the OSRM direction service.)", + "intent_template": "From my stay at {{hotel}}, what's the estimated driving time to reach {{place}}? {{retrieved_data_format_spec}}. (Use the OSRM direction service.)", + "instantiation_dict": { + "hotel": "La Quinta Inn near the airport", + "place": "Upitt", + "retrieved_data_format_spec": "Return the value as a string in HH:MM:SS format only, without any additional details" + }, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": false, "results_schema": { "type": "array", "items": {"type": "string", "format": "duration"} }, - "expected": { - "performed_operation": "retrieve", - "status": "SUCCESS", - "retrieved_data": ["29min"] - } + "expected": { "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": ["29min"] } } ], "revision": 2 @@ -2456,20 +2249,18 @@ "task_id": 87, "intent_template_id": 64, "start_urls": ["__MAP__"], - "intent": "From my stay at red roof inn, what's the estimated driving time to reach Pittsburgh science museum?", - "intent_template": "From my stay at {{hotel}}, what's the estimated driving time to reach {{place}}?", - "instantiation_dict": {"hotel": "red roof inn", "place": "Pittsburgh science museum"}, - "format_specification": null, + "intent": "From my stay at red roof inn, what's the estimated driving time to reach Pittsburgh science museum? Return the value as a string in HH:MM:SS format only, without any additional details. (Use the OSRM direction service.)", + "intent_template": "From my stay at {{hotel}}, what's the estimated driving time to reach {{place}}? {{retrieved_data_format_spec}}. (Use the OSRM direction service.)", + "instantiation_dict": { + "hotel": "red roof inn", + "place": "Pittsburgh science museum", + "retrieved_data_format_spec": "Return the value as a string in HH:MM:SS format only, without any additional details" + }, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": false, "results_schema": { "type": "array", "items": {"type": "string", "format": "duration"} }, - "expected": { - "performed_operation": "retrieve", - "status": "SUCCESS", - "retrieved_data": ["20min"] - } + "expected": { "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": ["20min"] } } ], "revision": 2 @@ -2479,20 +2270,18 @@ "task_id": 88, "intent_template_id": 64, "start_urls": ["__MAP__"], - "intent": "From my stay at Homewood Suites Southpointe, what's the estimated driving time to reach PPG Paints Arena?", - "intent_template": "From my stay at {{hotel}}, what's the estimated driving time to reach {{place}}?", - "instantiation_dict": {"hotel": "Homewood Suites Southpointe", "place": "PPG Paints Arena"}, - "format_specification": null, + "intent": "From my stay at Homewood Suites Southpointe, what's the estimated driving time to reach PPG Paints Arena? Return the value as a string in HH:MM:SS format only, without any additional details. (Use the OSRM direction service.)", + "intent_template": "From my stay at {{hotel}}, what's the estimated driving time to reach {{place}}? {{retrieved_data_format_spec}}. (Use the OSRM direction service.)", + "instantiation_dict": { + "hotel": "Homewood Suites Southpointe", + "place": "PPG Paints Arena", + "retrieved_data_format_spec": "Return the value as a string in HH:MM:SS format only, without any additional details" + }, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": false, "results_schema": { "type": "array", "items": {"type": "string", "format": "duration"} }, - "expected": { - "performed_operation": "retrieve", - "status": "SUCCESS", - "retrieved_data": ["34min"] - } + "expected": { "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": ["34min"] } } ], "revision": 2 @@ -2502,135 +2291,137 @@ "task_id": 89, "intent_template_id": 67, "start_urls": ["__MAP__"], - "intent": "Which US states border Connecticut?", - "intent_template": "Which US states border {{state}}?", - "instantiation_dict": {"state": "Connecticut"}, - "format_specification": null, + "intent": "Get the relation IDs of each US state that borders Connecticut. Return a list of integers only, without any additional details.", + "intent_template": "Get the relation IDs of each US state that borders {{state}}. {{retrieved_data_format_spec}}.", + "instantiation_dict": { + "state": "Connecticut", + "retrieved_data_format_spec": "Return a list of integers only, without any additional details" + }, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": false, - "results_schema": { "type": "array", "items": {"type": "string"} }, + "results_schema": { "type": "array", "items": {"type": "number"} }, "expected": { - "performed_operation": "retrieve", + "task_type": "retrieve", "status": "SUCCESS", - "retrieved_data": ["Rhode Island", "Massachusetts", "New York"] + "retrieved_data": [392915, 61315, 175905] } } ], - "revision": 2 + "revision": 3 }, { "sites": ["map"], "task_id": 90, "intent_template_id": 67, "start_urls": ["__MAP__"], - "intent": "Which US states border Pennsylvania?", - "intent_template": "Which US states border {{state}}?", - "instantiation_dict": {"state": "Pennsylvania"}, - "format_specification": null, + "intent": "Get the relation IDs of each US state that borders Pennsylvania. Return a list of integers only, without any additional details.", + "intent_template": "Get the relation IDs of each US state that borders {{state}}. {{retrieved_data_format_spec}}.", + "instantiation_dict": { + "state": "Pennsylvania", + "retrieved_data_format_spec": "Return a list of integers only, without any additional details" + }, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": false, - "results_schema": { "type": "array", "items": {"type": "string"} }, + "results_schema": { "type": "array", "items": {"type": "number"} }, "expected": { - "performed_operation": "retrieve", + "task_type": "retrieve", "status": "SUCCESS", - "retrieved_data": ["Ohio", "Maryland", "New York", "New Jersey", "Delaware", "West Virginia"] + "retrieved_data": [162061, 162112, 175905, 224951, 162110, 162068] } } ], - "revision": 2 + "revision": 3 }, { "sites": ["map"], "task_id": 91, "intent_template_id": 67, "start_urls": ["__MAP__"], - "intent": "Which US states border Massachusetts?", - "intent_template": "Which US states border {{state}}?", - "instantiation_dict": {"state": "Massachusetts"}, - "format_specification": null, + "intent": "Get the relation IDs of each US state that borders Massachusetts. Return a list of integers only, without any additional details.", + "intent_template": "Get the relation IDs of each US state that borders {{state}}. {{retrieved_data_format_spec}}.", + "instantiation_dict": { + "state": "Massachusetts", + "retrieved_data_format_spec": "Return a list of integers only, without any additional details" + }, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": false, - "results_schema": { "type": "array", "items": {"type": "string"} }, + "results_schema": { "type": "array", "items": {"type": "number"} }, "expected": { - "performed_operation": "retrieve", + "task_type": "retrieve", "status": "SUCCESS", - "retrieved_data": ["Rhode Island", "Connecticut", "New York", "New Hampshire", "Vermont"] + "retrieved_data": [392915, 165794, 175905, 67213, 60759] } } ], - "revision": 2 + "revision": 3 }, { "sites": ["map"], "task_id": 92, "intent_template_id": 67, "start_urls": ["__MAP__"], - "intent": "Which US states border Vermont?", - "intent_template": "Which US states border {{state}}?", - "instantiation_dict": {"state": "Vermont"}, - "format_specification": null, + "intent": "Get the relation IDs of each US state that borders Vermont. Return a list of integers only, without any additional details.", + "intent_template": "Get the relation IDs of each US state that borders {{state}}. {{retrieved_data_format_spec}}.", + "instantiation_dict": { + "state": "Vermont", + "retrieved_data_format_spec": "Return a list of integers only, without any additional details" + }, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": false, - "results_schema": { "type": "array", "items": {"type": "string"} }, + "results_schema": { "type": "array", "items": {"type": "number"} }, "expected": { - "performed_operation": "retrieve", + "task_type": "retrieve", "status": "SUCCESS", - "retrieved_data": ["New York", "New Hampshire", "Massachusetts"] + "retrieved_data": [175905, 67213, 61315] } } ], - "revision": 2 + "revision": 3 }, { "sites": ["map"], "task_id": 93, "intent_template_id": 67, "start_urls": ["__MAP__"], - "intent": "Which US states border New Hampshire?", - "intent_template": "Which US states border {{state}}?", - "instantiation_dict": {"state": "New Hampshire"}, - "format_specification": null, + "intent": "Get the relation IDs of each US state that borders New Hampshire. Return a list of integers only, without any additional details.", + "intent_template": "Get the relation IDs of each US state that borders {{state}}. {{retrieved_data_format_spec}}.", + "instantiation_dict": { + "state": "New Hampshire", + "retrieved_data_format_spec": "Return a list of integers only, without any additional details" + }, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": false, - "results_schema": { "type": "array", "items": {"type": "string"} }, + "results_schema": { "type": "array", "items": {"type": "number"} }, "expected": { - "performed_operation": "retrieve", + "task_type": "retrieve", "status": "SUCCESS", - "retrieved_data": ["Massachusetts", "Vermont", "Maine"] + "retrieved_data": [61315, 60759, 63512] } } ], - "revision": 2 + "revision": 3 }, { "sites": ["shopping_admin"], "task_id": 94, "intent_template_id": 274, "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Tell me the grand total of invoice 000000001.", - "intent_template": "Tell me the grand total of invoice {{id}}.", - "instantiation_dict": {"id": "000000001"}, - "format_specification": null, - "eval": [ + "intent": "Get the grand total of invoice 000000001. Return the value as a number (e.g., 10.99) only, without any additional details.", + "intent_template": "Get the grand total of invoice {{id}}. {{retrieved_data_format_spec}}.", + "instantiation_dict": { + "id": "000000001", + "retrieved_data_format_spec": "Return the value as a number (e.g., 10.99) only, without any additional details" + }, + "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": false, "results_schema": { "type": "array", "items": {"type": "number", "format": "currency"} }, - "expected": { - "performed_operation": "retrieve", - "status": "SUCCESS", - "retrieved_data": [36.39] - } + "expected": { "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": [36.39] } } ], "revision": 2 @@ -2640,20 +2431,17 @@ "task_id": 95, "intent_template_id": 274, "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Tell me the grand total of invoice 000000002.", - "intent_template": "Tell me the grand total of invoice {{id}}.", - "instantiation_dict": {"id": "000000002"}, - "format_specification": null, + "intent": "Get the grand total of invoice 000000002. Return the value as a number (e.g., 10.99) only, without any additional details.", + "intent_template": "Get the grand total of invoice {{id}}. {{retrieved_data_format_spec}}.", + "instantiation_dict": { + "id": "000000002", + "retrieved_data_format_spec": "Return the value as a number (e.g., 10.99) only, without any additional details" + }, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": false, "results_schema": { "type": "array", "items": {"type": "number", "format": "currency"} }, - "expected": { - "performed_operation": "retrieve", - "status": "SUCCESS", - "retrieved_data": [39.64] - } + "expected": { "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": [39.64] } } ], "revision": 2 @@ -2663,14 +2451,14 @@ "task_id": 96, "intent_template_id": 193, "start_urls": ["__SHOPPING__"], - "intent": "Tell me the status of my latest order and when will it arrive", - "intent_template": "Tell me the status of my latest order and when will it arrive", - "instantiation_dict": {}, - "format_specification": "Use \"status\" for status and \"arrival_date\" for arrival date (null if not applicable).", + "intent": "Get the status of my latest order and when will it arrive. Return a list of objects with keys \"status\" and \"arrival_date\" (YYYY-MM-DD format or null if not available), without any additional details.", + "intent_template": "Get the status of my latest order and when will it arrive. {{retrieved_data_format_spec}}.", + "instantiation_dict": { + "retrieved_data_format_spec": "Return a list of objects with keys \"status\" and \"arrival_date\" (YYYY-MM-DD format or null if not available), without any additional details" + }, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": false, "results_schema": { "type": "array", "items": { @@ -2678,12 +2466,11 @@ "properties": { "status" : { "type": "string" }, "arrival_date": { "type": "string", "format": "date" } - }, - "required": ["arrival_date", "status"] + } } }, "expected": { - "performed_operation": "retrieve", + "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": [ {"status": "canceled", "arrival_date": null} ] } @@ -2696,20 +2483,21 @@ "task_id": 97, "intent_template_id": 120, "start_urls": ["__MAP__", "__WIKIPEDIA__"], - "intent": "Tell me the distance to drive from Carnegie Mellon University to the top computer science school in massachusetts (use the provided wiki site to look up any needed information)", - "intent_template": "Tell me the distance to drive from Carnegie Mellon University to the top computer science school in massachusetts (use the provided wiki site to look up any needed information)", - "instantiation_dict": {}, - "format_specification": null, + "intent": "Get the driving distance from Carnegie Mellon University to the 3rd ranked university worldwide by SCImago Institutions Rankings in 2019. Return the value as a string (numeric value with unit km or m, e.g., 2.4km, 500m) only, without any additional details. (Use the OSRM direction service and the provided wiki site to look up any needed information.)", + "intent_template": "Get the driving distance from Carnegie Mellon University to the 3rd ranked university worldwide by SCImago Institutions Rankings in 2019. {{retrieved_data_format_spec}}. (Use the OSRM direction service and the provided wiki site to look up any needed information.)", + "instantiation_dict": { + "retrieved_data_format_spec": "Return the value as a string (numeric value with unit km or m, e.g., 2.4km, 500m) only, without any additional details" + }, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": true, "results_schema": { "type": "array", "items": {"type": "string", "format": "distance"} }, - "expected": { - "performed_operation": "retrieve", - "status": "SUCCESS", - "retrieved_data": ["914km"] - } + "expected": { "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": ["914km"] } + }, + { + "evaluator": "NetworkEventEvaluator", + "ignored_query_params_patterns": [".*"], + "expected": {"url": "^.*/routed-car/route/v1/.*/.*$"} } ], "revision": 2 @@ -2719,31 +2507,50 @@ "task_id": 98, "intent_template_id": 66, "start_urls": ["__MAP__"], - "intent": "Where is the nearest tea cafe to University of Pittsburgh, and what is the walking distance to it?", - "intent_template": "Where is the nearest {{places}} to {{start}}, and what is the walking distance to it?", - "instantiation_dict": {"places": "tea cafe", "start": "University of Pittsburgh"}, - "format_specification": "Use \"location\" for the name and location of the place and \"distance\" for the walking distance.", + "intent": "Where is the nearest tea cafe to University of Pittsburgh, and what is the walking distance to it? Return a list of objects with keys \"location\" and \"distance\" (numeric value with unit km or m, e.g., 2.4km, 500m) only, without any additional details. (Use the OSRM direction service.)", + "intent_template": "Where is the nearest {{places}} to {{start}}, and what is the walking distance to it? {{retrieved_data_format_spec}}. (Use the OSRM direction service.)", + "instantiation_dict": { + "places": "tea cafe", + "start": "University of Pittsburgh", + "retrieved_data_format_spec": "Return a list of objects with keys \"location\" and \"distance\" (numeric value with unit km or m, e.g., 2.4km, 500m) only, without any additional details" + }, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": false, "results_schema": { "type": "array", "items": { "type": "object", "properties": { - "location": { "type": "string" }, - "distance": { "type": "string", "format": "distance" } - }, - "required": ["distance", "location"] + "location": { + "type": "object", + "title": "full_address", + "properties": { + "name" : { "type": "string" }, + "house_number": { "type": "string" }, + "street" : { "type": "string" }, + "city" : { "type": "string" }, + "state" : { "type": "string" }, + "postcode" : { "type": "string" } + } + }, + "distance": {"type": "string", "format": "distance"} + } } }, "expected": { - "performed_operation": "retrieve", + "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": [ { - "location": "Fuku Tea, 3716, Forbes Avenue, Oakland, Central Oakland, Pittsburgh, Allegheny County, Pennsylvania, 15213, United States", + "location": { + "name": "Fuku Tea", + "house_number": "3716", + "street": "Forbes Avenue", + "city": "Pittsburgh", + "state": "Pennsylvania", + "postcode": "15213" + }, "distance": "653m" } ] @@ -2757,31 +2564,50 @@ "task_id": 99, "intent_template_id": 66, "start_urls": ["__MAP__"], - "intent": "Where is the nearest Five Guys to 5700 Penn Ave, and what is the walking distance to it?", - "intent_template": "Where is the nearest {{places}} to {{start}}, and what is the walking distance to it?", - "instantiation_dict": {"places": "Five Guys", "start": "5700 Penn Ave"}, - "format_specification": "Use \"location\" for the name and location of the place and \"distance\" for the walking distance.", + "intent": "Where is the nearest Five Guys to 5700 Penn Ave, and what is the walking distance to it? Return a list of objects with keys \"location\" and \"distance\" (numeric value with unit km or m, e.g., 2.4km, 500m) only, without any additional details. (Use the OSRM direction service.)", + "intent_template": "Where is the nearest {{places}} to {{start}}, and what is the walking distance to it? {{retrieved_data_format_spec}}. (Use the OSRM direction service.)", + "instantiation_dict": { + "places": "Five Guys", + "start": "5700 Penn Ave", + "retrieved_data_format_spec": "Return a list of objects with keys \"location\" and \"distance\" (numeric value with unit km or m, e.g., 2.4km, 500m) only, without any additional details" + }, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": false, "results_schema": { "type": "array", "items": { "type": "object", "properties": { - "location": { "type": "string" }, - "distance": { "type": "string", "format": "distance" } - }, - "required": ["distance", "location"] + "location": { + "type": "object", + "title": "full_address", + "properties": { + "name" : { "type": "string" }, + "house_number": { "type": "string" }, + "street" : { "type": "string" }, + "city" : { "type": "string" }, + "state" : { "type": "string" }, + "postcode" : { "type": "string" } + } + }, + "distance": {"type": "string", "format": "distance"} + } } }, "expected": { - "performed_operation": "retrieve", + "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": [ { - "location": "Five Guys, 117, South Bouquet Street, Oakland, North Oakland, Pittsburgh, Allegheny County, Pennsylvania, 15213, United States", + "location": { + "name": "Five Guys", + "house_number": "117", + "street": "South Bouquet Street", + "city": "Pittsburgh", + "state": "Pennsylvania", + "postcode": "15213" + }, "distance": "4km" } ] @@ -2795,31 +2621,50 @@ "task_id": 100, "intent_template_id": 66, "start_urls": ["__MAP__"], - "intent": "Where is the nearest Starbucks to Carnegie Mellon, and what is the walking distance to it?", - "intent_template": "Where is the nearest {{places}} to {{start}}, and what is the walking distance to it?", - "instantiation_dict": {"places": "Starbucks", "start": "Carnegie Mellon"}, - "format_specification": "Use \"location\" for the name and location of the place and \"distance\" for the walking distance.", + "intent": "Where is the nearest Starbucks to Carnegie Mellon, and what is the walking distance to it? Return a list of objects with keys \"location\" and \"distance\" (numeric value with unit km or m, e.g., 2.4km, 500m) only, without any additional details. (Use the OSRM direction service.)", + "intent_template": "Where is the nearest {{places}} to {{start}}, and what is the walking distance to it? {{retrieved_data_format_spec}}. (Use the OSRM direction service.)", + "instantiation_dict": { + "places": "Starbucks", + "start": "Carnegie Mellon", + "retrieved_data_format_spec": "Return a list of objects with keys \"location\" and \"distance\" (numeric value with unit km or m, e.g., 2.4km, 500m) only, without any additional details" + }, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": false, "results_schema": { "type": "array", "items": { "type": "object", "properties": { - "location": { "type": "string" }, - "distance": { "type": "string", "format": "distance" } - }, - "required": ["distance", "location"] + "location": { + "type": "object", + "title": "full_address", + "properties": { + "name" : { "type": "string" }, + "house_number": { "type": "string" }, + "street" : { "type": "string" }, + "city" : { "type": "string" }, + "state" : { "type": "string" }, + "postcode" : { "type": "string" } + } + }, + "distance": {"type": "string", "format": "distance"} + } } }, "expected": { - "performed_operation": "retrieve", + "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": [ { - "location": "Starbucks, 417, South Craig Street, Bellefield, Pittsburgh, Allegheny County, Pennsylvania, 15213, United States", + "location": { + "name": "Starbucks", + "house_number": "417", + "street": "South Craig Street", + "city": "Pittsburgh", + "state": "Pennsylvania", + "postcode": "15213" + }, "distance": "557m" } ] @@ -2833,19 +2678,18 @@ "task_id": 101, "intent_template_id": 66, "start_urls": ["__MAP__"], - "intent": "Where is the nearest In-N-Out to Upitts, and what is the walking distance to it?", - "intent_template": "Where is the nearest {{places}} to {{start}}, and what is the walking distance to it?", - "instantiation_dict": {"places": "In-N-Out", "start": "Upitts"}, - "format_specification": "Use \"location\" for the name and location of the place and \"distance\" for the walking distance.", + "intent": "Where is the nearest In-N-Out to Upitts, and what is the walking distance to it? Use \"location\" for the name and location of the place and \"distance\" for the walking distance. (Use the OSRM direction service.)", + "intent_template": "Where is the nearest {{places}} to {{start}}, and what is the walking distance to it? {{retrieved_data_format_spec}}. (Use the OSRM direction service.)", + "instantiation_dict": { + "places": "In-N-Out", + "start": "Upitts", + "retrieved_data_format_spec": "Use \"location\" for the name and location of the place and \"distance\" for the walking distance" + }, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": { - "performed_operation": "retrieve", - "status": "NOT_FOUND_ERROR", - "retrieved_data": null - } + "expected": {"task_type": "retrieve", "status": "NOT_FOUND_ERROR", "retrieved_data": null} } ], "revision": 2 @@ -2855,20 +2699,22 @@ "task_id": 102, "intent_template_id": 349, "start_urls": ["__GITLAB__"], - "intent": "Display the list of open issues in the a11yproject/a11yproject.com repository that have labels related to help wanted", - "intent_template": "Display the list of {{state}} issues in the {{repo}} repository that have labels related to {{label}}", + "intent": "Navigate to the page showing the list of open issues in the a11yproject/a11yproject.com repository that have labels related to help wanted", + "intent_template": "Navigate to the page showing the list of {{state}} issues in the {{repo}} repository that have labels related to {{label}}", "instantiation_dict": {"label": "help wanted", "repo": "a11yproject/a11yproject.com", "state": "open"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "navigate", "status": "SUCCESS", "retrieved_data": null} + }, + { + "evaluator": "NetworkEventEvaluator", + "ignored_query_params_patterns": [".*"], + "expected": {"url": "^__GITLAB__/byteblaze/a11y-syntax-highlighting/-/issues.*$"} }, { "evaluator": "NetworkEventEvaluator", - "site": "gitlab", - "last_event_only": true, "ignored_query_params_patterns": ["page", "sort"], "expected": { "url": "__GITLAB__/api/graphql", @@ -2886,20 +2732,22 @@ "task_id": 103, "intent_template_id": 349, "start_urls": ["__GITLAB__"], - "intent": "Display the list of closed issues in the kkroening/ffmpeg-python repository that have labels related to questions", - "intent_template": "Display the list of {{state}} issues in the {{repo}} repository that have labels related to {{label}}", + "intent": "Navigate to the page showing the list of closed issues in the kkroening/ffmpeg-python repository that have labels related to question", + "intent_template": "Navigate to the page showing the list of {{state}} issues in the {{repo}} repository that have labels related to {{label}}", "instantiation_dict": {"label": "question", "repo": "kkroening/ffmpeg-python", "state": "closed"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "navigate", "status": "SUCCESS", "retrieved_data": null} + }, + { + "evaluator": "NetworkEventEvaluator", + "ignored_query_params_patterns": [".*"], + "expected": {"url": "^__GITLAB__/kkroening/ffmpeg-python/-/issues.*$"} }, { "evaluator": "NetworkEventEvaluator", - "site": "gitlab", - "last_event_only": true, "ignored_query_params_patterns": ["page", "sort"], "expected": { "url": "__GITLAB__/api/graphql", @@ -2917,20 +2765,22 @@ "task_id": 104, "intent_template_id": 349, "start_urls": ["__GITLAB__"], - "intent": "Display the list of all issues in the keycloak/keycloak repository that have labels related to flaky-test", - "intent_template": "Display the list of {{state}} issues in the {{repo}} repository that have labels related to {{label}}", + "intent": "Navigate to the page showing the list of all issues in the keycloak/keycloak repository that have labels related to flaky-test", + "intent_template": "Navigate to the page showing the list of {{state}} issues in the {{repo}} repository that have labels related to {{label}}", "instantiation_dict": {"label": "flaky-test", "repo": "keycloak/keycloak", "state": "all"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "navigate", "status": "SUCCESS", "retrieved_data": null} + }, + { + "evaluator": "NetworkEventEvaluator", + "ignored_query_params_patterns": [".*"], + "expected": {"url": "^__GITLAB__/keycloak/keycloak/-/issues.*$"} }, { "evaluator": "NetworkEventEvaluator", - "site": "gitlab", - "last_event_only": true, "ignored_query_params_patterns": ["page", "sort"], "expected": { "url": "__GITLAB__/api/graphql", @@ -2948,24 +2798,26 @@ "task_id": 105, "intent_template_id": 349, "start_urls": ["__GITLAB__"], - "intent": "Display the list of not yet closed issues in the OpenAPITools/openapi-generator repository that have labels related to OpenAPI Generator CLI", - "intent_template": "Display the list of {{state}} issues in the {{repo}} repository that have labels related to {{label}}", + "intent": "Navigate to the page showing the list of not yet closed issues in the OpenAPITools/openapi-generator repository that have labels related to OpenAPI Generator CLI", + "intent_template": "Navigate to the page showing the list of {{state}} issues in the {{repo}} repository that have labels related to {{label}}", "instantiation_dict": { "label": "OpenAPI Generator CLI", "repo": "OpenAPITools/openapi-generator", "state": "not yet closed" }, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "navigate", "status": "SUCCESS", "retrieved_data": null} + }, + { + "evaluator": "NetworkEventEvaluator", + "ignored_query_params_patterns": [".*"], + "expected": {"url": "^__GITLAB__/OpenAPITools/openapi-generator/-/issues.*$"} }, { "evaluator": "NetworkEventEvaluator", - "site": "gitlab", - "last_event_only": true, "ignored_query_params_patterns": ["page", "sort"], "expected": { "url": "__GITLAB__/api/graphql", @@ -2986,24 +2838,26 @@ "task_id": 106, "intent_template_id": 349, "start_urls": ["__GITLAB__"], - "intent": "Display the list of open issues in the umano/AndroidSlidingUpPanel repository that have labels related to all except BUG", - "intent_template": "Display the list of {{state}} issues in the {{repo}} repository that have labels related to {{label}}", + "intent": "Navigate to the page showing the list of open issues in the umano/AndroidSlidingUpPanel repository that have labels related to all except BUG", + "intent_template": "Navigate to the page showing the list of {{state}} issues in the {{repo}} repository that have labels related to {{label}}", "instantiation_dict": { "label": "all except BUG", "repo": "umano/AndroidSlidingUpPanel", "state": "open" }, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "navigate", "status": "SUCCESS", "retrieved_data": null} + }, + { + "evaluator": "NetworkEventEvaluator", + "ignored_query_params_patterns": [".*"], + "expected": {"url": "^__GITLAB__/umano/AndroidSlidingUpPanel/-/issues.*$"} }, { "evaluator": "NetworkEventEvaluator", - "site": "gitlab", - "last_event_only": true, "ignored_query_params_patterns": ["page", "sort"], "expected": { "url": "__GITLAB__/api/graphql", @@ -3021,24 +2875,24 @@ "task_id": 107, "intent_template_id": 270, "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Get the monthly count of completed orders from May 2022 through December 2022, inclusive", - "intent_template": "Get the monthly count of completed orders {{period}}", - "instantiation_dict": {"period": "from May 2022 through December 2022, inclusive"}, - "format_specification": "Return a list of objects, where each object includes a \"month\" field for the month and a \"count\" field for the count.", + "intent": "Get the monthly count of completed orders from May 2022 through December 2022, inclusive. Return a list of objects with keys \"month\" (month name) and \"count\" (as integer) only, without any additional details.", + "intent_template": "Get the monthly count of completed orders {{period}}. {{retrieved_data_format_spec}}.", + "instantiation_dict": { + "period": "from May 2022 through December 2022, inclusive", + "retrieved_data_format_spec": "Return a list of objects with keys \"month\" (month name) and \"count\" (as integer) only, without any additional details" + }, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": false, "results_schema": { "type": "array", "items": { "type": "object", - "properties": { "month": {"type": "string", "format": "month"}, "count": {"type": "number"} }, - "required": ["count", "month"] + "properties": { "month": {"type": "string", "format": "month"}, "count": {"type": "number"} } } }, "expected": { - "performed_operation": "retrieve", + "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": [ { "month": "May" , "count": 8 }, @@ -3060,27 +2914,27 @@ "task_id": 108, "intent_template_id": 270, "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Get the monthly count of completed orders January 2023 through May 2023", - "intent_template": "Get the monthly count of completed orders {{period}}", - "instantiation_dict": {"period": "January 2023 through May 2023"}, - "format_specification": "Use \"month\" for the month and \"count\" for the count.", + "intent": "Get the monthly count of completed orders from January 2023 through May 2023, inclusive. Return a list of objects with keys \"month\" (month name) and \"count\" (as integer) only, without any additional details.", + "intent_template": "Get the monthly count of completed orders {{period}}. {{retrieved_data_format_spec}}.", + "instantiation_dict": { + "period": "from January 2023 through May 2023, inclusive", + "retrieved_data_format_spec": "Return a list of objects with keys \"month\" (month name) and \"count\" (as integer) only, without any additional details" + }, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": false, "results_schema": { "type": "array", "items": { "type": "object", - "properties": { "month": {"type": "string", "format": "month"}, "count": {"type": "number"} }, - "required": ["count", "month"] + "properties": { "month": {"type": "string", "format": "month"}, "count": {"type": "number"} } } }, "expected": { - "performed_operation": "retrieve", + "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": [ - { "month": "January" , "count": 10 }, + { "month": "January" , "count": 12 }, { "month": "February", "count": 7 }, { "month": "March" , "count": 5 }, { "month": "April" , "count": 9 }, @@ -3096,24 +2950,24 @@ "task_id": 109, "intent_template_id": 270, "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Get the monthly count of completed orders from Jan to December 2022", - "intent_template": "Get the monthly count of completed orders {{period}}", - "instantiation_dict": {"period": "from Jan to December 2022"}, - "format_specification": "Use \"month\" for the month and \"count\" for the count.", + "intent": "Get the monthly count of completed orders from Jan to December 2022, inclusive. Return a list of objects with keys \"month\" (month name) and \"count\" (as integer) only, without any additional details.", + "intent_template": "Get the monthly count of completed orders {{period}}. {{retrieved_data_format_spec}}.", + "instantiation_dict": { + "period": "from Jan to December 2022, inclusive", + "retrieved_data_format_spec": "Return a list of objects with keys \"month\" (month name) and \"count\" (as integer) only, without any additional details" + }, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": false, "results_schema": { "type": "array", "items": { "type": "object", - "properties": { "month": {"type": "string", "format": "month"}, "count": {"type": "number"} }, - "required": ["count", "month"] + "properties": { "month": {"type": "string", "format": "month"}, "count": {"type": "number"} } } }, "expected": { - "performed_operation": "retrieve", + "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": [ { "month": "January" , "count": 11 }, @@ -3139,24 +2993,24 @@ "task_id": 110, "intent_template_id": 270, "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Get the monthly count of completed orders from Jan 2022 through Nov 2022,", - "intent_template": "Get the monthly count of completed orders {{period}}", - "instantiation_dict": {"period": "from Jan 2022 through Nov 2022"}, - "format_specification": "Use \"month\" for the month and \"count\" for the count.", + "intent": "Get the monthly count of completed orders from Jan 2022 through Nov 2022, inclusive. Return a list of objects with keys \"month\" (month name) and \"count\" (as integer) only, without any additional details.", + "intent_template": "Get the monthly count of completed orders {{period}}. {{retrieved_data_format_spec}}.", + "instantiation_dict": { + "period": "from Jan 2022 through Nov 2022, inclusive", + "retrieved_data_format_spec": "Return a list of objects with keys \"month\" (month name) and \"count\" (as integer) only, without any additional details" + }, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": false, "results_schema": { "type": "array", "items": { "type": "object", - "properties": { "month": {"type": "string", "format": "month"}, "count": {"type": "number"} }, - "required": ["count", "month"] + "properties": { "month": {"type": "string", "format": "month"}, "count": {"type": "number"} } } }, "expected": { - "performed_operation": "retrieve", + "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": [ { "month": "January" , "count": 11 }, @@ -3181,24 +3035,24 @@ "task_id": 111, "intent_template_id": 270, "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Get the monthly count of completed orders from Feb 2022 through Nov 2022", - "intent_template": "Get the monthly count of completed orders {{period}}", - "instantiation_dict": {"period": "from Feb 2022 through Nov 2022"}, - "format_specification": "Use \"month\" for the month and \"count\" for the count.", + "intent": "Get the monthly count of completed orders from Feb 2022 through Nov 2022, inclusive. Return a list of objects with keys \"month\" (month name) and \"count\" (as integer) only, without any additional details.", + "intent_template": "Get the monthly count of completed orders {{period}}. {{retrieved_data_format_spec}}.", + "instantiation_dict": { + "period": "from Feb 2022 through Nov 2022, inclusive", + "retrieved_data_format_spec": "Return a list of objects with keys \"month\" (month name) and \"count\" (as integer) only, without any additional details" + }, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": false, "results_schema": { "type": "array", "items": { "type": "object", - "properties": { "month": {"type": "string", "format": "month"}, "count": {"type": "number"} }, - "required": ["count", "month"] + "properties": { "month": {"type": "string", "format": "month"}, "count": {"type": "number"} } } }, "expected": { - "performed_operation": "retrieve", + "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": [ { "month": "February" , "count": 16 }, @@ -3225,14 +3079,12 @@ "intent": "Return the customer nickname(s) who gave a rating of 3 stars or below for Circe fleece", "intent_template": "Return the customer nickname(s) who gave a rating of 3 stars or below for {{product}}", "instantiation_dict": {"product": "Circe fleece"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": false, "results_schema": { "type": "array", "items": {"type": "string"} }, "expected": { - "performed_operation": "retrieve", + "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": ["Hannah Lim"] } @@ -3248,14 +3100,12 @@ "intent": "Return the customer nickname(s) who gave a rating of 3 stars or below for Olivia zip jacket", "intent_template": "Return the customer nickname(s) who gave a rating of 3 stars or below for {{product}}", "instantiation_dict": {"product": "Olivia zip jacket"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": false, "results_schema": { "type": "array", "items": {"type": "string"} }, "expected": { - "performed_operation": "retrieve", + "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": ["Emma", "Seam Miller"] } @@ -3271,14 +3121,12 @@ "intent": "Return the customer nickname(s) who gave a rating of 3 stars or below for Antonia racer tank", "intent_template": "Return the customer nickname(s) who gave a rating of 3 stars or below for {{product}}", "instantiation_dict": {"product": "Antonia racer tank"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": false, "results_schema": { "type": "array", "items": {"type": "string"} }, "expected": { - "performed_operation": "retrieve", + "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": ["Shaunte", "Merrie"] } @@ -3294,16 +3142,11 @@ "intent": "Return the customer nickname(s) who gave a rating of 3 stars or below for Chloe tank", "intent_template": "Return the customer nickname(s) who gave a rating of 3 stars or below for {{product}}", "instantiation_dict": {"product": "Chloe tank"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": { - "performed_operation": "retrieve", - "status": "NOT_FOUND_ERROR", - "retrieved_data": null - } + "expected": {"task_type": "retrieve", "status": "NOT_FOUND_ERROR", "retrieved_data": null} } ], "revision": 2 @@ -3314,16 +3157,14 @@ "intent_template_id": 245, "start_urls": ["__SHOPPING_ADMIN__"], "intent": "Return the customer nickname(s) who gave a rating of 3 stars or below for tanks products", - "intent_template": "Return the customer nickname(s) who gave a rating of 3 stars or below for {{product}} 3 stars or below for {{product}}", + "intent_template": "Return the customer nickname(s) who gave a rating of 3 stars or below for {{product}}", "instantiation_dict": {"product": "tanks products"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": false, "results_schema": { "type": "array", "items": {"type": "string"} }, "expected": { - "performed_operation": "retrieve", + "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": [ "Dominic", "Trey", "Edmund", "Merrie", "Shaunte", "Teofila", "Carma", "Yan", @@ -3340,17 +3181,17 @@ "task_id": 117, "intent_template_id": 161, "start_urls": ["__SHOPPING__"], - "intent": "Get the date when I made my first purchase on this site", - "intent_template": "Get the date when I made my first purchase on this site", - "instantiation_dict": {}, - "format_specification": null, + "intent": "Get the date when I made my first purchase on this site. Return the date in YYYY-MM-DD format or null if not available, without any additional details.", + "intent_template": "Get the date when I made my first purchase on this site. {{retrieved_data_format_spec}}.", + "instantiation_dict": { + "retrieved_data_format_spec": "Return the date in YYYY-MM-DD format or null if not available, without any additional details" + }, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": false, "results_schema": { "type": "array", "items": {"type": "string", "format": "date"} }, "expected": { - "performed_operation": "retrieve", + "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": ["March 2, 2022"] } @@ -3363,15 +3204,22 @@ "task_id": 118, "intent_template_id": 151, "start_urls": ["__SHOPPING__"], - "intent": "I have jaw bruxism problem, show me something that could alleviate the problem.", - "intent_template": "I have jaw bruxism problem, show me something that could alleviate the problem.", + "intent": "I have a jaw bruxism problem, go to the product page for something that could alleviate the problem.", + "intent_template": "I have a jaw bruxism problem, go to the product page for something that could alleviate the problem.", "instantiation_dict": {}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "navigate", "status": "SUCCESS", "retrieved_data": null} + }, + { + "evaluator": "NetworkEventEvaluator", + "last_event_only": true, + "expected": { + "url": "^__SHOPPING__/[a-z0-9-]*(?:guard|mouth|teeth|night|dental|bruxism)[a-z0-9-]*\\.html$", + "response_status": 200 + } } ], "revision": 2 @@ -3381,24 +3229,24 @@ "task_id": 119, "intent_template_id": 250, "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Get title and rating for all reviews with 4 stars or above for Antonia Racer Tank.", - "intent_template": "Get title and rating for all reviews with 4 stars or above for {{product}}.", - "instantiation_dict": {"product": "Antonia Racer Tank"}, - "format_specification": "Use \"title\" for the review title and \"rating\" for rating (number of stars).", + "intent": "Get title and rating for all reviews with 4 stars or above for Antonia Racer Tank. Return a list of objects with keys \"title\" and \"rating\".", + "intent_template": "Get title and rating for all reviews with 4 stars or above for {{product}}. {{retrieved_data_format_spec}}.", + "instantiation_dict": { + "product": "Antonia Racer Tank", + "retrieved_data_format_spec": "Return a list of objects with keys \"title\" and \"rating\"" + }, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": false, "results_schema": { "type": "array", "items": { "type": "object", - "properties": { "title": {"type": "string"}, "rating": {"type": "number"} }, - "required": ["rating", "title"] + "properties": { "title": {"type": "string"}, "rating": {"type": "number"} } } }, "expected": { - "performed_operation": "retrieve", + "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": [ {"title": "A regular or me", "rating": 4} ] } @@ -3411,24 +3259,24 @@ "task_id": 120, "intent_template_id": 250, "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Get title and rating for all reviews with 4 stars or above for Ana Running Short.", - "intent_template": "Get title and rating for all reviews with 4 stars or above for {{product}}.", - "instantiation_dict": {"product": "Ana Running Short"}, - "format_specification": "Use \"title\" for the review title and \"rating\" for rating (number of stars).", + "intent": "Get title and rating for all reviews with 4 stars or above for Ana Running Short. Return a list of objects with keys \"title\" and \"rating\".", + "intent_template": "Get title and rating for all reviews with 4 stars or above for {{product}}. {{retrieved_data_format_spec}}.", + "instantiation_dict": { + "product": "Ana Running Short", + "retrieved_data_format_spec": "Return a list of objects with keys \"title\" and \"rating\"" + }, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": false, "results_schema": { "type": "array", "items": { "type": "object", - "properties": { "title": {"type": "string"}, "rating": {"type": "number"} }, - "required": ["rating", "title"] + "properties": { "title": {"type": "string"}, "rating": {"type": "number"} } } }, "expected": { - "performed_operation": "retrieve", + "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": [ { "title": "It was really hard to find the right siz", "rating": 4 }, @@ -3444,24 +3292,24 @@ "task_id": 121, "intent_template_id": 250, "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Get title and rating for all reviews with 4 stars or above for Circe hooded fleece.", - "intent_template": "Get title and rating for all reviews with 4 stars or above for {{product}}.", - "instantiation_dict": {"product": "Circe hooded fleece"}, - "format_specification": "Use \"title\" for the review title and \"rating\" for rating (number of stars).", + "intent": "Get title and rating for all reviews with 4 stars or above for Circe hooded fleece. Return a list of objects with keys \"title\" and \"rating\".", + "intent_template": "Get title and rating for all reviews with 4 stars or above for {{product}}. {{retrieved_data_format_spec}}.", + "instantiation_dict": { + "product": "Circe hooded fleece", + "retrieved_data_format_spec": "Return a list of objects with keys \"title\" and \"rating\"" + }, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": false, "results_schema": { "type": "array", "items": { "type": "object", - "properties": { "title": {"type": "string"}, "rating": {"type": "number"} }, - "required": ["rating", "title"] + "properties": { "title": {"type": "string"}, "rating": {"type": "number"} } } }, "expected": { - "performed_operation": "retrieve", + "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": [ {"title": "Good but not perfect", "rating": 4} ] } @@ -3474,24 +3322,24 @@ "task_id": 122, "intent_template_id": 250, "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Get title and rating for all reviews with 4 stars or above for Olivia zip jacket.", - "intent_template": "Get title and rating for all reviews with 4 stars or above for {{product}}.", - "instantiation_dict": {"product": "Olivia zip jacket"}, - "format_specification": "Use \"title\" for the review title and \"rating\" for rating (number of stars).", + "intent": "Get title and rating for all reviews with 4 stars or above for Olivia zip jacket. Return a list of objects with keys \"title\" and \"rating\".", + "intent_template": "Get title and rating for all reviews with 4 stars or above for {{product}}. {{retrieved_data_format_spec}}.", + "instantiation_dict": { + "product": "Olivia zip jacket", + "retrieved_data_format_spec": "Return a list of objects with keys \"title\" and \"rating\"" + }, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": false, "results_schema": { "type": "array", "items": { "type": "object", - "properties": { "title": {"type": "string"}, "rating": {"type": "number"} }, - "required": ["rating", "title"] + "properties": { "title": {"type": "string"}, "rating": {"type": "number"} } } }, "expected": { - "performed_operation": "retrieve", + "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": [ {"title": "Quite good", "rating": 5} ] } @@ -3504,24 +3352,24 @@ "task_id": 123, "intent_template_id": 250, "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Get title and rating for all reviews with 4 stars or above for Circe's products.", - "intent_template": "Get title and rating for all reviews with 4 stars or above for {{product}}.", - "instantiation_dict": {"product": "Circe's products"}, - "format_specification": "Use \"title\" for the review title and \"rating\" for rating (number of stars).", + "intent": "Get title and rating for all reviews with 4 stars or above for Circe's products. Return a list of objects with keys \"title\" and \"rating\".", + "intent_template": "Get title and rating for all reviews with 4 stars or above for {{product}}. {{retrieved_data_format_spec}}.", + "instantiation_dict": { + "product": "Circe's products", + "retrieved_data_format_spec": "Return a list of objects with keys \"title\" and \"rating\"" + }, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": false, "results_schema": { "type": "array", "items": { "type": "object", - "properties": { "title": {"type": "string"}, "rating": {"type": "number"} }, - "required": ["rating", "title"] + "properties": { "title": {"type": "string"}, "rating": {"type": "number"} } } }, "expected": { - "performed_operation": "retrieve", + "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": [ {"title": "Good but not perfect", "rating": 4} ] } @@ -3534,14 +3382,15 @@ "task_id": 124, "intent_template_id": 159, "start_urls": ["__SHOPPING__"], - "intent": "What is the price range of wireless earphone in the One Stop Market?", - "intent_template": "What is the price range of {{product}} in the One Stop Market?", - "instantiation_dict": {"product": "wireless earphone"}, - "format_specification": "Use \"min\" for lowest price and \"max\" for highest price.", + "intent": "What is the price range of wireless earphone in the One Stop Market?. Return an object with keys \"min\" and \"max\" (as numbers, e.g., 10.99) only, without any additional details.", + "intent_template": "What is the price range of {{product}} in the One Stop Market?. {{retrieved_data_format_spec}}.", + "instantiation_dict": { + "product": "wireless earphone", + "retrieved_data_format_spec": "Return an object with keys \"min\" and \"max\" (as numbers, e.g., 10.99) only, without any additional details" + }, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": false, "results_schema": { "type": "array", "items": { @@ -3549,12 +3398,11 @@ "properties": { "min": { "type": "number", "format": "currency" }, "max": { "type": "number", "format": "currency" } - }, - "required": ["max", "min"] + } } }, "expected": { - "performed_operation": "retrieve", + "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": [ {"min": 0.01, "max": 298.0} ] } @@ -3567,14 +3415,15 @@ "task_id": 125, "intent_template_id": 159, "start_urls": ["__SHOPPING__"], - "intent": "What is the price range of teeth grinding mouth guard in the One Stop Market?", - "intent_template": "What is the price range of {{product}} in the One Stop Market?", - "instantiation_dict": {"product": "teeth grinding mouth guard"}, - "format_specification": "Use \"min\" for lowest price and \"max\" for highest price.", + "intent": "What is the price range of teeth grinding mouth guard in the One Stop Market?. Return an object with keys \"min\" and \"max\" (as numbers, e.g., 10.99) only, without any additional details.", + "intent_template": "What is the price range of {{product}} in the One Stop Market?. {{retrieved_data_format_spec}}.", + "instantiation_dict": { + "product": "teeth grinding mouth guard", + "retrieved_data_format_spec": "Return an object with keys \"min\" and \"max\" (as numbers, e.g., 10.99) only, without any additional details" + }, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": false, "results_schema": { "type": "array", "items": { @@ -3582,12 +3431,11 @@ "properties": { "min": { "type": "number", "format": "currency" }, "max": { "type": "number", "format": "currency" } - }, - "required": ["max", "min"] + } } }, "expected": { - "performed_operation": "retrieve", + "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": [ {"min": 1.46, "max": 179.99} ] } @@ -3600,14 +3448,15 @@ "task_id": 126, "intent_template_id": 159, "start_urls": ["__SHOPPING__"], - "intent": "What is the price range of Canon photo printer in the One Stop Market?", - "intent_template": "What is the price range of {{product}} in the One Stop Market?", - "instantiation_dict": {"product": "Canon photo printer"}, - "format_specification": "Use \"min\" for lowest price and \"max\" for highest price.", + "intent": "What is the price range of Canon photo printer in the One Stop Market?. Return an object with keys \"min\" and \"max\" (as numbers, e.g., 10.99) only, without any additional details.", + "intent_template": "What is the price range of {{product}} in the One Stop Market?. {{retrieved_data_format_spec}}.", + "instantiation_dict": { + "product": "Canon photo printer", + "retrieved_data_format_spec": "Return an object with keys \"min\" and \"max\" (as numbers, e.g., 10.99) only, without any additional details" + }, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": false, "results_schema": { "type": "array", "items": { @@ -3615,12 +3464,11 @@ "properties": { "min": { "type": "number", "format": "currency" }, "max": { "type": "number", "format": "currency" } - }, - "required": ["max", "min"] + } } }, "expected": { - "performed_operation": "retrieve", + "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": [ {"min": 2.56, "max": 649.99} ] } @@ -3633,19 +3481,17 @@ "task_id": 127, "intent_template_id": 1001, "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Get the most frequent search terms that match available products in the store", - "intent_template": "Get the most frequent search terms that match available products in the store", + "intent": "Get the top 3 search terms that match available products in the store.", + "intent_template": "Get the top 3 search terms that match available products in the store.", "instantiation_dict": {}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": false, "results_schema": { "type": "array", "items": {"type": "string"} }, "expected": { - "performed_operation": "retrieve", + "task_type": "retrieve", "status": "SUCCESS", - "retrieved_data": ["Hollister", "Joust", "Antonia"] + "retrieved_data": ["Hollister", "Joust Bag", "Antonia Racer Tank"] } } ], @@ -3659,17 +3505,11 @@ "intent": "What's the total number of items sold in the most recent 2 complete orders?", "intent_template": "What's the total number of items sold in the most recent {{k}} complete orders?", "instantiation_dict": {"k": "2"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": false, "results_schema": { "type": "array", "items": {"type": "number"} }, - "expected": { - "performed_operation": "retrieve", - "status": "SUCCESS", - "retrieved_data": [3] - } + "expected": { "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": [3] } } ], "revision": 2 @@ -3682,17 +3522,11 @@ "intent": "What's the total number of items sold in the most recent 4 complete orders?", "intent_template": "What's the total number of items sold in the most recent {{k}} complete orders?", "instantiation_dict": {"k": "4"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": false, "results_schema": { "type": "array", "items": {"type": "number"} }, - "expected": { - "performed_operation": "retrieve", - "status": "SUCCESS", - "retrieved_data": [9] - } + "expected": { "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": [9] } } ], "revision": 2 @@ -3705,17 +3539,11 @@ "intent": "What's the total number of items sold in the most recent 5 complete orders?", "intent_template": "What's the total number of items sold in the most recent {{k}} complete orders?", "instantiation_dict": {"k": "5"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": false, "results_schema": { "type": "array", "items": {"type": "number"} }, - "expected": { - "performed_operation": "retrieve", - "status": "SUCCESS", - "retrieved_data": [12] - } + "expected": { "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": [12] } } ], "revision": 2 @@ -3728,17 +3556,11 @@ "intent": "What's the total number of items sold in the most recent 7 complete orders?", "intent_template": "What's the total number of items sold in the most recent {{k}} complete orders?", "instantiation_dict": {"k": "7"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": false, "results_schema": { "type": "array", "items": {"type": "number"} }, - "expected": { - "performed_operation": "retrieve", - "status": "SUCCESS", - "retrieved_data": [17] - } + "expected": { "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": [17] } } ], "revision": 2 @@ -3751,17 +3573,11 @@ "intent": "How many commits did kilian make to a11yproject.com on March 5, 2023?", "intent_template": "How many commits did {{user}} make to {{repo}} on {{date}}?", "instantiation_dict": {"user": "kilian", "repo": "a11yproject.com", "date": "March 5, 2023"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": false, "results_schema": { "type": "array", "items": {"type": "number"} }, - "expected": { - "performed_operation": "retrieve", - "status": "SUCCESS", - "retrieved_data": [1] - } + "expected": { "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": [1] } } ], "revision": 2 @@ -3774,17 +3590,11 @@ "intent": "How many commits did Eric make to a11yproject.com on March 2, 2023?", "intent_template": "How many commits did {{user}} make to {{repo}} on {{date}}?", "instantiation_dict": {"user": "Eric", "repo": "a11yproject.com", "date": "March 2, 2023"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": false, "results_schema": { "type": "array", "items": {"type": "number"} }, - "expected": { - "performed_operation": "retrieve", - "status": "SUCCESS", - "retrieved_data": [2] - } + "expected": { "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": [2] } } ], "revision": 2 @@ -3797,17 +3607,11 @@ "intent": "How many commits did kilian make to a11yproject.com on March 1, 2023?", "intent_template": "How many commits did {{user}} make to {{repo}} on {{date}}?", "instantiation_dict": {"user": "kilian", "repo": "a11yproject.com", "date": "March 1, 2023"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": false, "results_schema": { "type": "array", "items": {"type": "number"} }, - "expected": { - "performed_operation": "retrieve", - "status": "SUCCESS", - "retrieved_data": [0] - } + "expected": { "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": [0] } } ], "revision": 2 @@ -3824,17 +3628,11 @@ "repo": "a11yproject.com", "date": "January 3, 2023" }, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": false, "results_schema": { "type": "array", "items": {"type": "number"} }, - "expected": { - "performed_operation": "retrieve", - "status": "SUCCESS", - "retrieved_data": [1] - } + "expected": { "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": [1] } } ], "revision": 2 @@ -3851,17 +3649,11 @@ "repo": "a11y-webring.club", "date": "February 6, 2023" }, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": false, "results_schema": { "type": "array", "items": {"type": "number"} }, - "expected": { - "performed_operation": "retrieve", - "status": "SUCCESS", - "retrieved_data": [5] - } + "expected": { "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": [5] } } ], "revision": 2 @@ -3871,23 +3663,18 @@ "task_id": 137, "intent_template_id": 51, "start_urls": ["__MAP__"], - "intent": "What is the estimated driving time between the city where the Liberty Bell is located and the home city of Pirates?", - "intent_template": "What is the estimated driving time between {{city1}} and {{city2}}?", + "intent": "What is the estimated driving time between the city where the Liberty Bell is located and the home city of Pirates? Return the value as a string in HH:MM:SS format only, without any additional details. (Use the OSRM direction service.)", + "intent_template": "What is the estimated driving time between {{city1}} and {{city2}}? {{retrieved_data_format_spec}}. (Use the OSRM direction service.)", "instantiation_dict": { "city1": "the city where the Liberty Bell is located", - "city2": "the home city of Pirates" + "city2": "the home city of Pirates", + "retrieved_data_format_spec": "Return the value as a string in HH:MM:SS format only, without any additional details" }, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": false, "results_schema": { "type": "array", "items": {"type": "string", "format": "duration"} }, - "expected": { - "performed_operation": "retrieve", - "status": "SUCCESS", - "retrieved_data": ["5h 47min"] - } + "expected": { "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": ["5h 47min"] } } ], "revision": 2 @@ -3897,23 +3684,18 @@ "task_id": 138, "intent_template_id": 51, "start_urls": ["__MAP__"], - "intent": "What is the estimated driving time between the big apple and the city with the most authentic Philly cheesesteaks?", - "intent_template": "What is the estimated driving time between {{city1}} and {{city2}}?", + "intent": "What is the estimated driving time between the big apple and the city with the most authentic Philly cheesesteaks? Return the value as a string in HH:MM:SS format only, without any additional details. (Use the OSRM direction service.)", + "intent_template": "What is the estimated driving time between {{city1}} and {{city2}}? {{retrieved_data_format_spec}}. (Use the OSRM direction service.)", "instantiation_dict": { "city1": "the big apple", - "city2": "the city with the most authentic Philly cheesesteaks" + "city2": "the city with the most authentic Philly cheesesteaks", + "retrieved_data_format_spec": "Return the value as a string in HH:MM:SS format only, without any additional details" }, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": false, "results_schema": { "type": "array", "items": {"type": "string", "format": "duration"} }, - "expected": { - "performed_operation": "retrieve", - "status": "SUCCESS", - "retrieved_data": ["1h 58min"] - } + "expected": { "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": ["1h 58min"] } } ], "revision": 2 @@ -3923,20 +3705,18 @@ "task_id": 139, "intent_template_id": 51, "start_urls": ["__MAP__"], - "intent": "What is the estimated driving time between the hometown of Joe Biden and Bridgeport?", - "intent_template": "What is the estimated driving time between {{city1}} and {{city2}}?", - "instantiation_dict": {"city1": "the hometown of Joe Biden", "city2": "Bridgeport"}, - "format_specification": null, + "intent": "What is the estimated driving time between the hometown of Joe Biden and Bridgeport? Return the value as a string in HH:MM:SS format only, without any additional details. (Use the OSRM direction service.)", + "intent_template": "What is the estimated driving time between {{city1}} and {{city2}}? {{retrieved_data_format_spec}}. (Use the OSRM direction service.)", + "instantiation_dict": { + "city1": "the hometown of Joe Biden", + "city2": "Bridgeport", + "retrieved_data_format_spec": "Return the value as a string in HH:MM:SS format only, without any additional details" + }, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": false, "results_schema": { "type": "array", "items": {"type": "string", "format": "duration"} }, - "expected": { - "performed_operation": "retrieve", - "status": "SUCCESS", - "retrieved_data": ["3h 20min"] - } + "expected": { "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": ["3h 20min"] } } ], "revision": 2 @@ -3946,20 +3726,18 @@ "task_id": 140, "intent_template_id": 51, "start_urls": ["__MAP__"], - "intent": "What is the estimated driving time between the city of Niagara Falls and the city of Yale University?", - "intent_template": "What is the estimated driving time between {{city1}} and {{city2}}?", - "instantiation_dict": {"city1": "the city of Niagara Falls", "city2": "the city of Yale University"}, - "format_specification": null, + "intent": "What is the estimated driving time between the city of Niagara Falls and the city of Yale University? Return the value as a string in HH:MM:SS format only, without any additional details. (Use the OSRM direction service.)", + "intent_template": "What is the estimated driving time between {{city1}} and {{city2}}? {{retrieved_data_format_spec}}. (Use the OSRM direction service.)", + "instantiation_dict": { + "city1": "the city of Niagara Falls", + "city2": "the city of Yale University", + "retrieved_data_format_spec": "Return the value as a string in HH:MM:SS format only, without any additional details" + }, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": false, "results_schema": { "type": "array", "items": {"type": "string", "format": "duration"} }, - "expected": { - "performed_operation": "retrieve", - "status": "SUCCESS", - "retrieved_data": ["8h 33min"] - } + "expected": { "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": ["8h 33min"] } } ], "revision": 2 @@ -3969,20 +3747,18 @@ "task_id": 141, "intent_template_id": 162, "start_urls": ["__SHOPPING__"], - "intent": "Return how much I spent on food-related shopping during March 2023 without considering shipping and handling fee", - "intent_template": "Return how much I spent on {{category}} shopping during {{time}} without considering shipping and handling fee", - "instantiation_dict": {"category": "food-related", "time": "March 2023"}, - "format_specification": null, + "intent": "Return how much I spent on food-related shopping during March 2023 without considering shipping and handling fee. Return the value as a number (e.g., 10.99) only, without any additional details", + "intent_template": "Return how much I spent on {{category}} shopping {{time}} without considering shipping and handling fee. {{retrieved_data_format_spec}}", + "instantiation_dict": { + "category": "food-related", + "time": "during March 2023", + "retrieved_data_format_spec": "Return the value as a number (e.g., 10.99) only, without any additional details" + }, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": false, "results_schema": { "type": "array", "items": {"type": "number", "format": "currency"} }, - "expected": { - "performed_operation": "retrieve", - "status": "SUCCESS", - "retrieved_data": [32.41] - } + "expected": { "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": [32.41] } } ], "revision": 2 @@ -3992,20 +3768,18 @@ "task_id": 142, "intent_template_id": 162, "start_urls": ["__SHOPPING__"], - "intent": "Return how much I spent on hair care and hair style shopping during Jan 2023 without considering shipping and handling fee", - "intent_template": "Return how much I spent on {{category}} shopping during {{time}} without considering shipping and handling fee", - "instantiation_dict": {"category": "hair care and hair style", "time": "Jan 2023"}, - "format_specification": null, + "intent": "Return how much I spent on hair care and hair style shopping during Jan 2023 without considering shipping and handling fee. Return the value as a number (e.g., 10.99) only, without any additional details", + "intent_template": "Return how much I spent on {{category}} shopping {{time}} without considering shipping and handling fee. {{retrieved_data_format_spec}}", + "instantiation_dict": { + "category": "hair care and hair style", + "time": "during Jan 2023", + "retrieved_data_format_spec": "Return the value as a number (e.g., 10.99) only, without any additional details" + }, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": false, "results_schema": { "type": "array", "items": {"type": "number", "format": "currency"} }, - "expected": { - "performed_operation": "retrieve", - "status": "SUCCESS", - "retrieved_data": [68.51] - } + "expected": { "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": [68.51] } } ], "revision": 2 @@ -4015,20 +3789,18 @@ "task_id": 143, "intent_template_id": 162, "start_urls": ["__SHOPPING__"], - "intent": "Return how much I spent on home decoration shopping during January 29, 2023 without considering shipping and handling fee", - "intent_template": "Return how much I spent on {{category}} shopping during {{time}} without considering shipping and handling fee", - "instantiation_dict": {"category": "home decoration", "time": "1/29/2023"}, - "format_specification": null, + "intent": "Return how much I spent on home decoration shopping during January 29, 2023 without considering shipping and handling fee. Return the value as a number (e.g., 10.99) only, without any additional details", + "intent_template": "Return how much I spent on {{category}} shopping {{time}} without considering shipping and handling fee. {{retrieved_data_format_spec}}", + "instantiation_dict": { + "category": "home decoration", + "time": "during January 29, 2023", + "retrieved_data_format_spec": "Return the value as a number (e.g., 10.99) only, without any additional details" + }, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": false, "results_schema": { "type": "array", "items": {"type": "number", "format": "currency"} }, - "expected": { - "performed_operation": "retrieve", - "status": "SUCCESS", - "retrieved_data": [260.69] - } + "expected": { "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": [260.69] } } ], "revision": 2 @@ -4038,20 +3810,18 @@ "task_id": 144, "intent_template_id": 162, "start_urls": ["__SHOPPING__"], - "intent": "Return how much I spent on food shopping from January 15 to January 31 2023 without considering shipping and handling fee", - "intent_template": "Return how much I spent on {{category}} shopping during {{time}} without considering shipping and handling fee", - "instantiation_dict": {"category": "food", "time": "from January 15 to the end Jan 2023"}, - "format_specification": null, + "intent": "Return how much I spent on food shopping from January 15 to January 31 2023 without considering shipping and handling fee. ", + "intent_template": "Return how much I spent on {{category}} shopping {{time}} without considering shipping and handling fee. {{retrieved_data_format_spec}}", + "instantiation_dict": { + "category": "food", + "time": "from January 15 to January 31 2023", + "retrieved_data_format_spec": "" + }, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": false, "results_schema": { "type": "array", "items": {"type": "number"} }, - "expected": { - "performed_operation": "retrieve", - "status": "SUCCESS", - "retrieved_data": [0] - } + "expected": { "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": [0] } } ], "revision": 2 @@ -4061,20 +3831,18 @@ "task_id": 145, "intent_template_id": 162, "start_urls": ["__SHOPPING__"], - "intent": "Return how much I spent on cooking and food shopping during March 2022 without considering shipping and handling fee", - "intent_template": "Return how much I spent on {{category}} shopping during {{time}} without considering shipping and handling fee", - "instantiation_dict": {"category": "cooking and food", "time": "March 2022"}, - "format_specification": null, + "intent": "Return how much I spent on cooking and food shopping during March 2022 without considering shipping and handling fee. Return the value as a number (e.g., 10.99) only, without any additional details", + "intent_template": "Return how much I spent on {{category}} shopping {{time}} without considering shipping and handling fee. {{retrieved_data_format_spec}}", + "instantiation_dict": { + "category": "cooking and food", + "time": "during March 2022", + "retrieved_data_format_spec": "Return the value as a number (e.g., 10.99) only, without any additional details" + }, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": false, "results_schema": { "type": "array", "items": {"type": "number", "format": "currency"} }, - "expected": { - "performed_operation": "retrieve", - "status": "SUCCESS", - "retrieved_data": [42.35] - } + "expected": { "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": [42.35] } } ], "revision": 2 @@ -4084,26 +3852,33 @@ "task_id": 146, "intent_template_id": 155, "start_urls": ["__SHOPPING__"], - "intent": "Get the size of the picture frame I bought in Sep 2022", - "intent_template": "Get the {{option}} of the {{product}} I bought {{time}}", - "instantiation_dict": {"option": "size", "product": "picture frame", "time": "in Sep 2022"}, - "format_specification": "Use \"width\" for width in inches and \"height\" for height in inches.", + "intent": "Get the size of the picture frame I bought in Sep 2022. Return a list of objects with keys \"width\" and \"height\".", + "intent_template": "Get the {{option}} of the {{product}} I bought {{time}}.{{retrieved_data_format_spec}}", + "instantiation_dict": { + "option": "size", + "product": "picture frame", + "time": "in Sep 2022", + "retrieved_data_format_spec": " Return a list of objects with keys \"width\" and \"height\"." + }, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": false, "results_schema": { "type": "array", "items": { "type": "object", - "properties": { "width": {"type": "number"}, "height": {"type": "number"} }, - "required": ["width", "height"] + "properties": { "width": {"type": "string"}, "height": {"type": "string"} } } }, "expected": { - "performed_operation": "retrieve", + "task_type": "retrieve", "status": "SUCCESS", - "retrieved_data": [ {"width": 16, "height": 24} ] + "retrieved_data": [ + { + "width": "^16(?:\\.0+)?\\s*[-]?\\s*(?:inch(?:es)?|in\\.?|\u2033|\"|'|')$", + "height": "^24(?:\\.0+)?\\s*[-]?\\s*(?:inch(?:es)?|in\\.?|\u2033|\"|'|')$" + } + ] } } ], @@ -4114,26 +3889,33 @@ "task_id": 147, "intent_template_id": 155, "start_urls": ["__SHOPPING__"], - "intent": "Get the size of the picture frame I bought in 2022", - "intent_template": "Get the {{option}} of the {{product}} I bought {{time}}", - "instantiation_dict": {"option": "size", "product": "picture frame", "time": "in 2022"}, - "format_specification": "Use \"width\" for width in inches and \"height\" for height in inches.", + "intent": "Get the size of the picture frame I bought in 2022. Return a list of objects with keys \"width\" and \"height\".", + "intent_template": "Get the {{option}} of the {{product}} I bought {{time}}.{{retrieved_data_format_spec}}", + "instantiation_dict": { + "option": "size", + "product": "picture frame", + "time": "in 2022", + "retrieved_data_format_spec": " Return a list of objects with keys \"width\" and \"height\"." + }, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": false, "results_schema": { "type": "array", "items": { "type": "object", - "properties": { "width": {"type": "number"}, "height": {"type": "number"} }, - "required": ["width", "height"] + "properties": { "width": {"type": "string"}, "height": {"type": "string"} } } }, "expected": { - "performed_operation": "retrieve", + "task_type": "retrieve", "status": "SUCCESS", - "retrieved_data": [ {"width": 16, "height": 24} ] + "retrieved_data": [ + { + "width": "^16(?:\\.0+)?\\s*[-]?\\s*(?:inch(?:es)?|in\\.?|\u2033|\"|'|')$", + "height": "^24(?:\\.0+)?\\s*[-]?\\s*(?:inch(?:es)?|in\\.?|\u2033|\"|'|')$" + } + ] } } ], @@ -4144,17 +3926,20 @@ "task_id": 148, "intent_template_id": 155, "start_urls": ["__SHOPPING__"], - "intent": "Get the color of the picture frame I bought Sep 2022", - "intent_template": "Get the {{option}} of the {{product}} I bought {{time}}", - "instantiation_dict": {"option": "color", "product": "picture frame", "time": "Sep 2022"}, - "format_specification": null, + "intent": "Get the color of the picture frame I bought Sep 2022.", + "intent_template": "Get the {{option}} of the {{product}} I bought {{time}}.{{retrieved_data_format_spec}}", + "instantiation_dict": { + "option": "color", + "product": "picture frame", + "time": "Sep 2022", + "retrieved_data_format_spec": "" + }, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": false, "results_schema": { "type": "array", "items": {"type": "string"} }, "expected": { - "performed_operation": "retrieve", + "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": ["Mist 16*24"] } @@ -4167,17 +3952,20 @@ "task_id": 149, "intent_template_id": 155, "start_urls": ["__SHOPPING__"], - "intent": "Get the color of the artifical plants I bought Feb 2023", - "intent_template": "Get the {{option}} of the {{product}} I bought {{time}}", - "instantiation_dict": {"option": "color", "product": "artifical plants", "time": "Feb 2023"}, - "format_specification": null, + "intent": "Get the color of the artifical plants I bought Feb 2023.", + "intent_template": "Get the {{option}} of the {{product}} I bought {{time}}.{{retrieved_data_format_spec}}", + "instantiation_dict": { + "option": "color", + "product": "artifical plants", + "time": "Feb 2023", + "retrieved_data_format_spec": "" + }, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": false, "results_schema": { "type": "array", "items": {"type": "string"} }, "expected": { - "performed_operation": "retrieve", + "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": ["Green-vines"] } @@ -4190,20 +3978,19 @@ "task_id": 150, "intent_template_id": 155, "start_urls": ["__SHOPPING__"], - "intent": "Get the price of the fake tree I bought Jan 2023", - "intent_template": "Get the {{option}} of the {{product}} I bought {{time}}", - "instantiation_dict": {"option": "price", "product": "fake tree", "time": "Jan 2023"}, - "format_specification": null, + "intent": "Get the price of the fake tree I bought Jan 2023.Return the value as a number (e.g., 10.99) only, without any additional details", + "intent_template": "Get the {{option}} of the {{product}} I bought {{time}}.{{retrieved_data_format_spec}}", + "instantiation_dict": { + "option": "price", + "product": "fake tree", + "time": "Jan 2023", + "retrieved_data_format_spec": "Return the value as a number (e.g., 10.99) only, without any additional details" + }, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": false, "results_schema": { "type": "array", "items": {"type": "number", "format": "currency"} }, - "expected": { - "performed_operation": "retrieve", - "status": "SUCCESS", - "retrieved_data": [260.69] - } + "expected": { "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": [260.69] } } ], "revision": 2 @@ -4213,20 +4000,18 @@ "task_id": 151, "intent_template_id": 36, "start_urls": ["__MAP__"], - "intent": "What is the minimum travel time by car from CMU to University of Pittsburgh?", - "intent_template": "What is the minimum travel time by car from {{location1}} to {{location2}}?", - "instantiation_dict": {"location1": "CMU", "location2": "University of Pittsburgh"}, - "format_specification": null, + "intent": "What is the minimum travel time by car from CMU to University of Pittsburgh? Return the value as a string in HH:MM:SS format only, without any additional details. (Use the OSRM direction service.)", + "intent_template": "What is the minimum travel time by car from {{location1}} to {{location2}}? {{retrieved_data_format_spec}}. (Use the OSRM direction service.)", + "instantiation_dict": { + "location1": "CMU", + "location2": "University of Pittsburgh", + "retrieved_data_format_spec": "Return the value as a string in HH:MM:SS format only, without any additional details" + }, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": false, "results_schema": { "type": "array", "items": {"type": "string", "format": "duration"} }, - "expected": { - "performed_operation": "retrieve", - "status": "SUCCESS", - "retrieved_data": ["4min"] - } + "expected": { "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": ["4min"] } } ], "revision": 2 @@ -4236,20 +4021,18 @@ "task_id": 152, "intent_template_id": 36, "start_urls": ["__MAP__"], - "intent": "What is the minimum travel time by car from Schenley park to Upitt?", - "intent_template": "What is the minimum travel time by car from {{location1}} to {{location2}}?", - "instantiation_dict": {"location1": "Schenley park", "location2": "Upitt"}, - "format_specification": null, + "intent": "What is the minimum travel time by car from Schenley park to Upitt? Return the value as a string in HH:MM:SS format only, without any additional details. (Use the OSRM direction service.)", + "intent_template": "What is the minimum travel time by car from {{location1}} to {{location2}}? {{retrieved_data_format_spec}}. (Use the OSRM direction service.)", + "instantiation_dict": { + "location1": "Schenley park", + "location2": "Upitt", + "retrieved_data_format_spec": "Return the value as a string in HH:MM:SS format only, without any additional details" + }, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": false, "results_schema": { "type": "array", "items": {"type": "string", "format": "duration"} }, - "expected": { - "performed_operation": "retrieve", - "status": "SUCCESS", - "retrieved_data": ["4min"] - } + "expected": { "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": ["4min"] } } ], "revision": 2 @@ -4259,20 +4042,18 @@ "task_id": 153, "intent_template_id": 36, "start_urls": ["__MAP__"], - "intent": "What is the minimum travel time by car from REI to CMU?", - "intent_template": "What is the minimum travel time by car from {{location1}} to {{location2}}?", - "instantiation_dict": {"location1": "REI", "location2": "CMU"}, - "format_specification": null, + "intent": "What is the minimum travel time by car from REI to CMU? Return the value as a string in HH:MM:SS format only, without any additional details. (Use the OSRM direction service.)", + "intent_template": "What is the minimum travel time by car from {{location1}} to {{location2}}? {{retrieved_data_format_spec}}. (Use the OSRM direction service.)", + "instantiation_dict": { + "location1": "REI", + "location2": "CMU", + "retrieved_data_format_spec": "Return the value as a string in HH:MM:SS format only, without any additional details" + }, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": false, "results_schema": { "type": "array", "items": {"type": "string", "format": "duration"} }, - "expected": { - "performed_operation": "retrieve", - "status": "SUCCESS", - "retrieved_data": ["7min"] - } + "expected": { "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": ["7min"] } } ], "revision": 2 @@ -4282,20 +4063,18 @@ "task_id": 154, "intent_template_id": 36, "start_urls": ["__MAP__"], - "intent": "What is the minimum travel time by car from CMU gates building to Schenley park?", - "intent_template": "What is the minimum travel time by car from {{location1}} to {{location2}}?", - "instantiation_dict": {"location1": "CMU gates building", "location2": "Schenley park"}, - "format_specification": null, + "intent": "What is the minimum travel time by car from CMU gates building to Schenley park? Return the value as a string in HH:MM:SS format only, without any additional details. (Use the OSRM direction service.)", + "intent_template": "What is the minimum travel time by car from {{location1}} to {{location2}}? {{retrieved_data_format_spec}}. (Use the OSRM direction service.)", + "instantiation_dict": { + "location1": "CMU gates building", + "location2": "Schenley park", + "retrieved_data_format_spec": "Return the value as a string in HH:MM:SS format only, without any additional details" + }, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": false, "results_schema": { "type": "array", "items": {"type": "string", "format": "duration"} }, - "expected": { - "performed_operation": "retrieve", - "status": "SUCCESS", - "retrieved_data": ["4min"] - } + "expected": { "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": ["4min"] } } ], "revision": 2 @@ -4305,23 +4084,18 @@ "task_id": 155, "intent_template_id": 36, "start_urls": ["__MAP__"], - "intent": "What is the minimum travel time by car from Animal Rescue League of Pittsburgh to Schenley park?", - "intent_template": "What is the minimum travel time by car from {{location1}} to {{location2}}?", + "intent": "What is the minimum travel time by car from Animal Rescue League of Pittsburgh to Schenley park? Return the value as a string in HH:MM:SS format only, without any additional details. (Use the OSRM direction service.)", + "intent_template": "What is the minimum travel time by car from {{location1}} to {{location2}}? {{retrieved_data_format_spec}}. (Use the OSRM direction service.)", "instantiation_dict": { "location1": "Animal Rescue League of Pittsburgh", - "location2": "Schenley park" + "location2": "Schenley park", + "retrieved_data_format_spec": "Return the value as a string in HH:MM:SS format only, without any additional details" }, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": false, "results_schema": { "type": "array", "items": {"type": "string", "format": "duration"} }, - "expected": { - "performed_operation": "retrieve", - "status": "SUCCESS", - "retrieved_data": ["9min"] - } + "expected": { "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": ["9min"] } } ], "revision": 2 @@ -4331,15 +4105,14 @@ "task_id": 156, "intent_template_id": 290, "start_urls": ["__GITLAB__"], - "intent": "Show me the merge requests assigned to me", - "intent_template": "Show me the merge requests assigned to me", + "intent": "Go to the merge requests assigned to me", + "intent_template": "Go to the merge requests assigned to me", "instantiation_dict": {}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "navigate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", @@ -4356,20 +4129,17 @@ "task_id": 157, "intent_template_id": 255, "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Show all customers", - "intent_template": "Show all customers", + "intent": "View the details of all customers", + "intent_template": "View the details of all customers", "instantiation_dict": {}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "navigate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", - "site": "shopping_admin", - "last_event_only": true, "expected": {"url": "__SHOPPING_ADMIN__/customer/index/"} } ], @@ -4380,23 +4150,19 @@ "task_id": 158, "intent_template_id": 171, "start_urls": ["__SHOPPING__"], - "intent": "I have a lot of Nintendo Switch game cards now, help me find the best storage option to fit all 11 cards", - "intent_template": "I have a lot of Nintendo Switch game cards now, help me find the best storage option to fit all {{num}} cards", + "intent": "Go to the product page of the best storage option that fits 11 Nintendo Switch game cards", + "intent_template": "Go to the product page of the best storage option that fits {{num}} Nintendo Switch game cards", "instantiation_dict": {"num": 11}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "navigate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", - "site": "shopping", - "last_event_only": true, "expected": { - "url": "__SHOPPING__/heiying-game-card-case-for-nintendo-switch-switch-oled-game-card-or-micro-sd-memory-cards-portable-switch-game-memory-card-storage-with-24-game-card-slots-and-24-micro-sd-card-slots-black.html", - "response_status": 200 + "url": "__SHOPPING__/heiying-game-card-case-for-nintendo-switch-switch-oled-game-card-or-micro-sd-memory-cards-portable-switch-game-memory-card-storage-with-24-game-card-slots-and-24-micro-sd-card-slots-black.html" } } ], @@ -4407,23 +4173,19 @@ "task_id": 159, "intent_template_id": 171, "start_urls": ["__SHOPPING__"], - "intent": "I have a lot of Nintendo Switch game cards now, help me find the best storage option to fit all 31 cards", - "intent_template": "I have a lot of Nintendo Switch game cards now, help me find the best storage option to fit all {{num}} cards", + "intent": "Go to the product page of the best storage option that fits 31 Nintendo Switch game cards", + "intent_template": "Go to the product page of the best storage option that fits {{num}} Nintendo Switch game cards", "instantiation_dict": {"num": 31}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "navigate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", - "site": "shopping", - "last_event_only": true, "expected": { - "url": "__SHOPPING__/game-card-holder-storage-case-for-nintendo-switch-games-or-ps-vita-game-case-or-sd-memory-cards-black.html", - "response_status": 200 + "url": "__SHOPPING__/game-card-holder-storage-case-for-nintendo-switch-games-or-ps-vita-game-case-or-sd-memory-cards-black.html" } } ], @@ -4434,23 +4196,19 @@ "task_id": 160, "intent_template_id": 171, "start_urls": ["__SHOPPING__"], - "intent": "I have a lot of Nintendo Switch game cards now, help me find the best storage option to fit all 6 cards", - "intent_template": "I have a lot of Nintendo Switch game cards now, help me find the best storage option to fit all {{num}} cards", + "intent": "Go to the product page of the best storage option that fits 6 Nintendo Switch game cards", + "intent_template": "Go to the product page of the best storage option that fits {{num}} Nintendo Switch game cards", "instantiation_dict": {"num": 6}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "navigate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", - "site": "shopping", - "last_event_only": true, "expected": { - "url": "__SHOPPING__/heiying-game-card-case-for-nintendo-switch-switch-oled-game-card-or-micro-sd-memory-cards-portable-switch-game-memory-card-storage-with-24-game-card-slots-and-24-micro-sd-card-slots-black.html", - "response_status": 200 + "url": "__SHOPPING__/heiying-game-card-case-for-nintendo-switch-switch-oled-game-card-or-micro-sd-memory-cards-portable-switch-game-memory-card-storage-with-24-game-card-slots-and-24-micro-sd-card-slots-black.html" } } ], @@ -4461,23 +4219,19 @@ "task_id": 161, "intent_template_id": 171, "start_urls": ["__SHOPPING__"], - "intent": "I have a lot of Nintendo Switch game cards now, help me find the best storage option to fit all 23 cards", - "intent_template": "I have a lot of Nintendo Switch game cards now, help me find the best storage option to fit all {{num}} cards", + "intent": "Go to the product page of the best storage option that fits 23 Nintendo Switch game cards", + "intent_template": "Go to the product page of the best storage option that fits {{num}} Nintendo Switch game cards", "instantiation_dict": {"num": 23}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "navigate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", - "site": "shopping", - "last_event_only": true, "expected": { - "url": "__SHOPPING__/heiying-game-card-case-for-nintendo-switch-switch-oled-game-card-or-micro-sd-memory-cards-portable-switch-game-memory-card-storage-with-24-game-card-slots-and-24-micro-sd-card-slots-black.html", - "response_status": 200 + "url": "__SHOPPING__/heiying-game-card-case-for-nintendo-switch-switch-oled-game-card-or-micro-sd-memory-cards-portable-switch-game-memory-card-storage-with-24-game-card-slots-and-24-micro-sd-card-slots-black.html" } } ], @@ -4488,23 +4242,19 @@ "task_id": 162, "intent_template_id": 171, "start_urls": ["__SHOPPING__"], - "intent": "I have a lot of Nintendo Switch game cards now, help me find the best storage option to fit all 40 cards", - "intent_template": "I have a lot of Nintendo Switch game cards now, help me find the best storage option to fit all {{num}} cards", + "intent": "Go to the product page of the best storage option that fits 40 Nintendo Switch game cards", + "intent_template": "Go to the product page of the best storage option that fits {{num}} Nintendo Switch game cards", "instantiation_dict": {"num": 40}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "navigate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", - "site": "shopping", - "last_event_only": true, "expected": { - "url": "__SHOPPING__/game-card-holder-storage-case-for-nintendo-switch-games-or-ps-vita-game-case-or-sd-memory-cards-black.html", - "response_status": 200 + "url": "__SHOPPING__/game-card-holder-storage-case-for-nintendo-switch-games-or-ps-vita-game-case-or-sd-memory-cards-black.html" } } ], @@ -4520,14 +4270,12 @@ "intent": "Get all review titles with 2 stars or below for the product on the current page.", "intent_template": "Get all review titles with 2 stars or below for the product on the current page.", "instantiation_dict": {}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": false, "results_schema": { "type": "array", "items": {"type": "string"} }, "expected": { - "performed_operation": "retrieve", + "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": ["Memory Card Came Defective"] } @@ -4543,17 +4291,11 @@ "intent": "Get all review titles with 2 stars or below for the product on the current page.", "intent_template": "Get all review titles with 2 stars or below for the product on the current page.", "instantiation_dict": {}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": false, "results_schema": { "type": "array", "items": {"type": "string"} }, - "expected": { - "performed_operation": "retrieve", - "status": "SUCCESS", - "retrieved_data": ["Meh"] - } + "expected": { "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": ["Meh"] } } ], "revision": 2 @@ -4566,14 +4308,12 @@ "intent": "Get all review titles with 2 stars or below for the product on the current page.", "intent_template": "Get all review titles with 2 stars or below for the product on the current page.", "instantiation_dict": {}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": false, "results_schema": { "type": "array", "items": {"type": "string"} }, "expected": { - "performed_operation": "retrieve", + "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": ["So cute but too small", "Toe rubbed"] } @@ -4591,16 +4331,11 @@ "intent": "Get all review titles with 2 stars or below for the product on the current page.", "intent_template": "Get all review titles with 2 stars or below for the product on the current page.", "instantiation_dict": {}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": { - "performed_operation": "retrieve", - "status": "NOT_FOUND_ERROR", - "retrieved_data": null - } + "expected": {"task_type": "retrieve", "status": "NOT_FOUND_ERROR", "retrieved_data": null} } ], "revision": 2 @@ -4613,14 +4348,12 @@ "intent": "Get all review titles with 2 stars or below for the product on the current page.", "intent_template": "Get all review titles with 2 stars or below for the product on the current page.", "instantiation_dict": {}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": false, "results_schema": { "type": "array", "items": {"type": "string"} }, "expected": { - "performed_operation": "retrieve", + "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": ["Waste of big money"] } @@ -4636,16 +4369,11 @@ "intent": "Get the project ID(s) of my personal project(s) that received more than 100 stars", "intent_template": "Get the project ID(s) of my personal project(s) that received {{description}}", "instantiation_dict": {"description": "more than 100 stars"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": { - "performed_operation": "retrieve", - "status": "NOT_FOUND_ERROR", - "retrieved_data": null - } + "expected": {"task_type": "retrieve", "status": "NOT_FOUND_ERROR", "retrieved_data": null} } ], "revision": 2 @@ -4658,17 +4386,11 @@ "intent": "Get the project ID(s) of my personal project(s) that received the most stars", "intent_template": "Get the project ID(s) of my personal project(s) that received {{description}}", "instantiation_dict": {"description": "the most stars"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": false, "results_schema": { "type": "array", "items": {"type": "number"} }, - "expected": { - "performed_operation": "retrieve", - "status": "SUCCESS", - "retrieved_data": [187, 183] - } + "expected": { "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": [187, 183] } } ], "revision": 2 @@ -4681,14 +4403,12 @@ "intent": "Get the project ID(s) of my personal project(s) that received the least stars", "intent_template": "Get the project ID(s) of my personal project(s) that received {{description}}", "instantiation_dict": {"description": "the least stars"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": false, "results_schema": { "type": "array", "items": {"type": "number"} }, "expected": { - "performed_operation": "retrieve", + "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": [193, 190, 189, 188, 184, 181] } @@ -4704,14 +4424,12 @@ "intent": "Get the project ID(s) of my personal project(s) that received less than 5 stars", "intent_template": "Get the project ID(s) of my personal project(s) that received {{description}}", "instantiation_dict": {"description": "less than 5 stars"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": false, "results_schema": { "type": "array", "items": {"type": "number"} }, "expected": { - "performed_operation": "retrieve", + "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": [182, 179, 186, 185, 193, 190, 189, 188, 184, 181] } @@ -4724,17 +4442,15 @@ "task_id": 172, "intent_template_id": 289, "start_urls": ["__GITLAB__"], - "intent": "Tell me the project ID(s) of my personal project(s) that received no stars?", - "intent_template": "Tell me the project ID(s) of my personal project(s) that received {{description}}?", + "intent": "Get the project ID(s) of my personal project(s) that received no stars", + "intent_template": "Get the project ID(s) of my personal project(s) that received {{description}}", "instantiation_dict": {"description": "no stars"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": false, "results_schema": { "type": "array", "items": {"type": "number"} }, "expected": { - "performed_operation": "retrieve", + "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": [193, 190, 189, 188, 184, 181] } @@ -4747,20 +4463,17 @@ "task_id": 173, "intent_template_id": 310, "start_urls": ["__GITLAB__"], - "intent": "Get whether my latest updated issue with \"better\" in its title is closed", - "intent_template": "Get whether my latest updated issue with \"{{keyword}}\" in its title is closed", - "instantiation_dict": {"keyword": "better"}, - "format_specification": "Use a boolean where true means the issue is closed and false means the issue is opened.", + "intent": "Get whether my latest updated issue with \"better\" in its title is closed. Return a boolean (true if closed, false if opened).", + "intent_template": "Get whether my latest updated issue with \"{{keyword}}\" in its title is closed. {{retrieved_data_format_spec}}.", + "instantiation_dict": { + "keyword": "better", + "retrieved_data_format_spec": "Return a boolean (true if closed, false if opened)" + }, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": false, "results_schema": { "type": "array", "items": {"type": "boolean"} }, - "expected": { - "performed_operation": "retrieve", - "status": "SUCCESS", - "retrieved_data": [false] - } + "expected": { "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": [false] } } ], "revision": 2 @@ -4770,20 +4483,17 @@ "task_id": 174, "intent_template_id": 310, "start_urls": ["__GITLAB__"], - "intent": "Get whether my latest updated issue with \"feature\" in its title is closed", - "intent_template": "Get whether my latest updated issue with \"{{keyword}}\" in its title is closed", - "instantiation_dict": {"keyword": "feature"}, - "format_specification": "Use a boolean where true means the issue is closed and false means the issue is opened.", + "intent": "Get whether my latest updated issue with \"feature\" in its title is closed. Return a boolean (true if closed, false if opened).", + "intent_template": "Get whether my latest updated issue with \"{{keyword}}\" in its title is closed. {{retrieved_data_format_spec}}.", + "instantiation_dict": { + "keyword": "feature", + "retrieved_data_format_spec": "Return a boolean (true if closed, false if opened)" + }, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": false, "results_schema": { "type": "array", "items": {"type": "boolean"} }, - "expected": { - "performed_operation": "retrieve", - "status": "SUCCESS", - "retrieved_data": [false] - } + "expected": { "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": [false] } } ], "revision": 2 @@ -4793,20 +4503,17 @@ "task_id": 175, "intent_template_id": 310, "start_urls": ["__GITLAB__"], - "intent": "Get whether my latest updated issue with \"dependency\" in its title is closed", - "intent_template": "Get whether my latest updated issue with \"{{keyword}}\" in its title is closed", - "instantiation_dict": {"keyword": "dependency"}, - "format_specification": "Use a boolean where true means the issue is closed and false means the issue is opened.", + "intent": "Get whether my latest updated issue with \"dependency\" in its title is closed. Return a boolean (true if closed, false if opened).", + "intent_template": "Get whether my latest updated issue with \"{{keyword}}\" in its title is closed. {{retrieved_data_format_spec}}.", + "instantiation_dict": { + "keyword": "dependency", + "retrieved_data_format_spec": "Return a boolean (true if closed, false if opened)" + }, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": false, "results_schema": { "type": "array", "items": {"type": "boolean"} }, - "expected": { - "performed_operation": "retrieve", - "status": "SUCCESS", - "retrieved_data": [false] - } + "expected": { "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": [false] } } ], "revision": 2 @@ -4816,20 +4523,17 @@ "task_id": 176, "intent_template_id": 310, "start_urls": ["__GITLAB__"], - "intent": "Get whether my latest updated issue with \"theme editor\" in its title is closed", - "intent_template": "Get whether my latest updated issue with \"{{keyword}}\" in its title is closed", - "instantiation_dict": {"keyword": "theme editor"}, - "format_specification": "Use a boolean where true means the issue is closed and false means the issue is opened.", + "intent": "Get whether my latest updated issue with \"theme editor\" in its title is closed. Return a boolean (true if closed, false if opened).", + "intent_template": "Get whether my latest updated issue with \"{{keyword}}\" in its title is closed. {{retrieved_data_format_spec}}.", + "instantiation_dict": { + "keyword": "theme editor", + "retrieved_data_format_spec": "Return a boolean (true if closed, false if opened)" + }, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": false, "results_schema": { "type": "array", "items": {"type": "boolean"} }, - "expected": { - "performed_operation": "retrieve", - "status": "SUCCESS", - "retrieved_data": [false] - } + "expected": { "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": [false] } } ], "revision": 2 @@ -4839,20 +4543,17 @@ "task_id": 177, "intent_template_id": 310, "start_urls": ["__GITLAB__"], - "intent": "Get whether my latest updated issue with \"homepage content\" in its title is closed", - "intent_template": "Get whether my latest updated issue with \"{{keyword}}\" in its title is closed", - "instantiation_dict": {"keyword": "homepage content"}, - "format_specification": "Use a boolean where true means the issue is closed and false means the issue is opened.", + "intent": "Get whether my latest updated issue with \"homepage content\" in its title is closed. Return a boolean (true if closed, false if opened).", + "intent_template": "Get whether my latest updated issue with \"{{keyword}}\" in its title is closed. {{retrieved_data_format_spec}}.", + "instantiation_dict": { + "keyword": "homepage content", + "retrieved_data_format_spec": "Return a boolean (true if closed, false if opened)" + }, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": false, "results_schema": { "type": "array", "items": {"type": "boolean"} }, - "expected": { - "performed_operation": "retrieve", - "status": "SUCCESS", - "retrieved_data": [true] - } + "expected": { "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": [true] } } ], "revision": 2 @@ -4862,19 +4563,17 @@ "task_id": 178, "intent_template_id": 500, "start_urls": ["__GITLAB__"], - "intent": "Get whether my latest created issue with better in its title is closed", - "intent_template": "Get whether my latest created issue with {{keyword}} in its title is closed", - "instantiation_dict": {"keyword": "better"}, - "format_specification": "Use a boolean where true means the issue is closed and false means the issue is opened.", + "intent": "Get whether my latest created issue with better in its title is closed. Return a boolean (true if closed, false if opened).", + "intent_template": "Get whether my latest created issue with {{keyword}} in its title is closed. {{retrieved_data_format_spec}}.", + "instantiation_dict": { + "keyword": "better", + "retrieved_data_format_spec": "Return a boolean (true if closed, false if opened)" + }, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": { "type": "array", "items": {"type": "boolean"} }, - "expected": { - "performed_operation": "retrieve", - "status": "SUCCESS", - "retrieved_data": [true] - } + "expected": { "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": [true] } } ], "revision": 2 @@ -4884,19 +4583,17 @@ "task_id": 179, "intent_template_id": 500, "start_urls": ["__GITLAB__"], - "intent": "Get whether my latest created issue with feature in its title is closed", - "intent_template": "Get whether my latest created issue with {{keyword}} in its title is closed", - "instantiation_dict": {"keyword": "feature"}, - "format_specification": "Use a boolean where true means the issue is closed and false means the issue is opened.", + "intent": "Get whether my latest created issue with feature in its title is closed. Return a boolean (true if closed, false if opened).", + "intent_template": "Get whether my latest created issue with {{keyword}} in its title is closed. {{retrieved_data_format_spec}}.", + "instantiation_dict": { + "keyword": "feature", + "retrieved_data_format_spec": "Return a boolean (true if closed, false if opened)" + }, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": { "type": "array", "items": {"type": "boolean"} }, - "expected": { - "performed_operation": "retrieve", - "status": "SUCCESS", - "retrieved_data": [true] - } + "expected": { "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": [true] } } ], "revision": 2 @@ -4906,19 +4603,17 @@ "task_id": 180, "intent_template_id": 500, "start_urls": ["__GITLAB__"], - "intent": "Get whether my latest created issue with dependency in its title is closed", - "intent_template": "Get whether my latest created issue with {{keyword}} in its title is closed", - "instantiation_dict": {"keyword": "dependency"}, - "format_specification": "Use a boolean where true means the issue is closed and false means the issue is opened.", + "intent": "Get whether my latest created issue with dependency in its title is closed. Return a boolean (true if closed, false if opened).", + "intent_template": "Get whether my latest created issue with {{keyword}} in its title is closed. {{retrieved_data_format_spec}}.", + "instantiation_dict": { + "keyword": "dependency", + "retrieved_data_format_spec": "Return a boolean (true if closed, false if opened)" + }, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": { "type": "array", "items": {"type": "boolean"} }, - "expected": { - "performed_operation": "retrieve", - "status": "SUCCESS", - "retrieved_data": [false] - } + "expected": { "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": [false] } } ], "revision": 2 @@ -4928,19 +4623,17 @@ "task_id": 181, "intent_template_id": 500, "start_urls": ["__GITLAB__"], - "intent": "Get whether my latest created issue with theme editor in its title is closed", - "intent_template": "Get whether my latest created issue with {{keyword}} in its title is closed", - "instantiation_dict": {"keyword": "theme editor"}, - "format_specification": "Use a boolean where true means the issue is closed and false means the issue is opened.", + "intent": "Get whether my latest created issue with theme editor in its title is closed. Return a boolean (true if closed, false if opened).", + "intent_template": "Get whether my latest created issue with {{keyword}} in its title is closed. {{retrieved_data_format_spec}}.", + "instantiation_dict": { + "keyword": "theme editor", + "retrieved_data_format_spec": "Return a boolean (true if closed, false if opened)" + }, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": { "type": "array", "items": {"type": "boolean"} }, - "expected": { - "performed_operation": "retrieve", - "status": "SUCCESS", - "retrieved_data": [false] - } + "expected": { "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": [false] } } ], "revision": 2 @@ -4950,19 +4643,17 @@ "task_id": 182, "intent_template_id": 500, "start_urls": ["__GITLAB__"], - "intent": "Get whether my latest created issue with homepage content in its title is closed", - "intent_template": "Get whether my latest created issue with {{keyword}} in its title is closed", - "instantiation_dict": {"keyword": "homepage content"}, - "format_specification": "Use a boolean where true means the issue is closed and false means the issue is opened.", + "intent": "Get whether my latest created issue with homepage content in its title is closed. Return a boolean (true if closed, false if opened).", + "intent_template": "Get whether my latest created issue with {{keyword}} in its title is closed. {{retrieved_data_format_spec}}.", + "instantiation_dict": { + "keyword": "homepage content", + "retrieved_data_format_spec": "Return a boolean (true if closed, false if opened)" + }, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": { "type": "array", "items": {"type": "boolean"} }, - "expected": { - "performed_operation": "retrieve", - "status": "SUCCESS", - "retrieved_data": [true] - } + "expected": { "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": [true] } } ], "revision": 2 @@ -4973,18 +4664,13 @@ "intent_template_id": 368, "start_urls": ["__SHOPPING_ADMIN__"], "intent": "Give me the SKU of the products that have 10 units left", - "intent_template": "Give me the {{Attribute}} of the products that have {{N}} units left", - "instantiation_dict": {"Attribute": "SKU", "N": "10"}, - "format_specification": null, + "intent_template": "Give me the {{Attribute}} of the products that have {{N}} units left{{retrieved_data_format_spec}}", + "instantiation_dict": {"Attribute": "SKU", "N": "10", "retrieved_data_format_spec": ""}, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": { - "performed_operation": "retrieve", - "status": "NOT_FOUND_ERROR", - "retrieved_data": null - } + "expected": {"task_type": "retrieve", "status": "NOT_FOUND_ERROR", "retrieved_data": null} } ], "revision": 2 @@ -4994,19 +4680,27 @@ "task_id": 184, "intent_template_id": 368, "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Give me the name of the products that have 0 units left", - "intent_template": "Give me the {{Attribute}} of the products that have {{N}} units left", - "instantiation_dict": {"Attribute": "name", "N": "0"}, - "format_specification": null, + "intent": "Give me the name and color of the products that have 0 units left. Return a list of objects with keys \"name\" and \"color\".", + "intent_template": "Give me the {{Attribute}} of the products that have {{N}} units left{{retrieved_data_format_spec}}", + "instantiation_dict": { + "Attribute": "name and color", + "N": "0", + "retrieved_data_format_spec": ". Return a list of objects with keys \"name\" and \"color\"." + }, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": false, - "results_schema": { "type": "array", "items": {"type": "string"} }, + "results_schema": { + "type": "array", + "items": { + "type": "object", + "properties": { "name": {"type": "string"}, "color": {"type": "string"} } + } + }, "expected": { - "performed_operation": "retrieve", + "task_type": "retrieve", "status": "SUCCESS", - "retrieved_data": ["Cronus Yoga Pant -33-Blue"] + "retrieved_data": [ {"name": "Cronus Yoga Pant -33-Blue", "color": "Blue"} ] } } ], @@ -5018,16 +4712,14 @@ "intent_template_id": 368, "start_urls": ["__SHOPPING_ADMIN__"], "intent": "Give me the material of the products that have 3 units left", - "intent_template": "Give me the {{Attribute}} of the products that have {{N}} units left", - "instantiation_dict": {"Attribute": "material", "N": "3"}, - "format_specification": null, + "intent_template": "Give me the {{Attribute}} of the products that have {{N}} units left{{retrieved_data_format_spec}}", + "instantiation_dict": {"Attribute": "material", "N": "3", "retrieved_data_format_spec": ""}, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": false, "results_schema": { "type": "array", "items": {"type": "string"} }, "expected": { - "performed_operation": "retrieve", + "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": ["Cotton", "Fleece"] } @@ -5040,24 +4732,25 @@ "task_id": 186, "intent_template_id": 368, "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Give me the product names and the sizes of the products that have 2-3 units left", - "intent_template": "Give me the {{Attribute}} of the products that have {{N}} units left", - "instantiation_dict": {"Attribute": "product names and the sizes", "N": "2-3"}, - "format_specification": "Use \"name\" for the product name and \"size\" for the size.", + "intent": "Give me the product names and the sizes of the products that have 2-3 units left. Return a list of objects with keys \"name\" and \"size\".", + "intent_template": "Give me the {{Attribute}} of the products that have {{N}} units left{{retrieved_data_format_spec}}", + "instantiation_dict": { + "Attribute": "product names and the sizes", + "N": "2-3", + "retrieved_data_format_spec": ". Return a list of objects with keys \"name\" and \"size\"." + }, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": false, "results_schema": { "type": "array", "items": { "type": "object", - "properties": { "name": {"type": "string"}, "size": {"type": "string"} }, - "required": ["name", "size"] + "properties": { "name": {"type": "string"}, "size": {"type": "string"} } } }, "expected": { - "performed_operation": "retrieve", + "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": [ { "name": "Eos V-Neck Hoodie-S-Blue" , "size": "S" }, @@ -5074,16 +4767,14 @@ "intent_template_id": 368, "start_urls": ["__SHOPPING_ADMIN__"], "intent": "Give me the SKU of the products that have 1-3 units left", - "intent_template": "Give me the {{Attribute}} of the products that have {{N}} units left", - "instantiation_dict": {"Attribute": "SKU", "N": "1-3"}, - "format_specification": null, + "intent_template": "Give me the {{Attribute}} of the products that have {{N}} units left{{retrieved_data_format_spec}}", + "instantiation_dict": {"Attribute": "SKU", "N": "1-3", "retrieved_data_format_spec": ""}, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": false, "results_schema": { "type": "array", "items": {"type": "string"} }, "expected": { - "performed_operation": "retrieve", + "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": ["WH11-S-Blue", "WS08-XS-Blue"] } @@ -5096,20 +4787,17 @@ "task_id": 188, "intent_template_id": 214, "start_urls": ["__SHOPPING__"], - "intent": "Get the total cost of my latest order marked as \"cancelled\"", - "intent_template": "Get the total cost of my latest order {{status}}", - "instantiation_dict": {"status": "marked as 'cancelled'"}, - "format_specification": null, + "intent": "Get the total cost of my latest order marked as \"cancelled\". Return the value as a number (e.g., 10.99) only, without any additional details", + "intent_template": "Get the total cost of my latest order {{status}}. {{retrieved_data_format_spec}}", + "instantiation_dict": { + "status": "marked as \"cancelled\"", + "retrieved_data_format_spec": "Return the value as a number (e.g., 10.99) only, without any additional details" + }, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": false, "results_schema": { "type": "array", "items": {"type": "number", "format": "currency"} }, - "expected": { - "performed_operation": "retrieve", - "status": "SUCCESS", - "retrieved_data": [365.42] - } + "expected": { "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": [365.42] } } ], "revision": 2 @@ -5119,20 +4807,17 @@ "task_id": 189, "intent_template_id": 214, "start_urls": ["__SHOPPING__"], - "intent": "Get the total cost of my latest order marked as \"pending\"", - "intent_template": "Get the total cost of my latest order {{status}}", - "instantiation_dict": {"status": "marked as 'pending'"}, - "format_specification": null, + "intent": "Get the total cost of my latest order marked as \"pending\". Return the value as a number (e.g., 10.99) only, without any additional details", + "intent_template": "Get the total cost of my latest order {{status}}. {{retrieved_data_format_spec}}", + "instantiation_dict": { + "status": "marked as \"pending\"", + "retrieved_data_format_spec": "Return the value as a number (e.g., 10.99) only, without any additional details" + }, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": false, "results_schema": { "type": "array", "items": {"type": "number", "format": "currency"} }, - "expected": { - "performed_operation": "retrieve", - "status": "SUCCESS", - "retrieved_data": [754.99] - } + "expected": { "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": [754.99] } } ], "revision": 2 @@ -5142,20 +4827,17 @@ "task_id": 190, "intent_template_id": 214, "start_urls": ["__SHOPPING__"], - "intent": "Get the total cost of my latest order marked as \"complete\"", - "intent_template": "Get the total cost of my latest order {{status}}", - "instantiation_dict": {"status": "marked as 'complete'"}, - "format_specification": null, + "intent": "Get the total cost of my latest order marked as \"complete\". Return the value as a number (e.g., 10.99) only, without any additional details", + "intent_template": "Get the total cost of my latest order {{status}}. {{retrieved_data_format_spec}}", + "instantiation_dict": { + "status": "marked as \"complete\"", + "retrieved_data_format_spec": "Return the value as a number (e.g., 10.99) only, without any additional details" + }, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": false, "results_schema": { "type": "array", "items": {"type": "number", "format": "currency"} }, - "expected": { - "performed_operation": "retrieve", - "status": "SUCCESS", - "retrieved_data": [65.32] - } + "expected": { "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": [65.32] } } ], "revision": 2 @@ -5165,19 +4847,14 @@ "task_id": 191, "intent_template_id": 214, "start_urls": ["__SHOPPING__"], - "intent": "Get the total cost of my latest order marked as \"processing\"", - "intent_template": "Get the total cost of my latest order {{status}}", - "instantiation_dict": {"status": "marked as 'processing'"}, - "format_specification": null, + "intent": "Get the total cost of my latest order marked as \"processing\". ", + "intent_template": "Get the total cost of my latest order {{status}}. {{retrieved_data_format_spec}}", + "instantiation_dict": {"status": "marked as \"processing\"", "retrieved_data_format_spec": ""}, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": { - "performed_operation": "retrieve", - "status": "NOT_FOUND_ERROR", - "retrieved_data": null - } + "expected": {"task_type": "retrieve", "status": "NOT_FOUND_ERROR", "retrieved_data": null} } ], "revision": 2 @@ -5187,20 +4864,17 @@ "task_id": 192, "intent_template_id": 214, "start_urls": ["__SHOPPING__"], - "intent": "Get the total cost of my latest order that is not cancelled", - "intent_template": "Get the total cost of my latest order {{status}}", - "instantiation_dict": {"status": "that is not cancelled"}, - "format_specification": null, + "intent": "Get the total cost of my latest order that is not cancelled. Return the value as a number (e.g., 10.99) only, without any additional details", + "intent_template": "Get the total cost of my latest order {{status}}. {{retrieved_data_format_spec}}", + "instantiation_dict": { + "status": "that is not cancelled", + "retrieved_data_format_spec": "Return the value as a number (e.g., 10.99) only, without any additional details" + }, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": false, "results_schema": { "type": "array", "items": {"type": "number", "format": "currency"} }, - "expected": { - "performed_operation": "retrieve", - "status": "SUCCESS", - "retrieved_data": [754.99] - } + "expected": { "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": [754.99] } } ], "revision": 2 @@ -5210,20 +4884,17 @@ "task_id": 193, "intent_template_id": 367, "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Get the total payment amount of the last 2 completed orders", - "intent_template": "Get the total payment amount of the last {{N}} {{status}} orders", - "instantiation_dict": {"status": "completed", "N": "2"}, - "format_specification": null, + "intent": "Get the total payment amount of the last 2 completed orders. Return the value as a number (e.g., 10.99) only, without any additional details.", + "intent_template": "{{payment_query}}. {{retrieved_data_format_spec}}.", + "instantiation_dict": { + "payment_query": "Get the total payment amount of the last 2 completed orders", + "retrieved_data_format_spec": "Return the value as a number (e.g., 10.99) only, without any additional details" + }, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": false, "results_schema": { "type": "array", "items": {"type": "number", "format": "currency"} }, - "expected": { - "performed_operation": "retrieve", - "status": "SUCCESS", - "retrieved_data": [182.4] - } + "expected": { "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": [182.4] } } ], "revision": 2 @@ -5233,20 +4904,17 @@ "task_id": 194, "intent_template_id": 367, "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Get the total payment amount of the last 5 completed orders", - "intent_template": "Get the total payment amount of the last {{N}} {{status}} orders", - "instantiation_dict": {"status": "completed", "N": "5"}, - "format_specification": null, + "intent": "Get the total payment amount of the last 5 completed orders. Return the value as a number (e.g., 10.99) only, without any additional details.", + "intent_template": "{{payment_query}}. {{retrieved_data_format_spec}}.", + "instantiation_dict": { + "payment_query": "Get the total payment amount of the last 5 completed orders", + "retrieved_data_format_spec": "Return the value as a number (e.g., 10.99) only, without any additional details" + }, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": false, "results_schema": { "type": "array", "items": {"type": "number", "format": "currency"} }, - "expected": { - "performed_operation": "retrieve", - "status": "SUCCESS", - "retrieved_data": [555.2] - } + "expected": { "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": [555.2] } } ], "revision": 2 @@ -5256,20 +4924,17 @@ "task_id": 195, "intent_template_id": 367, "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Get the total payment amount of the last 5 pending orders", - "intent_template": "Get the total payment amount of the last {{N}} {{status}} orders", - "instantiation_dict": {"status": "pending", "N": "5"}, - "format_specification": null, + "intent": "Get the total payment amount of the last 5 pending orders. Return the value as a number (e.g., 10.99) only, without any additional details.", + "intent_template": "{{payment_query}}. {{retrieved_data_format_spec}}.", + "instantiation_dict": { + "payment_query": "Get the total payment amount of the last 5 pending orders", + "retrieved_data_format_spec": "Return the value as a number (e.g., 10.99) only, without any additional details" + }, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": false, "results_schema": { "type": "array", "items": {"type": "number", "format": "currency"} }, - "expected": { - "performed_operation": "retrieve", - "status": "SUCCESS", - "retrieved_data": [885.4] - } + "expected": { "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": [885.4] } } ], "revision": 2 @@ -5279,20 +4944,17 @@ "task_id": 196, "intent_template_id": 367, "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Get the payment difference between the last 4 cancelled orders and the last 4 completed orders", - "intent_template": "Get the payment difference between the last {{N}} {{status_1}} orders and the last {{N}} {{status_2}} orders", - "instantiation_dict": {"status_1": "cancelled", "status_2": "completed", "N": "4"}, - "format_specification": null, + "intent": "Get the payment difference between the last 4 cancelled orders and the last 4 completed orders. Return the value as a number (e.g., 10.99) only, without any additional details.", + "intent_template": "{{payment_query}}. {{retrieved_data_format_spec}}.", + "instantiation_dict": { + "payment_query": "Get the payment difference between the last 4 cancelled orders and the last 4 completed orders", + "retrieved_data_format_spec": "Return the value as a number (e.g., 10.99) only, without any additional details" + }, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": false, "results_schema": { "type": "array", "items": {"type": "number", "format": "currency"} }, - "expected": { - "performed_operation": "retrieve", - "status": "SUCCESS", - "retrieved_data": [194.25] - } + "expected": { "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": [194.25] } } ], "revision": 2 @@ -5302,20 +4964,17 @@ "task_id": 197, "intent_template_id": 367, "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Get the total payment amount of the last 5 non-cancelled orders", - "intent_template": "Get the total payment amount of the last {{N}} {{status}} orders", - "instantiation_dict": {"status": "non-cancelled", "N": "5"}, - "format_specification": null, + "intent": "Get the total payment amount of the last 5 non-cancelled orders. Return the value as a number (e.g., 10.99) only, without any additional details.", + "intent_template": "{{payment_query}}. {{retrieved_data_format_spec}}.", + "instantiation_dict": { + "payment_query": "Get the total payment amount of the last 5 non-cancelled orders", + "retrieved_data_format_spec": "Return the value as a number (e.g., 10.99) only, without any additional details" + }, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": false, "results_schema": { "type": "array", "items": {"type": "number", "format": "currency"} }, - "expected": { - "performed_operation": "retrieve", - "status": "SUCCESS", - "retrieved_data": [778.2] - } + "expected": { "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": [778.2] } } ], "revision": 2 @@ -5325,19 +4984,21 @@ "task_id": 198, "intent_template_id": 366, "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Get the customer name of the most recent cancelled order", - "intent_template": "Get the {{attribute}} of the {{status}} order", - "instantiation_dict": {"attribute": "customer name", "status": "most recent cancelled"}, - "format_specification": null, + "intent": "Get the customer email of the most recent cancelled order.", + "intent_template": "Get the {{attribute}} of the {{status}} order.{{retrieved_data_format_spec}}", + "instantiation_dict": { + "attribute": "customer email", + "status": "most recent cancelled", + "retrieved_data_format_spec": "" + }, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": false, "results_schema": { "type": "array", "items": {"type": "string"} }, "expected": { - "performed_operation": "retrieve", + "task_type": "retrieve", "status": "SUCCESS", - "retrieved_data": ["Lily Potter"] + "retrieved_data": ["harrypotterfan1@gmail.com"] } } ], @@ -5348,20 +5009,18 @@ "task_id": 199, "intent_template_id": 366, "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Get the order ID of the newest pending order", - "intent_template": "Get the {{attribute}} of the {{status}} order", - "instantiation_dict": {"attribute": "order ID", "status": "newest pending"}, - "format_specification": null, + "intent": "Get the order ID of the newest pending order.", + "intent_template": "Get the {{attribute}} of the {{status}} order.{{retrieved_data_format_spec}}", + "instantiation_dict": { + "attribute": "order ID", + "status": "newest pending", + "retrieved_data_format_spec": "" + }, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": false, "results_schema": { "type": "array", "items": {"type": "number"} }, - "expected": { - "performed_operation": "retrieve", - "status": "SUCCESS", - "retrieved_data": [299] - } + "expected": { "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": [299] } } ], "revision": 2 @@ -5371,20 +5030,18 @@ "task_id": 200, "intent_template_id": 366, "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Get the billing name of the oldest complete order", - "intent_template": "Get the {{attribute}} of the {{status}} order", - "instantiation_dict": {"attribute": "billing name", "status": "oldest complete"}, - "format_specification": null, + "intent": "Get the billing name of the oldest complete order.", + "intent_template": "Get the {{attribute}} of the {{status}} order.{{retrieved_data_format_spec}}", + "instantiation_dict": { + "attribute": "billing name", + "status": "oldest complete", + "retrieved_data_format_spec": "" + }, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": false, "results_schema": { "type": "array", "items": {"type": "string"} }, - "expected": { - "performed_operation": "retrieve", - "status": "SUCCESS", - "retrieved_data": ["John Lee"] - } + "expected": { "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": ["John Lee"] } } ], "revision": 2 @@ -5394,19 +5051,18 @@ "task_id": 201, "intent_template_id": 366, "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Get the customer name of the earliest fraud suspect order", - "intent_template": "Get the {{attribute}} of the {{status}} order", - "instantiation_dict": {"attribute": "customer name", "status": "earliest fraud suspect"}, - "format_specification": null, + "intent": "Get the customer email of the earliest fraud suspect order.", + "intent_template": "Get the {{attribute}} of the {{status}} order.{{retrieved_data_format_spec}}", + "instantiation_dict": { + "attribute": "customer email", + "status": "earliest fraud suspect", + "retrieved_data_format_spec": "" + }, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": { - "performed_operation": "retrieve", - "status": "NOT_FOUND_ERROR", - "retrieved_data": null - } + "expected": {"task_type": "retrieve", "status": "NOT_FOUND_ERROR", "retrieved_data": null} } ], "revision": 2 @@ -5416,17 +5072,19 @@ "task_id": 202, "intent_template_id": 366, "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Get the date of the most recent cancelled order", - "intent_template": "Get the {{attribute}} of the {{status}} order", - "instantiation_dict": {"attribute": "date", "status": "most recent cancelled"}, - "format_specification": null, + "intent": "Get the date of the most recent cancelled order.Return the date in YYYY-MM-DD format or null if not available, without any additional details", + "intent_template": "Get the {{attribute}} of the {{status}} order.{{retrieved_data_format_spec}}", + "instantiation_dict": { + "attribute": "date", + "status": "most recent cancelled", + "retrieved_data_format_spec": "Return the date in YYYY-MM-DD format or null if not available, without any additional details" + }, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": false, "results_schema": { "type": "array", "items": {"type": "string", "format": "date"} }, "expected": { - "performed_operation": "retrieve", + "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": ["May 23 2023"] } @@ -5439,24 +5097,25 @@ "task_id": 203, "intent_template_id": 366, "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Get the purchase date and order id of the most recent pending order", - "intent_template": "Get the {{attribute}} of the {{status}} order", - "instantiation_dict": {"attribute": "purchase date and order id", "status": "most recent pending"}, - "format_specification": "Use \"date\" for the date and \"order_id\" for the order id.", + "intent": "Get the purchase date and order id of the most recent pending order.Return a list of objects with keys \"purchase_date\" (YYYY-MM-DD format or null if not available) and \"order_id\", without any additional details", + "intent_template": "Get the {{attribute}} of the {{status}} order.{{retrieved_data_format_spec}}", + "instantiation_dict": { + "attribute": "purchase date and order id", + "status": "most recent pending", + "retrieved_data_format_spec": "Return a list of objects with keys \"purchase_date\" (YYYY-MM-DD format or null if not available) and \"order_id\", without any additional details" + }, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": false, "results_schema": { "type": "array", "items": { "type": "object", - "properties": { "date": {"type": "string", "format": "date"}, "order_id": {"type": "string"} }, - "required": ["date", "order_id"] + "properties": { "date": {"type": "string", "format": "date"}, "order_id": {"type": "string"} } } }, "expected": { - "performed_operation": "retrieve", + "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": [ {"date": "May 31, 2023", "order_id": "000000299"} ] } @@ -5469,13 +5128,13 @@ "task_id": 204, "intent_template_id": 366, "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Get the product name and final price (low to high) of the most recent completed order", - "intent_template": "Get the {{attribute}} of the {{status}} order", + "intent": "Get the product name and final price (low to high) of the most recent completed order.Return a list of objects with keys \"name\" (product name) and \"price\" (as number, e.g., 10.99) only, without any additional details", + "intent_template": "Get the {{attribute}} of the {{status}} order.{{retrieved_data_format_spec}}", "instantiation_dict": { "attribute": "product name and final price (low to high)", - "status": "most recent completed" + "status": "most recent completed", + "retrieved_data_format_spec": "Return a list of objects with keys \"name\" (product name) and \"price\" (as number, e.g., 10.99) only, without any additional details" }, - "format_specification": "Use \"name\" for the product name and \"price\" for the final price.", "eval": [ { "evaluator": "AgentResponseEvaluator", @@ -5487,16 +5146,15 @@ "properties": { "name" : { "type": "string" }, "price": { "type": "number", "format": "currency" } - }, - "required": ["name", "price"] + } } }, "expected": { - "performed_operation": "retrieve", + "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": [ - { "name": "Proteus Fitness Jackshirt" , "price": 45.0 }, - { "name": "Ida Workout Parachute Pant", "price": 38.4 } + { "name": "Ida Workout Parachute Pant", "price": 38.4 }, + { "name": "Proteus Fitness Jackshirt" , "price": 45.0 } ] } } @@ -5509,19 +5167,13 @@ "intent_template_id": 320, "start_urls": ["__GITLAB__/a11yproject/a11yproject.com"], "intent": "How many commits did kilian make on March 5, 2023 for the current project?", - "intent_template": "How many commits did {{user}} make on {{date}} for the current project?", - "instantiation_dict": {"user": "kilian", "date": "March 5, 2023"}, - "format_specification": null, + "intent_template": "How many commits did {{user}} make on {{date}}{{modifier}} for the current project?", + "instantiation_dict": {"user": "kilian", "date": "March 5, 2023", "modifier": ""}, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": false, "results_schema": { "type": "array", "items": {"type": "number"} }, - "expected": { - "performed_operation": "retrieve", - "status": "SUCCESS", - "retrieved_data": [1] - } + "expected": { "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": [1] } } ], "revision": 2 @@ -5532,19 +5184,13 @@ "intent_template_id": 320, "start_urls": ["__GITLAB__/a11yproject/a11yproject.com"], "intent": "How many commits did Eric make on March 2, 2023 for the current project?", - "intent_template": "How many commits did {{user}} make on {{date}} for the current project?", - "instantiation_dict": {"user": "Eric", "date": "March 2, 2023"}, - "format_specification": null, + "intent_template": "How many commits did {{user}} make on {{date}}{{modifier}} for the current project?", + "instantiation_dict": {"user": "Eric", "date": "March 2, 2023", "modifier": ""}, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": false, "results_schema": { "type": "array", "items": {"type": "number"} }, - "expected": { - "performed_operation": "retrieve", - "status": "SUCCESS", - "retrieved_data": [2] - } + "expected": { "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": [2] } } ], "revision": 2 @@ -5555,19 +5201,13 @@ "intent_template_id": 320, "start_urls": ["__GITLAB__/a11yproject/a11yproject.com"], "intent": "How many commits did Eric and Kilian make on January 3, 2023 in total for the current project?", - "intent_template": "How many commits did {{user}} make on {{date}} in total for the current project?", - "instantiation_dict": {"user": "Eric and Kilian", "date": "January 3, 2023"}, - "format_specification": null, + "intent_template": "How many commits did {{user}} make on {{date}}{{modifier}} for the current project?", + "instantiation_dict": {"user": "Eric and Kilian", "date": "January 3, 2023", "modifier": " in total"}, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": false, "results_schema": { "type": "array", "items": {"type": "number"} }, - "expected": { - "performed_operation": "retrieve", - "status": "SUCCESS", - "retrieved_data": [1] - } + "expected": { "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": [1] } } ], "revision": 2 @@ -5577,24 +5217,24 @@ "task_id": 208, "intent_template_id": 364, "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Get the customer name and email with phone number +1 2058812302", - "intent_template": "Get the customer name and email with phone number {{PhoneNum}}", - "instantiation_dict": {"PhoneNum": "+1 2058812302"}, - "format_specification": "Use \"name\" for the customer name and \"email\" for the email.", + "intent": "Get the customer name and email with phone number +1 2058812302. Return a list of objects with keys \"name\" and \"email\".", + "intent_template": "Get the customer name and email with phone number {{PhoneNum}}. {{retrieved_data_format_spec}}.", + "instantiation_dict": { + "PhoneNum": "+1 2058812302", + "retrieved_data_format_spec": "Return a list of objects with keys \"name\" and \"email\"" + }, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": false, "results_schema": { "type": "array", "items": { "type": "object", - "properties": { "name": {"type": "string"}, "email": {"type": "string"} }, - "required": ["email", "name"] + "properties": { "name": {"type": "string"}, "email": {"type": "string"} } } }, "expected": { - "performed_operation": "retrieve", + "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": [ {"name": "John Smith", "email": "john.smith.xyz@gmail.com"} ] } @@ -5607,24 +5247,24 @@ "task_id": 209, "intent_template_id": 364, "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Get the customer name and email with phone number 2137418080", - "intent_template": "Get the customer name and email with phone number {{PhoneNum}}", - "instantiation_dict": {"PhoneNum": "2137418080"}, - "format_specification": "Use \"name\" for the customer name and \"email\" for the email.", + "intent": "Get the customer name and email with phone number 2137418080. Return a list of objects with keys \"name\" and \"email\".", + "intent_template": "Get the customer name and email with phone number {{PhoneNum}}. {{retrieved_data_format_spec}}.", + "instantiation_dict": { + "PhoneNum": "2137418080", + "retrieved_data_format_spec": "Return a list of objects with keys \"name\" and \"email\"" + }, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": false, "results_schema": { "type": "array", "items": { "type": "object", - "properties": { "name": {"type": "string"}, "email": {"type": "string"} }, - "required": ["email", "name"] + "properties": { "name": {"type": "string"}, "email": {"type": "string"} } } }, "expected": { - "performed_operation": "retrieve", + "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": [ {"name": "Jennifer White", "email": "jennifer.white@yahoo.com"} ] } @@ -5637,24 +5277,24 @@ "task_id": 210, "intent_template_id": 364, "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Get the customer name and email with phone number 2065555555", - "intent_template": "Get the customer name and email with phone number {{PhoneNum}}", - "instantiation_dict": {"PhoneNum": "2065555555"}, - "format_specification": "Use \"name\" for the customer name and \"email\" for the email.", + "intent": "Get the customer name and email with phone number 2065555555. Return a list of objects with keys \"name\" and \"email\".", + "intent_template": "Get the customer name and email with phone number {{PhoneNum}}. {{retrieved_data_format_spec}}.", + "instantiation_dict": { + "PhoneNum": "2065555555", + "retrieved_data_format_spec": "Return a list of objects with keys \"name\" and \"email\"" + }, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": false, "results_schema": { "type": "array", "items": { "type": "object", - "properties": { "name": {"type": "string"}, "email": {"type": "string"} }, - "required": ["email", "name"] + "properties": { "name": {"type": "string"}, "email": {"type": "string"} } } }, "expected": { - "performed_operation": "retrieve", + "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": [ {"name": "Adam Garcia", "email": "gamingpro456@gmail.com"} ] } @@ -5667,24 +5307,24 @@ "task_id": 211, "intent_template_id": 364, "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Get the customer name and email with phone number 8015551212", - "intent_template": "Get the customer name and email with phone number {{PhoneNum}}", - "instantiation_dict": {"PhoneNum": "8015551212"}, - "format_specification": "Use \"name\" for the customer name and \"email\" for the email.", + "intent": "Get the customer name and email with phone number 8015551212. Return a list of objects with keys \"name\" and \"email\".", + "intent_template": "Get the customer name and email with phone number {{PhoneNum}}. {{retrieved_data_format_spec}}.", + "instantiation_dict": { + "PhoneNum": "8015551212", + "retrieved_data_format_spec": "Return a list of objects with keys \"name\" and \"email\"" + }, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": false, "results_schema": { "type": "array", "items": { "type": "object", - "properties": { "name": {"type": "string"}, "email": {"type": "string"} }, - "required": ["email", "name"] + "properties": { "name": {"type": "string"}, "email": {"type": "string"} } } }, "expected": { - "performed_operation": "retrieve", + "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": [ {"name": "Sean Miller", "email": "sean.miller@gmail.com"} ] } @@ -5697,24 +5337,24 @@ "task_id": 212, "intent_template_id": 364, "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Get the customer name and email with phone number 555-229-3326", - "intent_template": "Get the customer name and email with phone number {{PhoneNum}}", - "instantiation_dict": {"PhoneNum": "555-229-3326"}, - "format_specification": "Use \"name\" for the customer name and \"email\" for the email.", + "intent": "Get the customer name and email with phone number 555-229-3326. Return a list of objects with keys \"name\" and \"email\".", + "intent_template": "Get the customer name and email with phone number {{PhoneNum}}. {{retrieved_data_format_spec}}.", + "instantiation_dict": { + "PhoneNum": "555-229-3326", + "retrieved_data_format_spec": "Return a list of objects with keys \"name\" and \"email\"" + }, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": false, "results_schema": { "type": "array", "items": { "type": "object", - "properties": { "name": {"type": "string"}, "email": {"type": "string"} }, - "required": ["email", "name"] + "properties": { "name": {"type": "string"}, "email": {"type": "string"} } } }, "expected": { - "performed_operation": "retrieve", + "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": [ {"name": "Veronica Costello", "email": "roni_cost@example.com"} ] } @@ -5727,24 +5367,24 @@ "task_id": 213, "intent_template_id": 249, "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Get the title and rating for all reviews with 3 stars or below for Antonia Racer Tank.", - "intent_template": "Get the title and rating for all reviews with 3 stars or below for {{product}}.", - "instantiation_dict": {"product": "Antonia Racer Tank"}, - "format_specification": "Use \"title\" for the review title and \"rating\" for rating (number of stars).", + "intent": "Get the title and rating for all reviews with 3 stars or below for Antonia Racer Tank. Return a list of objects with keys \"title\" and \"rating\".", + "intent_template": "Get the title and rating for all reviews with 3 stars or below for {{product}}. {{retrieved_data_format_spec}}.", + "instantiation_dict": { + "product": "Antonia Racer Tank", + "retrieved_data_format_spec": "Return a list of objects with keys \"title\" and \"rating\"" + }, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": false, "results_schema": { "type": "array", "items": { "type": "object", - "properties": { "title": {"type": "string"}, "rating": {"type": "string"} }, - "required": ["rating", "title"] + "properties": { "title": {"type": "string"}, "rating": {"type": "string"} } } }, "expected": { - "performed_operation": "retrieve", + "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": [ { "title": "Zero support/modesty", "rating": "2" }, @@ -5760,24 +5400,24 @@ "task_id": 214, "intent_template_id": 249, "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Get the title and rating for all reviews with 3 stars or below for Erica Sports Bra.", - "intent_template": "Get the title and rating for all reviews with 3 stars or below for {{product}}.", - "instantiation_dict": {"product": "Erica Sports Bra"}, - "format_specification": "Use \"title\" for the review title and \"rating\" for rating (number of stars).", + "intent": "Get the title and rating for all reviews with 3 stars or below for Erica Sports Bra. Return a list of objects with keys \"title\" and \"rating\".", + "intent_template": "Get the title and rating for all reviews with 3 stars or below for {{product}}. {{retrieved_data_format_spec}}.", + "instantiation_dict": { + "product": "Erica Sports Bra", + "retrieved_data_format_spec": "Return a list of objects with keys \"title\" and \"rating\"" + }, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": false, "results_schema": { "type": "array", "items": { "type": "object", - "properties": { "title": {"type": "string"}, "rating": {"type": "string"} }, - "required": ["rating", "title"] + "properties": { "title": {"type": "string"}, "rating": {"type": "string"} } } }, "expected": { - "performed_operation": "retrieve", + "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": [ { "title": "Doesn't fit me. Luma fail.", "rating": "2" }, @@ -5793,24 +5433,24 @@ "task_id": 215, "intent_template_id": 249, "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Get the title and rating for all reviews with 3 stars or below for Circe ice fleece.", - "intent_template": "Get the title and rating for all reviews with 3 stars or below for {{product}}.", - "instantiation_dict": {"product": "Circe ice fleece"}, - "format_specification": "Use \"title\" for the review title and \"rating\" for rating (number of stars).", + "intent": "Get the title and rating for all reviews with 3 stars or below for Circe ice fleece. Return a list of objects with keys \"title\" and \"rating\".", + "intent_template": "Get the title and rating for all reviews with 3 stars or below for {{product}}. {{retrieved_data_format_spec}}.", + "instantiation_dict": { + "product": "Circe ice fleece", + "retrieved_data_format_spec": "Return a list of objects with keys \"title\" and \"rating\"" + }, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": false, "results_schema": { "type": "array", "items": { "type": "object", - "properties": { "title": {"type": "string"}, "rating": {"type": "string"} }, - "required": ["rating", "title"] + "properties": { "title": {"type": "string"}, "rating": {"type": "string"} } } }, "expected": { - "performed_operation": "retrieve", + "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": [ {"title": "Bad!", "rating": "1"} ] } @@ -5823,24 +5463,24 @@ "task_id": 216, "intent_template_id": 249, "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Get the title and rating for all reviews with 3 stars or below for Electra Bra Top.", - "intent_template": "Get the title and rating for all reviews with 3 stars or below for {{product}}.", - "instantiation_dict": {"product": "Electra Bra Top"}, - "format_specification": "Use \"title\" for the review title and \"rating\" for rating (number of stars).", + "intent": "Get the title and rating for all reviews with 3 stars or below for Electra Bra Top. Return a list of objects with keys \"title\" and \"rating\".", + "intent_template": "Get the title and rating for all reviews with 3 stars or below for {{product}}. {{retrieved_data_format_spec}}.", + "instantiation_dict": { + "product": "Electra Bra Top", + "retrieved_data_format_spec": "Return a list of objects with keys \"title\" and \"rating\"" + }, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": false, "results_schema": { "type": "array", "items": { "type": "object", - "properties": { "title": {"type": "string"}, "rating": {"type": "string"} }, - "required": ["rating", "title"] + "properties": { "title": {"type": "string"}, "rating": {"type": "string"} } } }, "expected": { - "performed_operation": "retrieve", + "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": [ {"title": "Not exactly true to size", "rating": "3"} ] } @@ -5853,24 +5493,24 @@ "task_id": 217, "intent_template_id": 249, "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Get the title and rating for all reviews with 3 stars or below for Pursuit Tone Band.", - "intent_template": "Get the title and rating for all reviews with 3 stars or below for {{product}}.", - "instantiation_dict": {"product": "Pursuit Tone Band"}, - "format_specification": "Use \"title\" for the review title and \"rating\" for rating (number of stars).", + "intent": "Get the title and rating for all reviews with 3 stars or below for Pursuit Tone Band. Return a list of objects with keys \"title\" and \"rating\".", + "intent_template": "Get the title and rating for all reviews with 3 stars or below for {{product}}. {{retrieved_data_format_spec}}.", + "instantiation_dict": { + "product": "Pursuit Tone Band", + "retrieved_data_format_spec": "Return a list of objects with keys \"title\" and \"rating\"" + }, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": false, "results_schema": { "type": "array", "items": { "type": "object", - "properties": { "title": {"type": "string"}, "rating": {"type": "string"} }, - "required": ["rating", "title"] + "properties": { "title": {"type": "string"}, "rating": {"type": "string"} } } }, "expected": { - "performed_operation": "retrieve", + "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": [ { "title": "Agreed. More resistance", "rating": "3" }, @@ -5886,19 +5526,18 @@ "task_id": 218, "intent_template_id": 41, "start_urls": ["__MAP__"], - "intent": "Get the name of the hotel and walking distance of nearby hotels to CMU, Pittsburgh that take at most 6 minutes", - "intent_template": "Get the name of the hotel and walking distance of nearby hotels to {{location}} that take at most {{n}} minutes", - "instantiation_dict": {"location": "CMU, Pittsburgh", "n": "5"}, - "format_specification": "Use \"hotel\" for the name and \"distance\" for the distance.", + "intent": "Get the name of the hotel and walking distance of nearby hotels to CMU, Pittsburgh that take at most 5 minutes. Use \"hotel\" for the name and \"distance\" for the distance. (Use the OSRM direction service.)", + "intent_template": "Get the name of the hotel and walking distance of nearby hotels to {{location}} that take at most {{n}} minutes. {{retrieved_data_format_spec}}. (Use the OSRM direction service.)", + "instantiation_dict": { + "location": "CMU, Pittsburgh", + "n": "5", + "retrieved_data_format_spec": "Use \"hotel\" for the name and \"distance\" for the distance" + }, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": { - "performed_operation": "retrieve", - "status": "NOT_FOUND_ERROR", - "retrieved_data": null - } + "expected": {"task_type": "retrieve", "status": "NOT_FOUND_ERROR", "retrieved_data": null} } ], "revision": 2 @@ -5908,19 +5547,18 @@ "task_id": 219, "intent_template_id": 41, "start_urls": ["__MAP__"], - "intent": "Get the name of the hotel and walking distance of nearby hotels to Pittsburgh airport that take at most 3 minutes", - "intent_template": "Get the name of the hotel and walking distance of nearby hotels to {{location}} that take at most {{n}} minutes", - "instantiation_dict": {"location": "Pittsburgh airport", "n": "3"}, - "format_specification": "Use \"hotel\" for the name and \"distance\" for the distance.", + "intent": "Get the name of the hotel and walking distance of nearby hotels to Pittsburgh airport that take at most 3 minutes. Use \"hotel\" for the name and \"distance\" for the distance. (Use the OSRM direction service.)", + "intent_template": "Get the name of the hotel and walking distance of nearby hotels to {{location}} that take at most {{n}} minutes. {{retrieved_data_format_spec}}. (Use the OSRM direction service.)", + "instantiation_dict": { + "location": "Pittsburgh airport", + "n": "3", + "retrieved_data_format_spec": "Use \"hotel\" for the name and \"distance\" for the distance" + }, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": { - "performed_operation": "retrieve", - "status": "NOT_FOUND_ERROR", - "retrieved_data": null - } + "expected": {"task_type": "retrieve", "status": "NOT_FOUND_ERROR", "retrieved_data": null} } ], "revision": 2 @@ -5930,31 +5568,32 @@ "task_id": 220, "intent_template_id": 41, "start_urls": ["__MAP__"], - "intent": "Get the name of the hotel and walking distance of nearby hotels to Gardner Steel Conference Center, that take at most 5 minutes", - "intent_template": "Tell me the name of the hotel and walking distance of nearby hotels to {{location}} that take at most {{n}} minutes", - "instantiation_dict": {"location": "Gardner Steel Conference Center,", "n": 5}, - "format_specification": "Use \"hotel\" for the name and \"distance\" for the distance.", + "intent": "Get the name of the hotel and walking distance of nearby hotels to Gardner Steel Conference Center, that take at most 5 minutes. Return a list of objects with keys \"hotel\" (hotel name) and \"distance\" (numeric value with unit km or m, e.g., 2.4km, 500m) only, without any additional details. (Use the OSRM direction service.)", + "intent_template": "Get the name of the hotel and walking distance of nearby hotels to {{location}} that take at most {{n}} minutes. {{retrieved_data_format_spec}}. (Use the OSRM direction service.)", + "instantiation_dict": { + "location": "Gardner Steel Conference Center,", + "n": 5, + "retrieved_data_format_spec": "Return a list of objects with keys \"hotel\" (hotel name) and \"distance\" (numeric value with unit km or m, e.g., 2.4km, 500m) only, without any additional details" + }, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": false, "results_schema": { "type": "array", "items": { "type": "object", "properties": { - "name" : { "type": "string" }, - "distance": { "type": "string", "format": "distance" } - }, - "required": ["distance", "name"] + "hotel" : { "type": "string", "format": "location-name" }, + "distance": { "type": "string", "format": "distance" } + } } }, "expected": { - "performed_operation": "retrieve", + "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": [ - { "name": "Wyndham Pittsburgh University Cente", "distance": "375m" }, - { "name": "The Oaklander Hotel" , "distance": "338m" } + { "distance": "375m", "hotel": "Wyndham Pittsburgh University Center" }, + { "distance": "338m", "hotel": "The Oaklander Hotel" } ] } } @@ -5966,14 +5605,15 @@ "task_id": 221, "intent_template_id": 35, "start_urls": ["__MAP__"], - "intent": "I am at CMU Pittsburgh, how long does it take to the nearest USPS postal office with different transportation methods?", - "intent_template": "I am at CMU Pittsburgh, how long does it take to the nearest {{location}} with different transportation methods?", - "instantiation_dict": {"location": "USPS postal office"}, - "format_specification": "Use \"transportation_method\" for the transportation method and \"duration\" for the duration.", + "intent": "I am at CMU Pittsburgh, how long does it take to reach the nearest USPS postal office with different transportation methods? Return a list of objects with keys \"transportation_method\" (Walking, Driving, or Biking) and \"duration\" (in HH:MM:SS format) only, without any additional details or text. (Use the OSRM direction service.)", + "intent_template": "I am at CMU Pittsburgh, how long does it take to reach the nearest {{location}} with different transportation methods? {{retrieved_data_format_spec}}. (Use the OSRM direction service.)", + "instantiation_dict": { + "location": "USPS postal office", + "retrieved_data_format_spec": "Return a list of objects with keys \"transportation_method\" (Walking, Driving, or Biking) and \"duration\" (in HH:MM:SS format) only, without any additional details or text" + }, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": false, "results_schema": { "type": "array", "items": { @@ -5981,12 +5621,11 @@ "properties": { "transportation_method": { "type": "string" }, "duration" : { "type": "string", "format": "duration" } - }, - "required": ["duration", "transportation_method"] + } } }, "expected": { - "performed_operation": "retrieve", + "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": [ { "transportation_method": "Walking", "duration": "19min" }, @@ -6003,20 +5642,17 @@ "task_id": 222, "intent_template_id": 35, "start_urls": ["__MAP__"], - "intent": "I am at CMU Pittsburgh, how long does it take to the nearest cold stone ice cream with different transportation methods?", - "intent_template": "I am at CMU Pittsburgh, how long does it take to the nearest {{location}} with different transportation methods?", - "instantiation_dict": {"location": "cold stone ice cream"}, - "format_specification": "Use \"transportation_method\" for the transportation method and \"duration\" for the duration.", + "intent": "I am at CMU Pittsburgh, how long does it take to reach the nearest cold stone ice cream with different transportation methods? Return duration in HH:MM:SS format. (Use the OSRM direction service.)", + "intent_template": "I am at CMU Pittsburgh, how long does it take to reach the nearest {{location}} with different transportation methods? {{retrieved_data_format_spec}}. (Use the OSRM direction service.)", + "instantiation_dict": { + "location": "cold stone ice cream", + "retrieved_data_format_spec": "Return duration in HH:MM:SS format" + }, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": false, "results_schema": { "type": "array", "items": {"type": "string", "format": "duration"} }, - "expected": { - "performed_operation": "retrieve", - "status": "SUCCESS", - "retrieved_data": ["3min"] - } + "expected": { "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": ["3min"] } } ], "revision": 2 @@ -6026,20 +5662,17 @@ "task_id": 223, "intent_template_id": 35, "start_urls": ["__MAP__"], - "intent": "I am at CMU Pittsburgh, how long does it take to the nearest McDonald's with different transportation methods?", - "intent_template": "I am at CMU Pittsburgh, how long does it take to the nearest {{location}} with different transportation methods?", - "instantiation_dict": {"location": "Mcdonald's"}, - "format_specification": "Use \"transportation_method\" for the transportation method and \"duration\" for the duration.", + "intent": "I am at CMU Pittsburgh, how long does it take to reach the nearest Mcdonald's with different transportation methods? Return duration in HH:MM:SS format. (Use the OSRM direction service.)", + "intent_template": "I am at CMU Pittsburgh, how long does it take to reach the nearest {{location}} with different transportation methods? {{retrieved_data_format_spec}}. (Use the OSRM direction service.)", + "instantiation_dict": { + "location": "Mcdonald's", + "retrieved_data_format_spec": "Return duration in HH:MM:SS format" + }, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": false, "results_schema": { "type": "array", "items": {"type": "string", "format": "duration"} }, - "expected": { - "performed_operation": "retrieve", - "status": "SUCCESS", - "retrieved_data": ["4min"] - } + "expected": { "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": ["4min"] } } ], "revision": 2 @@ -6049,20 +5682,17 @@ "task_id": 224, "intent_template_id": 35, "start_urls": ["__MAP__"], - "intent": "I am at CMU Pittsburgh, how long does it take to the nearest wendys with different transportation methods?", - "intent_template": "I am at CMU Pittsburgh, how long does it take to the nearest {{location}} with different transportation methods?", - "instantiation_dict": {"location": "wendys"}, - "format_specification": "Use \"transportation_method\" for the transportation method and \"duration\" for the duration.", + "intent": "I am at CMU Pittsburgh, how long does it take to reach the nearest wendys with different transportation methods? Return duration in HH:MM:SS format. (Use the OSRM direction service.)", + "intent_template": "I am at CMU Pittsburgh, how long does it take to reach the nearest {{location}} with different transportation methods? {{retrieved_data_format_spec}}. (Use the OSRM direction service.)", + "instantiation_dict": { + "location": "wendys", + "retrieved_data_format_spec": "Return duration in HH:MM:SS format" + }, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": false, "results_schema": { "type": "array", "items": {"type": "string", "format": "duration"} }, - "expected": { - "performed_operation": "retrieve", - "status": "SUCCESS", - "retrieved_data": ["3min"] - } + "expected": { "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": ["3min"] } } ], "revision": 2 @@ -6075,16 +5705,11 @@ "intent": "Return the titles for reviews with 3 stars or below for brush from sephora", "intent_template": "Return the titles for reviews with 3 stars or below for {{product_type}} from {{manufature}}", "instantiation_dict": {"product_type": "brush", "manufature": "sephora"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": { - "performed_operation": "retrieve", - "status": "NOT_FOUND_ERROR", - "retrieved_data": null - } + "expected": {"task_type": "retrieve", "status": "NOT_FOUND_ERROR", "retrieved_data": null} } ], "revision": 2 @@ -6094,14 +5719,15 @@ "task_id": 226, "intent_template_id": 370, "start_urls": ["__SHOPPING__"], - "intent": "What is the price range for products from Amazon basic?", - "intent_template": "What is the price range for products from {{brand}}?", - "instantiation_dict": {"brand": "Amazon basic"}, - "format_specification": "Use \"min\" for lowest price and \"max\" for highest price.", + "intent": "What is the price range for products from Amazon basic?. Return an object with keys \"min\" and \"max\" (as numbers, e.g., 10.99) only, without any additional details.", + "intent_template": "What is the price range for products from {{brand}}?. {{retrieved_data_format_spec}}.", + "instantiation_dict": { + "brand": "Amazon basic", + "retrieved_data_format_spec": "Return an object with keys \"min\" and \"max\" (as numbers, e.g., 10.99) only, without any additional details" + }, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": false, "results_schema": { "type": "array", "items": { @@ -6109,12 +5735,11 @@ "properties": { "min": { "type": "number", "format": "currency" }, "max": { "type": "number", "format": "currency" } - }, - "required": ["max", "min"] + } } }, "expected": { - "performed_operation": "retrieve", + "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": [ {"min": 5.49, "max": 375.19} ] } @@ -6127,14 +5752,15 @@ "task_id": 227, "intent_template_id": 370, "start_urls": ["__SHOPPING__"], - "intent": "What is the price range for products from EYZUTAK?", - "intent_template": "What is the price range for products from {{brand}}?", - "instantiation_dict": {"brand": "EYZUTAK"}, - "format_specification": "Use \"min\" for lowest price and \"max\" for highest price.", + "intent": "What is the price range for products from EYZUTAK?. Return an object with keys \"min\" and \"max\" (as numbers, e.g., 10.99) only, without any additional details.", + "intent_template": "What is the price range for products from {{brand}}?. {{retrieved_data_format_spec}}.", + "instantiation_dict": { + "brand": "EYZUTAK", + "retrieved_data_format_spec": "Return an object with keys \"min\" and \"max\" (as numbers, e.g., 10.99) only, without any additional details" + }, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": false, "results_schema": { "type": "array", "items": { @@ -6142,12 +5768,11 @@ "properties": { "min": { "type": "number", "format": "currency" }, "max": { "type": "number", "format": "currency" } - }, - "required": ["max", "min"] + } } }, "expected": { - "performed_operation": "retrieve", + "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": [ {"min": 9.99, "max": 9.99} ] } @@ -6160,14 +5785,15 @@ "task_id": 228, "intent_template_id": 370, "start_urls": ["__SHOPPING__"], - "intent": "What is the price range for products from sephora?", - "intent_template": "What is the price range for products from {{brand}}?", - "instantiation_dict": {"brand": "sephora"}, - "format_specification": "Use \"min\" for lowest price and \"max\" for highest price.", + "intent": "What is the price range for products from sephora?. Return an object with keys \"min\" and \"max\" (as numbers, e.g., 10.99) only, without any additional details.", + "intent_template": "What is the price range for products from {{brand}}?. {{retrieved_data_format_spec}}.", + "instantiation_dict": { + "brand": "sephora", + "retrieved_data_format_spec": "Return an object with keys \"min\" and \"max\" (as numbers, e.g., 10.99) only, without any additional details" + }, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": false, "results_schema": { "type": "array", "items": { @@ -6175,12 +5801,11 @@ "properties": { "min": { "type": "number", "format": "currency" }, "max": { "type": "number", "format": "currency" } - }, - "required": ["max", "min"] + } } }, "expected": { - "performed_operation": "retrieve", + "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": [ {"min": 18.18, "max": 94.99} ] } @@ -6193,14 +5818,15 @@ "task_id": 229, "intent_template_id": 370, "start_urls": ["__SHOPPING__"], - "intent": "What is the price range for products from ugreen?", - "intent_template": "What is the price range for products from {{brand}}?", - "instantiation_dict": {"brand": "ugreen"}, - "format_specification": "Use \"min\" for lowest price and \"max\" for highest price.", + "intent": "What is the price range for products from ugreen?. Return an object with keys \"min\" and \"max\" (as numbers, e.g., 10.99) only, without any additional details.", + "intent_template": "What is the price range for products from {{brand}}?. {{retrieved_data_format_spec}}.", + "instantiation_dict": { + "brand": "ugreen", + "retrieved_data_format_spec": "Return an object with keys \"min\" and \"max\" (as numbers, e.g., 10.99) only, without any additional details" + }, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": false, "results_schema": { "type": "array", "items": { @@ -6208,12 +5834,11 @@ "properties": { "min": { "type": "number", "format": "currency" }, "max": { "type": "number", "format": "currency" } - }, - "required": ["max", "min"] + } } }, "expected": { - "performed_operation": "retrieve", + "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": [ {"min": 6.99, "max": 38.99} ] } @@ -6226,14 +5851,15 @@ "task_id": 230, "intent_template_id": 370, "start_urls": ["__SHOPPING__"], - "intent": "What is the price range for products from Perricone MD?", - "intent_template": "What is the price range for products from {{brand}}?", - "instantiation_dict": {"brand": "Perricone MD"}, - "format_specification": "Use \"min\" for lowest price and \"max\" for highest price.", + "intent": "What is the price range for products from Perricone MD?. Return an object with keys \"min\" and \"max\" (as numbers, e.g., 10.99) only, without any additional details.", + "intent_template": "What is the price range for products from {{brand}}?. {{retrieved_data_format_spec}}.", + "instantiation_dict": { + "brand": "Perricone MD", + "retrieved_data_format_spec": "Return an object with keys \"min\" and \"max\" (as numbers, e.g., 10.99) only, without any additional details" + }, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": false, "results_schema": { "type": "array", "items": { @@ -6241,12 +5867,11 @@ "properties": { "min": { "type": "number", "format": "currency" }, "max": { "type": "number", "format": "currency" } - }, - "required": ["max", "min"] + } } }, "expected": { - "performed_operation": "retrieve", + "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": [ {"min": 35.0, "max": 149.0} ] } @@ -6262,16 +5887,14 @@ "intent": "Get the order number of my most recent cancelled order", "intent_template": "Get the order number of my most recent {{status}} order", "instantiation_dict": {"status": "cancelled"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": false, - "results_schema": { "type": "array", "items": {"type": "number"} }, + "results_schema": { "type": "array", "items": {"type": "string"} }, "expected": { - "performed_operation": "retrieve", + "task_type": "retrieve", "status": "SUCCESS", - "retrieved_data": [170] + "retrieved_data": ["^#?\\s*0*170$"] } } ], @@ -6285,16 +5908,14 @@ "intent": "Get the order number of my most recent pending order", "intent_template": "Get the order number of my most recent {{status}} order", "instantiation_dict": {"status": "pending"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": false, - "results_schema": { "type": "array", "items": {"type": "number"} }, + "results_schema": { "type": "array", "items": {"type": "string"} }, "expected": { - "performed_operation": "retrieve", + "task_type": "retrieve", "status": "SUCCESS", - "retrieved_data": [189] + "retrieved_data": ["^#?\\s*0*189$"] } } ], @@ -6308,16 +5929,14 @@ "intent": "Get the order number of my most recent complete order", "intent_template": "Get the order number of my most recent {{status}} order", "instantiation_dict": {"status": "complete"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": false, - "results_schema": { "type": "array", "items": {"type": "number"} }, + "results_schema": { "type": "array", "items": {"type": "string"} }, "expected": { - "performed_operation": "retrieve", + "task_type": "retrieve", "status": "SUCCESS", - "retrieved_data": [180] + "retrieved_data": ["^#?\\s*0*180$"] } } ], @@ -6331,16 +5950,11 @@ "intent": "Get the order number of my most recent on hold order", "intent_template": "Get the order number of my most recent {{status}} order", "instantiation_dict": {"status": "on hold"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": { - "performed_operation": "retrieve", - "status": "NOT_FOUND_ERROR", - "retrieved_data": null - } + "expected": {"task_type": "retrieve", "status": "NOT_FOUND_ERROR", "retrieved_data": null} } ], "revision": 2 @@ -6353,16 +5967,11 @@ "intent": "Get the order number of my most recent under delivery order", "intent_template": "Get the order number of my most recent {{status}} order", "instantiation_dict": {"status": "under delivery"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": { - "performed_operation": "retrieve", - "status": "NOT_FOUND_ERROR", - "retrieved_data": null - } + "expected": {"task_type": "retrieve", "status": "NOT_FOUND_ERROR", "retrieved_data": null} } ], "revision": 2 @@ -6372,18 +5981,17 @@ "task_id": 236, "intent_template_id": 39, "start_urls": ["__MAP__"], - "intent": "Get the name and address of the nearest pharmacy from Carnegie Mellon I can walk within 20mins", - "intent_template": "Get the name and address of the nearest {{location}} from {{location2}} {{condition}}", + "intent": "Get the name and address of the nearest pharmacy from Carnegie Mellon I can walk within 20mins (use OSRM direction service). Return a list of objects with keys \"name\", \"house_number\", \"street\", \"city\", \"state\", and \"postcode\".", + "intent_template": "Get the name and address of the nearest {{location}} from {{location2}} {{condition}}. {{retrieved_data_format_spec}}.", "instantiation_dict": { "location": "pharmacy", "location2": "Carnegie Mellon", - "condition": "I can walk within 20mins" + "condition": "I can walk within 20mins (use OSRM direction service)", + "retrieved_data_format_spec": "Return a list of objects with keys \"name\", \"house_number\", \"street\", \"city\", \"state\", and \"postcode\"" }, - "format_specification": "Return an object with keys \"name\", \"house_number\", \"street\", \"city\", \"state\", and \"postcode\"", "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": true, "results_schema": { "type": "array", "items": { @@ -6396,12 +6004,11 @@ "city" : { "type": "string" }, "state" : { "type": "string" }, "postcode" : { "type": "string" } - }, - "required": ["city", "name", "state", "house_number", "street", "postcode"] + } } }, "expected": { - "performed_operation": "retrieve", + "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": [ { @@ -6423,14 +6030,17 @@ "task_id": 237, "intent_template_id": 39, "start_urls": ["__MAP__"], - "intent": "Get the name and address of the nearest gas station from CMU", - "intent_template": "Get the name and address of the nearest {{location}} from {{location2}} {{condition}}", - "instantiation_dict": {"location": "gas station", "location2": "CMU", "condition": ""}, - "format_specification": "Return an object with keys \"name\", \"house_number\", \"street\", \"city\", \"state\", and \"postcode\"", + "intent": "Get the name and address of the nearest gas station from CMU . Return a list of objects with keys \"name\", \"house_number\", \"street\", \"city\", \"state\", and \"postcode\".", + "intent_template": "Get the name and address of the nearest {{location}} from {{location2}} {{condition}}. {{retrieved_data_format_spec}}.", + "instantiation_dict": { + "location": "gas station", + "location2": "CMU", + "condition": "", + "retrieved_data_format_spec": "Return a list of objects with keys \"name\", \"house_number\", \"street\", \"city\", \"state\", and \"postcode\"" + }, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": true, "results_schema": { "type": "array", "items": { @@ -6443,12 +6053,11 @@ "city" : { "type": "string" }, "state" : { "type": "string" }, "postcode" : { "type": "string" } - }, - "required": ["city", "name", "state", "house_number", "street", "postcode"] + } } }, "expected": { - "performed_operation": "retrieve", + "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": [ { @@ -6470,23 +6079,19 @@ "task_id": 238, "intent_template_id": 138, "start_urls": ["__SHOPPING__"], - "intent": "I am doing a market survey for one stop market, show me the most expensive product from PS4 accessories category", - "intent_template": "I am doing a market survey for one stop market, show me the most expensive product from {{product_category}} category", + "intent": "I am doing a market survey for one stop market, go to the product page for the most expensive PS4 accessories", + "intent_template": "I am doing a market survey for one stop market, go to the product page for the most expensive {{product_category}}", "instantiation_dict": {"product_category": "PS4 accessories"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "navigate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", - "site": "shopping", - "last_event_only": true, "expected": { - "url": "__SHOPPING__/astro-gaming-a50-wireless-headset-base-station-gen-4-compatible-with-ps5-ps4-pc-mac-black-silver.html", - "response_status": 200 + "url": "__SHOPPING__/astro-gaming-a50-wireless-headset-base-station-gen-4-compatible-with-ps5-ps4-pc-mac-black-silver.html" } } ], @@ -6497,23 +6102,19 @@ "task_id": 239, "intent_template_id": 138, "start_urls": ["__SHOPPING__"], - "intent": "I am doing a market survey for one stop market, show me the most expensive product from nutrition bars and drinks category", - "intent_template": "I am doing a market survey for one stop market, show me the most expensive product from {{product_category}} category", + "intent": "I am doing a market survey for one stop market, go to the product page for the most expensive nutrition bars and drinks", + "intent_template": "I am doing a market survey for one stop market, go to the product page for the most expensive {{product_category}}", "instantiation_dict": {"product_category": "nutrition bars and drinks"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "navigate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", - "site": "shopping", - "last_event_only": true, "expected": { - "url": "__SHOPPING__/kellogg-s-special-k-protein-meal-bars-chocolate-caramel-12-7oz-6-count.html", - "response_status": 200 + "url": "__SHOPPING__/kellogg-s-special-k-protein-meal-bars-chocolate-caramel-12-7oz-6-count.html" } } ], @@ -6524,20 +6125,17 @@ "task_id": 240, "intent_template_id": 138, "start_urls": ["__SHOPPING__"], - "intent": "I am doing a market survey for one stop market, show me the most expensive product from men's Uniforms, Work & Safety category", - "intent_template": "I am doing a market survey for one stop market, show me the most expensive product from {{product_category}} category", + "intent": "I am doing a market survey for one stop market, go to the product page for the most expensive men's Uniforms, Work & Safety", + "intent_template": "I am doing a market survey for one stop market, go to the product page for the most expensive {{product_category}}", "instantiation_dict": {"product_category": "men's Uniforms, Work & Safety"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "navigate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", - "site": "shopping", - "last_event_only": true, "expected": { "url": "__SHOPPING__/arc-teryx-beta-sv-bib-men-s-versatile-durable-gore-tex-pro-bib-for-severe-conditions.html" } @@ -6550,23 +6148,19 @@ "task_id": 241, "intent_template_id": 138, "start_urls": ["__SHOPPING__"], - "intent": "I am doing a market survey for one stop market, show me the most expensive product from skin care category", - "intent_template": "I am doing a market survey for one stop market, show me the most expensive product from {{product_category}} category", + "intent": "I am doing a market survey for one stop market, go to the product page for the most expensive skin care", + "intent_template": "I am doing a market survey for one stop market, go to the product page for the most expensive {{product_category}}", "instantiation_dict": {"product_category": "skin care"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "navigate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", - "site": "shopping", - "last_event_only": true, "expected": { - "url": "__SHOPPING__/professional-medi-spa-scar-stretch-mark-reduction-system.html", - "response_status": 200 + "url": "__SHOPPING__/professional-medi-spa-scar-stretch-mark-reduction-system.html" } } ], @@ -6577,23 +6171,19 @@ "task_id": 242, "intent_template_id": 138, "start_urls": ["__SHOPPING__"], - "intent": "I am doing a market survey for one stop market, show me the most expensive product from Household Supplies category", - "intent_template": "I am doing a market survey for one stop market, show me the most expensive product from {{product_category}} category", + "intent": "I am doing a market survey for one stop market, go to the product page for the most expensive Household Supplies", + "intent_template": "I am doing a market survey for one stop market, go to the product page for the most expensive {{product_category}}", "instantiation_dict": {"product_category": "Household Supplies"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "navigate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", - "site": "shopping", - "last_event_only": true, "expected": { - "url": "__SHOPPING__/lynx-battery-12v-200ah-lithium-iron-phosphate-lifepo4-prismatic-deep-cell-battery-set-of-4-3-2v-cells-with-3-bus-bars-and-8-lug-nuts-for-rv-solar-marine-off-grid-applications.html", - "response_status": 200 + "url": "__SHOPPING__/lynx-battery-12v-200ah-lithium-iron-phosphate-lifepo4-prismatic-deep-cell-battery-set-of-4-3-2v-cells-with-3-bus-bars-and-8-lug-nuts-for-rv-solar-marine-off-grid-applications.html" } } ], @@ -6604,17 +6194,15 @@ "task_id": 243, "intent_template_id": 244, "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Show me the email address of the customer who is the most unhappy with Circe fleece", - "intent_template": "Show me the {{information}} of the customer who is the most unhappy with {{product}}", + "intent": "Get the email address of the customer who is the most unhappy with Circe fleece", + "intent_template": "Get the {{information}} of the customer who is the most unhappy with {{product}}", "instantiation_dict": {"information": "email address", "product": "Circe fleece"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": false, "results_schema": { "type": "array", "items": {"type": "string"} }, "expected": { - "performed_operation": "retrieve", + "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": ["hannah.lim@gmail.com"] } @@ -6627,17 +6215,15 @@ "task_id": 244, "intent_template_id": 244, "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Show me the email address of the customer who is the most unhappy with Olivia zip jacket", - "intent_template": "Show me the {{information}} of the customer who is the most unhappy with {{product}}", + "intent": "Get the email address of the customer who is the most unhappy with Olivia zip jacket", + "intent_template": "Get the {{information}} of the customer who is the most unhappy with {{product}}", "instantiation_dict": {"information": "email address", "product": "Olivia zip jacket"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": false, "results_schema": { "type": "array", "items": {"type": "string"} }, "expected": { - "performed_operation": "retrieve", + "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": ["emma.lopez@gmail.com"] } @@ -6650,20 +6236,14 @@ "task_id": 245, "intent_template_id": 244, "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Show me the name of the customer who is the most unhappy with Antonia racer tank", - "intent_template": "Show me the {{information}} of the customer who is the most unhappy with {{product}}", + "intent": "Get the name of the customer who is the most unhappy with Antonia racer tank", + "intent_template": "Get the {{information}} of the customer who is the most unhappy with {{product}}", "instantiation_dict": {"information": "name", "product": "Antonia racer tank"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": false, "results_schema": { "type": "array", "items": {"type": "string"} }, - "expected": { - "performed_operation": "retrieve", - "status": "SUCCESS", - "retrieved_data": ["Shaunte"] - } + "expected": { "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": ["Shaunte"] } } ], "revision": 2 @@ -6673,20 +6253,14 @@ "task_id": 246, "intent_template_id": 244, "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Show me the name of the customer who is the most unhappy with Chloe tank", - "intent_template": "Show me the {{information}} of the customer who is the most unhappy with {{product}}", + "intent": "Get the name of the customer who is the most unhappy with Chloe tank", + "intent_template": "Get the {{information}} of the customer who is the most unhappy with {{product}}", "instantiation_dict": {"information": "name", "product": "Chloe tank"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": false, "results_schema": { "type": "array", "items": {"type": "string"} }, - "expected": { - "performed_operation": "retrieve", - "status": "SUCCESS", - "retrieved_data": ["Teofila"] - } + "expected": { "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": ["Teofila"] } } ], "revision": 2 @@ -6696,19 +6270,14 @@ "task_id": 247, "intent_template_id": 244, "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Show me the email address of the customer who is the most unhappy with the style of Zoe products", - "intent_template": "Show me the {{information}} of the customer who is the most unhappy with {{product}}", + "intent": "Get the email address of the customer who is the most unhappy with the style of Zoe products", + "intent_template": "Get the {{information}} of the customer who is the most unhappy with {{product}}", "instantiation_dict": {"information": "email address", "product": "the style of Zoe products"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": { - "performed_operation": "retrieve", - "status": "NOT_FOUND_ERROR", - "retrieved_data": null - } + "expected": {"task_type": "retrieve", "status": "NOT_FOUND_ERROR", "retrieved_data": null} } ], "revision": 2 @@ -6718,17 +6287,18 @@ "task_id": 248, "intent_template_id": 46, "start_urls": ["__MAP__"], - "intent": "Tell me the coordinates of Carnegie Mellon Caf\u00e9 in DD format", - "intent_template": "Tell me the coordinates of {{location}} in DD format", - "instantiation_dict": {"location": "Carnegie Mellon Caf\u00e9"}, - "format_specification": "Use \"latitude\" for the latitude coordinate and \"longitude\" for the longitude coordinate.", + "intent": "Get the coordinates of Carnegie Mellon Caf\u00e9. Return an object with keys \"latitude\" and \"longitude\" (as decimal degrees) only, without any additional details.", + "intent_template": "Get the coordinates of {{location}}. {{retrieved_data_format_spec}}.", + "instantiation_dict": { + "location": "Carnegie Mellon Caf\u00e9", + "retrieved_data_format_spec": "Return an object with keys \"latitude\" and \"longitude\" (as decimal degrees) only, without any additional details" + }, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": true, "results_schema": { "type": "array", "items": {"format": "coordinates"} }, "expected": { - "performed_operation": "retrieve", + "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": [ {"latitude": "40.4424191", "longitude": "-79.9397388"} ] } @@ -6741,17 +6311,18 @@ "task_id": 249, "intent_template_id": 46, "start_urls": ["__MAP__"], - "intent": "Tell me the coordinates of Western Pennsylvania Hospital Heliport in DD format", - "intent_template": "Tell me the coordinates of {{location}} in DD format", - "instantiation_dict": {"location": "Western Pennsylvania Hospital Heliport"}, - "format_specification": "Use \"latitude\" for the latitude coordinate and \"longitude\" for the longitude coordinate.", + "intent": "Get the coordinates of Western Pennsylvania Hospital Heliport. Return an object with keys \"latitude\" and \"longitude\" (as decimal degrees) only, without any additional details.", + "intent_template": "Get the coordinates of {{location}}. {{retrieved_data_format_spec}}.", + "instantiation_dict": { + "location": "Western Pennsylvania Hospital Heliport", + "retrieved_data_format_spec": "Return an object with keys \"latitude\" and \"longitude\" (as decimal degrees) only, without any additional details" + }, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": true, "results_schema": { "type": "array", "items": {"format": "coordinates"} }, "expected": { - "performed_operation": "retrieve", + "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": [ {"latitude": "40.46076", "longitude": "-79.94666"} ] } @@ -6764,17 +6335,18 @@ "task_id": 250, "intent_template_id": 46, "start_urls": ["__MAP__"], - "intent": "Tell me the coordinates of Apple Store near Pitt in DD format", - "intent_template": "Tell me the coordinates of {{location}} in DD format", - "instantiation_dict": {"location": "Apple Store near Pitt"}, - "format_specification": "Use \"latitude\" for the latitude coordinate and \"longitude\" for the longitude coordinate.", + "intent": "Get the coordinates of Apple Store near Pitt. Return an object with keys \"latitude\" and \"longitude\" (as decimal degrees) only, without any additional details.", + "intent_template": "Get the coordinates of {{location}}. {{retrieved_data_format_spec}}.", + "instantiation_dict": { + "location": "Apple Store near Pitt", + "retrieved_data_format_spec": "Return an object with keys \"latitude\" and \"longitude\" (as decimal degrees) only, without any additional details" + }, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": true, "results_schema": { "type": "array", "items": {"format": "coordinates"} }, "expected": { - "performed_operation": "retrieve", + "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": [ {"latitude": "40.4511693", "longitude": "-79.9334241"} ] } @@ -6787,17 +6359,18 @@ "task_id": 251, "intent_template_id": 46, "start_urls": ["__MAP__"], - "intent": "Tell me the coordinates of bus stop on the Carnegie art museum side of the street near CMU in DD format", - "intent_template": "Tell me the coordinates of {{location}} in DD format", - "instantiation_dict": {"location": "bus stop on the Carnegie art museum side of the street near CMU"}, - "format_specification": "Use \"latitude\" for the latitude coordinate and \"longitude\" for the longitude coordinate.", + "intent": "Get the coordinates of bus stop on the Carnegie art museum side of the street near CMU. Return an object with keys \"latitude\" and \"longitude\" (as decimal degrees) only, without any additional details.", + "intent_template": "Get the coordinates of {{location}}. {{retrieved_data_format_spec}}.", + "instantiation_dict": { + "location": "bus stop on the Carnegie art museum side of the street near CMU", + "retrieved_data_format_spec": "Return an object with keys \"latitude\" and \"longitude\" (as decimal degrees) only, without any additional details" + }, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": true, "results_schema": { "type": "array", "items": {"format": "coordinates"} }, "expected": { - "performed_operation": "retrieve", + "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": [ {"latitude": "40.4443", "longitude": "-79.94889"} ] } @@ -6810,17 +6383,18 @@ "task_id": 252, "intent_template_id": 46, "start_urls": ["__MAP__"], - "intent": "Tell me the coordinates of Tokyo Japanese Food Store in Pittsburgh in DD format", - "intent_template": "Tell me the coordinates of {{location}} in DD format", - "instantiation_dict": {"location": "Tokyo Japanese Food Store in Pittsburgh"}, - "format_specification": "Use \"latitude\" for the latitude coordinate and \"longitude\" for the longitude coordinate.", + "intent": "Get the coordinates of Tokyo Japanese Food Store in Pittsburgh. Return an object with keys \"latitude\" and \"longitude\" (as decimal degrees) only, without any additional details.", + "intent_template": "Get the coordinates of {{location}}. {{retrieved_data_format_spec}}.", + "instantiation_dict": { + "location": "Tokyo Japanese Food Store in Pittsburgh", + "retrieved_data_format_spec": "Return an object with keys \"latitude\" and \"longitude\" (as decimal degrees) only, without any additional details" + }, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": true, "results_schema": { "type": "array", "items": {"format": "coordinates"} }, "expected": { - "performed_operation": "retrieve", + "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": [ {"latitude": "40.45761", "longitude": "-79.92934"} ] } @@ -6833,19 +6407,18 @@ "task_id": 253, "intent_template_id": 501, "start_urls": ["__MAP__"], - "intent": "What is the phone number of Carnegie Mellon Caf\u00e9", - "intent_template": "What is the {{information}} of {{location}}", - "instantiation_dict": {"location": "Carnegie Mellon Caf\u00e9", "information": "phone number"}, - "format_specification": null, + "intent": "Get the phone number for Carnegie Mellon Caf\u00e9", + "intent_template": "Get the {{information}} for {{location}}{{retrieved_data_format_spec}}", + "instantiation_dict": { + "information": "phone number", + "location": "Carnegie Mellon Caf\u00e9", + "retrieved_data_format_spec": "" + }, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": { - "performed_operation": "retrieve", - "status": "NOT_FOUND_ERROR", - "retrieved_data": null - } + "expected": {"task_type": "retrieve", "status": "NOT_FOUND_ERROR", "retrieved_data": null} } ], "revision": 2 @@ -6855,17 +6428,19 @@ "task_id": 254, "intent_template_id": 501, "start_urls": ["__MAP__"], - "intent": "What is the phone number of Western Pennsylvania Hospital", - "intent_template": "What is the {{information}} of {{location}}", - "instantiation_dict": {"location": "Western Pennsylvania Hospital", "information": "phone number"}, - "format_specification": null, + "intent": "Get the phone number for Western Pennsylvania Hospital", + "intent_template": "Get the {{information}} for {{location}}{{retrieved_data_format_spec}}", + "instantiation_dict": { + "information": "phone number", + "location": "Western Pennsylvania Hospital", + "retrieved_data_format_spec": "" + }, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": false, "results_schema": { "type": "array", "items": {"type": "string"} }, "expected": { - "performed_operation": "retrieve", + "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": ["4125785000"] } @@ -6878,17 +6453,19 @@ "task_id": 255, "intent_template_id": 501, "start_urls": ["__MAP__"], - "intent": "Who is the operator of PIT airport", - "intent_template": "Who is the {{information}} of {{location}}", - "instantiation_dict": {"location": "PIT airport", "information": "operator"}, - "format_specification": null, + "intent": "Get the operator for PIT airport", + "intent_template": "Get the {{information}} for {{location}}{{retrieved_data_format_spec}}", + "instantiation_dict": { + "information": "operator", + "location": "PIT airport", + "retrieved_data_format_spec": "" + }, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": false, - "results_schema": { "type": "array", "items": {"type": "string"} }, + "results_schema": { "type": "array", "items": {"type": "string", "format": "location-name"} }, "expected": { - "performed_operation": "retrieve", + "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": ["Allegheny County Airport Authority"] } @@ -6901,17 +6478,19 @@ "task_id": 256, "intent_template_id": 501, "start_urls": ["__MAP__"], - "intent": "What is the website of Carnegie art museum in pittsburgh", - "intent_template": "What is the {{information}} of {{location}}", - "instantiation_dict": {"location": "Carnegie art museum in pittsburgh", "information": "website"}, - "format_specification": null, + "intent": "Get the website for Carnegie art museum in pittsburgh", + "intent_template": "Get the {{information}} for {{location}}{{retrieved_data_format_spec}}", + "instantiation_dict": { + "information": "website", + "location": "Carnegie art museum in pittsburgh", + "retrieved_data_format_spec": "" + }, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": false, "results_schema": { "type": "array", "items": {"type": "string"} }, "expected": { - "performed_operation": "retrieve", + "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": ["http://web.cmoa.org/"] } @@ -6924,17 +6503,16 @@ "task_id": 257, "intent_template_id": 501, "start_urls": ["__MAP__"], - "intent": "What is the hours of operation of Tokyo Japanese Food Store in Pittsburgh", - "intent_template": "What is the {{information}} of {{location}}", + "intent": "Get the hours of operation for Tokyo Japanese Food Store in Pittsburgh. Return a list of objects with keys \"day\", \"open_time\", and \"close_time\".", + "intent_template": "Get the {{information}} for {{location}}{{retrieved_data_format_spec}}", "instantiation_dict": { + "information": "hours of operation", "location": "Tokyo Japanese Food Store in Pittsburgh", - "information": "hours of operation" + "retrieved_data_format_spec": ". Return a list of objects with keys \"day\", \"open_time\", and \"close_time\"." }, - "format_specification": "Provide the information for each day of the week when the location is open. Use \"day\" for the day of the week, \"open_time\" for the opening time, and \"close_time\" for the closing time.", "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": false, "results_schema": { "type": "array", "items": { @@ -6943,12 +6521,11 @@ "day" : { "type": "string" }, "open_time" : { "type": "string" }, "close_time": { "type": "string" } - }, - "required": ["close_time", "day", "open_time"] + } } }, "expected": { - "performed_operation": "retrieve", + "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": [ { "day": "Wednesday", "open_time": "10:00", "close_time": "17:00" }, @@ -6967,25 +6544,18 @@ "task_id": 258, "intent_template_id": 325, "start_urls": ["__GITLAB__"], - "intent": "Show me all public projects", - "intent_template": "Show me all public projects", + "intent": "Open the public projects listing", + "intent_template": "Open the public projects listing", "instantiation_dict": {}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "navigate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", - "site": "gitlab", - "last_event_only": true, - "expected": { - "url": "__GITLAB__/explore", - "response_status": 200, - "query_params": { "visibility_level": ["20"] } - } + "expected": { "url": "__GITLAB__/explore", "query_params": { "visibility_level": ["20"] } } } ], "revision": 2 @@ -6998,13 +6568,12 @@ "intent": "Get me my RSS feed token", "intent_template": "Get me my RSS feed token", "instantiation_dict": {}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": { "type": "array", "items": {"type": "string"} }, "expected": { - "performed_operation": "retrieve", + "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": ["TMN_bBn9Z48qVbUFZV45"] } @@ -7017,21 +6586,18 @@ "task_id": 260, "intent_template_id": 211, "start_urls": ["__SHOPPING__"], - "intent": "I want to browse the products in the Video Game category", - "intent_template": "I want to browse the products in the {{category}} category", + "intent": "Open the Video Game category page to browse products", + "intent_template": "Open the {{category}} category page to browse products", "instantiation_dict": {"category": "Video Game"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "navigate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", - "site": "shopping", - "last_event_only": true, - "expected": {"url": "__SHOPPING__/video-games.html", "response_status": 200} + "expected": {"url": "__SHOPPING__/video-games.html"} } ], "revision": 2 @@ -7041,21 +6607,18 @@ "task_id": 261, "intent_template_id": 211, "start_urls": ["__SHOPPING__"], - "intent": "I want to browse the products in the Headphones category", - "intent_template": "I want to browse the products in the {{category}} category", + "intent": "Open the Headphones category page to browse products", + "intent_template": "Open the {{category}} category page to browse products", "instantiation_dict": {"category": "Headphones"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "navigate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", - "site": "shopping", - "last_event_only": true, - "expected": {"url": "__SHOPPING__/electronics/headphones.html", "response_status": 200} + "expected": {"url": "__SHOPPING__/electronics/headphones.html"} } ], "revision": 2 @@ -7065,24 +6628,18 @@ "task_id": 262, "intent_template_id": 211, "start_urls": ["__SHOPPING__"], - "intent": "I want to browse the products in the Men shoes category", - "intent_template": "I want to browse the products in the {{category}} category", + "intent": "Open the Men shoes category page to browse products", + "intent_template": "Open the {{category}} category page to browse products", "instantiation_dict": {"category": "Men shoes"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "navigate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", - "site": "shopping", - "last_event_only": true, - "expected": { - "url": "__SHOPPING__/clothing-shoes-jewelry/men/shoes.html", - "response_status": 200 - } + "expected": {"url": "__SHOPPING__/clothing-shoes-jewelry/men/shoes.html"} } ], "revision": 2 @@ -7092,24 +6649,18 @@ "task_id": 263, "intent_template_id": 211, "start_urls": ["__SHOPPING__"], - "intent": "I want to browse the products in the Woman clothing category", - "intent_template": "I want to browse the products in the {{category}} category", + "intent": "Open the Woman clothing category page to browse products", + "intent_template": "Open the {{category}} category page to browse products", "instantiation_dict": {"category": "Woman clothing"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "navigate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", - "site": "shopping", - "last_event_only": true, - "expected": { - "url": "__SHOPPING__/clothing-shoes-jewelry/women/clothing.html", - "response_status": 200 - } + "expected": {"url": "__SHOPPING__/clothing-shoes-jewelry/women/clothing.html"} } ], "revision": 2 @@ -7119,23 +6670,19 @@ "task_id": 264, "intent_template_id": 211, "start_urls": ["__SHOPPING__"], - "intent": "I want to browse the products in the Cabinets, Racks & Shelves category", - "intent_template": "I want to browse the products in the {{category}} category", + "intent": "Open the Cabinets, Racks & Shelves category page to browse products", + "intent_template": "Open the {{category}} category page to browse products", "instantiation_dict": {"category": "Cabinets, Racks & Shelves"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "navigate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", - "site": "shopping", - "last_event_only": true, "expected": { - "url": "__SHOPPING__/office-products/office-furniture-lighting/cabinets-racks-shelves.html", - "response_status": 200 + "url": "__SHOPPING__/office-products/office-furniture-lighting/cabinets-racks-shelves.html" } } ], @@ -7146,142 +6693,183 @@ "task_id": 265, "intent_template_id": 85, "start_urls": ["__MAP__", "__WIKIPEDIA__"], - "intent": "What's the closest national park to Boston (use the provided wiki site to look up any needed information)? How far is it to drive there?", - "intent_template": "What's the closest national park to {{city}} (use the provided wiki site to look up any needed information)? How far is it to drive there?", - "instantiation_dict": {"city": "Boston"}, - "format_specification": "Provide the name of the park using \"park_name\" and its distance using \"distance\".", + "intent": "Get the relation ID of the closest national park to Boston and the distance to drive there. Return a list of objects with keys \"relation_id\" (integer) and \"distance\" (numeric value with unit km or m, e.g., 2.4km, 500m) only, without any additional details. (Use the OSRM direction service and the provided wiki to look up any needed information and search both source and destination by coordinates from the place official page on the wiki.)", + "intent_template": "Get the relation ID of the closest national park to {{city}} and the {{metric_phrase}} to {{travel_mode}} there. {{retrieved_data_format_spec}}. (Use the OSRM direction service and the provided wiki to look up any needed information and search both source and destination by coordinates from the place official page on the wiki.)", + "instantiation_dict": { + "city": "Boston", + "travel_mode": "drive", + "metric_phrase": "distance", + "retrieved_data_format_spec": "Return a list of objects with keys \"relation_id\" (integer) and \"distance\" (numeric value with unit km or m, e.g., 2.4km, 500m) only, without any additional details" + }, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": false, "results_schema": { "type": "array", "items": { "type": "object", "properties": { - "park_name": { "type": "string" }, - "distance" : { "type": "string", "format": "distance" } - }, - "required": ["distance", "park_name"] + "relation_id": { "type": "integer" }, + "distance" : { "type": "string" , "format": "distance" } + } } }, "expected": { - "performed_operation": "retrieve", + "task_type": "retrieve", "status": "SUCCESS", - "retrieved_data": [ {"park_name": "Acadia National Park", "distance": "457km"} ] + "retrieved_data": [ {"relation_id": 2176999, "distance": "459km"} ] } + }, + { + "evaluator": "NetworkEventEvaluator", + "ignored_query_params_patterns": [".*"], + "expected": {"url": "^.*/route/v1/.*/-68.2177005,44.3494709;-71.0579762,42.3603713.*$"} } ], - "revision": 2 + "revision": 4 }, { "sites": ["wikipedia", "map"], "task_id": 266, "intent_template_id": 85, "start_urls": ["__MAP__", "__WIKIPEDIA__"], - "intent": "What's the closest national park to the largest city in Maine (use the provided wiki site to look up any needed information)?", - "intent_template": "What's the closest national park to {{city}} (use the provided wiki site to look up any needed information)?", - "instantiation_dict": {"city": "the largest city in Maine"}, - "format_specification": null, + "intent": "Get the relation ID of the closest national park to the largest city in Maine and the distance to drive there. Return a list of objects with keys \"relation_id\" (integer) and \"distance\" (numeric value with unit km or m, e.g., 2.4km, 500m) only, without any additional details. (Use the OSRM direction service and the provided wiki to look up any needed information and search both source and destination by coordinates from the place official page on the wiki.)", + "intent_template": "Get the relation ID of the closest national park to {{city}} and the {{metric_phrase}} to {{travel_mode}} there. {{retrieved_data_format_spec}}. (Use the OSRM direction service and the provided wiki to look up any needed information and search both source and destination by coordinates from the place official page on the wiki.)", + "instantiation_dict": { + "city": "the largest city in Maine", + "travel_mode": "drive", + "metric_phrase": "distance", + "retrieved_data_format_spec": "Return a list of objects with keys \"relation_id\" (integer) and \"distance\" (numeric value with unit km or m, e.g., 2.4km, 500m) only, without any additional details" + }, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": true, - "results_schema": { "type": "array", "items": {"type": "string"} }, + "results_schema": { + "type": "array", + "items": { + "type": "object", + "properties": { + "relation_id": { "type": "integer" }, + "distance" : { "type": "string" , "format": "distance" } + } + } + }, "expected": { - "performed_operation": "retrieve", + "task_type": "retrieve", "status": "SUCCESS", - "retrieved_data": ["Acadia National Park"] + "retrieved_data": [ {"relation_id": 2176999, "distance": "290km"} ] } + }, + { + "evaluator": "NetworkEventEvaluator", + "ignored_query_params_patterns": [".*"], + "expected": {"url": "^.*/route/v1/.*/-68.2177005,44.3494709;-70.2545299,43.6599147.*$"} } ], - "revision": 2 + "revision": 4 }, { "sites": ["wikipedia", "map"], "task_id": 267, "intent_template_id": 85, "start_urls": ["__MAP__", "__WIKIPEDIA__"], - "intent": "What's the closest national park to the hometown of Stephen King (use the provided wiki site to look up any needed information)? How long it takes to drive there?", - "intent_template": "What's the closest national park to {{city}} (use the provided wiki site to look up any needed information)? How long it takes to drive there?", - "instantiation_dict": {"city": "the hometown of Stephen King"}, - "format_specification": "Provide the name of the park using \"park_name\" and the time using \"time\".", + "intent": "Get the relation ID of the closest national park to the hometown of Stephen King and the time to drive there. Return a list of objects with keys \"relation_id\" (integer) and \"duration\" (in HH:MM:SS format) only, without any additional details. (Use the OSRM direction service and the provided wiki to look up any needed information and search both source and destination by coordinates from the place official page on the wiki.)", + "intent_template": "Get the relation ID of the closest national park to {{city}} and the {{metric_phrase}} to {{travel_mode}} there. {{retrieved_data_format_spec}}. (Use the OSRM direction service and the provided wiki to look up any needed information and search both source and destination by coordinates from the place official page on the wiki.)", + "instantiation_dict": { + "city": "the hometown of Stephen King", + "travel_mode": "drive", + "metric_phrase": "time", + "retrieved_data_format_spec": "Return a list of objects with keys \"relation_id\" (integer) and \"duration\" (in HH:MM:SS format) only, without any additional details" + }, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": false, "results_schema": { "type": "array", "items": { "type": "object", "properties": { - "park_name": { "type": "string" }, - "time" : { "type": "string", "format": "duration" } - }, - "required": ["park_name", "time"] + "relation_id": { "type": "integer" }, + "duration" : { "type": "string" , "format": "duration" } + } } }, "expected": { - "performed_operation": "retrieve", + "task_type": "retrieve", "status": "SUCCESS", - "retrieved_data": [ {"park_name": "Acadia National Park", "time": "1h 23min"} ] + "retrieved_data": [ {"relation_id": 2176999, "duration": "01:33:00"} ] + } + }, + { + "evaluator": "NetworkEventEvaluator", + "ignored_query_params_patterns": [".*"], + "expected": { + "url": "^.*/route/.*/-68.2177005,44.3494709;-68.767507,44.8030715.*$", + "headers": {"Cookie": "^(?!.*_osm_directions_engine=fossgis_osrm_(?:bicycle|foot)).*$"} } } ], - "revision": 2 + "revision": 4 }, { "sites": ["wikipedia", "map"], "task_id": 268, "intent_template_id": 85, "start_urls": ["__MAP__", "__WIKIPEDIA__"], - "intent": "What's the closest national park to Vinalhaven, ME (use the provided wiki site to look up any needed information)? How long does it take to bike there?", - "intent_template": "What's the closest national park to {{city}} (use the provided wiki site to look up any needed information)? How long does it take to bike there?", - "instantiation_dict": {"city": "Vinalhaven, ME"}, - "format_specification": "Provide the name of the park using \"park_name\" and the time using \"time\".", + "intent": "Get the relation ID of the closest national park to Vinalhaven, ME and the time to bike there. Return a list of objects with keys \"relation_id\" (integer) and \"duration\" (in HH:MM:SS format) only, without any additional details. (Use the OSRM direction service and the provided wiki to look up any needed information and search both source and destination by coordinates from the place official page on the wiki.)", + "intent_template": "Get the relation ID of the closest national park to {{city}} and the {{metric_phrase}} to {{travel_mode}} there. {{retrieved_data_format_spec}}. (Use the OSRM direction service and the provided wiki to look up any needed information and search both source and destination by coordinates from the place official page on the wiki.)", + "instantiation_dict": { + "city": "Vinalhaven, ME", + "travel_mode": "bike", + "metric_phrase": "time", + "retrieved_data_format_spec": "Return a list of objects with keys \"relation_id\" (integer) and \"duration\" (in HH:MM:SS format) only, without any additional details" + }, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": false, "results_schema": { "type": "array", "items": { "type": "object", "properties": { - "park_name": { "type": "string" }, - "time" : { "type": "string", "format": "duration" } - }, - "required": ["park_name", "time"] + "relation_id": { "type": "integer" }, + "duration" : { "type": "string" , "format": "duration" } + } } }, "expected": { - "performed_operation": "retrieve", + "task_type": "retrieve", "status": "SUCCESS", - "retrieved_data": [ {"park_name": "Acadia National Park", "time": "10h 33min"} ] + "retrieved_data": [ {"relation_id": 2176999, "duration": "10:58:00"} ] + } + }, + { + "evaluator": "NetworkEventEvaluator", + "ignored_query_params_patterns": [".*"], + "expected": { + "url": "^.*/route/v1/.*/-68.2177005,44.3494709;-68.8315387,44.0478975.*$", + "headers": {"Cookie": "^.*_osm_directions_engine=fossgis_osrm_bicycle.*$"} } } ], - "revision": 2 + "revision": 4 }, { "sites": ["shopping"], "task_id": 269, "intent_template_id": 139, "start_urls": ["__SHOPPING__"], - "intent": "Show me all products in \"women shoes\" category filtered to under $25", - "intent_template": "Show me all products in \"{{product_category}}\" category filtered to {{price_range}}", + "intent": "Open the \"women shoes\" category page filtered to under $25", + "intent_template": "Open the \"{{product_category}}\" category page filtered to {{price_range}}", "instantiation_dict": {"price_range": "under $25", "product_category": "women shoes"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "navigate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", - "site": "shopping", - "last_event_only": true, "ignored_query_params_patterns": ["^(?!price$).+$"], "expected": { "url": "__SHOPPING__/clothing-shoes-jewelry/women/shoes.html", @@ -7296,23 +6884,19 @@ "task_id": 270, "intent_template_id": 139, "start_urls": ["__SHOPPING__"], - "intent": "Show me all products in \"men shoes\" category filtered to under $30", - "intent_template": "Show me all products in \"{{product_category}}\" category filtered to {{price_range}}", + "intent": "Open the \"men shoes\" category page filtered to under $30", + "intent_template": "Open the \"{{product_category}}\" category page filtered to {{price_range}}", "instantiation_dict": {"price_range": "under $30", "product_category": "men shoes"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "navigate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", - "site": "shopping", - "last_event_only": true, "expected": { "url": "__SHOPPING__/clothing-shoes-jewelry/men/shoes.html", - "response_status": 200, "query_params": { "price": ["0-30"] } } } @@ -7324,23 +6908,19 @@ "task_id": 271, "intent_template_id": 139, "start_urls": ["__SHOPPING__"], - "intent": "Show me all products in \"makeup remover\" category filtered to under $46.99", - "intent_template": "Show me all products in \"{{product_category}}\" category filtered to {{price_range}}", + "intent": "Open the \"makeup remover\" category page filtered to under $46.99", + "intent_template": "Open the \"{{product_category}}\" category page filtered to {{price_range}}", "instantiation_dict": {"price_range": "under $46.99", "product_category": "makeup remover"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "navigate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", - "site": "shopping", - "last_event_only": true, "expected": { "url": "__SHOPPING__/beauty-personal-care/makeup/makeup-remover.html", - "response_status": 200, "query_params": { "price": ["0-46.99"] } } } @@ -7352,23 +6932,19 @@ "task_id": 272, "intent_template_id": 139, "start_urls": ["__SHOPPING__"], - "intent": "Show me all products in \"children dental care\" category filtered to under $78", - "intent_template": "Show me all products in \"{{product_category}}\" category filtered to {{price_range}}", + "intent": "Open the \"children dental care\" category page filtered to under $78", + "intent_template": "Open the \"{{product_category}}\" category page filtered to {{price_range}}", "instantiation_dict": {"price_range": "under $78", "product_category": "children dental care"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "navigate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", - "site": "shopping", - "last_event_only": true, "expected": { "url": "__SHOPPING__/beauty-personal-care/oral-care/children-s-dental-care.html", - "response_status": 200, "query_params": { "price": ["0-78"] } } } @@ -7380,23 +6956,19 @@ "task_id": 273, "intent_template_id": 139, "start_urls": ["__SHOPPING__"], - "intent": "Show me all products in \"furniture with accent\" category filtered to under $199", - "intent_template": "Show me all products in \"{{product_category}}\" category filtered to {{price_range}}", + "intent": "Open the \"furniture with accent\" category page filtered to under $199", + "intent_template": "Open the \"{{product_category}}\" category page filtered to {{price_range}}", "instantiation_dict": {"price_range": "under $199", "product_category": "furniture with accent"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "navigate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", - "site": "shopping", - "last_event_only": true, "expected": { "url": "__SHOPPING__/home-kitchen/furniture/accent-furniture.html", - "response_status": 200, "query_params": { "price": ["0-199"] } } } @@ -7408,23 +6980,19 @@ "task_id": 274, "intent_template_id": 212, "start_urls": ["__SHOPPING__"], - "intent": "Search for \"usb wifi\"", - "intent_template": "Search for \"{{keyword}}\"", + "intent": "Open the search results for \"usb wifi\"", + "intent_template": "Open the search results for \"{{keyword}}\"", "instantiation_dict": {"keyword": "usb wifi"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "navigate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", - "site": "shopping", - "last_event_only": true, "expected": { "url": "__SHOPPING__/catalogsearch/result/", - "response_status": 200, "query_params": { "q": ["usb wifi"] } } } @@ -7436,23 +7004,19 @@ "task_id": 275, "intent_template_id": 212, "start_urls": ["__SHOPPING__"], - "intent": "Search for \"xbox\"", - "intent_template": "Search for \"{{keyword}}\"", + "intent": "Open the search results for \"xbox\"", + "intent_template": "Open the search results for \"{{keyword}}\"", "instantiation_dict": {"keyword": "xbox"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "navigate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", - "site": "shopping", - "last_event_only": true, "expected": { "url": "__SHOPPING__/catalogsearch/result/", - "response_status": 200, "query_params": { "q": ["xbox"] } } } @@ -7464,23 +7028,19 @@ "task_id": 276, "intent_template_id": 212, "start_urls": ["__SHOPPING__"], - "intent": "Search for \"switch accessories\"", - "intent_template": "Search for \"{{keyword}}\"", + "intent": "Open the search results for \"switch accessories\"", + "intent_template": "Open the search results for \"{{keyword}}\"", "instantiation_dict": {"keyword": "switch accessories"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "navigate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", - "site": "shopping", - "last_event_only": true, "expected": { "url": "__SHOPPING__/catalogsearch/result/", - "response_status": 200, "query_params": { "q": ["switch accessories"] } } } @@ -7492,23 +7052,19 @@ "task_id": 277, "intent_template_id": 212, "start_urls": ["__SHOPPING__"], - "intent": "Search for \"batteries for iphone 13\"", - "intent_template": "Search for \"{{keyword}}\"", + "intent": "Open the search results for \"batteries for iphone 13\"", + "intent_template": "Open the search results for \"{{keyword}}\"", "instantiation_dict": {"keyword": "batteries for iphone 13"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "navigate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", - "site": "shopping", - "last_event_only": true, "expected": { "url": "__SHOPPING__/catalogsearch/result/", - "response_status": 200, "query_params": { "q": ["batteries for iphone 13"] } } } @@ -7520,23 +7076,19 @@ "task_id": 278, "intent_template_id": 212, "start_urls": ["__SHOPPING__"], - "intent": "Search for \"green tea bag for weight loss\"", - "intent_template": "Search for \"{{keyword}}\"", + "intent": "Open the search results for \"green tea bag for weight loss\"", + "intent_template": "Open the search results for \"{{keyword}}\"", "instantiation_dict": {"keyword": "green tea bag for weight loss"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "navigate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", - "site": "shopping", - "last_event_only": true, "expected": { "url": "__SHOPPING__/catalogsearch/result/", - "response_status": 200, "query_params": { "q": ["green tea bag for weight loss"] } } } @@ -7548,14 +7100,15 @@ "task_id": 279, "intent_template_id": 204, "start_urls": ["__SHOPPING__"], - "intent": "Provide me with the full names of Bluetooth headphones from Sony, and also share the price range for the available models", - "intent_template": "Provide me with the full names of {{product}}, and also share the price range for the available models", - "instantiation_dict": {"product": "Bluetooth headphones from Sony"}, - "format_specification": "Use \"names\" for the list of product names and \"min\" for the minimum price and \"max\" for the maximum price.", + "intent": "Provide me with the full names of Bluetooth headphones from Sony, and also share the price range for the available models. Return an object with keys \"names\" (list of product names), \"min\" and \"max\" (as numbers, e.g., 10.99) only, without any additional details.", + "intent_template": "Provide me with the full names of {{product}}, and also share the price range for the available models. {{retrieved_data_format_spec}}.", + "instantiation_dict": { + "product": "Bluetooth headphones from Sony", + "retrieved_data_format_spec": "Return an object with keys \"names\" (list of product names), \"min\" and \"max\" (as numbers, e.g., 10.99) only, without any additional details" + }, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": false, "results_schema": { "type": "array", "items": { @@ -7564,12 +7117,11 @@ "names": { "type": "array", "items": {"type": "string"} }, "min": {"type": "number", "format": "currency"}, "max": {"type": "number", "format": "currency"} - }, - "required": ["max", "min", "names"] + } } }, "expected": { - "performed_operation": "retrieve", + "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": [ { @@ -7601,14 +7153,15 @@ "task_id": 280, "intent_template_id": 204, "start_urls": ["__SHOPPING__"], - "intent": "Provide me with the full names of chargers from Anker, and also share the price range for the available models", - "intent_template": "Provide me with the full names of {{product}}, and also share the price range for the available models", - "instantiation_dict": {"product": "chargers from Anker"}, - "format_specification": "Use \"names\" for the list of product names and \"min\" for the minimum price and \"max\" for the maximum price.", + "intent": "Provide me with the full names of chargers from Anker, and also share the price range for the available models. Return an object with keys \"names\" (list of product names), \"min\" and \"max\" (as numbers, e.g., 10.99) only, without any additional details.", + "intent_template": "Provide me with the full names of {{product}}, and also share the price range for the available models. {{retrieved_data_format_spec}}.", + "instantiation_dict": { + "product": "chargers from Anker", + "retrieved_data_format_spec": "Return an object with keys \"names\" (list of product names), \"min\" and \"max\" (as numbers, e.g., 10.99) only, without any additional details" + }, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": false, "results_schema": { "type": "array", "items": { @@ -7617,12 +7170,11 @@ "names": { "type": "array", "items": {"type": "string"} }, "min": {"type": "number", "format": "currency"}, "max": {"type": "number", "format": "currency"} - }, - "required": ["max", "min", "names"] + } } }, "expected": { - "performed_operation": "retrieve", + "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": [ { @@ -7640,8 +7192,8 @@ "Anker Quick Charge 3.0 39W Dual USB Wall Charger, PowerPort Speed 2 for Galaxy S10/S9/S8/Edge/Plus, Note 8/7 and PowerIQ for iPhone Xs/XS Max/XR/X/8/Plus, iPad Pro/Air 2/Mini, LG, Nexus, HTC and More", "USB C Charger, Anker 20W PIQ 3.0 Fast Charger with Foldable Plug, PowerPort III Charger for iPhone 13/13 Mini/13 Pro/13 Pro Max/12/11, iPad/iPad Mini, MagSafe, and More (Cable Not Included)" ], - "min": "8.99", - "max": "59.99" + "min": 8.99, + "max": 59.99 } ] } @@ -7654,14 +7206,15 @@ "task_id": 281, "intent_template_id": 204, "start_urls": ["__SHOPPING__"], - "intent": "Provide me with the full names of Oral B brush heads designed for children, and also share the price range for the available models", - "intent_template": "Provide me with the full names of {{product}}, and also share the price range for the available models", - "instantiation_dict": {"product": "Oral B brush heads designed for children"}, - "format_specification": "Use \"names\" for the list of product names and \"min\" for the minimum price and \"max\" for the maximum price.", + "intent": "Provide me with the full names of Oral B brush heads designed for children, and also share the price range for the available models. Return an object with keys \"names\" (list of product names), \"min\" and \"max\" (as numbers, e.g., 10.99) only, without any additional details.", + "intent_template": "Provide me with the full names of {{product}}, and also share the price range for the available models. {{retrieved_data_format_spec}}.", + "instantiation_dict": { + "product": "Oral B brush heads designed for children", + "retrieved_data_format_spec": "Return an object with keys \"names\" (list of product names), \"min\" and \"max\" (as numbers, e.g., 10.99) only, without any additional details" + }, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": false, "results_schema": { "type": "array", "items": { @@ -7670,12 +7223,11 @@ "names": { "type": "array", "items": {"type": "string"} }, "min": {"type": "number", "format": "currency"}, "max": {"type": "number", "format": "currency"} - }, - "required": ["max", "min", "names"] + } } }, "expected": { - "performed_operation": "retrieve", + "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": [ { @@ -7697,14 +7249,15 @@ "task_id": 282, "intent_template_id": 204, "start_urls": ["__SHOPPING__"], - "intent": "Provide me with the full names of slide slippers from Nike, and also share the price range for the available models", - "intent_template": "Provide me with the full names of {{product}}, and also share the price range for the available models", - "instantiation_dict": {"product": "slide slippers from Nike"}, - "format_specification": "Use \"names\" for the list of product names and \"min\" for the minimum price and \"max\" for the maximum price.", + "intent": "Provide me with the full names of slide slippers from Nike, and also share the price range for the available models. Return an object with keys \"names\" (list of product names), \"min\" and \"max\" (as numbers, e.g., 10.99) only, without any additional details.", + "intent_template": "Provide me with the full names of {{product}}, and also share the price range for the available models. {{retrieved_data_format_spec}}.", + "instantiation_dict": { + "product": "slide slippers from Nike", + "retrieved_data_format_spec": "Return an object with keys \"names\" (list of product names), \"min\" and \"max\" (as numbers, e.g., 10.99) only, without any additional details" + }, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": false, "results_schema": { "type": "array", "items": { @@ -7713,12 +7266,11 @@ "names": { "type": "array", "items": {"type": "string"} }, "min": {"type": "number", "format": "currency"}, "max": {"type": "number", "format": "currency"} - }, - "required": ["max", "min", "names"] + } } }, "expected": { - "performed_operation": "retrieve", + "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": [ { @@ -7747,23 +7299,19 @@ "task_id": 283, "intent_template_id": 210, "start_urls": ["__SHOPPING__"], - "intent": "Show me the most recent models of XBox controllers released between 2020-2021.", - "intent_template": "Show me the most recent models of XBox controllers released between 2020-2021.", + "intent": "Open the page showing the most recent Xbox controller models released between 2020-2021", + "intent_template": "Open the page showing the most recent Xbox controller models released between 2020-2021", "instantiation_dict": {}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "navigate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", - "site": "shopping", - "last_event_only": true, "expected": { - "url": "__SHOPPING__/microsoft-xbox-controller-carbon-black-for-series-x-series-s-xbox-one-windows-10-android-ios-bundled-with-dual-port-charging-dock-xbox-controller-skin-voucher-premgear-cloth.html", - "response_status": 200 + "url": "__SHOPPING__/microsoft-xbox-controller-carbon-black-for-series-x-series-s-xbox-one-windows-10-android-ios-bundled-with-dual-port-charging-dock-xbox-controller-skin-voucher-premgear-cloth.html" } } ], @@ -7774,23 +7322,19 @@ "task_id": 284, "intent_template_id": 207, "start_urls": ["__SHOPPING__"], - "intent": "Show the least expensive shoe storage with a minimum storage capacity of 12 pairs.", - "intent_template": "Show the least expensive {{product}} with a minimum storage capacity of {{min_storage}}.", + "intent": "View the product page for the least expensive shoe storage with a minimum storage capacity of 12 pairs.", + "intent_template": "View the product page for the least expensive {{product}} with a minimum storage capacity of {{min_storage}}.", "instantiation_dict": {"product": "shoe storage", "min_storage": "12 pairs"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "navigate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", - "site": "shopping", - "last_event_only": true, "expected": { - "url": "__SHOPPING__/onlyeasy-over-the-door-shoe-storage-organizer-hanging-shoe-rack-holder-with-24-large-fabric-pockets-22-1-x-61-4-herringbone-grey-mxrodsb1p.html", - "response_status": 200 + "url": "__SHOPPING__/onlyeasy-over-the-door-shoe-storage-organizer-hanging-shoe-rack-holder-with-24-large-fabric-pockets-22-1-x-61-4-herringbone-grey-mxrodsb1p.html" } } ], @@ -7801,23 +7345,19 @@ "task_id": 285, "intent_template_id": 207, "start_urls": ["__SHOPPING__"], - "intent": "Show the least expensive switch card holder with a minimum storage capacity of 15 cards.", - "intent_template": "Show the least expensive {{product}} with a minimum storage capacity of {{min_storage}}.", + "intent": "View the product page for the least expensive switch card holder with a minimum storage capacity of 15 cards.", + "intent_template": "View the product page for the least expensive {{product}} with a minimum storage capacity of {{min_storage}}.", "instantiation_dict": {"product": "switch card holder", "min_storage": "15 cards"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "navigate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", - "site": "shopping", - "last_event_only": true, "expected": { - "url": "__SHOPPING__/game-card-holder-storage-case-for-nintendo-switch-games-or-ps-vita-game-case-or-sd-memory-cards-black.html", - "response_status": 200 + "url": "__SHOPPING__/game-card-holder-storage-case-for-nintendo-switch-games-or-ps-vita-game-case-or-sd-memory-cards-black.html" } } ], @@ -7828,23 +7368,19 @@ "task_id": 286, "intent_template_id": 207, "start_urls": ["__SHOPPING__"], - "intent": "Show the least expensive ssd hard drive with a minimum storage capacity of 1TB.", - "intent_template": "Show the least expensive {{product}} with a minimum storage capacity of {{min_storage}}.", + "intent": "View the product page for the least expensive ssd hard drive with a minimum storage capacity of 1TB.", + "intent_template": "View the product page for the least expensive {{product}} with a minimum storage capacity of {{min_storage}}.", "instantiation_dict": {"product": "ssd hard drive", "min_storage": "1TB"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "navigate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", - "site": "shopping", - "last_event_only": true, "expected": { - "url": "__SHOPPING__/external-hard-drive-2tb-ultra-thin-external-hard-drive-2000gb-ultra-high-speed-portable-3-1-type-c-storage-drive-compatible-with-pc-laptop-and-mac-2tb-a1.html", - "response_status": 200 + "url": "__SHOPPING__/portable-ssd-2tb-external-hard-drive-mobile-solid-state-drive-portable-hard-drive-for-pc-laptop-and-mac-data-storage-and-transfer-2tb-silver.html" } } ], @@ -7855,20 +7391,16 @@ "task_id": 287, "intent_template_id": 47, "start_urls": ["__MAP__"], - "intent": "How much time does it take from Pittsburgh to Philadelphia by car?", - "intent_template": "How much time does it take from Pittsburgh to Philadelphia by car?", - "instantiation_dict": {}, - "format_specification": null, + "intent": "How much time does it take from Pittsburgh to Philadelphia by car? Return the value as a string in HH:MM:SS format only, without any additional details.", + "intent_template": "How much time does it take from Pittsburgh to Philadelphia by car? {{retrieved_data_format_spec}}.", + "instantiation_dict": { + "retrieved_data_format_spec": "Return the value as a string in HH:MM:SS format only, without any additional details" + }, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": true, "results_schema": { "type": "array", "items": {"type": "string", "format": "duration"} }, - "expected": { - "performed_operation": "retrieve", - "status": "SUCCESS", - "retrieved_data": ["5h 47min"] - } + "expected": { "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": ["5h 47min"] } } ], "revision": 2 @@ -7878,19 +7410,17 @@ "task_id": 288, "intent_template_id": 234, "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Get the name of the customer who has the most cancellations in the history", - "intent_template": "Tell me the {{attribute}} of the customer who has the most cancellations in the history", - "instantiation_dict": {"attribute": "name"}, - "format_specification": null, + "intent": "Get the email of the customer who has the most cancellations in the history", + "intent_template": "Get the {{attribute}} of the customer who has the most cancellations in the history{{retrieved_data_format_spec}}", + "instantiation_dict": {"attribute": "email", "retrieved_data_format_spec": ""}, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": false, "results_schema": { "type": "array", "items": {"type": "string"} }, "expected": { - "performed_operation": "retrieve", + "task_type": "retrieve", "status": "SUCCESS", - "retrieved_data": ["Samantha Jones"] + "retrieved_data": ["coolcat321@hotmail.com"] } } ], @@ -7901,14 +7431,15 @@ "task_id": 289, "intent_template_id": 234, "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Tell me the email address, name, phone number of the customer who has the most cancellations in the history", - "intent_template": "Tell me the {{attribute}} of the customer who has the most cancellations in the history", - "instantiation_dict": {"attribute": "email address, name, phone number"}, - "format_specification": "Use \"name\" for the customer name and \"email\" for the email and \"phone_number\" for the phone number.", + "intent": "Get the email address, name, phone number of the customer who has the most cancellations in the history. Return a list of objects with keys \"name\", \"email\", and \"phone_number\".", + "intent_template": "Get the {{attribute}} of the customer who has the most cancellations in the history{{retrieved_data_format_spec}}", + "instantiation_dict": { + "attribute": "email address, name, phone number", + "retrieved_data_format_spec": ". Return a list of objects with keys \"name\", \"email\", and \"phone_number\"." + }, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": false, "results_schema": { "type": "array", "items": { @@ -7917,12 +7448,11 @@ "name" : { "type": "string" }, "email" : { "type": "string" }, "phone_number": { "type": "string" } - }, - "required": ["email", "name", "phone_number"] + } } }, "expected": { - "performed_operation": "retrieve", + "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": [ { @@ -7941,17 +7471,18 @@ "task_id": 290, "intent_template_id": 234, "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Tell me the product SKUs in the most recent cancelled orders of the customer who has the most cancellations in the history", - "intent_template": "Tell me the {{attribute}} of the customer who has the most cancellations in the history", - "instantiation_dict": {"attribute": "product SKUs in the most recent cancelled orders"}, - "format_specification": null, + "intent": "Get the product SKUs in the most recent cancelled orders of the customer who has the most cancellations in the history", + "intent_template": "Get the {{attribute}} of the customer who has the most cancellations in the history{{retrieved_data_format_spec}}", + "instantiation_dict": { + "attribute": "product SKUs in the most recent cancelled orders", + "retrieved_data_format_spec": "" + }, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": false, "results_schema": { "type": "array", "items": {"type": "string"} }, "expected": { - "performed_operation": "retrieve", + "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": ["WSH09-29-White", "WSH09-28-Green", "MSH11-34-Blue", "WP09-29-Purple"] } @@ -7964,22 +7495,17 @@ "task_id": 291, "intent_template_id": 234, "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Tell me the total spend on products in the most recent cancelled orders of the customer who has the most cancellations in the history", - "intent_template": "Tell me the {{attribute}} of the customer who has the most cancellations in the history", + "intent": "Get the total spend on products in the most recent cancelled orders excluding shipping and handling of the customer who has the most cancellations in the historyReturn the value as a number (e.g., 10.99) only, without any additional details", + "intent_template": "Get the {{attribute}} of the customer who has the most cancellations in the history{{retrieved_data_format_spec}}", "instantiation_dict": { - "attribute": "total spend on products in the most recent cancelled orders excluding shipping and handling" + "attribute": "total spend on products in the most recent cancelled orders excluding shipping and handling", + "retrieved_data_format_spec": "Return the value as a number (e.g., 10.99) only, without any additional details" }, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": false, "results_schema": { "type": "array", "items": {"type": "number", "format": "currency"} }, - "expected": { - "performed_operation": "retrieve", - "status": "SUCCESS", - "retrieved_data": [148.8] - } + "expected": { "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": [148.8] } } ], "revision": 2 @@ -7989,20 +7515,14 @@ "task_id": 292, "intent_template_id": 234, "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Tell me the total number of cancellations of the customer who has the most cancellations in the history", - "intent_template": "Tell me the {{attribute}} of the customer who has the most cancellations in the history", - "instantiation_dict": {"attribute": "total number of cancellations"}, - "format_specification": null, + "intent": "Get the total number of cancellations of the customer who has the most cancellations in the history", + "intent_template": "Get the {{attribute}} of the customer who has the most cancellations in the history{{retrieved_data_format_spec}}", + "instantiation_dict": {"attribute": "total number of cancellations", "retrieved_data_format_spec": ""}, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": false, "results_schema": { "type": "array", "items": {"type": "number"} }, - "expected": { - "performed_operation": "retrieve", - "status": "SUCCESS", - "retrieved_data": [9] - } + "expected": { "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": [9] } } ], "revision": 2 @@ -8012,18 +7532,20 @@ "task_id": 293, "intent_template_id": 329, "start_urls": ["__GITLAB__"], - "intent": "Get the URL to clone Super_Awesome_Robot with SSH", - "intent_template": "Get the URL to clone {{repo}} with SSH", - "instantiation_dict": {"repo": "Super_Awesome_Robot"}, - "format_specification": null, + "intent": "Get the URL to clone Super_Awesome_Robot with SSH. Return the URL only, without any additional details.", + "intent_template": "Get the URL to clone {{repo}} with SSH. {{retrieved_data_format_spec}}.", + "instantiation_dict": { + "repo": "Super_Awesome_Robot", + "retrieved_data_format_spec": "Return the URL only, without any additional details" + }, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": { "type": "array", "items": {"type": "string", "format": "url"} }, "expected": { - "performed_operation": "retrieve", + "task_type": "retrieve", "status": "SUCCESS", - "retrieved_data": ["ssh://git@__GITLAB__:2222/convexegg/super_awesome_robot.git"] + "retrieved_data": ["git@__SSH_HOST__:convexegg/super_awesome_robot.git"] } } ], @@ -8034,18 +7556,20 @@ "task_id": 294, "intent_template_id": 329, "start_urls": ["__GITLAB__"], - "intent": "Get the URL to clone ChatGPT with SSH", - "intent_template": "Get the URL to clone {{repo}} with SSH", - "instantiation_dict": {"repo": "ChatGPT"}, - "format_specification": null, + "intent": "Get the URL to clone ChatGPT with SSH. Return the URL only, without any additional details.", + "intent_template": "Get the URL to clone {{repo}} with SSH. {{retrieved_data_format_spec}}.", + "instantiation_dict": { + "repo": "ChatGPT", + "retrieved_data_format_spec": "Return the URL only, without any additional details" + }, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": { "type": "array", "items": {"type": "string", "format": "url"} }, "expected": { - "performed_operation": "retrieve", + "task_type": "retrieve", "status": "SUCCESS", - "retrieved_data": ["ssh://git@__GITLAB__:2222/convexegg/chatgpt.git"] + "retrieved_data": ["git@__SSH_HOST__:convexegg/chatgpt.git"] } } ], @@ -8056,18 +7580,20 @@ "task_id": 295, "intent_template_id": 329, "start_urls": ["__GITLAB__"], - "intent": "Get the URL to clone metaseq with SSH", - "intent_template": "Get the URL to clone {{repo}} with SSH", - "instantiation_dict": {"repo": "metaseq"}, - "format_specification": null, + "intent": "Get the URL to clone metaseq with SSH. Return the URL only, without any additional details.", + "intent_template": "Get the URL to clone {{repo}} with SSH. {{retrieved_data_format_spec}}.", + "instantiation_dict": { + "repo": "metaseq", + "retrieved_data_format_spec": "Return the URL only, without any additional details" + }, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": { "type": "array", "items": {"type": "string", "format": "url"} }, "expected": { - "performed_operation": "retrieve", + "task_type": "retrieve", "status": "SUCCESS", - "retrieved_data": ["ssh://git@__GITLAB__:2222/root/metaseq.git"] + "retrieved_data": ["git@__SSH_HOST__:root/metaseq.git"] } } ], @@ -8078,18 +7604,20 @@ "task_id": 296, "intent_template_id": 329, "start_urls": ["__GITLAB__"], - "intent": "Get the URL to clone the best GAN python implementation with SSH", - "intent_template": "Get the URL to clone {{repo}} with SSH", - "instantiation_dict": {"repo": "the best GAN python implementation"}, - "format_specification": null, + "intent": "Get the URL to clone the best GAN python implementation with SSH. Return the URL only, without any additional details.", + "intent_template": "Get the URL to clone {{repo}} with SSH. {{retrieved_data_format_spec}}.", + "instantiation_dict": { + "repo": "the best GAN python implementation", + "retrieved_data_format_spec": "Return the URL only, without any additional details" + }, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": { "type": "array", "items": {"type": "string", "format": "url"} }, "expected": { - "performed_operation": "retrieve", + "task_type": "retrieve", "status": "SUCCESS", - "retrieved_data": ["ssh://git@__GITLAB__:2222/eriklindernoren/PyTorch-GAN.git"] + "retrieved_data": ["git@__SSH_HOST__:eriklindernoren/PyTorch-GAN.git"] } } ], @@ -8100,18 +7628,20 @@ "task_id": 297, "intent_template_id": 329, "start_urls": ["__GITLAB__"], - "intent": "Get the URL to clone the most stared Covid related project with SSH", - "intent_template": "Get the URL to clone {{repo}} with SSH", - "instantiation_dict": {"repo": "the most stared Covid related project"}, - "format_specification": null, + "intent": "Get the URL to clone the most stared Covid related project with SSH. Return the URL only, without any additional details.", + "intent_template": "Get the URL to clone {{repo}} with SSH. {{retrieved_data_format_spec}}.", + "instantiation_dict": { + "repo": "the most stared Covid related project", + "retrieved_data_format_spec": "Return the URL only, without any additional details" + }, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": { "type": "array", "items": {"type": "string", "format": "url"} }, "expected": { - "performed_operation": "retrieve", + "task_type": "retrieve", "status": "SUCCESS", - "retrieved_data": ["ssh://git@__GITLAB__:2222/covid19india/covid19india-react.git"] + "retrieved_data": ["git@__SSH_HOST__:covid19india/covid19india-react.git"] } } ], @@ -8122,21 +7652,18 @@ "task_id": 298, "intent_template_id": 180, "start_urls": ["__SHOPPING__"], - "intent": "Show the most recent completed order", - "intent_template": "Show the most recent {{status}} order", + "intent": "Open the order details page for the most recent completed order", + "intent_template": "Open the order details page for the most recent {{status}} order", "instantiation_dict": {"status": "completed"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "navigate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", - "site": "shopping", - "last_event_only": true, - "expected": {"url": "__SHOPPING__/sales/order/view/order_id/180/", "response_status": 200} + "expected": {"url": "__SHOPPING__/sales/order/view/order_id/180/"} } ], "revision": 2 @@ -8146,21 +7673,18 @@ "task_id": 299, "intent_template_id": 180, "start_urls": ["__SHOPPING__"], - "intent": "Show the most recent cancelled order", - "intent_template": "Show the most recent {{status}} order", + "intent": "Open the order details page for the most recent cancelled order", + "intent_template": "Open the order details page for the most recent {{status}} order", "instantiation_dict": {"status": "cancelled"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "navigate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", - "site": "shopping", - "last_event_only": true, - "expected": {"url": "__SHOPPING__/sales/order/view/order_id/170/", "response_status": 200} + "expected": {"url": "__SHOPPING__/sales/order/view/order_id/170/"} } ], "revision": 2 @@ -8170,24 +7694,18 @@ "task_id": 300, "intent_template_id": 180, "start_urls": ["__SHOPPING__"], - "intent": "Show the most recent pending order", - "intent_template": "Show the most recent {{status}} order", + "intent": "Open the order details page for the most recent pending order", + "intent_template": "Open the order details page for the most recent {{status}} order", "instantiation_dict": {"status": "pending"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "navigate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", - "site": "shopping", - "last_event_only": true, - "expected": { - "url": "__SHOPPING__/sales/order/view/order_id/189/", - "headers": {"referer": "__SHOPPING__/sales/order/history/"} - } + "expected": {"url": "__SHOPPING__/sales/order/view/order_id/189/"} } ], "revision": 2 @@ -8197,23 +7715,17 @@ "task_id": 301, "intent_template_id": 180, "start_urls": ["__SHOPPING__"], - "intent": "Show the most recent processing order", - "intent_template": "Show the most recent {{status}} order", + "intent": "Open the order details page for the most recent processing order", + "intent_template": "Open the order details page for the most recent {{status}} order", "instantiation_dict": {"status": "processing"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": { - "performed_operation": "navigate", - "status": "NOT_FOUND_ERROR", - "retrieved_data": null - } + "expected": {"task_type": "navigate", "status": "NOT_FOUND_ERROR", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", - "site": "shopping", "expected": {"url": "__SHOPPING__/sales/order/history/"} } ], @@ -8224,23 +7736,17 @@ "task_id": 302, "intent_template_id": 180, "start_urls": ["__SHOPPING__"], - "intent": "Show the most recent out of delivery order", - "intent_template": "Show the most recent {{status}} order", + "intent": "Open the order details page for the most recent out of delivery order", + "intent_template": "Open the order details page for the most recent {{status}} order", "instantiation_dict": {"status": "out of delivery"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": { - "performed_operation": "navigate", - "status": "NOT_FOUND_ERROR", - "retrieved_data": null - } + "expected": {"task_type": "navigate", "status": "NOT_FOUND_ERROR", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", - "site": "shopping", "expected": {"url": "__SHOPPING__/sales/order/history/"} } ], @@ -8254,17 +7760,11 @@ "intent": "How many commits did Kilian make during 2023 in the current repository?", "intent_template": "How many commits did {{user}} make {{period}} in the current repository?", "instantiation_dict": {"user": "Kilian", "period": "during 2023"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": false, "results_schema": { "type": "array", "items": {"type": "number"} }, - "expected": { - "performed_operation": "retrieve", - "status": "SUCCESS", - "retrieved_data": [1] - } + "expected": { "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": [1] } } ], "revision": 2 @@ -8280,17 +7780,11 @@ "user": "Eric Bailey", "period": "between start of Feb 2023 and end of May 2023" }, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": false, "results_schema": { "type": "array", "items": {"type": "number"} }, - "expected": { - "performed_operation": "retrieve", - "status": "SUCCESS", - "retrieved_data": [14] - } + "expected": { "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": [14] } } ], "revision": 2 @@ -8303,17 +7797,11 @@ "intent": "How many commits did Philip make in Jan 2023 in the current repository?", "intent_template": "How many commits did {{user}} make {{period}} in the current repository?", "instantiation_dict": {"user": "Philip", "period": "in Jan 2023"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": false, "results_schema": { "type": "array", "items": {"type": "number"} }, - "expected": { - "performed_operation": "retrieve", - "status": "SUCCESS", - "retrieved_data": [0] - } + "expected": { "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": [0] } } ], "revision": 2 @@ -8326,17 +7814,11 @@ "intent": "How many commits did Anthony make between August 2022 through September 2022 in the current repository?", "intent_template": "How many commits did {{user}} make {{period}} in the current repository?", "instantiation_dict": {"user": "Anthony", "period": "between August 2022 through September 2022"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": false, "results_schema": { "type": "array", "items": {"type": "number"} }, - "expected": { - "performed_operation": "retrieve", - "status": "SUCCESS", - "retrieved_data": [0] - } + "expected": { "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": [0] } } ], "revision": 2 @@ -8349,17 +7831,11 @@ "intent": "How many commits did Nic Chan make on April 7th 2022 in the current repository?", "intent_template": "How many commits did {{user}} make {{period}} in the current repository?", "instantiation_dict": {"user": "Nic Chan", "period": "on April 7th 2022"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": false, "results_schema": { "type": "array", "items": {"type": "number"} }, - "expected": { - "performed_operation": "retrieve", - "status": "SUCCESS", - "retrieved_data": [5] - } + "expected": { "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": [5] } } ], "revision": 2 @@ -8372,13 +7848,12 @@ "intent": "Get the username(s) of the user(s) with the most commits to the primer/design project", "intent_template": "Get the username(s) of the user(s) with the most commits to the {{repo}} project", "instantiation_dict": {"repo": "primer/design"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": { "type": "array", "items": {"type": "string"} }, "expected": { - "performed_operation": "retrieve", + "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": ["shawn.allen@github.com"] } @@ -8394,13 +7869,12 @@ "intent": "Get the username(s) of the user(s) with the most commits to the thoughtbot/administrate project", "intent_template": "Get the username(s) of the user(s) with the most commits to the {{repo}} project", "instantiation_dict": {"repo": "thoughtbot/administrate"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": { "type": "array", "items": {"type": "string"} }, "expected": { - "performed_operation": "retrieve", + "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": ["wright.grayson@gmail.com"] } @@ -8416,13 +7890,12 @@ "intent": "Get the username(s) of the user(s) with the most commits to the AndroidSlidingUpPanel project", "intent_template": "Get the username(s) of the user(s) with the most commits to the {{repo}} project", "instantiation_dict": {"repo": "AndroidSlidingUpPanel"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": { "type": "array", "items": {"type": "string"} }, "expected": { - "performed_operation": "retrieve", + "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": ["antonlopyrev@gmail.com"] } @@ -8438,13 +7911,12 @@ "intent": "Get the username(s) of the user(s) with the most commits to the Pytorch GAN project", "intent_template": "Get the username(s) of the user(s) with the most commits to the {{repo}} project", "instantiation_dict": {"repo": "Pytorch GAN"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": { "type": "array", "items": {"type": "string"} }, "expected": { - "performed_operation": "retrieve", + "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": ["eriklindernoren@live.se"] } @@ -8460,13 +7932,12 @@ "intent": "Get the username(s) of the user(s) with the most commits to the csvkit project", "intent_template": "Get the username(s) of the user(s) with the most commits to the {{repo}} project", "instantiation_dict": {"repo": "csvkit"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": { "type": "array", "items": {"type": "string"} }, "expected": { - "performed_operation": "retrieve", + "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": ["staringmonkey@gmail.com"] } @@ -8482,16 +7953,11 @@ "intent": "Get the customer service phone number", "intent_template": "Get the customer service phone number", "instantiation_dict": {}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": { - "performed_operation": "retrieve", - "status": "NOT_FOUND_ERROR", - "retrieved_data": null - } + "expected": {"task_type": "retrieve", "status": "NOT_FOUND_ERROR", "retrieved_data": null} } ], "revision": 2 @@ -8502,15 +7968,18 @@ "intent_template_id": 324, "start_urls": ["__GITLAB__"], "intent": "Get the full names of the top 3 contributors (by commit count) to primer/design repo", - "intent_template": "Get the {{attribute}} of the top 3 contributors (by commit count) to {{repo}} repo", - "instantiation_dict": {"repo": "primer/design", "attribute": "full names"}, - "format_specification": null, + "intent_template": "Get the {{attribute}} of the top 3 contributors (by commit count) to {{repo}} repo{{retrieved_data_format_spec}}", + "instantiation_dict": { + "repo": "primer/design", + "attribute": "full names", + "retrieved_data_format_spec": "" + }, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": { "type": "array", "items": {"type": "string"} }, "expected": { - "performed_operation": "retrieve", + "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": ["Shawn Allen", "Inayaili Le\u00f3n", "Aurora Pleguezuelo"] } @@ -8524,15 +7993,18 @@ "intent_template_id": 324, "start_urls": ["__GITLAB__"], "intent": "Get the email addresses of the top 3 contributors (by commit count) to Pytorch GAN repo", - "intent_template": "Get the {{attribute}} of the top 3 contributors (by commit count) to {{repo}} repo", - "instantiation_dict": {"repo": "Pytorch GAN", "attribute": "email addresses"}, - "format_specification": null, + "intent_template": "Get the {{attribute}} of the top 3 contributors (by commit count) to {{repo}} repo{{retrieved_data_format_spec}}", + "instantiation_dict": { + "repo": "Pytorch GAN", + "attribute": "email addresses", + "retrieved_data_format_spec": "" + }, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": { "type": "array", "items": {"type": "string"} }, "expected": { - "performed_operation": "retrieve", + "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": ["eriklindernoren@live.se", "eriklindernoren@gmail.com", "pinnacle.chen@qq.com"] } @@ -8546,22 +8018,21 @@ "intent_template_id": 324, "start_urls": ["__GITLAB__"], "intent": "Get the email addresses of the top 3 contributors (by commit count) to facebook\"s guide on building react apps repo", - "intent_template": "Get the {{attribute}} of the top 3 contributors (by commit count) to {{repo}} repo", + "intent_template": "Get the {{attribute}} of the top 3 contributors (by commit count) to {{repo}} repo{{retrieved_data_format_spec}}", "instantiation_dict": { - "repo": "facebook's guide on building react apps", - "attribute": "email addresses" + "repo": "facebook\"s guide on building react apps", + "attribute": "email addresses", + "retrieved_data_format_spec": "" }, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": { "type": "array", "items": {"type": "string"} }, "expected": { - "performed_operation": "retrieve", + "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": ["dan.abramov@gmail.com", "timer150@gmail.com", "ian@iansutherland.ca"] - }, - "ordered": false + } } ], "revision": 2 @@ -8571,14 +8042,16 @@ "task_id": 317, "intent_template_id": 324, "start_urls": ["__GITLAB__"], - "intent": "Get the names and number of commits of the top 3 contributors (by commit count) to metaseq repo", - "intent_template": "Get the {{attribute}} of the top 3 contributors (by commit count) to {{repo}} repo", - "instantiation_dict": {"repo": "metaseq", "attribute": "names and number of commits"}, - "format_specification": "Use objects with keys: \"first_name\", \"last_name\" and \"number_of_commits\".", + "intent": "Get the names and number of commits of the top 3 contributors (by commit count) to metaseq repo. Return a list of objects with keys \"first_name\", \"last_name\", and \"number_of_commits\".", + "intent_template": "Get the {{attribute}} of the top 3 contributors (by commit count) to {{repo}} repo{{retrieved_data_format_spec}}", + "instantiation_dict": { + "repo": "metaseq", + "attribute": "names and number of commits", + "retrieved_data_format_spec": ". Return a list of objects with keys \"first_name\", \"last_name\", and \"number_of_commits\"." + }, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": false, "results_schema": { "type": "array", "items": { @@ -8587,12 +8060,11 @@ "first_name" : { "type": "string" }, "last_name" : { "type": "string" }, "number_of_commits": { "type": "number" } - }, - "required": ["first_name", "last_name", "number_of_commits"] + } } }, "expected": { - "performed_operation": "retrieve", + "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": [ { "first_name": "Susan" , "last_name": "Zhang" , "number_of_commits": 70 }, @@ -8610,16 +8082,18 @@ "intent_template_id": 324, "start_urls": ["__GITLAB__"], "intent": "Get the last names of the top 3 contributors (by commit count) to 2019-nCov repo", - "intent_template": "Get the {{attribute}} of the top 3 contributors (by commit count) to {{repo}} repo", - "instantiation_dict": {"repo": "2019-nCov", "attribute": "last names"}, - "format_specification": null, + "intent_template": "Get the {{attribute}} of the top 3 contributors (by commit count) to {{repo}} repo{{retrieved_data_format_spec}}", + "instantiation_dict": { + "repo": "2019-nCov", + "attribute": "last names", + "retrieved_data_format_spec": "" + }, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": false, "results_schema": { "type": "array", "items": {"type": "string"} }, "expected": { - "performed_operation": "retrieve", + "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": ["Lo", "Chen", "Chu"] } @@ -8632,16 +8106,18 @@ "task_id": 319, "intent_template_id": 160, "start_urls": ["__SHOPPING__"], - "intent": "How much refund should I expect from my orders canceled, if any, in April 2022, including shipping fee", - "intent_template": "How much refund should I expect from my orders canceled, if any, in {{time}}{{conditions}}", - "instantiation_dict": {"time": "April 2022", "conditions": ", including shipping fee"}, - "format_specification": null, + "intent": "How much refund should I expect from my orders canceled, if any, in April 2022, including shipping fee.. Return the value as a number (e.g., 10.99) only, without any additional details.", + "intent_template": "How much refund should I expect from my orders canceled, if any, in {{time}}{{conditions}}. {{retrieved_data_format_spec}}.", + "instantiation_dict": { + "time": "April 2022", + "conditions": ", including shipping fee.", + "retrieved_data_format_spec": "Return the value as a number (e.g., 10.99) only, without any additional details" + }, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": false, "results_schema": { "type": "array", "items": {"type": "number", "format": "currency"} }, - "expected": {"performed_operation": "retrieve", "status": "NOT_FOUND_ERROR"} + "expected": {"task_type": "retrieve", "status": "NOT_FOUND_ERROR"} } ], "revision": 2 @@ -8651,20 +8127,18 @@ "task_id": 320, "intent_template_id": 160, "start_urls": ["__SHOPPING__"], - "intent": "How much refund should I expect from my orders canceled, if any, in Feb 2023, including shipping fee", - "intent_template": "How much refund should I expect from my orders canceled, if any, in {{time}}{{conditions}}", - "instantiation_dict": {"time": "Feb 2023", "conditions": ", including shipping fee"}, - "format_specification": null, + "intent": "How much refund should I expect from my orders canceled, if any, in Feb 2023, including shipping fee.. Return the value as a number (e.g., 10.99) only, without any additional details.", + "intent_template": "How much refund should I expect from my orders canceled, if any, in {{time}}{{conditions}}. {{retrieved_data_format_spec}}.", + "instantiation_dict": { + "time": "Feb 2023", + "conditions": ", including shipping fee.", + "retrieved_data_format_spec": "Return the value as a number (e.g., 10.99) only, without any additional details" + }, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": false, "results_schema": { "type": "array", "items": {"type": "number", "format": "currency"} }, - "expected": { - "performed_operation": "retrieve", - "status": "SUCCESS", - "retrieved_data": [406.53] - } + "expected": { "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": [406.53] } } ], "revision": 2 @@ -8674,20 +8148,18 @@ "task_id": 321, "intent_template_id": 160, "start_urls": ["__SHOPPING__"], - "intent": "How much refund should I expect from my orders canceled, if any, in 2022, including shipping fee", - "intent_template": "How much refund should I expect from my orders canceled, if any, in {{time}}{{conditions}}", - "instantiation_dict": {"time": "2022", "conditions": ", including shipping fee"}, - "format_specification": null, + "intent": "How much refund should I expect from my orders canceled, if any, in 2022, including shipping fee.. Return the value as a number (e.g., 10.99) only, without any additional details.", + "intent_template": "How much refund should I expect from my orders canceled, if any, in {{time}}{{conditions}}. {{retrieved_data_format_spec}}.", + "instantiation_dict": { + "time": "2022", + "conditions": ", including shipping fee.", + "retrieved_data_format_spec": "Return the value as a number (e.g., 10.99) only, without any additional details" + }, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": false, "results_schema": { "type": "array", "items": {"type": "number", "format": "currency"} }, - "expected": { - "performed_operation": "retrieve", - "status": "SUCCESS", - "retrieved_data": [3053.97] - } + "expected": { "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": [3053.97] } } ], "revision": 2 @@ -8697,23 +8169,18 @@ "task_id": 322, "intent_template_id": 160, "start_urls": ["__SHOPPING__"], - "intent": "How much refund should I expect from my orders canceled, if any, in May 2023 if I cannot get the shipping fee refunded?", - "intent_template": "How much refund should I expect from my orders canceled, if any, in {{time}}{{conditions}}", + "intent": "How much refund should I expect from my orders canceled, if any, in May 2023 if I cannot get the shipping fee refunded?. Return the value as a number (e.g., 10.99) only, without any additional details.", + "intent_template": "How much refund should I expect from my orders canceled, if any, in {{time}}{{conditions}}. {{retrieved_data_format_spec}}.", "instantiation_dict": { "time": "May 2023", - "conditions": " if I cannot get the shipping fee refunded?" + "conditions": " if I cannot get the shipping fee refunded?", + "retrieved_data_format_spec": "Return the value as a number (e.g., 10.99) only, without any additional details" }, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": false, "results_schema": { "type": "array", "items": {"type": "number", "format": "currency"} }, - "expected": { - "performed_operation": "retrieve", - "status": "SUCCESS", - "retrieved_data": [350.42] - } + "expected": { "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": [350.42] } } ], "revision": 2 @@ -8723,23 +8190,18 @@ "task_id": 323, "intent_template_id": 160, "start_urls": ["__SHOPPING__"], - "intent": "How much refund should I expect from my orders canceled, if any, in March 2022? I only kept the AC-DC Adapter and the shop told me that I cannot get the shipping fee back", - "intent_template": "How much refund should I expect from my orders canceled, if any, in {{time}}{{conditions}}", + "intent": "How much refund should I expect from my orders canceled, if any, in March 2022? I only kept the AC-DC Adapter and the shop told me that I cannot get the shipping fee back.. Return the value as a number (e.g., 10.99) only, without any additional details.", + "intent_template": "How much refund should I expect from my orders canceled, if any, in {{time}}{{conditions}}. {{retrieved_data_format_spec}}.", "instantiation_dict": { "time": "March 2022", - "conditions": "? I only kept the AC-DC Adapter and the shop told me that I cannot get the shipping fee back" + "conditions": "? I only kept the AC-DC Adapter and the shop told me that I cannot get the shipping fee back.", + "retrieved_data_format_spec": "Return the value as a number (e.g., 10.99) only, without any additional details" }, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": false, "results_schema": { "type": "array", "items": {"type": "number", "format": "currency"} }, - "expected": { - "performed_operation": "retrieve", - "status": "SUCCESS", - "retrieved_data": [264.49] - } + "expected": { "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": [264.49] } } ], "revision": 2 @@ -8749,23 +8211,19 @@ "task_id": 324, "intent_template_id": 208, "start_urls": ["__SHOPPING__"], - "intent": "Show me the \"chairs\" listings by ascending price.", - "intent_template": "Show me the \"{{product}}\" listings by {{sorting_order}}.", + "intent": "Pull up the page with all \"chairs\" listings sorted by ascending price.", + "intent_template": "Pull up the page with all \"{{product}}\" listings sorted by {{sorting_order}}.", "instantiation_dict": {"product": "chairs", "sorting_order": "ascending price"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "navigate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", - "site": "shopping", - "last_event_only": true, "expected": { "url": "__SHOPPING__/catalogsearch/result/index/", - "response_status": 200, "query_params": { "product_list_order": [ "price" ], "q" : [ "chairs" ], @@ -8781,20 +8239,17 @@ "task_id": 325, "intent_template_id": 208, "start_urls": ["__SHOPPING__"], - "intent": "Show me the \"mouth night guard\" listings by descending price.", - "intent_template": "Show me the \"{{product}}\" listings by {{sorting_order}}.", + "intent": "Pull up the page with all \"mouth night guard\" listings sorted by descending price.", + "intent_template": "Pull up the page with all \"{{product}}\" listings sorted by {{sorting_order}}.", "instantiation_dict": {"product": "mouth night guard", "sorting_order": "descending price"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "navigate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", - "site": "shopping", - "last_event_only": true, "ignored_query_params_patterns": ["^(?!q$|product_list_dir$|product_list_order).+$"], "expected": { "url": "__SHOPPING__/catalogsearch/result/index/", @@ -8809,26 +8264,22 @@ "task_id": 326, "intent_template_id": 208, "start_urls": ["__SHOPPING__"], - "intent": "Show me the \"Canon photo printer\" listings by search relevance, from most to least.", - "intent_template": "Show me the \"{{product}}\" listings by {{sorting_order}}.", + "intent": "Pull up the page with all \"Canon photo printer\" listings sorted by search relevance, from most to least.", + "intent_template": "Pull up the page with all \"{{product}}\" listings sorted by {{sorting_order}}.", "instantiation_dict": { "product": "Canon photo printer", "sorting_order": "search relevance, from most to least" }, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "navigate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", - "site": "shopping", - "last_event_only": true, "expected": { "url": "__SHOPPING__/catalogsearch/result/", - "response_status": 200, "query_params": { "q": ["Canon photo printer"] } } } @@ -8840,20 +8291,17 @@ "task_id": 327, "intent_template_id": 208, "start_urls": ["__SHOPPING__"], - "intent": "Show me the \"iphone 12 phone case\" listings by name alphabetically.", - "intent_template": "Show me the \"{{product}}\" listings by {{sorting_order}}.", + "intent": "Pull up the page with all \"iphone 12 phone case\" listings sorted by name alphabetically.", + "intent_template": "Pull up the page with all \"{{product}}\" listings sorted by {{sorting_order}}.", "instantiation_dict": {"product": "iphone 12 phone case", "sorting_order": "name alphabetically"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "navigate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", - "site": "shopping", - "last_event_only": true, "ignored_query_params_patterns": ["^(?!q$|product_list_dir$|product_list_order$).+$"], "expected": { "url": "__SHOPPING__/catalogsearch/result/index/", @@ -8872,20 +8320,17 @@ "task_id": 328, "intent_template_id": 208, "start_urls": ["__SHOPPING__"], - "intent": "Show me the \"iphone 12 phone case\" listings by price.", - "intent_template": "Show me the \"{{product}}\" listings by {{sorting_order}}.", + "intent": "Pull up the page with all \"iphone 12 phone case\" listings sorted by price.", + "intent_template": "Pull up the page with all \"{{product}}\" listings sorted by {{sorting_order}}.", "instantiation_dict": {"product": "iphone 12 phone case", "sorting_order": "price"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "navigate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", - "site": "shopping", - "last_event_only": true, "ignored_query_params_patterns": ["^(?!q$|product_list_order$).+$"], "expected": { "url": "__SHOPPING__/catalogsearch/result/index/", @@ -8900,20 +8345,17 @@ "task_id": 329, "intent_template_id": 147, "start_urls": ["__SHOPPING__"], - "intent": "Return the total amount I spent on shopping at One Stop Market on April 19, 2023, excluding shipping.", - "intent_template": "Return the total amount I spent on shopping at One Stop Market {{time}}, excluding shipping.", - "instantiation_dict": {"time": "on April 19, 2023"}, - "format_specification": null, + "intent": "Return the total amount I spent on shopping at One Stop Market on April 19, 2023, excluding shipping. Return the value as a number (e.g., 10.99) only, without any additional details", + "intent_template": "Return the total amount I spent on shopping at One Stop Market {{time}}, excluding shipping. {{retrieved_data_format_spec}}", + "instantiation_dict": { + "time": "on April 19, 2023", + "retrieved_data_format_spec": "Return the value as a number (e.g., 10.99) only, without any additional details" + }, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": false, "results_schema": { "type": "array", "items": {"type": "number", "format": "currency"} }, - "expected": { - "performed_operation": "retrieve", - "status": "SUCCESS", - "retrieved_data": [0] - } + "expected": { "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": [0] } } ], "revision": 2 @@ -8923,20 +8365,17 @@ "task_id": 330, "intent_template_id": 147, "start_urls": ["__SHOPPING__"], - "intent": "Return the total amount I spent on shopping at One Stop Market in March 2023, excluding shipping.", - "intent_template": "Return the total amount I spent on shopping at One Stop Market {{time}}, excluding shipping.", - "instantiation_dict": {"time": "in March 2023"}, - "format_specification": null, + "intent": "Return the total amount I spent on shopping at One Stop Market in March 2023, excluding shipping. Return the value as a number (e.g., 10.99) only, without any additional details", + "intent_template": "Return the total amount I spent on shopping at One Stop Market {{time}}, excluding shipping. {{retrieved_data_format_spec}}", + "instantiation_dict": { + "time": "in March 2023", + "retrieved_data_format_spec": "Return the value as a number (e.g., 10.99) only, without any additional details" + }, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": false, "results_schema": { "type": "array", "items": {"type": "number", "format": "currency"} }, - "expected": { - "performed_operation": "retrieve", - "status": "SUCCESS", - "retrieved_data": [53.31] - } + "expected": { "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": [53.31] } } ], "revision": 2 @@ -8946,20 +8385,17 @@ "task_id": 331, "intent_template_id": 147, "start_urls": ["__SHOPPING__"], - "intent": "Return the total amount I spent on shopping at One Stop Market in July 2022, excluding shipping.", - "intent_template": "Return the total amount I spent on shopping at One Stop Market {{time}}, excluding shipping.", - "instantiation_dict": {"time": "in July 2022"}, - "format_specification": null, + "intent": "Return the total amount I spent on shopping at One Stop Market in July 2022, excluding shipping. Return the value as a number (e.g., 10.99) only, without any additional details", + "intent_template": "Return the total amount I spent on shopping at One Stop Market {{time}}, excluding shipping. {{retrieved_data_format_spec}}", + "instantiation_dict": { + "time": "in July 2022", + "retrieved_data_format_spec": "Return the value as a number (e.g., 10.99) only, without any additional details" + }, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": false, "results_schema": { "type": "array", "items": {"type": "number", "format": "currency"} }, - "expected": { - "performed_operation": "retrieve", - "status": "SUCCESS", - "retrieved_data": [25.16] - } + "expected": { "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": [25.16] } } ], "revision": 2 @@ -8969,14 +8405,15 @@ "task_id": 332, "intent_template_id": 147, "start_urls": ["__SHOPPING__"], - "intent": "Return the total amount I spent on shopping at One Stop Market each months from Jan to the March 31, 2023, excluding shipping.", - "intent_template": "Return the total amount I spent on shopping at One Stop Market {{time}}, excluding shipping.", - "instantiation_dict": {"time": "each months from Jan to the March 31, 2023"}, - "format_specification": "Use \"month\" for month and \"total\" for spent amount.", + "intent": "Return the total amount I spent on shopping at One Stop Market each month from Jan to the March 31, 2023, excluding shipping. Return a list of objects with keys \"month\" (month name) and \"total\" (as a number, e.g., 10.99) only, without any additional details", + "intent_template": "Return the total amount I spent on shopping at One Stop Market {{time}}, excluding shipping. {{retrieved_data_format_spec}}", + "instantiation_dict": { + "time": "each month from Jan to the March 31, 2023", + "retrieved_data_format_spec": "Return a list of objects with keys \"month\" (month name) and \"total\" (as a number, e.g., 10.99) only, without any additional details" + }, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": false, "results_schema": { "type": "array", "items": { @@ -8984,12 +8421,11 @@ "properties": { "month": { "type": "string", "format": "month" }, "total": { "type": "number", "format": "currency" } - }, - "required": ["month", "total"] + } } }, "expected": { - "performed_operation": "retrieve", + "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": [ { "month": "Jan", "total": 542.88 }, @@ -9006,20 +8442,17 @@ "task_id": 333, "intent_template_id": 147, "start_urls": ["__SHOPPING__"], - "intent": "Return the total amount I spent on shopping at One Stop Market in November 2022, excluding shipping.", - "intent_template": "Return the total amount I spent on shopping at One Stop Market {{time}}, excluding shipping.", - "instantiation_dict": {"time": "in November 2022"}, - "format_specification": null, + "intent": "Return the total amount I spent on shopping at One Stop Market in November 2022, excluding shipping. Return the value as a number (e.g., 10.99) only, without any additional details", + "intent_template": "Return the total amount I spent on shopping at One Stop Market {{time}}, excluding shipping. {{retrieved_data_format_spec}}", + "instantiation_dict": { + "time": "in November 2022", + "retrieved_data_format_spec": "Return the value as a number (e.g., 10.99) only, without any additional details" + }, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": false, "results_schema": { "type": "array", "items": {"type": "number", "format": "currency"} }, - "expected": { - "performed_operation": "retrieve", - "status": "SUCCESS", - "retrieved_data": [358.18] - } + "expected": { "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": [358.18] } } ], "revision": 2 @@ -9029,17 +8462,18 @@ "task_id": 334, "intent_template_id": 169, "start_urls": ["__SHOPPING__"], - "intent": "Return the date I last ordered my muffin cornbread mix", - "intent_template": "Return the date I last ordered my {{description}}", - "instantiation_dict": {"description": "muffin cornbread mix"}, - "format_specification": null, + "intent": "Return the date I last ordered my muffin cornbread mix. Return the date in YYYY-MM-DD format or null if not available, without any additional details.", + "intent_template": "Return the date I last ordered my {{description}}. {{retrieved_data_format_spec}}.", + "instantiation_dict": { + "description": "muffin cornbread mix", + "retrieved_data_format_spec": "Return the date in YYYY-MM-DD format or null if not available, without any additional details" + }, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": false, "results_schema": { "type": "array", "items": {"type": "string", "format": "date"} }, "expected": { - "performed_operation": "retrieve", + "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": ["03/11/2023"] } @@ -9052,17 +8486,18 @@ "task_id": 335, "intent_template_id": 169, "start_urls": ["__SHOPPING__"], - "intent": "Return the date I last ordered my body butter", - "intent_template": "Return the date I last ordered my {{description}}", - "instantiation_dict": {"description": "body butter"}, - "format_specification": null, + "intent": "Return the date I last ordered my body butter. Return the date in YYYY-MM-DD format or null if not available, without any additional details.", + "intent_template": "Return the date I last ordered my {{description}}. {{retrieved_data_format_spec}}.", + "instantiation_dict": { + "description": "body butter", + "retrieved_data_format_spec": "Return the date in YYYY-MM-DD format or null if not available, without any additional details" + }, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": false, "results_schema": { "type": "array", "items": {"type": "string", "format": "date"} }, "expected": { - "performed_operation": "retrieve", + "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": ["01/16/2023"] } @@ -9075,17 +8510,18 @@ "task_id": 336, "intent_template_id": 169, "start_urls": ["__SHOPPING__"], - "intent": "Return the date I last ordered my conditioner", - "intent_template": "Return the date I last ordered my {{description}}", - "instantiation_dict": {"description": "conditioner"}, - "format_specification": null, + "intent": "Return the date I last ordered my conditioner. Return the date in YYYY-MM-DD format or null if not available, without any additional details.", + "intent_template": "Return the date I last ordered my {{description}}. {{retrieved_data_format_spec}}.", + "instantiation_dict": { + "description": "conditioner", + "retrieved_data_format_spec": "Return the date in YYYY-MM-DD format or null if not available, without any additional details" + }, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": false, "results_schema": { "type": "array", "items": {"type": "string", "format": "date"} }, "expected": { - "performed_operation": "retrieve", + "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": ["01/16/2023"] } @@ -9098,17 +8534,18 @@ "task_id": 337, "intent_template_id": 169, "start_urls": ["__SHOPPING__"], - "intent": "Return the date I last ordered my olive bread", - "intent_template": "Return the date I last ordered my {{description}}", - "instantiation_dict": {"description": "olive bread"}, - "format_specification": null, + "intent": "Return the date I last ordered my olive bread. Return the date in YYYY-MM-DD format or null if not available, without any additional details.", + "intent_template": "Return the date I last ordered my {{description}}. {{retrieved_data_format_spec}}.", + "instantiation_dict": { + "description": "olive bread", + "retrieved_data_format_spec": "Return the date in YYYY-MM-DD format or null if not available, without any additional details" + }, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": false, "results_schema": { "type": "array", "items": {"type": "string", "format": "date"} }, "expected": { - "performed_operation": "retrieve", + "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": ["12/12/2022"] } @@ -9121,17 +8558,18 @@ "task_id": 338, "intent_template_id": 169, "start_urls": ["__SHOPPING__"], - "intent": "Return the date I last ordered my toothpaste", - "intent_template": "Return the date I last ordered my {{description}}", - "instantiation_dict": {"description": "toothpaste"}, - "format_specification": null, + "intent": "Return the date I last ordered my toothpaste. Return the date in YYYY-MM-DD format or null if not available, without any additional details.", + "intent_template": "Return the date I last ordered my {{description}}. {{retrieved_data_format_spec}}.", + "instantiation_dict": { + "description": "toothpaste", + "retrieved_data_format_spec": "Return the date in YYYY-MM-DD format or null if not available, without any additional details" + }, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": false, "results_schema": { "type": "array", "items": {"type": "string", "format": "date"} }, "expected": { - "performed_operation": "retrieve", + "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": ["12/04/2022"] } @@ -9144,24 +8582,29 @@ "task_id": 339, "intent_template_id": 299, "start_urls": ["__GITLAB__/a11yproject/a11yproject.com"], - "intent": "Show me the list of all opened issues that report bugs for the current project", - "intent_template": "Show me the list of all opened issues {{description}} for the current project", + "intent": "Go to the list of all opened issues that report bugs for the current project", + "intent_template": "Go to the list of all opened issues {{description}} for the current project", "instantiation_dict": {"description": "that report bugs"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "navigate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", - "site": "gitlab", - "last_event_only": true, + "expected": {"url": "^__GITLAB__/a11yproject/a11yproject.com/-/issues/.*$"}, + "ignored_query_params_patterns": [".*"] + }, + { + "evaluator": "NetworkEventEvaluator", + "ignored_query_params_patterns": ["page", "sort", "scope"], "expected": { - "url": "__GITLAB__/a11yproject/a11yproject.com/-/issues/", - "response_status": 200, - "query_params": { "label_name[]": ["bug"], "state": ["opened"] } + "url": "__GITLAB__/api/graphql", + "http_method": "POST", + "headers": { + "referer": "__GITLAB__/a11yproject/a11yproject.com/-/issues/?state=opened&label_name%5B%5D=bug" + } } } ], @@ -9172,24 +8615,29 @@ "task_id": 340, "intent_template_id": 299, "start_urls": ["__GITLAB__/primer/design"], - "intent": "Show me the list of all opened issues that report bugs for the current project", - "intent_template": "Show me the list of all opened issues {{description}} for the current project", + "intent": "Go to the list of all opened issues that report bugs for the current project", + "intent_template": "Go to the list of all opened issues {{description}} for the current project", "instantiation_dict": {"description": "that report bugs"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "navigate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", - "site": "gitlab", - "last_event_only": true, + "expected": {"url": "^__GITLAB__/primer/design/-/issues/.*$"}, + "ignored_query_params_patterns": [".*"] + }, + { + "evaluator": "NetworkEventEvaluator", + "ignored_query_params_patterns": ["page", "sort", "scope"], "expected": { - "url": "__GITLAB__/primer/design/-/issues/", - "response_status": 200, - "query_params": { "label_name[]": ["type: bug \ud83d\udc1e"], "state": ["opened"] } + "url": "__GITLAB__/api/graphql", + "http_method": "POST", + "headers": { + "referer": "__GITLAB__/primer/design/-/issues/?state=opened&label_name%5B%5D=type%3A%20bug%20%F0%9F%90%9E" + } } } ], @@ -9200,24 +8648,29 @@ "task_id": 341, "intent_template_id": 299, "start_urls": ["__GITLAB__/root/metaseq"], - "intent": "Show me the list of all opened issues requesting new features for the current project", - "intent_template": "Show me the list of all opened issues {{description}} for the current project", + "intent": "Go to the list of all opened issues requesting new features for the current project", + "intent_template": "Go to the list of all opened issues {{description}} for the current project", "instantiation_dict": {"description": "requesting new features"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "navigate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", - "site": "gitlab", - "last_event_only": true, + "expected": {"url": "^__GITLAB__/root/metaseq/-/issues/.*$"}, + "ignored_query_params_patterns": [".*"] + }, + { + "evaluator": "NetworkEventEvaluator", + "ignored_query_params_patterns": ["page", "sort", "scope"], "expected": { - "url": "__GITLAB__/root/metaseq/-/issues/", - "response_status": 200, - "query_params": { "label_name[]": ["enhancement"], "state": ["opened"] } + "url": "__GITLAB__/api/graphql", + "http_method": "POST", + "headers": { + "referer": "__GITLAB__/root/metaseq/-/issues/?state=opened&label_name%5B%5D=enhancement" + } } } ], @@ -9228,20 +8681,22 @@ "task_id": 342, "intent_template_id": 299, "start_urls": ["__GITLAB__/root/metaseq"], - "intent": "Show me the list of all opened issues that ask about OPT model related questions for the current project", - "intent_template": "Show me the list of all opened issues {{description}} for the current project", + "intent": "Go to the list of all opened issues that ask about OPT model related questions for the current project", + "intent_template": "Go to the list of all opened issues {{description}} for the current project", "instantiation_dict": {"description": "that ask about OPT model related questions"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "navigate", "status": "SUCCESS", "retrieved_data": null} + }, + { + "evaluator": "NetworkEventEvaluator", + "ignored_query_params_patterns": [".*"], + "expected": {"url": "^__GITLAB__/root/metaseq/-/issues.*$"} }, { "evaluator": "NetworkEventEvaluator", - "site": "gitlab", - "last_event_only": true, "ignored_query_params_patterns": ["page", "sort"], "expected": { "url": "__GITLAB__/api/graphql", @@ -9259,21 +8714,23 @@ "task_id": 343, "intent_template_id": 299, "start_urls": ["__GITLAB__/root/metaseq"], - "intent": "Show me the list of all opened issues that don't have any labels for the current project", - "intent_template": "Show me the list of all opened issues {{description}} for the current project", + "intent": "Go to the list of all opened issues that don't have any labels for the current project", + "intent_template": "Go to the list of all opened issues {{description}} for the current project", "instantiation_dict": {"description": "that don't have any labels"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "navigate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", - "site": "gitlab", - "last_event_only": true, - "ignored_query_params_patterns": ["page", "sort"], + "ignored_query_params_patterns": [".*"], + "expected": {"url": "^__GITLAB__/root/metaseq/-/issues.*$"} + }, + { + "evaluator": "NetworkEventEvaluator", + "ignored_query_params_patterns": ["page", "sort", "scope"], "expected": { "url": "__GITLAB__/api/graphql", "http_method": "POST", @@ -9293,17 +8750,11 @@ "intent": "How many reviews did our shop receive so far?", "intent_template": "How many reviews did our shop receive {{time}}?", "instantiation_dict": {"time": "so far"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": false, "results_schema": { "type": "array", "items": {"type": "number"} }, - "expected": { - "performed_operation": "retrieve", - "status": "SUCCESS", - "retrieved_data": [351] - } + "expected": { "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": [351] } } ], "revision": 2 @@ -9316,17 +8767,11 @@ "intent": "How many reviews did our shop receive in Apr 2023?", "intent_template": "How many reviews did our shop receive {{time}}?", "instantiation_dict": {"time": "in Apr 2023"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": false, "results_schema": { "type": "array", "items": {"type": "number"} }, - "expected": { - "performed_operation": "retrieve", - "status": "SUCCESS", - "retrieved_data": [351] - } + "expected": { "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": [351] } } ], "revision": 2 @@ -9339,17 +8784,11 @@ "intent": "How many reviews did our shop receive during 2022?", "intent_template": "How many reviews did our shop receive {{time}}?", "instantiation_dict": {"time": "during 2022"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": false, "results_schema": { "type": "array", "items": {"type": "number"} }, - "expected": { - "performed_operation": "retrieve", - "status": "SUCCESS", - "retrieved_data": [0] - } + "expected": { "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": [0] } } ], "revision": 2 @@ -9362,17 +8801,11 @@ "intent": "How many reviews did our shop receive from the beginning of the shop?", "intent_template": "How many reviews did our shop receive {{time}}?", "instantiation_dict": {"time": "from the beginning of the shop"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": false, "results_schema": { "type": "array", "items": {"type": "number"} }, - "expected": { - "performed_operation": "retrieve", - "status": "SUCCESS", - "retrieved_data": [351] - } + "expected": { "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": [351] } } ], "revision": 2 @@ -9385,17 +8818,11 @@ "intent": "How many reviews did our shop receive in May 2023?", "intent_template": "How many reviews did our shop receive {{time}}?", "instantiation_dict": {"time": "in May 2023"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": false, "results_schema": { "type": "array", "items": {"type": "number"} }, - "expected": { - "performed_operation": "retrieve", - "status": "SUCCESS", - "retrieved_data": [0] - } + "expected": { "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": [0] } } ], "revision": 2 @@ -9408,20 +8835,18 @@ "intent": "Get the usernames of other users who have access to my repo gimmiethat.space", "intent_template": "Get the usernames of other users who have access to my repo {{repo}}", "instantiation_dict": {"repo": "gimmiethat.space"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": false, "results_schema": { "type": "array", "items": {"type": "string"} }, "expected": { - "performed_operation": "retrieve", + "task_type": "retrieve", "status": "SUCCESS", - "retrieved_data": [ ["yjlou", "@yjlou"] ] + "retrieved_data": ["^@?yjlou$"] } } ], - "revision": 2 + "revision": 3 }, { "sites": ["gitlab"], @@ -9431,43 +8856,37 @@ "intent": "Get the usernames of other users who have access to my repo prism-theme", "intent_template": "Get the usernames of other users who have access to my repo {{repo}}", "instantiation_dict": {"repo": "prism-theme"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": false, "results_schema": { "type": "array", "items": {"type": "string"} }, "expected": { - "performed_operation": "retrieve", + "task_type": "retrieve", "status": "SUCCESS", - "retrieved_data": [ ["abisubramanya27", "@abisubramanya27"] ] + "retrieved_data": ["^@?abisubramanya27$"] } } ], - "revision": 2 + "revision": 3 }, { "sites": ["shopping"], "task_id": 351, "intent_template_id": 137, "start_urls": ["__SHOPPING__"], - "intent": "List products from PS4 accessories category by ascending price", - "intent_template": "List products from {{product_category}} category by {{order}} price", + "intent": "Go to the page showing PS4 accessories products sorted by ascending price", + "intent_template": "Go to the page showing {{product_category}} products sorted by {{order}} price", "instantiation_dict": {"product_category": "PS4 accessories", "order": "ascending"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "navigate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", - "site": "shopping", - "last_event_only": true, "expected": { "url": "__SHOPPING__/video-games/playstation-4/accessories.html", - "response_status": 200, "query_params": { "product_list_order": ["price"] } } } @@ -9479,23 +8898,19 @@ "task_id": 352, "intent_template_id": 137, "start_urls": ["__SHOPPING__"], - "intent": "List products from nutrition bars and drinks category by ascending price", - "intent_template": "List products from {{product_category}} category by {{order}} price", + "intent": "Go to the page showing nutrition bars and drinks products sorted by ascending price", + "intent_template": "Go to the page showing {{product_category}} products sorted by {{order}} price", "instantiation_dict": {"product_category": "nutrition bars and drinks", "order": "ascending"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "navigate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", - "site": "shopping", - "last_event_only": true, "expected": { "url": "__SHOPPING__/health-household/diet-sports-nutrition/nutrition-bars-drinks.html", - "response_status": 200, "query_params": { "product_list_order": ["price"] } } } @@ -9507,23 +8922,19 @@ "task_id": 353, "intent_template_id": 137, "start_urls": ["__SHOPPING__"], - "intent": "List products from competitive swimwear category by ascending price", - "intent_template": "List products from {{product_category}} category by {{order}} price", + "intent": "Go to the page showing competitive swimwear products sorted by ascending price", + "intent_template": "Go to the page showing {{product_category}} products sorted by {{order}} price", "instantiation_dict": {"product_category": "competitive swimwear", "order": "ascending"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "navigate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", - "site": "shopping", - "last_event_only": true, "expected": { "url": "__SHOPPING__/clothing-shoes-jewelry/sport-specific-clothing/competitive-swimwear.html", - "response_status": 200, "query_params": { "product_list_order": ["price"] } } } @@ -9535,23 +8946,19 @@ "task_id": 354, "intent_template_id": 137, "start_urls": ["__SHOPPING__"], - "intent": "List products from living room furtniture category by descending price", - "intent_template": "List products from {{product_category}} category by {{order}} price", - "instantiation_dict": {"product_category": "living room furtniture", "order": "descending"}, - "format_specification": null, + "intent": "Go to the page showing living room furniture products sorted by descending price", + "intent_template": "Go to the page showing {{product_category}} products sorted by {{order}} price", + "instantiation_dict": {"product_category": "living room furniture", "order": "descending"}, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "navigate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", - "site": "shopping", - "last_event_only": true, "expected": { "url": "__SHOPPING__/home-kitchen/furniture/living-room-furniture.html", - "response_status": 200, "query_params": { "product_list_order": ["price"], "product_list_dir": ["desc"] } } } @@ -9563,23 +8970,19 @@ "task_id": 355, "intent_template_id": 137, "start_urls": ["__SHOPPING__"], - "intent": "List products from kids\" bedding category by descending price", - "intent_template": "List products from {{product_category}} category by {{order}} price", - "instantiation_dict": {"product_category": "kids' bedding", "order": "descending"}, - "format_specification": null, + "intent": "Go to the page showing kids\" bedding products sorted by descending price", + "intent_template": "Go to the page showing {{product_category}} products sorted by {{order}} price", + "instantiation_dict": {"product_category": "kids\" bedding", "order": "descending"}, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "navigate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", - "site": "shopping", - "last_event_only": true, "expected": { "url": "__SHOPPING__/home-kitchen/bedding/kids-bedding.html", - "response_status": 200, "query_params": { "product_list_dir": ["desc"] } } } @@ -9591,22 +8994,16 @@ "task_id": 356, "intent_template_id": 49, "start_urls": ["__MAP__"], - "intent": "Show the route from Gates and Hillman Centers at CMU in Pittsburgh to the location where the Declaration of Independence and Constitution were signed", - "intent_template": "Show the route from Gates and Hillman Centers at CMU in Pittsburgh to the location where the Declaration of Independence and Constitution were signed", + "intent": "Show on the map the route from Gates and Hillman Centers at CMU in Pittsburgh to the location where the Declaration of Independence and Constitution were signed. (Use the OSRM direction service.)", + "intent_template": "Show on the map the route from Gates and Hillman Centers at CMU in Pittsburgh to the location where the Declaration of Independence and Constitution were signed. (Use the OSRM direction service.)", "instantiation_dict": {}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "navigate", "status": "SUCCESS", "retrieved_data": null} }, - { - "evaluator": "NetworkEventEvaluator", - "site": "map", - "last_event_only": true, - "expected": {"url": "__MAP__", "response_status": 200} - } + { "evaluator": "NetworkEventEvaluator", "expected": {"url": "__MAP__"} } ], "revision": 2 }, @@ -9615,26 +9012,24 @@ "task_id": 357, "intent_template_id": 291, "start_urls": ["__GITLAB__"], - "intent": "Show me the merge requests requiring my review", - "intent_template": "Show me the merge requests requiring my review", + "intent": "Go to the merge requests requiring my review", + "intent_template": "Go to the merge requests requiring my review", "instantiation_dict": {}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "navigate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", - "site": "gitlab", - "last_event_only": true, "expected": { - "url": [ - "__GITLAB__/dashboard/merge_requests?reviewer_username=byteblaze", - "__GITLAB__/dashboard/merge_requests?reviewer_username=byteblaze&scope=all&state=opened" - ], - "response_status": 200 + "url": "__GITLAB__/dashboard/merge_requests", + "query_params": { + "reviewer_username": [ "byteblaze" ], + "scope" : [ "^(all|)$" ], + "state" : [ "^(opened|)$" ] + } } } ], @@ -9646,18 +9041,20 @@ "intent_template_id": 206, "start_urls": ["__SHOPPING__"], "intent": "Get the shipping method for order number 187.", - "intent_template": "Get the {{info}} for order number {{order_number}}.", - "instantiation_dict": {"info": "shipping method", "order_number": 187}, - "format_specification": null, + "intent_template": "Get the {{info}} for order number {{order_number}}.{{retrieved_data_format_spec}}", + "instantiation_dict": { + "info": "shipping method", + "order_number": 187, + "retrieved_data_format_spec": "" + }, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": false, "results_schema": { "type": "array", "items": {"type": "string"} }, "expected": { - "performed_operation": "retrieve", + "task_type": "retrieve", "status": "SUCCESS", - "retrieved_data": ["Flat Rate - Fixed"] + "retrieved_data": ["^flat rate[^a-z0-9]*(?:fixed)$"] } } ], @@ -9668,17 +9065,19 @@ "task_id": 359, "intent_template_id": 206, "start_urls": ["__SHOPPING__"], - "intent": "Get the order date for order number 148.", - "intent_template": "Get the {{info}} for order number {{order_number}}.", - "instantiation_dict": {"info": "order date", "order_number": "148"}, - "format_specification": null, + "intent": "Get the order date for order number 148.Return the date in YYYY-MM-DD format or null if not available, without any additional details", + "intent_template": "Get the {{info}} for order number {{order_number}}.{{retrieved_data_format_spec}}", + "instantiation_dict": { + "info": "order date", + "order_number": "148", + "retrieved_data_format_spec": "Return the date in YYYY-MM-DD format or null if not available, without any additional details" + }, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": false, "results_schema": { "type": "array", "items": {"type": "string", "format": "date"} }, "expected": { - "performed_operation": "retrieve", + "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": ["January 29, 2023"] } @@ -9692,16 +9091,18 @@ "intent_template_id": 206, "start_urls": ["__SHOPPING__"], "intent": "Get the product names for order number 148.", - "intent_template": "Get the {{info}} for order number {{order_number}}.", - "instantiation_dict": {"info": "product names", "order_number": "148"}, - "format_specification": null, + "intent_template": "Get the {{info}} for order number {{order_number}}.{{retrieved_data_format_spec}}", + "instantiation_dict": { + "info": "product names", + "order_number": "148", + "retrieved_data_format_spec": "" + }, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": false, "results_schema": { "type": "array", "items": {"type": "string"} }, "expected": { - "performed_operation": "retrieve", + "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": [ "Bornbridge Artificial Spiral Topiary Tree - Indoor / Outdoor Topiary Trees - Artificial Outdoor Plants (2 Pack, 4' Cypress)", @@ -9717,28 +9118,29 @@ "task_id": 361, "intent_template_id": 206, "start_urls": ["__SHOPPING__"], - "intent": "Get the order statuses for order number 170 and 189.", - "intent_template": "Get the {{info}} for order number {{order_number}}.", - "instantiation_dict": {"info": "order statuses", "order_number": "170 and 189"}, - "format_specification": "Use \"order_number\" for order number and \"status\" for status.", + "intent": "Get the order statuses for order number 170 and 189. Return a list of objects with keys \"order_number\" and \"status\".", + "intent_template": "Get the {{info}} for order number {{order_number}}.{{retrieved_data_format_spec}}", + "instantiation_dict": { + "info": "order statuses", + "order_number": "170 and 189", + "retrieved_data_format_spec": " Return a list of objects with keys \"order_number\" and \"status\"." + }, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": false, "results_schema": { "type": "array", "items": { "type": "object", - "properties": { "order_number": {"type": "string"}, "status": {"type": "string"} }, - "required": ["order_number", "status"] + "properties": { "order_number": {"type": "string"}, "status": {"type": "string"} } } }, "expected": { - "performed_operation": "retrieve", + "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": [ - { "order_number": "170", "status": "canceled" }, - { "order_number": "189", "status": "pending" } + { "order_number": "^#?\\s*0*170$", "status": "canceled" }, + { "order_number": "^#?\\s*0*189$", "status": "pending" } ] } } @@ -9750,14 +9152,16 @@ "task_id": 362, "intent_template_id": 206, "start_urls": ["__SHOPPING__"], - "intent": "Get the billing address for order number 00178.", - "intent_template": "Get the {{info}} for order number {{order_number}}.", - "instantiation_dict": {"info": "billing address", "order_number": "00178"}, - "format_specification": "Use keys \"house_number\", \"street\", \"city\", \"state\", \"postcode\", and \"country\". Set any key to null if not available.", + "intent": "Get the billing address for order number 00178. Return a list of objects with keys \"house_number\", \"street\", \"city\", \"state\", \"postcode\", and \"country\".", + "intent_template": "Get the {{info}} for order number {{order_number}}.{{retrieved_data_format_spec}}", + "instantiation_dict": { + "info": "billing address", + "order_number": "00178", + "retrieved_data_format_spec": " Return a list of objects with keys \"house_number\", \"street\", \"city\", \"state\", \"postcode\", and \"country\"." + }, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": false, "results_schema": { "type": "array", "items": { @@ -9770,12 +9174,11 @@ "state" : { "type": "string" }, "postcode" : { "type": "string" }, "country" : { "type": "string" } - }, - "required": ["house_number", "street", "city", "country", "state", "postcode"] + } } }, "expected": { - "performed_operation": "retrieve", + "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": [ { @@ -9797,23 +9200,18 @@ "task_id": 363, "intent_template_id": 58, "start_urls": ["__MAP__"], - "intent": "Measure distance between Carnegie Mellon University and Carnegie Music Hall by walking", - "intent_template": "Measure distance between {{location/address_1}} and {{location/address_2}} by walking", + "intent": "Measure distance between Carnegie Mellon University and Carnegie Music Hall by walking. Return the value as a string (numeric value with unit km or m, e.g., 2.4km, 500m) only, without any additional details. (Use the OSRM direction service.)", + "intent_template": "Measure distance between {{location_address_1}} and {{location_address_2}} by walking. {{retrieved_data_format_spec}}. (Use the OSRM direction service.)", "instantiation_dict": { - "location/address_1": "Carnegie Mellon University", - "location/address_2": "Carnegie Music Hall" + "location_address_1": "Carnegie Mellon University", + "location_address_2": "Carnegie Music Hall", + "retrieved_data_format_spec": "Return the value as a string (numeric value with unit km or m, e.g., 2.4km, 500m) only, without any additional details" }, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": true, "results_schema": { "type": "array", "items": {"type": "string", "format": "distance"} }, - "expected": { - "performed_operation": "retrieve", - "status": "SUCCESS", - "retrieved_data": ["748m"] - } + "expected": { "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": ["748m"] } } ], "revision": 2 @@ -9823,23 +9221,18 @@ "task_id": 364, "intent_template_id": 58, "start_urls": ["__MAP__"], - "intent": "Measure distance between Carnegie Mellon University and UPMC Shadyside by walking", - "intent_template": "Measure distance between {{location/address_1}} and {{location/address_2}} by walking", + "intent": "Measure distance between Carnegie Mellon University and UPMC Shadyside by walking. Return the value as a string (numeric value with unit km or m, e.g., 2.4km, 500m) only, without any additional details. (Use the OSRM direction service.)", + "intent_template": "Measure distance between {{location_address_1}} and {{location_address_2}} by walking. {{retrieved_data_format_spec}}. (Use the OSRM direction service.)", "instantiation_dict": { - "location/address_1": "Carnegie Mellon University", - "location/address_2": "UPMC Shadyside" + "location_address_1": "Carnegie Mellon University", + "location_address_2": "UPMC Shadyside", + "retrieved_data_format_spec": "Return the value as a string (numeric value with unit km or m, e.g., 2.4km, 500m) only, without any additional details" }, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": true, "results_schema": { "type": "array", "items": {"type": "string", "format": "distance"} }, - "expected": { - "performed_operation": "retrieve", - "status": "SUCCESS", - "retrieved_data": ["1.7km"] - } + "expected": { "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": ["1.7km"] } } ], "revision": 2 @@ -9849,23 +9242,18 @@ "task_id": 365, "intent_template_id": 58, "start_urls": ["__MAP__"], - "intent": "Measure distance between Carnegie Music Hall and UPMC Shadyside by walking", - "intent_template": "Measure distance between {{location/address_1}} and {{location/address_2}} by walking", + "intent": "Measure distance between Carnegie Music Hall and UPMC Shadyside by walking. Return the value as a string (numeric value with unit km or m, e.g., 2.4km, 500m) only, without any additional details. (Use the OSRM direction service.)", + "intent_template": "Measure distance between {{location_address_1}} and {{location_address_2}} by walking. {{retrieved_data_format_spec}}. (Use the OSRM direction service.)", "instantiation_dict": { - "location/address_1": "Carnegie Music Hall", - "location/address_2": "UPMC Shadyside" + "location_address_1": "Carnegie Music Hall", + "location_address_2": "UPMC Shadyside", + "retrieved_data_format_spec": "Return the value as a string (numeric value with unit km or m, e.g., 2.4km, 500m) only, without any additional details" }, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": true, "results_schema": { "type": "array", "items": {"type": "string", "format": "distance"} }, - "expected": { - "performed_operation": "retrieve", - "status": "SUCCESS", - "retrieved_data": ["2.2km"] - } + "expected": { "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": ["2.2km"] } } ], "revision": 2 @@ -9875,23 +9263,18 @@ "task_id": 366, "intent_template_id": 58, "start_urls": ["__MAP__"], - "intent": "Measure distance between CVS (closet one) and UPMC Shadyside by walking", - "intent_template": "Measure distance between {{location/address_1}} and {{location/address_2}} by walking", + "intent": "Measure distance between CVS (closest one) and UPMC Shadyside by walking. Return the value as a string (numeric value with unit km or m, e.g., 2.4km, 500m) only, without any additional details. (Use the OSRM direction service.)", + "intent_template": "Measure distance between {{location_address_1}} and {{location_address_2}} by walking. {{retrieved_data_format_spec}}. (Use the OSRM direction service.)", "instantiation_dict": { - "location/address_1": "CVS (closet one)", - "location/address_2": "UPMC Shadyside" + "location_address_1": "CVS (closest one)", + "location_address_2": "UPMC Shadyside", + "retrieved_data_format_spec": "Return the value as a string (numeric value with unit km or m, e.g., 2.4km, 500m) only, without any additional details" }, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": true, "results_schema": { "type": "array", "items": {"type": "string", "format": "distance"} }, - "expected": { - "performed_operation": "retrieve", - "status": "SUCCESS", - "retrieved_data": ["1.2km"] - } + "expected": { "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": ["1.2km"] } } ], "revision": 2 @@ -9901,23 +9284,18 @@ "task_id": 367, "intent_template_id": 58, "start_urls": ["__MAP__"], - "intent": "Measure distance between Carnegie Mellon University and CVS (closet one) by walking", - "intent_template": "Measure distance between {{location/address_1}} and {{location/address_2}} by walking", + "intent": "Measure distance between Carnegie Mellon University and CVS (closest one) by walking. Return the value as a string (numeric value with unit km or m, e.g., 2.4km, 500m) only, without any additional details. (Use the OSRM direction service.)", + "intent_template": "Measure distance between {{location_address_1}} and {{location_address_2}} by walking. {{retrieved_data_format_spec}}. (Use the OSRM direction service.)", "instantiation_dict": { - "location/address_1": "Carnegie Mellon University", - "location/address_2": "CVS (closet one)" + "location_address_1": "Carnegie Mellon University", + "location_address_2": "CVS (closest one)", + "retrieved_data_format_spec": "Return the value as a string (numeric value with unit km or m, e.g., 2.4km, 500m) only, without any additional details" }, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": true, "results_schema": { "type": "array", "items": {"type": "string", "format": "distance"} }, - "expected": { - "performed_operation": "retrieve", - "status": "SUCCESS", - "retrieved_data": ["1.4km"] - } + "expected": { "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": ["1.4km"] } } ], "revision": 2 @@ -9927,19 +9305,14 @@ "task_id": 368, "intent_template_id": 188, "start_urls": ["__SHOPPING__"], - "intent": "find discounted items.", - "intent_template": "find discounted items.", + "intent": "Return the list of discounted (sale) items available on the site.", + "intent_template": "Return the list of discounted (sale) items available on the site.", "instantiation_dict": {}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": { - "performed_operation": "retrieve", - "status": "NOT_FOUND_ERROR", - "retrieved_data": null - } + "expected": {"task_type": "retrieve", "status": "NOT_FOUND_ERROR", "retrieved_data": null} } ], "revision": 2 @@ -9952,18 +9325,15 @@ "intent": "Pull up the description page of Carnegie Music Hall on Map", "intent_template": "Pull up the description page of {{location}} on Map", "instantiation_dict": {"location": "Carnegie Music Hall"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "navigate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", - "site": "map", - "last_event_only": true, - "expected": {"url": "__MAP__/way/154257484/", "response_status": 200} + "expected": {"url": "__MAP__/way/154257484/"} } ], "revision": 2 @@ -9976,18 +9346,15 @@ "intent": "Pull up the description page of Carnegie Mellon University on Map", "intent_template": "Pull up the description page of {{location}} on Map", "instantiation_dict": {"location": "Carnegie Mellon University"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "navigate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", - "site": "map", - "last_event_only": true, - "expected": {"url": "__MAP__/relation/2279034/", "response_status": 200} + "expected": {"url": "__MAP__/relation/2279034/"} } ], "revision": 2 @@ -10000,18 +9367,15 @@ "intent": "Pull up the description page of Piada restaurant near Pitt on Map", "intent_template": "Pull up the description page of {{location}} on Map", "instantiation_dict": {"location": "Piada restaurant near Pitt"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "navigate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", - "site": "map", - "last_event_only": true, - "expected": {"url": "__MAP__/node/2710170970", "response_status": 200} + "expected": {"url": "__MAP__/node/2710170970"} } ], "revision": 2 @@ -10024,18 +9388,15 @@ "intent": "Pull up the description page of the Costco in Pittsburgh near a river on Map", "intent_template": "Pull up the description page of {{location}} on Map", "instantiation_dict": {"location": "the Costco in Pittsburgh near a river"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "navigate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", - "site": "map", - "last_event_only": true, - "expected": {"url": "__MAP__/way/168456128", "response_status": 200} + "expected": {"url": "__MAP__/way/168456128"} } ], "revision": 2 @@ -10048,18 +9409,15 @@ "intent": "Pull up the description page of Whole Foods near Carnegie Mellon on Map", "intent_template": "Pull up the description page of {{location}} on Map", "instantiation_dict": {"location": "Whole Foods near Carnegie Mellon"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "navigate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", - "site": "map", - "last_event_only": true, - "expected": {"url": "__MAP__/node/10114377662", "response_status": 200} + "expected": {"url": "__MAP__/node/10114377662"} } ], "revision": 2 @@ -10069,20 +9427,17 @@ "task_id": 374, "intent_template_id": 266, "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Show me the Magento Blank theme settings page", - "intent_template": "Show me the {{name}} theme settings page", + "intent": "Go to the Magento Blank theme settings page", + "intent_template": "Go to the {{name}} theme settings page", "instantiation_dict": {"name": "Magento Blank"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "navigate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", - "site": "shopping_admin", - "last_event_only": true, "expected": {"url": "__SHOPPING_ADMIN__/admin/system_design_theme/edit/id/1"} } ], @@ -10093,21 +9448,18 @@ "task_id": 375, "intent_template_id": 266, "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Show me the Magento Luma theme settings page", - "intent_template": "Show me the {{name}} theme settings page", + "intent": "Go to the Magento Luma theme settings page", + "intent_template": "Go to the {{name}} theme settings page", "instantiation_dict": {"name": "Magento Luma"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "navigate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", - "site": "shopping_admin", - "last_event_only": true, - "expected": {"url": "__SHOPPING_ADMIN__/admin/system_design_theme/edit/id/3/key/"} + "expected": {"url": "__SHOPPING_ADMIN__/admin/system_design_theme/edit/id/3"} } ], "revision": 2 @@ -10120,16 +9472,11 @@ "intent": "Summarize customer reviews for Amazon Echo Dot 3rd generation.", "intent_template": "Summarize customer reviews for {{product}}.", "instantiation_dict": {"product": "Amazon Echo Dot 3rd generation"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": { - "performed_operation": "retrieve", - "status": "NOT_FOUND_ERROR", - "retrieved_data": null - } + "expected": {"task_type": "retrieve", "status": "NOT_FOUND_ERROR", "retrieved_data": null} } ], "revision": 2 @@ -10139,23 +9486,19 @@ "task_id": 377, "intent_template_id": 59, "start_urls": ["__MAP__"], - "intent": "Search for \"resturants near CMU ArtPark Lab\"", - "intent_template": "Search for \"{{space}} near {{location}}\"", - "instantiation_dict": {"location": "CMU ArtPark Lab", "space": "resturants"}, - "format_specification": null, + "intent": "Show on the map restaurants near CMU ArtPark Lab", + "intent_template": "Show on the map {{space}} near {{location}}", + "instantiation_dict": {"location": "CMU ArtPark Lab", "space": "restaurants"}, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "navigate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", - "site": "map", - "last_event_only": true, "expected": { "url": "__MAP__/search", - "response_status": 200, "query_params": { "query": ["restaurants near CMU ArtPark Lab"] } } } @@ -10167,23 +9510,19 @@ "task_id": 378, "intent_template_id": 59, "start_urls": ["__MAP__"], - "intent": "Search for \"parking near Carnegie Mellon University\"", - "intent_template": "Search for \"{{space}} near {{location}}\"", + "intent": "Show on the map parking near Carnegie Mellon University", + "intent_template": "Show on the map {{space}} near {{location}}", "instantiation_dict": {"location": "Carnegie Mellon University", "space": "parking"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "navigate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", - "site": "map", - "last_event_only": true, "expected": { "url": "__MAP__/search", - "response_status": 200, "query_params": { "query": ["parking near Carnegie Mellon University"] } } } @@ -10195,23 +9534,19 @@ "task_id": 379, "intent_template_id": 59, "start_urls": ["__MAP__"], - "intent": "Search for \"hotels near Carnegie Mellon University\"", - "intent_template": "Search for \"{{space}} near {{location}}\"", + "intent": "Show on the map hotels near Carnegie Mellon University", + "intent_template": "Show on the map {{space}} near {{location}}", "instantiation_dict": {"location": "Carnegie Mellon University", "space": "hotels"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "navigate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", - "site": "map", - "last_event_only": true, "expected": { "url": "__MAP__/search", - "response_status": 200, "query_params": { "query": ["hotels near Carnegie Mellon University"] } } } @@ -10223,23 +9558,19 @@ "task_id": 380, "intent_template_id": 59, "start_urls": ["__MAP__"], - "intent": "Search for \"bars near Carnegie Music Hall\"", - "intent_template": "Search for \"{{space}} near {{location}}\"", + "intent": "Show on the map bars near Carnegie Music Hall", + "intent_template": "Show on the map {{space}} near {{location}}", "instantiation_dict": {"location": "Carnegie Music Hall", "space": "bars"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "navigate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", - "site": "map", - "last_event_only": true, "expected": { "url": "__MAP__/search", - "response_status": 200, "query_params": { "query": ["bars near Carnegie Music Hall"] } } } @@ -10251,23 +9582,19 @@ "task_id": 381, "intent_template_id": 59, "start_urls": ["__MAP__"], - "intent": "Search for \"hotels near Carnegie Music Hall\"", - "intent_template": "Search for \"{{space}} near {{location}}\"", + "intent": "Show on the map hotels near Carnegie Music Hall", + "intent_template": "Show on the map {{space}} near {{location}}", "instantiation_dict": {"location": "Carnegie Music Hall", "space": "hotels"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "navigate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", - "site": "map", - "last_event_only": true, "expected": { "url": "__MAP__/search", - "response_status": 200, "query_params": { "query": ["hotels near Carnegie Music Hall"] } } } @@ -10279,19 +9606,14 @@ "task_id": 382, "intent_template_id": 781, "start_urls": ["__MAP__"], - "intent": "I am arriving at Carnegie Mellon University. Find the nearby US Citizenship and Immigration Services and the walking distance to the nearest Social Security Administration from US Citizenship and Immigration Services", - "intent_template": "I am arriving at Carnegie Mellon University. Find the nearby US Citizenship and Immigration Services and the walking distance to the nearest Social Security Administration from US Citizenship and Immigration Services", + "intent": "I am arriving at Carnegie Mellon University. Find the nearby US Citizenship and Immigration Services and the walking distance to the nearest Social Security Administration from US Citizenship and Immigration Services. (Use the OSRM direction service.)", + "intent_template": "I am arriving at Carnegie Mellon University. Find the nearby US Citizenship and Immigration Services and the walking distance to the nearest Social Security Administration from US Citizenship and Immigration Services. (Use the OSRM direction service.)", "instantiation_dict": {}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": { - "performed_operation": "retrieve", - "status": "NOT_FOUND_ERROR", - "retrieved_data": null - } + "expected": {"task_type": "retrieve", "status": "NOT_FOUND_ERROR", "retrieved_data": null} } ], "revision": 2 @@ -10301,27 +9623,26 @@ "task_id": 383, "intent_template_id": 782, "start_urls": ["__MAP__"], - "intent": "I am arriving at Pittsburgh Airport. Show me the name of a Hyatt hotel if there is any nearby. Tell me the names of supermarkets that are within 15mins driving from the hotel", - "intent_template": "I am arriving at Pittsburgh Airport. Show me the name of a Hyatt hotel if there is any nearby. Tell me the names of supermarkets that are within 15mins driving from the hotel", - "instantiation_dict": {}, - "format_specification": "Use \"hotel\" for the hotel name and \"supermarkets\" for the list of supermarket names", + "intent": "I am arriving at Pittsburgh Airport. Find if there is a Hyatt hotel nearby. If there is, get me its name and the names of any supermarkets that are within 15 minutes driving from the hotel. Return a list of objects with keys \"hotel\" and \"supermarkets\". (Use the OSRM direction service.)", + "intent_template": "I am arriving at Pittsburgh Airport. Find if there is a Hyatt hotel nearby. If there is, get me its name and the names of any supermarkets that are within 15 minutes driving from the hotel. {{retrieved_data_format_spec}}. (Use the OSRM direction service.)", + "instantiation_dict": { + "retrieved_data_format_spec": "Return a list of objects with keys \"hotel\" and \"supermarkets\"" + }, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": false, "results_schema": { "type": "array", "items": { "type": "object", "properties": { - "hotel" : { "type": "string" }, - "supermarkets": { "type": "array" , "items": {"type": "string"} } - }, - "required": ["hotel", "supermarkets"] + "hotel": {"type": "string", "format": "location-name"}, + "supermarkets": { "type": "array", "items": {"type": "string", "format": "location-name"} } + } } }, "expected": { - "performed_operation": "retrieve", + "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": [ { @@ -10340,16 +9661,14 @@ "intent_template_id": 666, "start_urls": ["__SHOPPING__"], "intent": "List the customer names who complain about the quality of EYZUTAK phone cases", - "intent_template": "List the customer names who complain about the quality of EYZUTAK phone cases", - "instantiation_dict": {}, - "format_specification": null, + "intent_template": "List the customer names who {{review_criteria}}", + "instantiation_dict": {"review_criteria": "complain about the quality of EYZUTAK phone cases"}, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": false, "results_schema": { "type": "array", "items": {"type": "string"} }, "expected": { - "performed_operation": "retrieve", + "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": ["Lisa Lee", "Evelyn Kurver", "Amanda", "N Randall"] } @@ -10363,16 +9682,14 @@ "intent_template_id": 666, "start_urls": ["__SHOPPING__"], "intent": "List the customer names who thinks EYZUTAK phone cases are of good looking", - "intent_template": "List the customer names who thinks EYZUTAK phone cases are of good looking", - "instantiation_dict": {}, - "format_specification": null, + "intent_template": "List the customer names who {{review_criteria}}", + "instantiation_dict": {"review_criteria": "thinks EYZUTAK phone cases are of good looking"}, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": false, "results_schema": { "type": "array", "items": {"type": "string"} }, "expected": { - "performed_operation": "retrieve", + "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": ["Lisa Lee", "MH", "Misba009", "Amanda", "Cally", "Bethany Robertson"] } @@ -10385,20 +9702,14 @@ "task_id": 386, "intent_template_id": 1355, "start_urls": ["__SHOPPING__"], - "intent": "What is the rating of Ugreen lightning to 3.5mm cable", - "intent_template": "What is the rating of {{product}}", + "intent": "What is the rating of Ugreen lightning to 3.5mm cable?", + "intent_template": "What is the rating of {{product}}?", "instantiation_dict": {"product": "Ugreen lightning to 3.5mm cable"}, - "format_specification": "Return a value between 0 and 100", "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": false, "results_schema": { "type": "array", "items": {"type": "number"} }, - "expected": { - "performed_operation": "retrieve", - "status": "SUCCESS", - "retrieved_data": [65] - } + "expected": { "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": [65] } } ], "revision": 2 @@ -10411,14 +9722,12 @@ "intent": "Who gave 4 or 5 stars for phone cases from EYZUTAK", "intent_template": "Who gave {{stars}} for phone cases from EYZUTAK", "instantiation_dict": {"stars": "4 or 5 stars"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": false, "results_schema": { "type": "array", "items": {"type": "string"} }, "expected": { - "performed_operation": "retrieve", + "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": ["MH", "Misba009", "Amanda", "Amazon Customer", "Cally", "Bethany Robertson"] } @@ -10434,14 +9743,12 @@ "intent": "Who gave 1 or 2 stars for phone cases from EYZUTAK", "intent_template": "Who gave {{stars}} for phone cases from EYZUTAK", "instantiation_dict": {"stars": "1 or 2 stars"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": false, "results_schema": { "type": "array", "items": {"type": "string"} }, "expected": { - "performed_operation": "retrieve", + "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": ["Evelyn Kurver", "N Randall"] } @@ -10461,26 +9768,22 @@ "repo": "primer/design", "content": "Thanks, working on reviews" }, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", - "site": "gitlab", - "last_event_only": true, "expected": { "url": "__GITLAB__/primer/design/notes", "http_method": "POST", - "query_string": {"target_id": "139245", "target_type": "merge_request"}, "post_data": { "$.note.noteable_type": "MergeRequest", "$.note.note": "Thanks, working on reviews" }, - "response_status": 200 + "query_params": { "target_id": ["139245"], "target_type": ["merge_request"] } } } ], @@ -10498,23 +9801,19 @@ "repo": "a11yproject/a11yproject.com", "content": "lgtm" }, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", - "site": "gitlab", - "last_event_only": true, "expected": { "url": "__GITLAB__/a11yproject/a11yproject.com/notes", "http_method": "POST", - "query_string": {"target_id": "72594", "target_type": "merge_request"}, "post_data": {"$.note.noteable_type": "MergeRequest", "$.note.note": "lgtm"}, - "response_status": 200 + "query_params": { "target_id": ["72594"], "target_type": ["merge_request"] } } } ], @@ -10532,26 +9831,22 @@ "repo": "a11yproject/a11yproject.com", "content": "close because non reproducible" }, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", - "site": "gitlab", - "last_event_only": true, "expected": { "url": "__GITLAB__/a11yproject/a11yproject.com/notes", "http_method": "POST", - "query_string": {"target_id": "72404", "target_type": "merge_request"}, "post_data": { "$.note.noteable_type": "MergeRequest", "$.note.note": "close because non reproducible" }, - "response_status": 200 + "query_params": { "target_id": ["72404"], "target_type": ["merge_request"] } } } ], @@ -10565,23 +9860,19 @@ "intent": "Post \"Good idea\" for the merge request related to color ulitity in a11yproject.com project", "intent_template": "Post \"{{content}}\" for the merge request related to {{mr}} in {{repo}} project", "instantiation_dict": {"mr": "color ulitity", "repo": "a11yproject.com", "content": "Good idea"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", - "site": "gitlab", - "last_event_only": true, "expected": { "url": "__GITLAB__/a11yproject/a11yproject.com/notes", "http_method": "POST", - "query_string": {"target_id": "72135", "target_type": "merge_request"}, "post_data": {"$.note.noteable_type": "MergeRequest", "$.note.note": "Good idea"}, - "response_status": 200 + "query_params": { "target_id": ["72135"], "target_type": ["merge_request"] } } } ], @@ -10599,23 +9890,19 @@ "repo": "byteblaze/empathy-prompts", "content": "lgtm" }, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", - "site": "gitlab", - "last_event_only": true, "expected": { "url": "__GITLAB__/byteblaze/empathy-prompts/notes", "http_method": "POST", - "query_string": {"target_id": "138843", "target_type": "merge_request"}, "post_data": {"$.note.noteable_type": "MergeRequest", "$.note.note": "lgtm"}, - "response_status": 200 + "query_params": { "target_id": ["138843"], "target_type": ["merge_request"] } } } ], @@ -10629,12 +9916,11 @@ "intent": "Fork 2019-nCov.", "intent_template": "Fork {{repo}}.", "instantiation_dict": {"repo": "2019-nCov"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", @@ -10656,12 +9942,11 @@ "intent": "Fork the Pytorch GAN repo with most stars.", "intent_template": "Fork {{repo}}.", "instantiation_dict": {"repo": "the Pytorch GAN repo with most stars"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", @@ -10683,12 +9968,11 @@ "intent": "Fork ChatGPT.", "intent_template": "Fork {{repo}}.", "instantiation_dict": {"repo": "ChatGPT"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", @@ -10710,16 +9994,14 @@ "intent": "Fork MetaSeq.", "intent_template": "Fork {{repo}}.", "instantiation_dict": {"repo": "MetaSeq"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", - "site": "gitlab", "expected": { "url": "__GITLAB__/api/v4/projects/33/fork", "http_method": "POST", @@ -10738,12 +10020,11 @@ "intent": "Fork all repos from Akilesh Kannan.", "intent_template": "Fork {{repo}}.", "instantiation_dict": {"repo": "all repos from Akilesh Kannan"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", @@ -10811,12 +10092,11 @@ "intent": "Change my bio to \"I am a robot\" in the discussion forum", "intent_template": "Change my bio to \"{{content}}\" in the discussion forum", "instantiation_dict": {"content": "I am a robot"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", @@ -10824,10 +10104,8 @@ "expected": { "url": "__REDDIT__/user/MarvelsGrantMan136/edit_biography", "http_method": "POST", - "headers": {"referer": "__REDDIT__/user/MarvelsGrantMan136/edit_biography"}, "post_data": {"user_biography[biography]": "I am a robot"}, - "response_status": 302, - "redirect_url": "__REDDIT__/user/MarvelsGrantMan136/edit_biography" + "response_status": 302 } } ], @@ -10841,12 +10119,11 @@ "intent": "Change my bio to \"Pro Python Developer with 20 years of Experience\" in the discussion forum", "intent_template": "Change my bio to \"{{content}}\" in the discussion forum", "instantiation_dict": {"content": "Pro Python Developer with 20 years of Experience"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", @@ -10854,12 +10131,10 @@ "expected": { "url": "__REDDIT__/user/MarvelsGrantMan136/edit_biography", "http_method": "POST", - "headers": {"referer": "__REDDIT__/user/MarvelsGrantMan136/edit_biography"}, "post_data": { "user_biography[biography]": "Pro Python Developer with 20 years of Experience" }, - "response_status": 302, - "redirect_url": "__REDDIT__/user/MarvelsGrantMan136/edit_biography" + "response_status": 302 } } ], @@ -10873,12 +10148,11 @@ "intent": "Change my bio to \"Seeking SDE positions\" in the discussion forum", "intent_template": "Change my bio to \"{{content}}\" in the discussion forum", "instantiation_dict": {"content": "Seeking SDE positions"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", @@ -10886,10 +10160,8 @@ "expected": { "url": "__REDDIT__/user/MarvelsGrantMan136/edit_biography", "http_method": "POST", - "headers": {"referer": "__REDDIT__/user/MarvelsGrantMan136/edit_biography"}, "post_data": {"user_biography[biography]": "Seeking SDE positions"}, - "response_status": 302, - "redirect_url": "__REDDIT__/user/MarvelsGrantMan136/edit_biography" + "response_status": 302 } } ], @@ -10903,12 +10175,11 @@ "intent": "Change my bio to \"Freelance Web Developer\" in the discussion forum", "intent_template": "Change my bio to \"{{content}}\" in the discussion forum", "instantiation_dict": {"content": "Freelance Web Developer"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", @@ -10916,10 +10187,8 @@ "expected": { "url": "__REDDIT__/user/MarvelsGrantMan136/edit_biography", "http_method": "POST", - "headers": {"referer": "__REDDIT__/user/MarvelsGrantMan136/edit_biography"}, "post_data": {"user_biography[biography]": "Freelance Web Developer"}, - "response_status": 302, - "redirect_url": "__REDDIT__/user/MarvelsGrantMan136/edit_biography" + "response_status": 302 } } ], @@ -10933,12 +10202,11 @@ "intent": "Change my bio to \"Awesome Prompt Artist\" in the discussion forum", "intent_template": "Change my bio to \"{{content}}\" in the discussion forum", "instantiation_dict": {"content": "Awesome Prompt Artist"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", @@ -10946,10 +10214,8 @@ "expected": { "url": "__REDDIT__/user/MarvelsGrantMan136/edit_biography", "http_method": "POST", - "headers": {"referer": "__REDDIT__/user/MarvelsGrantMan136/edit_biography"}, "post_data": {"user_biography[biography]": "Awesome Prompt Artist"}, - "response_status": 302, - "redirect_url": "__REDDIT__/user/MarvelsGrantMan136/edit_biography" + "response_status": 302 } } ], @@ -10963,12 +10229,11 @@ "intent": "Upvote the newest post in books forum", "intent_template": "Upvote the newest post in {{forum}} forum", "instantiation_dict": {"forum": "books"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", @@ -10989,12 +10254,11 @@ "intent": "Upvote the newest post in DIY forum", "intent_template": "Upvote the newest post in {{forum}} forum", "instantiation_dict": {"forum": "DIY"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", @@ -11015,12 +10279,11 @@ "intent": "Upvote the newest post in future technology forum", "intent_template": "Upvote the newest post in {{forum}} forum", "instantiation_dict": {"forum": "future technology"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", @@ -11041,12 +10304,11 @@ "intent": "Upvote the newest post in the deep learning forum", "intent_template": "Upvote the newest post in {{forum}} forum", "instantiation_dict": {"forum": "the deep learning"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", @@ -11067,12 +10329,11 @@ "intent": "Upvote the newest post in explain like im 5 forum", "intent_template": "Upvote the newest post in {{forum}} forum", "instantiation_dict": {"forum": "explain like im 5"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", @@ -11091,17 +10352,18 @@ "intent_template_id": 23, "start_urls": ["__REDDIT__/f/books/59421/friendly-reminder-bookshop-org-exists"], "intent": "Reply to the manager of the website in this post with \"thanks! I am a big fan of your website.\"", - "intent_template": "Reply to {{position_description}} in this post with \"{{content_description}}\"", + "intent_template": "Reply to {{position_description}} {{location}} with {{comment_prefix}}\"{{content_description}}\"", "instantiation_dict": { "position_description": "the manager of the website", + "location": "in this post", + "comment_prefix": "", "content_description": "thanks! I am a big fan of your website." }, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", @@ -11121,17 +10383,18 @@ "intent_template_id": 23, "start_urls": ["__REDDIT__/f/singularity/69404/this-is-how-chatgpt-sees-itself"], "intent": "Reply to the first reply in this post with \"don't panic\"", - "intent_template": "Reply to {{position_description}} in this post with \"{{content_description}}\"", + "intent_template": "Reply to {{position_description}} {{location}} with {{comment_prefix}}\"{{content_description}}\"", "instantiation_dict": { "position_description": "the first reply", + "location": "in this post", + "comment_prefix": "", "content_description": "don't panic" }, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", @@ -11153,25 +10416,35 @@ "intent": "Change the LICENSE for repo byteblaze/cloud-to-butt to an MIT license", "intent_template": "Change the LICENSE for repo {{repo}} to {{license}}", "instantiation_dict": {"repo": "byteblaze/cloud-to-butt", "license": "an MIT license"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", - "site": "gitlab", - "last_event_only": true, + "post_data_schema": { + "type": "object", + "properties": { + "_method" : { "type": "string" }, + "file_path" : { "type": "string" }, + "branch_name": { "type": "string" }, + "content" : { "type": "string" } + } + }, "expected": { - "url": "__GITLAB__/byteblaze/cloud-to-butt/-/update/master/LICENSE.txt", + "url": [ + "__GITLAB__/byteblaze/cloud-to-butt/-/update/master/LICENSE.txt", + "__GITLAB__/byteblaze/cloud-to-butt/-/create/master", + "__GITLAB__/byteblaze/cloud-to-butt/-/blob/master/LICENSE" + ], "http_method": "POST", "post_data": { - "_method": "put", - "file_path": "LICENSE", - "commit_message": "Update LICENSE.txt", - "branch_name": "master" + "$.^file_path|file_name$": "LICENSE", + "branch_name": "master", + "content": "^.*MIT.*License.*$", + "_method": "^.*(?!delete$).*$" }, "response_status": 302 } @@ -11190,25 +10463,35 @@ "repo": "byteblaze/accessible-html-content-patterns", "license": "an Apache License" }, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", - "site": "gitlab", - "last_event_only": true, + "post_data_schema": { + "type": "object", + "properties": { + "_method" : { "type": "string" }, + "file_path" : { "type": "string" }, + "branch_name": { "type": "string" }, + "content" : { "type": "string" } + } + }, "expected": { - "url": "__GITLAB__/byteblaze/accessible-html-content-patterns/-/update/main/LICENSE", + "url": [ + "__GITLAB__/byteblaze/accessible-html-content-patterns/-/update/main/LICENSE", + "__GITLAB__/byteblaze/accessible-html-content-patterns/-/create/main", + "__GITLAB__/byteblaze/accessible-html-content-patterns/-/blob/main/LICENSE" + ], "http_method": "POST", "post_data": { - "_method": "put", - "file_path": "LICENSE", - "commit_message": "Update LICENSE", - "branch_name": "main" + "$.^file_path|file_name$": "LICENSE", + "branch_name": "main", + "content": "^.*Apache.*License.*$", + "_method": "^.*(?!delete$).*$" }, "response_status": 302 } @@ -11227,25 +10510,35 @@ "repo": "byteblaze/a11y-syntax-highlighting", "license": "one that mandates all copies and derivative works to be under the same license" }, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", - "site": "gitlab", - "last_event_only": true, + "post_data_schema": { + "type": "object", + "properties": { + "_method" : { "type": "string" }, + "file_path" : { "type": "string" }, + "branch_name": { "type": "string" }, + "content" : { "type": "string" } + } + }, "expected": { - "url": "__GITLAB__/byteblaze/a11y-syntax-highlighting/-/update/main/LICENSE", + "url": [ + "__GITLAB__/byteblaze/a11y-syntax-highlighting/-/update/main/LICENSE", + "__GITLAB__/byteblaze/a11y-syntax-highlighting/-/create/main", + "__GITLAB__/byteblaze/a11y-syntax-highlighting/-/blob/main/LICENSE" + ], "http_method": "POST", "post_data": { - "_method": "put", - "file_path": "LICENSE", - "commit_message": "Update LICENSE", - "branch_name": "main" + "$.^file_path|file_name$": "LICENSE", + "branch_name": "main", + "content": "^.*GNU GENERAL PUBLIC LICENSE.*$", + "_method": "^.*(?!delete$).*$" }, "response_status": 302 } @@ -11261,22 +10554,38 @@ "intent": "Change the LICENSE for repo byteblaze/dotfiles to an MIT license", "intent_template": "Change the LICENSE for repo {{repo}} to {{license}}", "instantiation_dict": {"repo": "byteblaze/dotfiles", "license": "an MIT license"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "group": "byteblaze", - "project": "dotfiles", - "file_path": "LICENSE", - "substring": "MIT License", - "branch": "main", - "site": "gitlab", - "expected": true + "evaluator": "NetworkEventEvaluator", + "post_data_schema": { + "type": "object", + "properties": { + "_method" : { "type": "string" }, + "file_path" : { "type": "string" }, + "branch_name": { "type": "string" }, + "content" : { "type": "string" } + } + }, + "expected": { + "url": [ + "__GITLAB__/byteblaze/dotfiles/-/update/main/LICENSE", + "__GITLAB__/byteblaze/dotfiles/-/create/main", + "__GITLAB__/byteblaze/dotfiles/-/blob/main/LICENSE" + ], + "http_method": "POST", + "post_data": { + "$.^file_path|file_name$": "LICENSE", + "branch_name": "main", + "content": "^.*MIT.*License.*$", + "_method": "^.*(?!delete$).*$" + }, + "response_status": 302 + } } ], "revision": 2 @@ -11286,22 +10595,21 @@ "task_id": 415, "intent_template_id": 360, "start_urls": ["__GITLAB__"], - "intent": "Check the code review assigned to me for verification functions. If the last message in the code review is from the author, reply \"Thank you\". If not, tag them with their username (e.g., @user) as a reminder.", - "intent_template": "Check the code review assigned to me for {{topic}}. If the last message in the code review is from the author, reply \"Thank you\". If not, tag them with their username (e.g., @user) as a reminder.", + "intent": "Reply on the merge request assigned to me for verification functions: if the last comment is from the author, reply \"Thank you\"; otherwise tag the author (e.g., @user) as a reminder.", + "intent_template": "Reply on the merge request assigned to me for {{topic}}: if the last comment is from the author, reply \"Thank you\"; otherwise tag the author (e.g., @user) as a reminder.", "instantiation_dict": {"topic": "verification functions"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", "expected": { "url": "__GITLAB__/byteblaze/a11y-webring.club/notes?target_id=138783&target_type=merge_request", "http_method": "POST", - "post_data": {"$.note.noteable_type": "MergeRequest", "$.note.note": "@davepgreene "} + "post_data": {"$.note.noteable_type": "MergeRequest", "$.note.note": "^.*@davepgreene.*$"} } } ], @@ -11312,15 +10620,14 @@ "task_id": 416, "intent_template_id": 360, "start_urls": ["__GITLAB__"], - "intent": "Check the code review assigned to me for wcag. If the last message in the code review is from the author, reply \"Thank you\". If not, tag them with their username (e.g., @user) as a reminder.", - "intent_template": "Check the code review assigned to me for {{topic}}. If the last message in the code review is from the author, reply \"Thank you\". If not, tag them with their username (e.g., @user) as a reminder.", + "intent": "Reply on the merge request assigned to me for wcag: if the last comment is from the author, reply \"Thank you\"; otherwise tag the author (e.g., @user) as a reminder.", + "intent_template": "Reply on the merge request assigned to me for {{topic}}: if the last comment is from the author, reply \"Thank you\"; otherwise tag the author (e.g., @user) as a reminder.", "instantiation_dict": {"topic": "wcag"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", @@ -11338,22 +10645,21 @@ "task_id": 417, "intent_template_id": 360, "start_urls": ["__GITLAB__"], - "intent": "Check the code review assigned to me for 404 link. If the last message in the code review is from the author, reply \"Thank you\". If not, tag them with their username (e.g., @user) as a reminder.", - "intent_template": "Check the code review assigned to me for {{topic}}. If the last message in the code review is from the author, reply \"Thank you\". If not, tag them with their username (e.g., @user) as a reminder.", + "intent": "Reply on the merge request assigned to me for 404 link: if the last comment is from the author, reply \"Thank you\"; otherwise tag the author (e.g., @user) as a reminder.", + "intent_template": "Reply on the merge request assigned to me for {{topic}}: if the last comment is from the author, reply \"Thank you\"; otherwise tag the author (e.g., @user) as a reminder.", "instantiation_dict": {"topic": "404 link"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", "expected": { "url": "__GITLAB__/a11yproject/a11yproject.com/notes?target_id=72560&target_type=merge_request", "http_method": "POST", - "post_data": {"$.note.noteable_type": "MergeRequest", "$.note.note": "Thank you"} + "post_data": {"$.note.noteable_type": "MergeRequest", "$.note.note": "^.*@Roshanjossey.*$"} } } ], @@ -11367,12 +10673,11 @@ "intent": "Set my gitlab status as Busy.", "intent_template": "Set my gitlab status as {{status}}.", "instantiation_dict": {"status": "Busy"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", @@ -11393,12 +10698,11 @@ "intent": "Set my gitlab status as Enjoying life.", "intent_template": "Set my gitlab status as {{status}}.", "instantiation_dict": {"status": "Enjoying life"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", @@ -11409,8 +10713,7 @@ "message": "Enjoying life.", "availability": "busy", "clear_status_after": null - }, - "response_status": 200 + } } } ], @@ -11424,12 +10727,11 @@ "intent": "Set my gitlab status as Playing Badminton.", "intent_template": "Set my gitlab status as {{status}}.", "instantiation_dict": {"status": "Playing Badminton"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", @@ -11440,8 +10742,7 @@ "message": "Playing Badminton.", "availability": "busy", "clear_status_after": null - }, - "response_status": 200 + } } } ], @@ -11455,12 +10756,11 @@ "intent": "Set my gitlab status as Resting due to leg injury.", "intent_template": "Set my gitlab status as {{status}}.", "instantiation_dict": {"status": "Resting due to leg injury"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", @@ -11471,8 +10771,7 @@ "message": "Resting due to leg injury.", "availability": "busy", "clear_status_after": null - }, - "response_status": 200 + } } } ], @@ -11486,12 +10785,11 @@ "intent": "Set my gitlab status as Out of Office.", "intent_template": "Set my gitlab status as {{status}}.", "instantiation_dict": {"status": "Out of Office"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", @@ -11502,8 +10800,7 @@ "message": "Out of Office.", "availability": "busy", "clear_status_after": null - }, - "response_status": 200 + } } } ], @@ -11514,28 +10811,22 @@ "task_id": 423, "intent_template_id": 237, "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Mark all Hollister shirts on sale", - "intent_template": "Mark all {{brand}} shirts on sale", + "intent": "Set all Hollister shirts to on-sale status", + "intent_template": "Set all {{brand}} shirts to on-sale status", "instantiation_dict": {"brand": "Hollister"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", - "site": "shopping_admin", "last_event_only": false, "expected": { - "url": "__SHOPPING_ADMIN__/catalog/product/validate/id/126/type/configurable/store/0/set/9/", - "headers": { - "referer": "__SHOPPING_ADMIN__/admin/catalog/product/edit/id/126/", - "X-Requested-With": "XMLHttpRequest" - }, + "url": "^__SHOPPING_ADMIN__/catalog/product/save/id/126/type/configurable/store/0/set/\\d+/back/edit$", "post_data": {"report_type": "created_at_order", "from": "02/1/2023", "to": "02/28/2023"}, - "query_params": { "isAjax": ["true"] } + "response_status": 302 } } ], @@ -11546,15 +10837,19 @@ "task_id": 424, "intent_template_id": 371, "start_urls": ["__MAP__", "__WIKIPEDIA__"], - "intent": "Find the page of the place where Mr. Rogers was filmed on the map (use the provided wiki site to look up any needed information).", - "intent_template": "Find the page of {{description}} on the map (use the provided wiki site to look up any needed information).", - "instantiation_dict": {"description": "the place where Mr. Rogers was filmed"}, - "format_specification": null, + "intent": "On the map site, view the info page for the city where A Beautiful Day in the Neighborhood was filmed (use the provided wiki site to look up any needed information).", + "intent_template": "On the map site, view the info page for {{description}} (use the provided wiki site to look up any needed information).", + "instantiation_dict": {"description": "the city where A Beautiful Day in the Neighborhood was filmed"}, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "navigate", "status": "SUCCESS", "retrieved_data": null} + }, + { + "evaluator": "NetworkEventEvaluator", + "ignored_query_params_patterns": [".*"], + "expected": {"url": "__MAP__/relation/188553"} } ], "revision": 2 @@ -11564,15 +10859,19 @@ "task_id": 425, "intent_template_id": 371, "start_urls": ["__MAP__", "__WIKIPEDIA__"], - "intent": "Find the page of the longest bridge in the Western hemisphere on the map (use the provided wiki site to look up any needed information).", - "intent_template": "Find the page of {{description}} on the map (use the provided wiki site to look up any needed information).", + "intent": "On the map site, view the info page for the longest bridge in the Western hemisphere (use the provided wiki site to look up any needed information).", + "intent_template": "On the map site, view the info page for {{description}} (use the provided wiki site to look up any needed information).", "instantiation_dict": {"description": "the longest bridge in the Western hemisphere"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "navigate", "status": "SUCCESS", "retrieved_data": null} + }, + { + "evaluator": "NetworkEventEvaluator", + "ignored_query_params_patterns": [".*"], + "expected": {"url": "__MAP__/relation/5475586"} } ], "revision": 2 @@ -11582,17 +10881,21 @@ "task_id": 426, "intent_template_id": 371, "start_urls": ["__MAP__", "__WIKIPEDIA__"], - "intent": "Find the page of the place in Pennsylvania where a plane crashed during the September 11th attacks on the map (use the provided wiki site to look up any needed information).", - "intent_template": "Find the page of {{description}} on the map (use the provided wiki site to look up any needed information).", + "intent": "On the map site, view the info page for the city in Pennsylvania where a plane crashed during the September 11th attacks (use the provided wiki site to look up any needed information).", + "intent_template": "On the map site, view the info page for {{description}} (use the provided wiki site to look up any needed information).", "instantiation_dict": { - "description": "the place in Pennsylvania where a plane crashed during the September 11th attacks" + "description": "the city in Pennsylvania where a plane crashed during the September 11th attacks" }, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "navigate", "status": "SUCCESS", "retrieved_data": null} + }, + { + "evaluator": "NetworkEventEvaluator", + "ignored_query_params_patterns": [".*"], + "expected": {"url": "__MAP__/relation/189076"} } ], "revision": 2 @@ -11602,15 +10905,21 @@ "task_id": 427, "intent_template_id": 371, "start_urls": ["__MAP__", "__WIKIPEDIA__"], - "intent": "Find the page of the university that has most Turning Award winners on the map (use the provided wiki site to look up any needed information).", - "intent_template": "Find the page of {{description}} on the map (use the provided wiki site to look up any needed information).", - "instantiation_dict": {"description": "the university that has most Turning Award winners"}, - "format_specification": null, + "intent": "On the map site, view the info page for the university with 26 Turing Award winners as of December 2021 (use the provided wiki site to look up any needed information).", + "intent_template": "On the map site, view the info page for {{description}} (use the provided wiki site to look up any needed information).", + "instantiation_dict": { + "description": "the university with 26 Turing Award winners as of December 2021" + }, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "navigate", "status": "SUCCESS", "retrieved_data": null} + }, + { + "evaluator": "NetworkEventEvaluator", + "ignored_query_params_patterns": [".*"], + "expected": {"url": "__MAP__/relation/65066"} } ], "revision": 2 @@ -11620,17 +10929,21 @@ "task_id": 428, "intent_template_id": 371, "start_urls": ["__MAP__", "__WIKIPEDIA__"], - "intent": "Find the page of the undergrad college of the person who developed the Nash equilibrium on the map (use the provided wiki site to look up any needed information).", - "intent_template": "Find the page of {{description}} on the map (use the provided wiki site to look up any needed information).", + "intent": "On the map site, view the info page for the undergrad college of the person who developed the Nash equilibrium (use the provided wiki site to look up any needed information).", + "intent_template": "On the map site, view the info page for {{description}} (use the provided wiki site to look up any needed information).", "instantiation_dict": { "description": "the undergrad college of the person who developed the Nash equilibrium" }, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "navigate", "status": "SUCCESS", "retrieved_data": null} + }, + { + "evaluator": "NetworkEventEvaluator", + "ignored_query_params_patterns": [".*"], + "expected": {"url": "__MAP__/relation/2279034"} } ], "revision": 2 @@ -11640,15 +10953,26 @@ "task_id": 429, "intent_template_id": 371, "start_urls": ["__MAP__", "__WIKIPEDIA__"], - "intent": "Find the page of the colleges where The Chair was filmed in Pittsburgh on the map (use the provided wiki site to look up any needed information).", - "intent_template": "Find the page of {{description}} on the map (use the provided wiki site to look up any needed information).", - "instantiation_dict": {"description": "the colleges where The Chair was filmed in Pittsburgh"}, - "format_specification": null, + "intent": "On the map site, view the info page for the college or colleges where The Chair was filmed (open each in a separate tab) (use the provided wiki site to look up any needed information).", + "intent_template": "On the map site, view the info page for {{description}} (use the provided wiki site to look up any needed information).", + "instantiation_dict": { + "description": "the college or colleges where The Chair was filmed (open each in a separate tab)" + }, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "navigate", "status": "SUCCESS", "retrieved_data": null} + }, + { + "evaluator": "NetworkEventEvaluator", + "ignored_query_params_patterns": [".*"], + "expected": {"url": "__MAP__/relation/583390395"} + }, + { + "evaluator": "NetworkEventEvaluator", + "ignored_query_params_patterns": [".*"], + "expected": {"url": "__MAP__/relation/172206707"} } ], "revision": 2 @@ -11658,17 +10982,21 @@ "task_id": 430, "intent_template_id": 371, "start_urls": ["__MAP__", "__WIKIPEDIA__"], - "intent": "Find the page of the college(s) where The Chair was filmed in Pennsylvania other than the ones in Pittsburgh on the map (use the provided wiki site to look up any needed information).", - "intent_template": "Find the page of {{description}} on the map (use the provided wiki site to look up any needed information).", + "intent": "On the map site, view the info page for the college(s) where The Chair was filmed in Pennsylvania other than the ones in Pittsburgh (use the provided wiki site to look up any needed information).", + "intent_template": "On the map site, view the info page for {{description}} (use the provided wiki site to look up any needed information).", "instantiation_dict": { "description": "the college(s) where The Chair was filmed in Pennsylvania other than the ones in Pittsburgh" }, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "navigate", "status": "SUCCESS", "retrieved_data": null} + }, + { + "evaluator": "NetworkEventEvaluator", + "ignored_query_params_patterns": [".*"], + "expected": {"url": "__MAP__/TASK_430_MAP_RELATION_ID"} } ], "revision": 2 @@ -11685,22 +11013,18 @@ "intent": "Add the product with the lowest per unit price from my open tabs to the shopping cart", "intent_template": "Add the product with the lowest per unit price from my open tabs to the shopping cart", "instantiation_dict": {}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", - "site": "shopping", - "last_event_only": true, "expected": { "url": "^__SHOPPING__/checkout/cart/add/uenc/.*/product/32202$", "http_method": "POST", - "post_data": {"qty": "1", "item": "32202"}, - "response_status": 200 + "post_data": {"qty": "1", "item": "32202"} } } ], @@ -11717,22 +11041,18 @@ "intent": "Add the product with the lowest per unit price from my open tabs to the shopping cart", "intent_template": "Add the product with the lowest per unit price from my open tabs to the shopping cart", "instantiation_dict": {}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", - "site": "shopping", - "last_event_only": true, "expected": { "url": "^__SHOPPING__/checkout/cart/add/uenc/.*/product/104498$", "http_method": "POST", - "post_data": {"qty": "1", "item": "104498"}, - "response_status": 200 + "post_data": {"qty": "1", "item": "104498"} } } ], @@ -11749,17 +11069,19 @@ "intent": "Add the product with the lowest per unit price from my open tabs to the shopping cart", "intent_template": "Add the product with the lowest per unit price from my open tabs to the shopping cart", "instantiation_dict": {}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "site": "shopping", - "expected": {"sku": "B07Q1NRQBW"} + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": "^__SHOPPING__/checkout/cart/add/uenc/.*/product/52674$", + "http_method": "POST", + "post_data": {"qty": "1", "item": "52674"} + } } ], "revision": 2 @@ -11775,17 +11097,19 @@ "intent": "Add the product with the lowest per unit price from my open tabs to the shopping cart", "intent_template": "Add the product with the lowest per unit price from my open tabs to the shopping cart", "instantiation_dict": {}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "site": "shopping", - "expected": {"sku": "B08MFJFHQ4"} + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": "^__SHOPPING__/checkout/cart/add/uenc/.*/product/11615$", + "http_method": "POST", + "post_data": {"qty": "1", "item": "11615"} + } } ], "revision": 2 @@ -11801,22 +11125,18 @@ "intent": "Add the product with the lowest per unit price from my open tabs to the shopping cart", "intent_template": "Add the product with the lowest per unit price from my open tabs to the shopping cart", "instantiation_dict": {}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", - "site": "shopping", - "last_event_only": true, "expected": { "url": "^__SHOPPING__/checkout/cart/add/uenc/.*/product/77300$", "http_method": "POST", - "post_data": {"qty": "1", "item": "77300"}, - "response_status": 200 + "post_data": {"qty": "1", "item": "77300"} } } ], @@ -11830,17 +11150,19 @@ "intent": "I previously ordered a mattress foundation around Feb or March 2023 and later cancelled. Can you reorder it for me?", "intent_template": "I previously ordered {{product}} {{time}} and later cancelled. Can you reorder it for me?", "instantiation_dict": {"product": "a mattress foundation", "time": "around Feb or March 2023"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "site": "shopping", - "expected": {"sku": "B07DFJ5XKH"} + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": "^__SHOPPING__/checkout/cart/add/uenc/.*/product/14854$", + "http_method": "POST", + "post_data": {"qty": "1", "item": "14854"} + } } ], "revision": 2 @@ -11853,17 +11175,19 @@ "intent": "I previously ordered a table lamp in May 2023 and later cancelled. Can you reorder it for me?", "intent_template": "I previously ordered {{product}} {{time}} and later cancelled. Can you reorder it for me?", "instantiation_dict": {"product": "a table lamp", "time": "in May 2023"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "site": "shopping", - "expected": {"sku": "B087QSCXGT"} + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": "^__SHOPPING__/checkout/cart/add/uenc/.*/product/15033$", + "http_method": "POST", + "post_data": {"qty": "1", "item": "15033"} + } } ], "revision": 2 @@ -11876,17 +11200,19 @@ "intent": "I previously ordered a TV stand sometime around sep 2022 and later cancelled. Can you reorder it for me?", "intent_template": "I previously ordered {{product}} {{time}} and later cancelled. Can you reorder it for me?", "instantiation_dict": {"product": "a TV stand", "time": "sometime around sep 2022"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "site": "shopping", - "expected": {"sku": "B017SK5E3M"} + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": "^__SHOPPING__/checkout/cart/add/uenc/.*/product/35837$", + "http_method": "POST", + "post_data": {"qty": "1", "item": "35837"} + } } ], "revision": 2 @@ -11899,17 +11225,19 @@ "intent": "I previously ordered a cat t-shirt during 2022 and later cancelled. Can you reorder it for me?", "intent_template": "I previously ordered {{product}} {{time}} and later cancelled. Can you reorder it for me?", "instantiation_dict": {"product": "a cat t-shirt", "time": "during 2022"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "site": "shopping", - "expected": {"sku": "B0844BWS76"} + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": "^__SHOPPING__/checkout/cart/add/uenc/.*/product/24493$", + "http_method": "POST", + "post_data": {"qty": "1", "item": "24493"} + } } ], "revision": 2 @@ -11922,17 +11250,19 @@ "intent": "I previously ordered a make up removal kit during summer 2022 and later cancelled. Can you reorder it for me?", "intent_template": "I previously ordered {{product}} {{time}} and later cancelled. Can you reorder it for me?", "instantiation_dict": {"product": "a make up removal kit", "time": "during summer 2022"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "site": "shopping", - "expected": {"sku": "B0738JQG6Q"} + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": "^__SHOPPING__/checkout/cart/add/uenc/.*/product/51958$", + "http_method": "POST", + "post_data": {"qty": "1", "item": "51958"} + } } ], "revision": 2 @@ -11945,16 +11275,14 @@ "intent": "Update and commit (to the default branch) the website code for the current project using the simple online file editor to change the browser tab title to \"GIVE ME SPACE\"", "intent_template": "Update and commit (to {{target_branch}}) the website code for the current project using the simple online file editor to change the browser tab title to \"{{title}}\"", "instantiation_dict": {"title": "GIVE ME SPACE", "target_branch": "the default branch"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", - "gen_missing_post_data_schema": true, "expected": { "url": "__GITLAB__/byteblaze/gimmiethat.space/-/update/main/index.html", "http_method": "POST", @@ -11979,16 +11307,14 @@ "intent": "Update and commit (to main) the website code for the current project using the simple online file editor to change the browser tab title to \"Welcome to my site\"", "intent_template": "Update and commit (to {{target_branch}}) the website code for the current project using the simple online file editor to change the browser tab title to \"{{title}}\"", "instantiation_dict": {"title": "Welcome to my site", "target_branch": "main"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", - "gen_missing_post_data_schema": true, "expected": { "url": "__GITLAB__/byteblaze/gimmiethat.space/-/update/main/index.html", "http_method": "POST", @@ -12016,16 +11342,14 @@ "title": "Not an interesting site", "target_branch": "a new branch with no merged request" }, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", - "gen_missing_post_data_schema": true, "expected": { "url": "__GITLAB__/byteblaze/gimmiethat.space/-/update/main/index.html", "http_method": "POST", @@ -12053,16 +11377,14 @@ "title": "Title Wanted", "target_branch": "a new branch called title-update with no merged request" }, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", - "gen_missing_post_data_schema": true, "expected": { "url": "__GITLAB__/byteblaze/gimmiethat.space/-/update/main/index.html", "http_method": "POST", @@ -12087,16 +11409,14 @@ "intent": "Update and commit (to main) the website code for the current project using the simple online file editor to change the browser tab title to \"Hello\"", "intent_template": "Update and commit (to {{target_branch}}) the website code for the current project using the simple online file editor to change the browser tab title to \"{{title}}\"", "instantiation_dict": {"title": "Hello", "target_branch": "main"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", - "gen_missing_post_data_schema": true, "expected": { "url": "__GITLAB__/byteblaze/gimmiethat.space/-/update/main/index.html", "http_method": "POST", @@ -12118,19 +11438,26 @@ "task_id": 446, "intent_template_id": 999, "start_urls": ["__GITLAB__"], - "intent": "Assign the issue regarding 404 in a11yproject to Roshanjossey.", - "intent_template": "Assign the issue regarding {{issue}} in {{repo}} to {{account}}.", - "instantiation_dict": {"repo": "a11yproject", "issue": 404, "account": "Roshanjossey"}, - "format_specification": null, + "intent": "Assign the issue regarding 404 errors in a11yproject to Roshanjossey.", + "intent_template": "Assign the issue regarding {{issue}} {{issue_location}}to {{account}}.", + "instantiation_dict": { + "issue": "404 errors", + "issue_location": "in a11yproject ", + "account": "Roshanjossey" + }, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", "ignored_query_params": ["serializer"], + "post_data_schema": { + "type": "object", + "properties": { "$.issue.assignee_ids": { "type": "array", "items": {"type": "number"} } } + }, "expected": { "url": "__GITLAB__/a11yproject/a11yproject.com/-/issues/1478.json", "http_method": "PUT", @@ -12146,22 +11473,25 @@ "intent_template_id": 999, "start_urls": ["__GITLAB__"], "intent": "Assign the issue regarding linking to an accessibility statement in a11y-webring.club to Rohan.", - "intent_template": "Assign the issue regarding {{issue}} in {{repo}} to {{account}}.", + "intent_template": "Assign the issue regarding {{issue}} {{issue_location}}to {{account}}.", "instantiation_dict": { - "repo": "a11y-webring.club", "issue": "linking to an accessibility statement", + "issue_location": "in a11y-webring.club ", "account": "Rohan" }, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", "ignored_query_params": ["serializer"], + "post_data_schema": { + "type": "object", + "properties": { "$.issue.assignee_ids": { "type": "array", "items": {"type": "number"} } } + }, "expected": { "url": "__GITLAB__/byteblaze/a11y-webring.club/-/issues/71.json", "http_method": "PUT", @@ -12179,12 +11509,11 @@ "intent": "set the homepage URL on my GitLab profile to https://egg.tart.com", "intent_template": "set the homepage URL on my GitLab profile to {{url}}", "instantiation_dict": {"url": "https://egg.tart.com"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", @@ -12206,12 +11535,11 @@ "intent": "set the homepage URL on my GitLab profile to helloworld.xyz", "intent_template": "set the homepage URL on my GitLab profile to {{url}}", "instantiation_dict": {"url": "helloworld.xyz"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", @@ -12234,12 +11562,11 @@ "intent": "set the homepage URL on my GitLab profile to https://a11yproject.contributor.me", "intent_template": "set the homepage URL on my GitLab profile to {{url}}", "instantiation_dict": {"url": "https://a11yproject.contributor.me"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", @@ -12261,12 +11588,11 @@ "intent": "set the homepage URL on my GitLab profile to www.byteblaze.com", "intent_template": "set the homepage URL on my GitLab profile to {{url}}", "instantiation_dict": {"url": "www.byteblaze.com"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", @@ -12291,12 +11617,11 @@ "intent": "set the homepage URL on my GitLab profile to https://byteblaze.github.io", "intent_template": "set the homepage URL on my GitLab profile to {{url}}", "instantiation_dict": {"url": "https://byteblaze.github.io"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", @@ -12318,18 +11643,20 @@ "intent": "Disable Teton pullover hoodie from the site, they are facing some quality issues.", "intent_template": "Disable {{product}} from the site, they are facing some quality issues.", "instantiation_dict": {"product": "Teton pullover hoodie"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "name": "Teton pullover hoodie", - "site": "shopping_admin", - "expected": {"is_disabled": true} + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": "^__SHOPPING_ADMIN__/catalog/product/save/id/78/type/(simple|configurable)/store/0/set/\\d+/back/edit$", + "http_method": "POST", + "post_data": {"product[status]": "2"}, + "response_status": 302 + } } ], "revision": 2 @@ -12342,18 +11669,20 @@ "intent": "Disable Ryker Tee Crew Neck from the site, they are facing some quality issues.", "intent_template": "Disable {{product}} from the site, they are facing some quality issues.", "instantiation_dict": {"product": "Ryker Tee Crew Neck"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "name": "Ryker LumaTech™ Tee (Crew-neck)", - "site": "shopping_admin", - "expected": {"is_disabled": true} + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": "^__SHOPPING_ADMIN__/catalog/product/save/id/478/type/(simple|configurable)/store/0/set/\\d+/back/edit$", + "http_method": "POST", + "post_data": {"product[status]": "2"}, + "response_status": 302 + } } ], "revision": 2 @@ -12366,18 +11695,20 @@ "intent": "Disable Helios Endurance Tank from the site, they are facing some quality issues.", "intent_template": "Disable {{product}} from the site, they are facing some quality issues.", "instantiation_dict": {"product": "Helios Endurance Tank"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "name": "Helios Endurance Tank", - "site": "shopping_admin", - "expected": {"is_disabled": true} + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": "^__SHOPPING_ADMIN__/catalog/product/save/id/676/type/(simple|configurable)/store/0/set/\\d+/back/edit$", + "http_method": "POST", + "post_data": {"product[status]": "2"}, + "response_status": 302 + } } ], "revision": 2 @@ -12390,18 +11721,20 @@ "intent": "Disable Cora Pant from the site, they are facing some quality issues.", "intent_template": "Disable {{product}} from the site, they are facing some quality issues.", "instantiation_dict": {"product": "Cora Pant"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "name": "Cora Parachute Pant", - "site": "shopping_admin", - "expected": {"is_disabled": true} + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": "^__SHOPPING_ADMIN__/catalog/product/save/id/1840/type/(simple|configurable)/store/0/set/\\d+/back/edit$", + "http_method": "POST", + "post_data": {"product[status]": "2"}, + "response_status": 302 + } } ], "revision": 2 @@ -12414,18 +11747,20 @@ "intent": "Disable Karmen yoga pants from the site, they are facing some quality issues.", "intent_template": "Disable {{product}} from the site, they are facing some quality issues.", "instantiation_dict": {"product": "Karmen yoga pants"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "name": "Karmen Yoga Pant", - "site": "shopping_admin", - "expected": {"is_disabled": true} + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": "^__SHOPPING_ADMIN__/catalog/product/save/id/1819/type/(simple|configurable)/store/0/set/\\d+/back/edit$", + "http_method": "POST", + "post_data": {"product[status]": "2"}, + "response_status": 302 + } } ], "revision": 2 @@ -12437,19 +11772,21 @@ "start_urls": ["__SHOPPING_ADMIN__/catalog/product/edit/id/1481/"], "intent": "Reduce the price of the product on the current page by $5", "intent_template": "{{action}} the price of the product on the current page by {{amount}}", - "instantiation_dict": {"amount": "$5", "performed_operation": "Reduce"}, - "format_specification": null, + "instantiation_dict": {"amount": "$5", "action": "Reduce"}, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "product_id": "1481", - "site": "shopping_admin", - "expected": {"price": "27.00"} + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": "^__SHOPPING_ADMIN__/catalog/product/save/id/1481/type/(simple|configurable)/store/0/set/\\d+/back/edit$", + "http_method": "POST", + "post_data": {"product[price]": "27.00"}, + "response_status": 302 + } } ], "revision": 2 @@ -12461,19 +11798,21 @@ "start_urls": ["__SHOPPING_ADMIN__/catalog/product/edit/id/237/"], "intent": "Reduce the price of the product on the current page by 10%", "intent_template": "{{action}} the price of the product on the current page by {{amount}}", - "instantiation_dict": {"amount": "10%", "performed_operation": "Reduce"}, - "format_specification": null, + "instantiation_dict": {"amount": "10%", "action": "Reduce"}, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "product_id": "237", - "site": "shopping_admin", - "expected": {"price": "62.10"} + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": "^__SHOPPING_ADMIN__/catalog/product/save/id/237/type/(simple|configurable)/store/0/set/\\d+/back/edit$", + "http_method": "POST", + "post_data": {"product[price]": "62.10"}, + "response_status": 302 + } } ], "revision": 2 @@ -12485,19 +11824,21 @@ "start_urls": ["__SHOPPING_ADMIN__/catalog/product/edit/id/418/"], "intent": "Reduce the price of the product on the current page by 15%", "intent_template": "{{action}} the price of the product on the current page by {{amount}}", - "instantiation_dict": {"amount": "15%", "performed_operation": "Reduce"}, - "format_specification": null, + "instantiation_dict": {"amount": "15%", "action": "Reduce"}, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "product_id": "418", - "site": "shopping_admin", - "expected": {"price": "38.25"} + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": "^__SHOPPING_ADMIN__/catalog/product/save/id/418/type/(simple|configurable)/store/0/set/\\d+/back/edit$", + "http_method": "POST", + "post_data": {"product[price]": "38.25"}, + "response_status": 302 + } } ], "revision": 2 @@ -12509,19 +11850,21 @@ "start_urls": ["__SHOPPING_ADMIN__/catalog/product/edit/id/721/"], "intent": "Increase the price of the product on the current page by $11.5", "intent_template": "{{action}} the price of the product on the current page by {{amount}}", - "instantiation_dict": {"amount": "$11.5", "performed_operation": "Increase"}, - "format_specification": null, + "instantiation_dict": {"amount": "$11.5", "action": "Increase"}, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "product_id": "721", - "site": "shopping_admin", - "expected": {"price": "29.50"} + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": "^__SHOPPING_ADMIN__/catalog/product/save/id/721/type/(simple|configurable)/store/0/set/\\d+/back/edit$", + "http_method": "POST", + "post_data": {"product[price]": "29.50"}, + "response_status": 302 + } } ], "revision": 2 @@ -12533,19 +11876,21 @@ "start_urls": ["__SHOPPING_ADMIN__/catalog/product/edit/id/710/"], "intent": "Increase the price of the product on the current page by 10%", "intent_template": "{{action}} the price of the product on the current page by {{amount}}", - "instantiation_dict": {"amount": "10%", "performed_operation": "Increase"}, - "format_specification": null, + "instantiation_dict": {"amount": "10%", "action": "Increase"}, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "product_id": "710", - "site": "shopping_admin", - "expected": {"price": "19.80"} + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": "^__SHOPPING_ADMIN__/catalog/product/save/id/710/type/(simple|configurable)/store/0/set/\\d+/back/edit$", + "http_method": "POST", + "post_data": {"product[price]": "19.80"}, + "response_status": 302 + } } ], "revision": 2 @@ -12557,19 +11902,21 @@ "start_urls": ["__SHOPPING_ADMIN__/catalog/product/edit/id/996/"], "intent": "Increase the price of the product on the current page by 15%", "intent_template": "{{action}} the price of the product on the current page by {{amount}}", - "instantiation_dict": {"amount": "15%", "performed_operation": "Increase"}, - "format_specification": null, + "instantiation_dict": {"amount": "15%", "action": "Increase"}, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "product_id": "996", - "site": "shopping_admin", - "expected": {"price": "36.80"} + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": "^__SHOPPING_ADMIN__/catalog/product/save/id/996/type/(simple|configurable)/store/0/set/\\d+/back/edit$", + "http_method": "POST", + "post_data": {"product[price]": "36.80"}, + "response_status": 302 + } } ], "revision": 2 @@ -12582,19 +11929,16 @@ "intent": "Update the product description of Antonia Racer Tank to \"{count} customer(s) love it!\" where count is the number of reviews with 4 stars or above, or \"don't miss out on this amazing product\" when there are no such reviews.", "intent_template": "Update the product description of {{product}} to \"{count} customer(s) love it!\" where count is the number of reviews with 4 stars or above, or \"don't miss out on this amazing product\" when there are no such reviews.", "instantiation_dict": {"product": "Antonia Racer Tank"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", - "site": "shopping_admin", - "last_event_only": true, "expected": { - "url": "__SHOPPING_ADMIN__/catalog/product/save/id/1796/type/configurable/store/0/set/9/back/edit", + "url": "^__SHOPPING_ADMIN__/catalog/product/save/id/1796/type/configurable/store/0/set/\\d+/back/edit$", "http_method": "POST", "response_status": 302, "post_data": {"product[short_description]": "

1 customer(s) love it!

"} @@ -12613,17 +11957,20 @@ "instantiation_dict": { "product": "Tide PODS Spring Meadow Scent HE Turbo Laundry Detergent Pacs, 81 Count" }, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "site": "shopping", - "expected": {"sku": "B074QVN413"} + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": "__SHOPPING__/wishlist/index/add/", + "http_method": "POST", + "post_data": {"product": "71337", "qty": null}, + "response_status": 302 + } } ], "revision": 2 @@ -12636,17 +11983,14 @@ "intent": "Add 2 Hawaiian Bamboo Orchid Roots #zc50 - by Discount Hawaiian Gifts to my wish list", "intent_template": "Add {{product}} to my wish list", "instantiation_dict": {"product": "2 Hawaiian Bamboo Orchid Roots #zc50 - by Discount Hawaiian Gifts"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", - "site": "shopping", - "last_event_only": true, "expected": { "url": "__SHOPPING__/wishlist/index/add/", "http_method": "POST", @@ -12667,17 +12011,20 @@ "instantiation_dict": { "product": "HONGJ Hawaiian Beach Outfits Set for Mens, Summer Tropical Tree Printed Relaxed-fit Hawaii Shirts Shorts 2 Piece Suits" }, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "site": "shopping", - "expected": {"sku": "B09STCV25D"} + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": "__SHOPPING__/wishlist/index/add/", + "http_method": "POST", + "post_data": {"product": "85498", "qty": null}, + "response_status": 302 + } } ], "revision": 2 @@ -12692,17 +12039,20 @@ "instantiation_dict": { "product": "DkRgVNY Lace Spcling Lingerie Womens Sexy Hollow Out Underwear Bodysuit One Piece Snap Crotch Clubwear Teddy Bodysuit" }, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "site": "shopping", - "expected": {"sku": "B09QZRWT97"} + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": "__SHOPPING__/wishlist/index/add/", + "http_method": "POST", + "post_data": {"product": "30843", "qty": null}, + "response_status": 302 + } } ], "revision": 2 @@ -12717,17 +12067,20 @@ "instantiation_dict": { "product": "Light Blue Simple Summer New Low Heels Slippers for Women Fashion Chunky Heels Pointed Toe Wine Glasses Sandals Comfortable Walking Shoes Ladies All-Match Sexy Party Shoes" }, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "site": "shopping", - "expected": {"sku": "B09QXM7B42"} + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": "__SHOPPING__/wishlist/index/add/", + "http_method": "POST", + "post_data": {"product": "60047", "qty": null}, + "response_status": 302 + } } ], "revision": 2 @@ -12740,18 +12093,19 @@ "intent": "Cancel order 302", "intent_template": "Cancel order {{id}}", "instantiation_dict": {"id": "302"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "order_id": "302", - "site": "shopping_admin", - "expected": {"status": "Canceled"} + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": "__SHOPPING_ADMIN__/sales/order/cancel/order_id/302/", + "http_method": "POST", + "response_status": 302 + } } ], "revision": 2 @@ -12764,18 +12118,19 @@ "intent": "Cancel order 307", "intent_template": "Cancel order {{id}}", "instantiation_dict": {"id": "307"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "order_id": "307", - "site": "shopping_admin", - "expected": {"status": "Canceled"} + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": "__SHOPPING_ADMIN__/sales/order/cancel/order_id/307/", + "http_method": "POST", + "response_status": 302 + } } ], "revision": 2 @@ -12788,18 +12143,19 @@ "intent": "Cancel order 299", "intent_template": "Cancel order {{id}}", "instantiation_dict": {"id": "299"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "order_id": "299", - "site": "shopping_admin", - "expected": {"status": "Canceled"} + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": "__SHOPPING_ADMIN__/sales/order/cancel/order_id/299/", + "http_method": "POST", + "response_status": 302 + } } ], "revision": 2 @@ -12812,18 +12168,19 @@ "intent": "Cancel order 301", "intent_template": "Cancel order {{id}}", "instantiation_dict": {"id": "301"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "order_id": "301", - "site": "shopping_admin", - "expected": {"status": "Canceled"} + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": "__SHOPPING_ADMIN__/sales/order/cancel/order_id/301/", + "http_method": "POST", + "response_status": 302 + } } ], "revision": 2 @@ -12836,18 +12193,19 @@ "intent": "Cancel order 305", "intent_template": "Cancel order {{id}}", "instantiation_dict": {"id": "305"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "order_id": "305", - "site": "shopping_admin", - "expected": {"status": "Canceled"} + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": "__SHOPPING_ADMIN__/sales/order/cancel/order_id/305/", + "http_method": "POST", + "response_status": 302 + } } ], "revision": 2 @@ -12857,21 +12215,28 @@ "task_id": 475, "intent_template_id": 292, "start_urls": ["__GITLAB__"], - "intent": "Set up a new, empty repository with the name chatgpt_plugin?", - "intent_template": "Set up a new, empty repository with the name {{project_name}}?", + "intent": "Set up a new, empty repository with the name chatgpt_plugin", + "intent_template": "Set up a new, empty repository with the name {{project_name}}", "instantiation_dict": {"project_name": "chatgpt_plugin"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "project": "chatgpt_plugin", - "site": "gitlab", - "expected": true + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": "__GITLAB__/projects", + "http_method": "POST", + "post_data": { + "project[name]": "chatgpt_plugin", + "project[namespace_id]": "2505", + "project[path]": "chatgpt_plugin", + "project[initialize_with_readme]": "0" + }, + "response_status": 302 + } } ], "revision": 2 @@ -12881,21 +12246,28 @@ "task_id": 476, "intent_template_id": 292, "start_urls": ["__GITLAB__"], - "intent": "Set up a new, empty repository with the name awesome_llm_reading?", - "intent_template": "Set up a new, empty repository with the name {{project_name}}?", + "intent": "Set up a new, empty repository with the name awesome_llm_reading", + "intent_template": "Set up a new, empty repository with the name {{project_name}}", "instantiation_dict": {"project_name": "awesome_llm_reading"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "project": "awesome_llm_reading", - "site": "gitlab", - "expected": true + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": "__GITLAB__/projects", + "http_method": "POST", + "post_data": { + "project[name]": "awesome_llm_reading", + "project[namespace_id]": "2505", + "project[path]": "awesome_llm_reading", + "project[initialize_with_readme]": "0" + }, + "response_status": 302 + } } ], "revision": 2 @@ -12905,21 +12277,28 @@ "task_id": 477, "intent_template_id": 292, "start_urls": ["__GITLAB__"], - "intent": "Set up a new, empty repository with the name awesome_program_aided_reasoning?", - "intent_template": "Set up a new, empty repository with the name {{project_name}}?", + "intent": "Set up a new, empty repository with the name awesome_program_aided_reasoning", + "intent_template": "Set up a new, empty repository with the name {{project_name}}", "instantiation_dict": {"project_name": "awesome_program_aided_reasoning"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "project": "awesome_program_aided_reasoning", - "site": "gitlab", - "expected": true + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": "__GITLAB__/projects", + "http_method": "POST", + "post_data": { + "project[name]": "awesome_program_aided_reasoning", + "project[namespace_id]": "2505", + "project[path]": "awesome_program_aided_reasoning", + "project[initialize_with_readme]": "0" + }, + "response_status": 302 + } } ], "revision": 2 @@ -12929,21 +12308,28 @@ "task_id": 478, "intent_template_id": 292, "start_urls": ["__GITLAB__"], - "intent": "Set up a new, empty repository with the name webagent?", - "intent_template": "Set up a new, empty repository with the name {{project_name}}?", + "intent": "Set up a new, empty repository with the name webagent", + "intent_template": "Set up a new, empty repository with the name {{project_name}}", "instantiation_dict": {"project_name": "webagent"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "project": "webagent", - "site": "gitlab", - "expected": true + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": "__GITLAB__/projects", + "http_method": "POST", + "post_data": { + "project[name]": "webagent", + "project[namespace_id]": "2505", + "project[path]": "webagent", + "project[initialize_with_readme]": "0" + }, + "response_status": 302 + } } ], "revision": 2 @@ -12953,21 +12339,28 @@ "task_id": 479, "intent_template_id": 292, "start_urls": ["__GITLAB__"], - "intent": "Set up a new, empty repository with the name awesome_webagent?", - "intent_template": "Set up a new, empty repository with the name {{project_name}}?", + "intent": "Set up a new, empty repository with the name awesome_webagent", + "intent_template": "Set up a new, empty repository with the name {{project_name}}", "instantiation_dict": {"project_name": "awesome_webagent"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "project": "awesome_webagent", - "site": "gitlab", - "expected": true + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": "__GITLAB__/projects", + "http_method": "POST", + "post_data": { + "project[name]": "awesome_webagent", + "project[namespace_id]": "2505", + "project[path]": "awesome_webagent", + "project[initialize_with_readme]": "0" + }, + "response_status": 302 + } } ], "revision": 2 @@ -12980,12 +12373,11 @@ "intent": "Invite yjlou as collaborator(s) to solarized-prism-theme repo", "intent_template": "Invite {{collaborator_account_list}} as collaborator(s) to {{repo}} repo", "instantiation_dict": {"collaborator_account_list": "yjlou", "repo": "solarized-prism-theme"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", @@ -13007,12 +12399,11 @@ "intent": "Abishek wants to check my dotfile configurations. Please invite him to the repo as a guest.", "intent_template": "{{name}} wants to check my dotfile configurations. Please invite him to the repo as a guest.", "instantiation_dict": {"name": "Abishek"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", @@ -13034,12 +12425,11 @@ "intent": "yjlou wants to check my dotfile configurations. Please invite him to the repo as a guest.", "intent_template": "{{name}} wants to check my dotfile configurations. Please invite him to the repo as a guest.", "instantiation_dict": {"name": "yjlou"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", @@ -13061,12 +12451,11 @@ "intent": "Koushik wants to check my dotfile configurations. Please invite him to the repo as a guest.", "intent_template": "{{name}} wants to check my dotfile configurations. Please invite him to the repo as a guest.", "instantiation_dict": {"name": "Koushik"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", @@ -13088,12 +12477,11 @@ "intent": "Jakub Klinkovsk\u00fd wants to check my dotfile configurations. Please invite him to the repo as a guest.", "intent_template": "{{name}} wants to check my dotfile configurations. Please invite him to the repo as a guest.", "instantiation_dict": {"name": "Jakub Klinkovsk\u00fd"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", @@ -13115,12 +12503,11 @@ "intent": "Vinta wants to check my dotfile configurations. Please invite him to the repo as a guest.", "intent_template": "{{name}} wants to check my dotfile configurations. Please invite him to the repo as a guest.", "instantiation_dict": {"name": "Vinta"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", @@ -13140,22 +12527,19 @@ "intent_template_id": 275, "start_urls": ["__SHOPPING_ADMIN__"], "intent": "Change the page title of \"404 Not Found\" page on my site to \"Bruh bro you clicked the wrong page\".", - "intent_template": "Change the page title of \"{{old-heading}}\" page on my site to \"{{heading}}\".", + "intent_template": "Change the page title of \"{{old_heading}}\" page on my site to \"{{heading}}\".", "instantiation_dict": { - "old-heading": "404 Not Found", - "heading": "Bruh bro you clicked the wrong page" + "heading": "Bruh bro you clicked the wrong page", + "old_heading": "404 Not Found" }, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", - "site": "shopping_admin", - "last_event_only": true, "expected": { "url": "__SHOPPING_ADMIN__/cms/page/save/back/edit", "http_method": "POST", @@ -13177,22 +12561,19 @@ "intent_template_id": 275, "start_urls": ["__SHOPPING_ADMIN__"], "intent": "Change the page title of \"Enable Cookies\" page on my site to \"Cookie monster coming to your place\".", - "intent_template": "Change the page title of \"{{old-heading}}\" page on my site to \"{{heading}}\".", + "intent_template": "Change the page title of \"{{old_heading}}\" page on my site to \"{{heading}}\".", "instantiation_dict": { - "old-heading": "Enable Cookies", - "heading": "Cookie monster coming to your place" + "heading": "Cookie monster coming to your place", + "old_heading": "Enable Cookies" }, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", - "site": "shopping_admin", - "last_event_only": true, "expected": { "url": "__SHOPPING_ADMIN__/cms/page/save/back/edit", "http_method": "POST", @@ -13214,19 +12595,16 @@ "intent_template_id": 275, "start_urls": ["__SHOPPING_ADMIN__"], "intent": "Change the page title of \"Home Page\" page on my site to \"This is the home page!! Leave here!!\".", - "intent_template": "Change the page title of \"{{old-heading}}\" page on my site to \"{{heading}}\".", - "instantiation_dict": {"old-heading": "Home Page", "heading": "This is the home page!! Leave here!!"}, - "format_specification": null, + "intent_template": "Change the page title of \"{{old_heading}}\" page on my site to \"{{heading}}\".", + "instantiation_dict": {"heading": "This is the home page!! Leave here!!", "old_heading": "Home Page"}, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", - "site": "shopping_admin", - "last_event_only": true, "expected": { "url": "__SHOPPING_ADMIN__/cms/page/save/back/edit", "http_method": "POST", @@ -13248,22 +12626,19 @@ "intent_template_id": 275, "start_urls": ["__SHOPPING_ADMIN__"], "intent": "Change the page title of \"Privacy Policy\" page on my site to \"No privacy policy is needed in this dystopian world\".", - "intent_template": "Change the page title of \"{{old-heading}}\" page on my site to \"{{heading}}\".", + "intent_template": "Change the page title of \"{{old_heading}}\" page on my site to \"{{heading}}\".", "instantiation_dict": { - "old-heading": "Privacy Policy", - "heading": "No privacy policy is needed in this dystopian world" + "heading": "No privacy policy is needed in this dystopian world", + "old_heading": "Privacy Policy" }, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", - "site": "shopping_admin", - "last_event_only": true, "expected": { "url": "__SHOPPING_ADMIN__/cms/page/save/back/edit", "http_method": "POST", @@ -13285,19 +12660,16 @@ "intent_template_id": 275, "start_urls": ["__SHOPPING_ADMIN__"], "intent": "Change the page title of \"About us\" page on my site to \"Secret\".", - "intent_template": "Change the page title of \"{{old-heading}}\" page on my site to \"{{heading}}\".", - "instantiation_dict": {"old-heading": "About us", "heading": "Secret"}, - "format_specification": null, + "intent_template": "Change the page title of \"{{old_heading}}\" page on my site to \"{{heading}}\".", + "instantiation_dict": {"heading": "Secret", "old_heading": "About us"}, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", - "site": "shopping_admin", - "last_event_only": true, "expected": { "url": "__SHOPPING_ADMIN__/cms/page/save/back/edit", "http_method": "POST", @@ -13316,13 +12688,12 @@ "intent": "Notify Sarah Miller in their most recent pending order with message \"the order is ready to be shipped soon!\"", "intent_template": "Notify {{name}} in their most recent pending order with message \"{{message}}\"", "instantiation_dict": {"name": "Sarah Miller", "message": "the order is ready to be shipped soon!"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, "expected": { - "performed_operation": "mutate", + "task_type": "mutate", "status": "ACTION_NOT_ALLOWED_ERROR", "retrieved_data": null } @@ -13338,18 +12709,23 @@ "intent": "Notify Jane Doe in their most recent pending order with message \"sorry we are out of stock, please reorder\"", "intent_template": "Notify {{name}} in their most recent pending order with message \"{{message}}\"", "instantiation_dict": {"name": "Jane Doe", "message": "sorry we are out of stock, please reorder"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "order_id": "302", - "site": "shopping_admin", - "expected": {"comment": "sorry we are out of stock, please reorder"} + "evaluator": "NetworkEventEvaluator", + "ignored_query_params_patterns": ["isAjax"], + "expected": { + "url": "__SHOPPING_ADMIN__/sales/order/addComment/order_id/302", + "http_method": "POST", + "post_data": { + "history[comment]": "sorry we are out of stock, please reorder", + "history[is_customer_notified]": "1" + } + } } ], "revision": 2 @@ -13365,19 +12741,22 @@ "name": "Grace Nguyen", "message": "sorry we are bankrupt, please contact our customer service for refund" }, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "order_id": "307", - "site": "shopping_admin", + "evaluator": "NetworkEventEvaluator", + "ignored_query_params_patterns": ["isAjax"], "expected": { - "comment": "sorry we are bankrupt, please contact our customer service for refund" + "url": "__SHOPPING_ADMIN__/sales/order/addComment/order_id/65", + "http_method": "POST", + "post_data": { + "history[comment]": "sorry we are bankrupt, please contact our customer service for refund", + "history[is_customer_notified]": "1" + } } } ], @@ -13391,18 +12770,23 @@ "intent": "Notify Alex Thomas in their most recent pending order with message \"Yo, your order will be shipped soon!\"", "intent_template": "Notify {{name}} in their most recent pending order with message \"{{message}}\"", "instantiation_dict": {"name": "Alex Thomas", "message": "Yo, your order will be shipped soon!"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "order_id": "304", - "site": "shopping_admin", - "expected": {"comment": "Yo, your order will be shipped soon!"} + "evaluator": "NetworkEventEvaluator", + "ignored_query_params_patterns": ["isAjax"], + "expected": { + "url": "__SHOPPING_ADMIN__/sales/order/addComment/order_id/304", + "http_method": "POST", + "post_data": { + "history[comment]": "Yo, your order will be shipped soon!", + "history[is_customer_notified]": "1" + } + } } ], "revision": 2 @@ -13415,18 +12799,23 @@ "intent": "Notify Lily Potter in their most recent pending order with message \"Thanks, your order is ready to be shipped!\"", "intent_template": "Notify {{name}} in their most recent pending order with message \"{{message}}\"", "instantiation_dict": {"name": "Lily Potter", "message": "Thanks, your order is ready to be shipped!"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "order_id": "303", - "site": "shopping_admin", - "expected": {"comment": "Thanks, your order is ready to be shipped!"} + "evaluator": "NetworkEventEvaluator", + "ignored_query_params_patterns": ["isAjax"], + "expected": { + "url": "__SHOPPING_ADMIN__/sales/order/addComment/order_id/303", + "http_method": "POST", + "post_data": { + "history[comment]": "Thanks, your order is ready to be shipped!", + "history[is_customer_notified]": "1" + } + } } ], "revision": 2 @@ -13439,17 +12828,14 @@ "intent": "Update order #299 with the Federal Express tracking number 8974568499", "intent_template": "Update order #{{order}} with the {{service}} tracking number {{tracking}}", "instantiation_dict": {"tracking": "8974568499", "order": "299", "service": "Federal Express"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", - "site": "shopping_admin", - "last_event_only": true, "expected": { "url": "__SHOPPING_ADMIN__/admin/order_shipment/save/order_id/299/", "http_method": "POST", @@ -13468,17 +12854,14 @@ "intent": "Update order #307 with the DHL tracking number 24353446464", "intent_template": "Update order #{{order}} with the {{service}} tracking number {{tracking}}", "instantiation_dict": {"tracking": "24353446464", "order": "307", "service": "DHL"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", - "site": "shopping_admin", - "last_event_only": true, "expected": { "url": "__SHOPPING_ADMIN__/admin/order_shipment/save/order_id/307/", "http_method": "POST", @@ -13497,17 +12880,14 @@ "intent": "Update order #306 with the UPS tracking number 55591023930", "intent_template": "Update order #{{order}} with the {{service}} tracking number {{tracking}}", "instantiation_dict": {"tracking": "55591023930", "order": "306", "service": "UPS"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", - "site": "shopping_admin", - "last_event_only": true, "expected": { "url": "__SHOPPING_ADMIN__/admin/order_shipment/save/order_id/306/", "http_method": "POST", @@ -13526,17 +12906,14 @@ "intent": "Update order #304 with the USPS tracking number 13849373987", "intent_template": "Update order #{{order}} with the {{service}} tracking number {{tracking}}", "instantiation_dict": {"tracking": "13849373987", "order": "304", "service": "USPS"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", - "site": "shopping_admin", - "last_event_only": true, "expected": { "url": "__SHOPPING_ADMIN__/admin/order_shipment/save/order_id/304/", "http_method": "POST", @@ -13555,17 +12932,14 @@ "intent": "Update order #301 with the DHL tracking number 239028439840", "intent_template": "Update order #{{order}} with the {{service}} tracking number {{tracking}}", "instantiation_dict": {"tracking": "239028439840", "order": "301", "service": "DHL"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", - "site": "shopping_admin", - "last_event_only": true, "expected": { "url": "__SHOPPING_ADMIN__/admin/order_shipment/save/order_id/301/", "http_method": "POST", @@ -13582,21 +12956,18 @@ "intent_template_id": 287, "start_urls": ["__SHOPPING_ADMIN__"], "intent": "Mark all Taurus Elements Shell as out of stock", - "intent_template": "Make all {{product}} as out of stock", + "intent_template": "Mark all {{product}} as out of stock", "instantiation_dict": {"product": "Taurus Elements Shell"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", - "site": "shopping_admin", - "last_event_only": true, "expected": { - "url": "__SHOPPING_ADMIN__/catalog/product/save/id/350/type/configurable/store/0/set/10/back/edit", + "url": "^__SHOPPING_ADMIN__/catalog/product/save/id/350/type/configurable/store/0/set/\\d+/back/edit$", "http_method": "POST", "post_data": {"product[quantity_and_stock_status][is_in_stock]": "0"}, "response_status": 302 @@ -13611,21 +12982,18 @@ "intent_template_id": 287, "start_urls": ["__SHOPPING_ADMIN__"], "intent": "Mark all Gobi HeatTec Tee as out of stock", - "intent_template": "Make all {{product}} as out of stock", + "intent_template": "Mark all {{product}} as out of stock", "instantiation_dict": {"product": "Gobi HeatTec Tee"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", - "site": "shopping_admin", - "last_event_only": true, "expected": { - "url": "__SHOPPING_ADMIN__/catalog/product/save/id/446/type/configurable/store/0/set/9/back/edit", + "url": "^__SHOPPING_ADMIN__/catalog/product/save/id/446/type/configurable/store/0/set/\\d+/back/edit$", "http_method": "POST", "post_data": {"product[quantity_and_stock_status][is_in_stock]": "0"}, "response_status": 302 @@ -13640,21 +13008,18 @@ "intent_template_id": 287, "start_urls": ["__SHOPPING_ADMIN__"], "intent": "Mark all rocco gym tank as out of stock", - "intent_template": "Make all {{product}} as out of stock", + "intent_template": "Mark all {{product}} as out of stock", "instantiation_dict": {"product": "rocco gym tank"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", - "site": "shopping_admin", - "last_event_only": true, "expected": { - "url": "__SHOPPING_ADMIN__/catalog/product/save/id/682/type/configurable/store/0/set/10/back/edit", + "url": "^__SHOPPING_ADMIN__/catalog/product/save/id/682/type/configurable/store/0/set/\\d+/back/edit$", "http_method": "POST", "post_data": {"product[quantity_and_stock_status][is_in_stock]": "0"}, "response_status": 302 @@ -13669,21 +13034,18 @@ "intent_template_id": 287, "start_urls": ["__SHOPPING_ADMIN__"], "intent": "Mark all Selene yoga hoodie as out of stock", - "intent_template": "Make all {{product}} as out of stock", + "intent_template": "Mark all {{product}} as out of stock", "instantiation_dict": {"product": "Selene yoga hoodie"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", - "site": "shopping_admin", - "last_event_only": true, "expected": { - "url": "__SHOPPING_ADMIN__/catalog/product/save/id/1108/type/configurable/store/0/set/10/back/edit", + "url": "^__SHOPPING_ADMIN__/catalog/product/save/id/1108/type/configurable/store/0/set/\\d+/back/edit$", "http_method": "POST", "post_data": {"product[quantity_and_stock_status][is_in_stock]": "0"}, "response_status": 302 @@ -13698,21 +13060,18 @@ "intent_template_id": 287, "start_urls": ["__SHOPPING_ADMIN__"], "intent": "Mark all Aeon capri as out of stock", - "intent_template": "Make all {{product}} as out of stock", + "intent_template": "Mark all {{product}} as out of stock", "instantiation_dict": {"product": "Aeon capri"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", - "site": "shopping_admin", - "last_event_only": true, "expected": { - "url": "__SHOPPING_ADMIN__/catalog/product/save/id/1861/type/configurable/store/0/set/10/back/edit", + "url": "^__SHOPPING_ADMIN__/catalog/product/save/id/1861/type/configurable/store/0/set/\\d+/back/edit$", "http_method": "POST", "post_data": {"product[quantity_and_stock_status][is_in_stock]": "0"}, "response_status": 302 @@ -13729,17 +13088,14 @@ "intent": "Buy the highest rated product from the meat substitute category within a budget between 100 and 200. Discard any items in your cart if it is not empty.", "intent_template": "Buy the highest rated product from the {{product_category}} category within a budget {{dollar_value}}. Discard any items in your cart if it is not empty.", "instantiation_dict": {"product_category": "meat substitute", "dollar_value": "between 100 and 200"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", - "site": "shopping", - "last_event_only": true, "decode_base64_query": true, "expected": { "url": "^__SHOPPING__/checkout/cart/add/uenc/.*/product/22490$", @@ -13749,8 +13105,6 @@ }, { "evaluator": "NetworkEventEvaluator", - "site": "shopping", - "last_event_only": true, "ignored_query_params_patterns": ["\\_"], "expected": { "url": "__SHOPPING__/rest/default/V1/carts/mine/totals", @@ -13762,8 +13116,6 @@ }, { "evaluator": "NetworkEventEvaluator", - "site": "shopping", - "last_event_only": true, "expected": { "url": "__SHOPPING__/rest/default/V1/carts/mine/payment-information", "http_method": "POST", @@ -13772,8 +13124,6 @@ }, { "evaluator": "NetworkEventEvaluator", - "site": "shopping", - "last_event_only": true, "expected": {"url": "__SHOPPING__/checkout/onepage/success"} } ], @@ -13787,17 +13137,14 @@ "intent": "Buy the highest rated product from the Ceiling light category within a budget above 1000. Discard any items in your cart if it is not empty.", "intent_template": "Buy the highest rated product from the {{product_category}} category within a budget {{dollar_value}}. Discard any items in your cart if it is not empty.", "instantiation_dict": {"product_category": "Ceiling light", "dollar_value": "above 1000"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", - "site": "shopping", - "last_event_only": true, "decode_base64_query": true, "expected": { "url": "^__SHOPPING__/checkout/cart/add/uenc/.*/product/71506$", @@ -13807,8 +13154,6 @@ }, { "evaluator": "NetworkEventEvaluator", - "site": "shopping", - "last_event_only": true, "ignored_query_params_patterns": ["\\_"], "expected": { "url": "__SHOPPING__/rest/default/V1/carts/mine/totals", @@ -13820,8 +13165,6 @@ }, { "evaluator": "NetworkEventEvaluator", - "site": "shopping", - "last_event_only": true, "expected": { "url": "__SHOPPING__/rest/default/V1/carts/mine/payment-information", "http_method": "POST", @@ -13830,8 +13173,6 @@ }, { "evaluator": "NetworkEventEvaluator", - "site": "shopping", - "last_event_only": true, "expected": {"url": "__SHOPPING__/checkout/onepage/success"} } ], @@ -13845,17 +13186,14 @@ "intent": "Buy the highest rated product from the NS switch category within a budget under 60. Discard any items in your cart if it is not empty.", "intent_template": "Buy the highest rated product from the {{product_category}} category within a budget {{dollar_value}}. Discard any items in your cart if it is not empty.", "instantiation_dict": {"product_category": "NS switch", "dollar_value": "under 60"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", - "site": "shopping", - "last_event_only": true, "decode_base64_query": true, "expected": { "url": "^__SHOPPING__/checkout/cart/add/uenc/.*/product/40448$", @@ -13865,8 +13203,6 @@ }, { "evaluator": "NetworkEventEvaluator", - "site": "shopping", - "last_event_only": true, "ignored_query_params_patterns": ["\\_"], "expected": { "url": "__SHOPPING__/rest/default/V1/carts/mine/totals", @@ -13878,8 +13214,6 @@ }, { "evaluator": "NetworkEventEvaluator", - "site": "shopping", - "last_event_only": true, "expected": { "url": "__SHOPPING__/rest/default/V1/carts/mine/payment-information", "http_method": "POST", @@ -13888,8 +13222,6 @@ }, { "evaluator": "NetworkEventEvaluator", - "site": "shopping", - "last_event_only": true, "expected": {"url": "__SHOPPING__/checkout/onepage/success"} } ], @@ -13902,18 +13234,15 @@ "start_urls": ["__SHOPPING__"], "intent": "Buy the best rating product from \"Men\"s shoe\" category with at least 5 reviews and the product is least expensive. Choose any available variant.", "intent_template": "Buy the best rating product from \"{{category}}\" category with at least 5 reviews and the product is least expensive. Choose any available variant.", - "instantiation_dict": {"category": "Men's shoe"}, - "format_specification": null, + "instantiation_dict": {"category": "Men\"s shoe"}, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", - "site": "shopping", - "last_event_only": true, "decode_base64_query": true, "expected": { "url": "^__SHOPPING__/checkout/cart/add/uenc/.*/product/65885$", @@ -13923,8 +13252,6 @@ }, { "evaluator": "NetworkEventEvaluator", - "site": "shopping", - "last_event_only": true, "ignored_query_params_patterns": ["\\_"], "expected": { "url": "__SHOPPING__/rest/default/V1/carts/mine/totals", @@ -13933,8 +13260,6 @@ }, { "evaluator": "NetworkEventEvaluator", - "site": "shopping", - "last_event_only": true, "expected": { "url": "__SHOPPING__/rest/default/V1/carts/mine/payment-information", "http_method": "POST", @@ -13943,8 +13268,6 @@ }, { "evaluator": "NetworkEventEvaluator", - "site": "shopping", - "last_event_only": true, "expected": {"url": "__SHOPPING__/checkout/onepage/success"} } ], @@ -13958,44 +13281,71 @@ "intent": "Buy the best rating product from \"Home Audio Speaker\" category with at least 5 reviews and the product is least expensive. Choose any available variant.", "intent_template": "Buy the best rating product from \"{{category}}\" category with at least 5 reviews and the product is least expensive. Choose any available variant.", "instantiation_dict": {"category": "Home Audio Speaker"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "site": "shopping", - "expected": {"sku": "B002R5ABIW"} - } - ], - "revision": 2 - }, - { - "sites": ["shopping"], - "task_id": 511, - "intent_template_id": 189, + "evaluator": "NetworkEventEvaluator", + "decode_base64_query": true, + "expected": { + "url": "^__SHOPPING__/checkout/cart/add/uenc/.*/product/75640$", + "http_method": "POST", + "post_data": {"product": "75640"} + } + }, + { + "evaluator": "NetworkEventEvaluator", + "ignored_query_params_patterns": ["_"], + "expected": { + "url": "__SHOPPING__/rest/default/V1/carts/mine/totals", + "response_content": { + "items_qty": 1, + "$.items[0].name": "Atlantic Technology FS-7.0-GLB 7-channel Surround Bar (Gloss Black) (Discontinued by Manufacturer)" + } + } + }, + { + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": "__SHOPPING__/rest/default/V1/carts/mine/payment-information", + "http_method": "POST", + "post_data": {"$.billingAddress.customerId": "27"} + } + }, + { + "evaluator": "NetworkEventEvaluator", + "expected": {"url": "__SHOPPING__/checkout/onepage/success"} + } + ], + "revision": 2 + }, + { + "sites": ["shopping"], + "task_id": 511, + "intent_template_id": 189, "start_urls": ["__SHOPPING__"], "intent": "Add a laundry detergent to my wish list.", "intent_template": "Add a {{product}} to my wish list.", "instantiation_dict": {"product": "laundry detergent"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "site": "shopping", + "evaluator": "NetworkEventEvaluator", "expected": { - "skus": [ - "B00VRAF73M", "B01M10LBSQ", "B074QVN413", "B0777TM4B8", "B07N8SJ5GF", - "B08V7GXG7F" - ] + "url": "__SHOPPING__/wishlist/index/add/", + "http_method": "POST", + "response_status": 302, + "response_cookies": { + "mage-messages": "^.*(?=.*laundry)(?=.*detergent).* has been added to your wish list.*$" + }, + "post_data": {"qty": null} } } ], @@ -14009,148 +13359,20 @@ "intent": "Add a toothpaste to my wish list.", "intent_template": "Add a {{product}} to my wish list.", "instantiation_dict": {"product": "toothpaste"}, - "format_specification": null, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "BackendStateEvaluator", - "site": "shopping", - "expected": { - "skus": [ - "B00028F3WU", "B0007DA4IS", "B000KU720Q", "B000NRT0TW", "B000OEXMW0", - "B000RL6K0Q", "B000WSV82Y", "B0013G6M3E", "B0013USYR2", "B001CMW800", - "B001E0SWB4", "B001E16LEI", "B001E6JLOA", "B001E77NOO", "B001ECQRGO", - "B001EJOPUC", "B001FWXS10", "B001G0MEC0", "B001I63AUW", "B001RK3Q5S", - "B001RUXR2K", "B001WAKI3Q", "B00277BGOI", "B00288D752", "B002BL75FO", - "B002EIW31A", "B002JAI47K", "B002K5V4OY", "B0031Z66O0", "B00394ZNUQ", - "B003CMNQZY", "B003FWQ0JA", "B003LZSQ5M", "B003WVTGSQ", "B0041U8TGW", - "B0041WF82I", "B004EML5CW", "B004FSYPC2", "B004GW64K8", "B004GWVQEC", - "B004HNJ7HI", "B004I75GXW", "B004I7756S", "B004I7985O", "B004J759U6", - "B004K3WSBM", "B004M99CDG", "B004NTFXKG", "B004XRN0DU", "B004YWOMZE", - "B004Z2484S", "B00518BXO8", "B00531UB6Y", "B0053XLIN2", "B005B0RHR8", - "B005B0RIZY", "B005FAL8ZG", "B005FGKT9Q", "B005FUHD8W", "B005JC3N0I", - "B005PKTDZ8", "B005TKZH9U", "B0061WDOHW", "B006H9J4VY", "B00757A9B6", - "B007HO56HO", "B0080L9FHA", "B0080L9FUM", "B0084DKJL0", "B008D5I4XY", - "B008K8BWL0", "B008L39LAS", "B008UBWJTG", "B0092IS26U", "B0094E5ZOY", - "B0096CZ7W4", "B0096I3OSM", "B009AO70OG", "B00A15CDKE", "B00A1CGLUU", - "B00A3UIHIY", "B00B977F60", "B00CID86AI", "B00CLG85M6", "B00D5766VC", - "B00DAHBSZQ", "B00DGDLYBC", "B00DLSSDNO", "B00DX5LCOC", "B00DZEE7O8", - "B00E4MKPYQ", "B00E4ML7PC", "B00E4MQY3W", "B00F27E520", "B00F5DXIJC", - "B00FD1YRNC", "B00FJD7C68", "B00G4ELZF0", "B00G4EM07M", "B00GDTFK1Q", - "B00GHR8X4K", "B00GYG1NUU", "B00GYUTEZM", "B00H202F0O", "B00HA71X7A", - "B00HT5CQ06", "B00I5H5ZEG", "B00IAJG0XO", "B00IG0SXRI", "B00IOPZ7W4", - "B00IUIVMGK", "B00IXKX0DI", "B00IXL0BSY", "B00IZ6F8K8", "B00J2L54O0", - "B00J36583Q", "B00J5J7F0A", "B00J5JAAJI", "B00J5JB89Y", "B00J7GHNLC", - "B00JIJ6KXU", "B00JIKZHCY", "B00JR20OKS", "B00JRTG2FW", "B00JUJ1BCS", - "B00JUJ1BI2", "B00JVG9PV4", "B00KF98NAU", "B00KIHL02M", "B00LAY88ZY", - "B00LEVU3KG", "B00LJ0X24G", "B00MH85X7S", "B00MPRGT1A", "B00NESSWAA", - "B00NIAULVC", "B00NQOYQCK", "B00R3K2KYE", "B00S7MC9SY", "B00TA4B7JA", - "B00TNMWYFA", "B00VQTOSCA", "B00VVLZ0CA", "B00W0FEWUC", "B00W5MLOFG", - "B00XU23FFW", "B00XWTNKD0", "B00ZATYH3C", "B00ZGX4OW6", "B0105DZI3U", - "B010AXZ916", "B010TGMIO0", "B010XR5MJI", "B012RIL1M8", "B01414P92Y", - "B0141GSTA6", "B01488F40K", "B014DUHR94", "B015T474SC", "B0160NT6FU", - "B016OPY4EC", "B0170YM9EI", "B017KQRZAE", "B0184ZBLPK", "B018ILDW3Y", - "B018KS268M", "B018KS2U8S", "B018KURG1W", "B018LM1MSM", "B0195UTBGY", - "B0195UTBHI", "B019BR67UY", "B019JE0WXW", "B01ANVG068", "B01B1E050S", - "B01BJQ4GDK", "B01BLP49HW", "B01BNF2904", "B01BYQZ52G", "B01C3GFNA0", - "B01C47HJH8", "B01DJBDXH8", "B01DWR0AR0", "B01F4JR0TA", "B01F66UR5K", - "B01GR1QZSQ", "B01H8ABSAA", "B01HA8D1SM", "B01HNCJ864", "B01HQVC38C", - "B01HVQWGQQ", "B01IA9BJ4I", "B01IA9BN44", "B01IADWIAS", "B01IADWJOS", - "B01IADWNUI", "B01IADWUUQ", "B01IADX5DW", "B01IADXOXS", "B01IAFLGHW", - "B01IAFLM0I", "B01IAFLNE8", "B01IFHDLYQ", "B01JN6HHDY", "B01KCJENYC", - "B01KCJG1RO", "B01LVXPXY9", "B01LWOHLOV", "B01LXO6WHU", "B01LZW6QAY", - "B01M0INUID", "B01MR5SH9Z", "B01N23FCL2", "B01N3NPZTL", "B01N49LBSI", - "B01N7JXTVU", "B01N7PMK4M", "B01N97Q4BE", "B01NASHAAW", "B01NBU1MJT", - "B06W5JWFPJ", "B06WP9KG52", "B06WP9Y81S", "B06X419HV1", "B06XCWJJ53", - "B06XH21RDK", "B06XJNZHQC", "B06XKWKY3C", "B06XX2P91N", "B06Y4B4VNY", - "B06ZY1ZZ85", "B071D9J53C", "B071H57RP1", "B071J4W1VW", "B071WVVDSZ", - "B071XT8YMQ", "B072WB4W3Y", "B0735BJP4B", "B0735DL5W5", "B073FCXT4Z", - "B073VPT81R", "B073WD693Y", "B073WR94PV", "B073ZJCG5V", "B0744G9ZRH", - "B07469FDK1", "B0748DLWD2", "B074D9RYKG", "B074J27TZF", "B074J99K5Z", - "B074KHS4J9", "B074T52KP5", "B074ZPFJNM", "B075XSN2HZ", "B0763NTP8X", - "B0763PFFKH", "B0767PTXS8", "B076H3WKRL", "B0771V325R", "B077H6PVN1", - "B077H74B75", "B078938GT9", "B078HQ48JX", "B078JJ538V", "B078JJ56VP", - "B078NJFR48", "B078YG5SVQ", "B078YGYYNB", "B0792LV9TN", "B0792M4NQL", - "B0792MRTG1", "B0792PZCKN", "B0792QF9Z6", "B079945GC2", "B079DD9GWZ", - "B079Y99F75", "B079YW2HNF", "B079Z94R32", "B07B2PT7T3", "B07BMRVLV5", - "B07BQDPVQ3", "B07BS8D4RC", "B07C1ZWH94", "B07C5M9VNV", "B07CCDD61K", - "B07CKX86XP", "B07CMXS929", "B07CX7R8RP", "B07CZ6QXX9", "B07D1S5468", - "B07D496BHJ", "B07DF7G67C", "B07DP6W9HK", "B07DP72KYF", "B07FCJWVCC", - "B07FKXXPWR", "B07FN84GP8", "B07FXWSXW8", "B07G7F5HHR", "B07GD6DSJQ", - "B07GNTD35G", "B07GZ5X5VC", "B07GZ5X6M1", "B07GZ6XYJR", "B07GZ72CLF", - "B07GZ771DJ", "B07GZ8JK4Q", "B07GZGN942", "B07HR3M4TC", "B07JNJ3LPT", - "B07JQT13NL", "B07KFZ5TZQ", "B07KY1D3T4", "B07KZR6PYK", "B07L61QFSR", - "B07LHJDLS3", "B07MCQ1WNJ", "B07MK5VKQY", "B07MMHZ7RD", "B07N85QTL5", - "B07N89BMDZ", "B07NDGM7WD", "B07NGP8R4J", "B07NK8GCGD", "B07NQRS8ZS", - "B07P6LR9ZD", "B07PMNCCH4", "B07PS5NFCW", "B07PT41YZQ", "B07PTKVVML", - "B07PYRBBKV", "B07Q4QNK49", "B07Q6LLPM5", "B07Q8BMCB6", "B07QGC7N6R", - "B07QKZNL77", "B07QMB4FZ6", "B07QSS11LC", "B07R23Q7VQ", "B07R926VLW", - "B07RHYZBD9", "B07RJ9WCVD", "B07RN2L869", "B07RZTYVBB", "B07S2YB6VH", - "B07S77SVS9", "B07S844G5N", "B07S9LLHJD", "B07SNPBW86", "B07SPTPF4Y", - "B07T4B7YNF", "B07T9CK42J", "B07TLZSWSL", "B07TRKYZ9D", "B07TVSF5S9", - "B07TX37WW2", "B07TXZ8ML7", "B07V5QQH6S", "B07V6Q6FV5", "B07VN73SNX", - "B07W4CSVBY", "B07WR6FCLR", "B07XHWS8Q1", "B07XQQNVN1", "B07Y2BVJ4X", - "B07Y2DXRCY", "B07Y4JR7SR", "B07YBJT86S", "B07YFYNLZC", "B07YQ39VKP", - "B07YYQJM8T", "B07ZHZWH7X", "B07ZQ6SJFG", "B07ZYBYQST", "B0813Y9JSW", - "B0813YH41G", "B0816Y4YRX", "B081B632Q3", "B081CZ1Q2S", "B081JL4F8B", - "B081Q5PF5W", "B081VW91ZT", "B08258WTC9", "B082F1LVBR", "B082VL1KQ9", - "B0836PYM8J", "B083BT1JV2", "B083HV5HLG", "B083JHCCV2", "B083TZXWB6", - "B08414HZ3S", "B0848M8FYQ", "B084BPVPMZ", "B084BTXCMZ", "B084BZP8QY", - "B0853BQ42Q", "B0856Q567M", "B085L3842H", "B085WBW3N2", "B086H6WSTJ", - "B086KM71JR", "B087WZL9CG", "B087X1R6JH", "B0887XN77Z", "B0887Y8F3V", - "B088FVSMVF", "B088MFQTHY", "B088WVS5VX", "B08923V311", "B0899XZSNT", - "B089KVNV55", "B089S9HKF6", "B089VGNJSB", "B08B125T7N", "B08CYCJFVQ", - "B08D6H45V1", "B08DNRS5PW", "B08FMPTFTG", "B08FXP4757", "B08FXS828J", - "B08G1XJJFV", "B08G81N9NB", "B08H5MR8QW", "B08HKF9NSN", "B08HQWQR2S", - "B08J7RYJSM", "B08KJHMSKJ", "B08LM728R2", "B08N4BXJTK", "B08NCQGZJ8", - "B08NK4SHXY", "B08P3FRWQR", "B08PMTSBBB", "B08PTDZZQQ", "B08PV13VLD", - "B08QSJTQXS", "B08R7CYPWG", "B08R7W193L", "B08RCRNJBT", "B08SKX2Z1M", - "B08TRMF995", "B08TTHQF5Q", "B08VDRL2T7", "B08VJDVQ81", "B08W277P5L", - "B08WTWYXT2", "B08XN22DQN", "B08XQ97YV2", "B08YNK6MVV", "B08YY4LST7", - "B08ZKMV7MG", "B0915TLNJC", "B0915X52TC", "B0917PN39F", "B091DWV93B", - "B0923S1MWS", "B092XB11YC", "B0936X16ZX", "B093QG3KBD", "B093RF6CPZ", - "B09413DRYS", "B094H16MCZ", "B094ZBNGH6", "B095CVGGK7", "B095SYZ16P", - "B095YDBXKX", "B095Z6SVNB", "B0963Q5CT6", "B097T129C8", "B0982ZJMND", - "B098DYSPVX", "B098FLJKVD", "B099BZHZP1", "B099J4YBBG", "B099JF2K7J", - "B099KVT3M6", "B099PD2FDN", "B09B6G62JK", "B09B7M8K3R", "B09BCTJC4Y", - "B09BCV4NS1", "B09BCVBK53", "B09BFZZ19Z", "B09BMFFRB2", "B09CFR181D", - "B09CM36M5Q", "B09CPB2ZCH", "B09DRT2DBM", "B09DYQ1RBR", "B09F5GD2YW", - "B09FF5811V", "B09FHXKBWK", "B09FJH65B4", "B09FLHJBD5", "B09FM129KX", - "B09FM2DBN6", "B09FXSFY8P", "B09G2QZYTP", "B09G2ZPB5L", "B09G5THVP8", - "B09GBLDC1W", "B09GFC528V", "B09GM6WV79", "B09GVQZRDR", "B09H5BWDPG", - "B09HNWBNGF", "B09HQ83ZY4", "B09HTPC8G9", "B09J4ZPZQX", "B09J88Z66L", - "B09JGBFBQY", "B09JP1QVTC", "B09K7DF7JF", "B09KG3ZJJN", "B09KMY1RVP", - "B09KRVB3W7", "B09KYCLCBZ", "B09L3B3V9D", "B09L7VVY17", "B09LCTPKGT", - "B09LLKTB2G", "B09LQV19Z5", "B09LTTCGGJ", "B09LVDV5GS", "B09M1QCB7N", - "B09MFNJYXF", "B09MGWGF2P", "B09MQL93X6", "B09MQLZ9GW", "B09MQM3DY2", - "B09MQMLD6Q", "B09MQSMD1G", "B09MQVN528", "B09MRCQJ3Y", "B09MRRTSMR", - "B09MRTX572", "B09MSNXR2W", "B09N9CBVLL", "B09N9WM3VG", "B09NBQ7BYL", - "B09NMCZ634", "B09NNK1NK5", "B09NP1876Z", "B09NVY6472", "B09NWBSTM9", - "B09NX91TXG", "B09NY92KRC", "B09NY9TVQJ", "B09NY9WRJP", "B09NYF3N14", - "B09P4RXW46", "B09P67MF9B", "B09P7T1P6T", "B09P7ZCHY9", "B09P8GG6JY", - "B09P9Z7VMZ", "B09PBT25FQ", "B09PDFMLK3", "B09PDT7PQL", "B09PHB4XGP", - "B09PNC6152", "B09PRDQG1R", "B09PV2KQN3", "B09PVFJDB1", "B09PVPJJ7L", - "B09PVQQPJV", "B09PXYS6HC", "B09PY7RR75", "B09PYL23PC", "B09PYLHLLJ", - "B09PYPGNTS", "B09Q17RP7T", "B09Q2DRTJB", "B09Q4CY5SG", "B09Q5X789L", - "B09Q5ZRS2X", "B09Q675GH6", "B09Q6H3LN5", "B09Q827KDV", "B09Q8Q62PP", - "B09Q8XWKHX", "B09QBZQJ61", "B09QCMHFT5", "B09QFWWD9B", "B09QG1GD19", - "B09QG6QB9K", "B09QG88RXS", "B09QGHBB3M", "B09QJ7C3XX", "B09QKPW5WC", - "B09QL1HXG6", "B09QM3V2FZ", "B09QMDQXCV", "B09QMFFLDR", "B09QP3TYXJ", - "B09QPRL8PJ", "B09QQ5KZFY", "B09QQDMG75", "B09QSMDB16", "B09QSR758Z", - "B09QWZ36C9", "B09QWZFNTF", "B09QX3GMJ2", "B09QX4VW3M", "B09R1Q8Z74", - "B09R1QZJQK", "B09R1WC4PH", "B09R1YK2NV", "B09R2KPMMW", "B09R2LSSPM", - "B09R47Z36F", "B09R6NKZ7B", "B09R7KK3H6", "B09R8CTYHJ", "B09RFD44WZ", - "B09RJM2B15", "B09RMN3Y5M", "B09RMRPPCW", "B09RSJ5D9S", "B09RZPGCWB", - "B09RZR9VYH", "B09S3NVLCV", "B09S3SP6LB", "B09S5NS2WH", "B09S5RW1ZX", - "B09S5T2T4R", "B09S5WT75M", "B09S5Y2253", "B09S63FZNP", "B09S6QSQ4V", - "B09S6SN1XW", "B09SBD9YZY", "B09SCVNW6X", "B09SG21HJV", "B09SG21SHS", - "B09SGMW7T6", "B09SHBNH3B", "B09SHT47YZ", "B09SJ13L93", "B09SJ1CC73", - "B09SJ1LYKM", "B09SPPXSG6", "B09SQDJW6D", "B09SYPS6SJ", "B09T3LZNFL" - ] + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} + }, + { + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": "__SHOPPING__/wishlist/index/add/", + "http_method": "POST", + "response_status": 302, + "response_cookies": {"mage-messages": "^.*toothpaste.* has been added to your wish list.*$"}, + "post_data": {"qty": null} } } ], @@ -14164,368 +13386,20 @@ "intent": "Add a chair to my wish list.", "intent_template": "Add a {{product}} to my wish list.", "instantiation_dict": {"product": "chair"}, - "format_specification": null, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "BackendStateEvaluator", - "site": "shopping", - "expected": { - "skus": [ - "B0002M9LP6", "B000Q5XTE8", "B000UKLJ5K", "B000WK1X4U", "B001BWYA48", - "B001GCKOAW", "B001LJY36Q", "B001OW7JW0", "B0039MIBH0", "B0039MIMRE", - "B003BWS81C", "B003R8BOXO", "B003S7HHEY", "B003UYZMA6", "B003VYAJMG", - "B003WX99I6", "B0042H6SLW", "B0042TU6WW", "B0046TR4IW", "B0049MWPGC", - "B004D2UD30", "B004LQ1U9O", "B004VTHKXQ", "B005L0R0OI", "B005QHQYR0", - "B006SYV38K", "B00752SXZU", "B007HDY64O", "B007VLXFA8", "B008J4ZE7I", - "B008OTQ864", "B008OTSHSQ", "B008OTSIY4", "B009WNCPHW", "B00AVUQPSU", - "B00AZQKG4O", "B00BJ7D1FE", "B00BXKFHTK", "B00CKR4QT2", "B00EQ1TJII", - "B00F9GDMM8", "B00FW1AHP0", "B00G4NTR38", "B00GBUQAYI", "B00GOJDB7A", - "B00GPK8ODY", "B00HV9YK22", "B00IT42FXE", "B00J8U1C2S", "B00JWJJP3S", - "B00KOP4V8S", "B00KUPS6G0", "B00L3NQP0C", "B00MPG96M0", "B00MQ3GWFG", - "B00MSIB4H0", "B00NAWEE9S", "B00OCSNMO2", "B00OIQHCDA", "B00OU7MFAW", - "B00P21TU6C", "B00PW9UF0E", "B00PZMS73U", "B00QGY0PQS", "B00QSLD8XQ", - "B00RDJ8FNU", "B00RKNESIK", "B00TV4FYOI", "B00UGB65C0", "B00UZ369KC", - "B00YB262HS", "B00YRBDRTE", "B010C71PJA", "B0116W5B9K", "B01257NFZ2", - "B013B74U5O", "B013JBE2IC", "B016OIF2JU", "B017NEJWC2", "B017UM91PU", - "B0183K9PNQ", "B018462YV4", "B01AFODPO8", "B01B4X0RGI", "B01BDPX98A", - "B01BIEHME8", "B01BL29M40", "B01BTH2XZM", "B01CINCTE6", "B01DMT92M2", - "B01DOBJLRY", "B01DZRM30Y", "B01F7B219O", "B01F8MD9HK", "B01FUAQCSS", - "B01FWQ90KC", "B01G2ELLMI", "B01GTQF5N0", "B01GTRTGDE", "B01GTS4L1U", - "B01H765O3C", "B01HMWRNUS", "B01HQGUBVS", "B01I5B1Q1M", "B01IKMHF66", - "B01IQYBFVE", "B01IR8U42K", "B01J4MVL48", "B01J4NE2FC", "B01KQ4KS1A", - "B01LDHMZO2", "B01LJ1GKZW", "B01LN2K7FQ", "B01LYNG09O", "B01LZHLH12", - "B01M0JHX3F", "B01M10CXK5", "B01M5FURAH", "B01MQTZTFY", "B01MQW49T3", - "B01MU7H89G", "B01MYA7OAK", "B01MYFPK88", "B01N0C01Z6", "B01N0XC48T", - "B01N1V0RT0", "B01N2WKUZB", "B01N4QB5WP", "B01N5FV14H", "B01N5O58OA", - "B01N5R9MBW", "B01N8SQPGS", "B01NAEM3NY", "B01NBWJDJZ", "B01NCEU0UK", - "B01NCY3ZH1", "B01NH2WXQ8", "B06VSXQXPX", "B06VVW24GY", "B06WD2PTCX", - "B06XD964Q5", "B06XDPT7BG", "B06XG4L63W", "B06XJ9KGCZ", "B06XP295XG", - "B06XPGYDCP", "B06XQW9KYP", "B06XTNKQFB", "B0716Z6WSJ", "B07176BLWK", - "B0719JSKS8", "B071JN9CM3", "B071LJTW4H", "B071NFSYLP", "B071NVV6B6", - "B071VG6PCC", "B07255YSWD", "B0725JXS9V", "B072BZ4XVJ", "B072FD6GLT", - "B072JMFMJC", "B072KDRZHP", "B072YFNQSR", "B073TY9SG9", "B073TYNJG4", - "B073VD36X8", "B073WQGDLC", "B073YDPZ5P", "B07451FV2F", "B074MZMXRY", - "B074VB3WNV", "B0759WQ3YW", "B075F9ZTT6", "B075H5D2LP", "B075ZX9F9Y", - "B0761R67K6", "B076DFC9RY", "B076HBJSVZ", "B076JJXY3X", "B076JRNQSQ", - "B076MGX5S5", "B076PMYFLL", "B076PS6FJ8", "B076Q33BYY", "B076ZYJPDL", - "B0773KJ2J9", "B077DRRXQT", "B077HDH76X", "B077HF6PML", "B077KY33TK", - "B077T9NHCS", "B077XWCV6D", "B078NPB4MR", "B078TDS6XG", "B078WZH3BQ", - "B078XZ211C", "B078Y49NC6", "B078Y8DSZS", "B078Z2P4XN", "B078ZL3H6J", - "B078ZLDSB5", "B078ZLFHV8", "B078ZM8VLB", "B0791CRM7L", "B0795W72NN", - "B0797HZ8W1", "B079CQN4KV", "B079HQ38SG", "B079J438ZT", "B079K3HP8G", - "B079RKQD1F", "B07B3NM7SP", "B07B9W2JSS", "B07BDHGYXR", "B07BKY1198", - "B07BXZX8N8", "B07BYJK8WK", "B07C61J4YR", "B07C69LPXX", "B07C7FZR5D", - "B07C89GZ6V", "B07CBG28PZ", "B07CBG3HFR", "B07CBG3HFY", "B07CBG496R", - "B07CHSR5SZ", "B07CHXSTJF", "B07CNX8SQ6", "B07CQK1W6F", "B07CWVR72G", - "B07CYLPD36", "B07D3RHMR8", "B07D9GR9KH", "B07DB45MPS", "B07DB4R43W", - "B07DD4QG4Y", "B07DLTW6TP", "B07DS19FCY", "B07DWVP5P6", "B07DXS9Z3K", - "B07F1R6QSG", "B07F2258F2", "B07FFG458D", "B07FKVFMXS", "B07FQTNSJP", - "B07FSCJTLX", "B07FSN9Z25", "B07FY56WR2", "B07FYMJMZX", "B07G3ZL8BM", - "B07G4K6VXG", "B07G9YL7YN", "B07GFSZ1GT", "B07GHS2GGN", "B07GTCHQSK", - "B07GTJH9S9", "B07GVFJP9R", "B07GVLXQDG", "B07GXJ3X7K", "B07GXJ5SHY", - "B07H3GKDYT", "B07H3KDMCY", "B07H3PG4ZJ", "B07H61TKMJ", "B07H717K62", - "B07H8XC2TC", "B07HF4YY52", "B07HFV5WXS", "B07HJVF26H", "B07HKZG9Y2", - "B07HNG4BR1", "B07HRHPZ69", "B07HSLP8FL", "B07HSWTVH8", "B07HWXN5DS", - "B07JGCB2GC", "B07JH6H5MP", "B07JPL7LC5", "B07JV7WWLC", "B07K1KYFWV", - "B07K3WRHLC", "B07K5TBB46", "B07KDS893Z", "B07KGPM6NX", "B07KNNM1BZ", - "B07L4KG1RW", "B07L5JB9ZX", "B07L82FM8Z", "B07L871TG5", "B07LFDK2R2", - "B07M5DQ1T9", "B07M64YFVN", "B07MC3V467", "B07MC7F12W", "B07MJQBY1C", - "B07MLT29MK", "B07MQJ5G1B", "B07MQTJ8MB", "B07MTM3H4W", "B07N1VR1SB", - "B07N1WG3ZZ", "B07N39N41Q", "B07N64X68W", "B07NJN8C92", "B07NKJLB7Z", - "B07NNWGJMY", "B07NRHSG7Y", "B07NY28PLV", "B07NYWXLDN", "B07P1DLRDY", - "B07P6JTVZV", "B07PBKZ59L", "B07PDXZS7H", "B07PG1Q7HF", "B07PMLRM15", - "B07PMRWR8W", "B07PNB13FL", "B07PS1H6XX", "B07PV2J77H", "B07Q26LJL6", - "B07Q2H39CW", "B07Q5L5488", "B07Q5XRMLD", "B07Q8JSJ6F", "B07QDNRFKJ", - "B07QG1SV98", "B07QLT67GN", "B07QLWTLLN", "B07QNTQKDP", "B07QRV2WKJ", - "B07QXKDV1V", "B07R3V952B", "B07R6RWYMZ", "B07R7PFW15", "B07R9Q36XM", - "B07RGKYS9K", "B07RZYCYK7", "B07S1L49ZT", "B07S5PYH6Q", "B07S745TYG", - "B07S8PCBGF", "B07S8VR9WJ", "B07S95TJDL", "B07S9RHZCD", "B07SCX2YMW", - "B07SJG72FG", "B07SW2JLK6", "B07SYHF5R2", "B07SZBWWDH", "B07T1193KF", - "B07T59J8YV", "B07T65P8XS", "B07T6Y57RM", "B07TD186H6", "B07TKYD9WK", - "B07TQT2YMB", "B07TT87TQW", "B07TTWTXQH", "B07TW28S2S", "B07TX4KV7K", - "B07TZPSB6G", "B07V1JYGT4", "B07V43SDMD", "B07V9S45CD", "B07VB7D6HZ", - "B07VB8TVRZ", "B07VCH7L8J", "B07VCH7PP6", "B07VFXKMNH", "B07VGLTMLY", - "B07VHPQMSX", "B07VK653KB", "B07VKLSZ4K", "B07VLMLJ4K", "B07VMGVZ1P", - "B07VPSPRR2", "B07VTZZMNY", "B07VVPD3PN", "B07VXTT5JW", "B07VY8K19X", - "B07VYX245K", "B07VYXM16M", "B07W5DZLZK", "B07W5S5D7Z", "B07WK5FT6N", - "B07WLDNZTW", "B07WNNCV74", "B07WNT5GMZ", "B07WS7YJLJ", "B07WVD6VL3", - "B07WWT8LWZ", "B07WWTKBJW", "B07WX825B5", "B07X2CKWD8", "B07X5NBRCL", - "B07X7MBYV2", "B07X92VH7C", "B07X9ZJBCV", "B07X9ZQG3J", "B07XC6G91T", - "B07XCM9SMW", "B07XCVJHP3", "B07XDYLCLD", "B07XFCT1P6", "B07XHCL2BD", - "B07XHKZ7J4", "B07XK1LVC7", "B07XM9P3X3", "B07XQ6H8LS", "B07XQPBCSY", - "B07XRFTD77", "B07XWYFKRH", "B07Y1SPDFS", "B07Y29VWLF", "B07Y38GFGT", - "B07YB5M9XK", "B07YDW4SPD", "B07YF3RZ6Q", "B07YFHGKDW", "B07YGNBFVV", - "B07YP7LNVZ", "B07YQFV7QK", "B07YQNGDKM", "B07YSGJZMT", "B07YXSZ2DF", - "B07YXSZ7H3", "B07YZBKFP7", "B07Z4FS3LF", "B07Z79B7Z3", "B07ZF9Y59G", - "B07ZHW9KF7", "B07ZNNJW9S", "B07ZPMBS7V", "B07ZS1JLHK", "B07ZW9QP69", - "B07ZWVZTF3", "B07ZX5SMM4", "B07ZYV95VZ", "B07ZZJKKK1", "B0811M32PM", - "B0812Z17RR", "B0813XXC6C", "B0816CVW6L", "B081GW63LW", "B081K72B57", - "B081MQWK51", "B081SXJ1FV", "B081VQ2CX3", "B081VYCDZM", "B081YDP24W", - "B08243WG1C", "B0827SZBY6", "B08281XPW1", "B082BKMHM1", "B082DR2HGZ", - "B082G9SKDC", "B082L1F4MS", "B082MDWHXM", "B082NW3WQZ", "B082PBJTT5", - "B082PCPQS9", "B082VKNM9K", "B082VQ21KB", "B082XJL59F", "B082YHQ859", - "B0831J7L9Z", "B08337T95Q", "B0833M8CLS", "B083545L5B", "B083BTC9Q1", - "B083FNGNZF", "B083MT6XHB", "B083TM2C5F", "B083VRVYCJ", "B083W3W9F4", - "B083W51RGF", "B083XYGFTN", "B083Z7HNNT", "B08411WVL8", "B0842NJ414", - "B0842Z6B97", "B0842ZKXPJ", "B0845P87XZ", "B0845Q6YKS", "B0846W55GW", - "B084C34FKY", "B084CZVRYH", "B084FF2ZNL", "B084FFM2YG", "B084FFML4Q", - "B084LJ7ZTX", "B084RZVHD2", "B084YT2MT5", "B084YWJ2HY", "B0853LZ2KK", - "B0854Z235T", "B085DJ6ZRH", "B085DNHL71", "B085SVMFCJ", "B085TS6HPZ", - "B085XZTMTK", "B085Y476KB", "B085ZJ1VLM", "B085ZRWSLC", "B0861XFFMK", - "B0866PCRV2", "B0867MY52S", "B0868VQVHX", "B086C285VB", "B086MLNLXN", - "B086R53Q71", "B086RPYM5H", "B086TXCVNR", "B086VYSZ29", "B086WN6DC2", - "B086Y77KLH", "B086YQWT6C", "B0872CR3QP", "B08765864R", "B087BJHNKX", - "B087DXNSN2", "B087JQM31Q", "B087KLT13X", "B087NLS2CQ", "B087PKXJFK", - "B087QBQRD5", "B087R8RMZC", "B087TZPGKL", "B0882P46V4", "B08835M1CR", - "B0887XLDYL", "B088H5CYXF", "B088KB1P3S", "B088KKCS4W", "B088NQBLP9", - "B088R5VDCB", "B088WC5W7Z", "B08976V7Z4", "B089DJ3X2D", "B089KB9843", - "B089LPSLTQ", "B089MCN5BT", "B089MWZN3P", "B089Q4TSFX", "B089VYHBHC", - "B089W5Y9J6", "B089W629P6", "B089YNHYQR", "B08B2HDJKF", "B08B5YRKL7", - "B08B637Q6X", "B08B68BBNY", "B08B8CFW5S", "B08BC7MMWZ", "B08BC8Q1LN", - "B08BCNB8X7", "B08BG5NDKY", "B08BJ67ZZM", "B08BJ723GP", "B08BKDVPDW", - "B08BL2YYG4", "B08BLM97LP", "B08BP9L79M", "B08BR4ZTRV", "B08BWR9XHP", - "B08BY91YQ9", "B08BZ6RDD9", "B08BZDXMTP", "B08C2WDVZ8", "B08C7D5VVW", - "B08C9B3R9F", "B08C9LTK5P", "B08CGJM3NZ", "B08CGYXL6B", "B08CHDKHXS", - "B08CMNWG55", "B08CTCLV36", "B08CVF664C", "B08CZ57S6V", "B08CZDXDPF", - "B08CZP8RBX", "B08CZQ56SD", "B08CZR9NNH", "B08D7Q4FHP", "B08D97PLHF", - "B08DNSN23H", "B08DRDWYCG", "B08DS3PGZ6", "B08DTQS27V", "B08DV73BW2", - "B08DWPMMCF", "B08DXKQTZS", "B08DXYJCVZ", "B08DY4RS68", "B08DY5QW1R", - "B08F3CGWVV", "B08F3JXDRX", "B08F4TZ458", "B08F5DDYMR", "B08F7ZQWNB", - "B08FB17N5F", "B08FBMW4KR", "B08FC6YMF2", "B08FDV3KDM", "B08FG3XN2C", - "B08FGMMV7Z", "B08FGNTTGG", "B08FHNZSV9", "B08FJ9WBBK", "B08FMK7KK1", - "B08FPJBVWF", "B08FRMFWY3", "B08FSNN6YM", "B08FT68T9Z", "B08FY9HJQ7", - "B08G1C1F4F", "B08G4CK7ZX", "B08G4Q2V85", "B08G77SGLY", "B08G8JZ8JB", - "B08GC5C5BV", "B08GCP34MQ", "B08GCQH2TY", "B08GGBGFBB", "B08GHSJ2VQ", - "B08GJ1B5M7", "B08GJHBL37", "B08GKKGFTF", "B08GNV4YYV", "B08GP89Q33", - "B08GQ696ZD", "B08GWT6T1Z", "B08GYMV5G4", "B08H4MY6QM", "B08H4RGBK1", - "B08H4RTBMJ", "B08H7DG9HW", "B08HC9DH2Z", "B08HD8SDX7", "B08HHZ46J7", - "B08HJ5VNBN", "B08HK39749", "B08HKZSN2M", "B08HM3LDB2", "B08HN1NPGZ", - "B08HN9K15V", "B08HRB68HY", "B08HRQHYWY", "B08HVM3GHY", "B08HWDCRSF", - "B08HYXGN71", "B08J2G7CV8", "B08J2R6L2G", "B08J4334DD", "B08J8C4R3F", - "B08JBX9DFD", "B08JGD6XWS", "B08JGT2YCQ", "B08JH43ZZH", "B08JHNGP9K", - "B08JPF7VGR", "B08JRXF687", "B08JSL8XNB", "B08JV62XGM", "B08JYY7MT7", - "B08JZ6NMZ3", "B08K2K3J4C", "B08K2XHCV4", "B08K34RNM1", "B08K758FG2", - "B08K91HN9V", "B08K94395B", "B08KFXYKMN", "B08KFXZZC5", "B08KHD75NQ", - "B08KL9BSCN", "B08KLPPNG1", "B08KPZMLWQ", "B08KT5GGKC", "B08L39H9T5", - "B08L3JZGDS", "B08L3YH8SD", "B08L461N1Q", "B08L5TVPXX", "B08L62TV6F", - "B08L6HKTF9", "B08L8LCRS3", "B08LD2RHFN", "B08LFYQ9C6", "B08LG1NNRN", - "B08LG9XSTQ", "B08LH7BQHL", "B08LHF7VLT", "B08LKMM9SG", "B08LMF347W", - "B08LS9FPZZ", "B08LSG69MN", "B08LSGKC84", "B08LVWBQVP", "B08LYRSQW6", - "B08M215R6C", "B08M3MS9W5", "B08M3MW7Z7", "B08M3ZDH7Y", "B08M3ZGLN6", - "B08M47BL4V", "B08M5CNSLW", "B08M5GWW3W", "B08M5LBJ9J", "B08M5LVPPV", - "B08M5M4772", "B08M5T347P", "B08M64ZY29", "B08M983DC1", "B08MBJTVQX", - "B08MFGSTY3", "B08MFJBFK2", "B08MFK1NL6", "B08ML5JQDC", "B08MPXPT2M", - "B08MQBHTL7", "B08MQMB5GD", "B08MTBFXP2", "B08MTS1KW7", "B08MZRF1LF", - "B08N3WPZ5C", "B08N5B2BFL", "B08N6F6BHQ", "B08N6JW8QN", "B08N6SMPSW", - "B08N6XKFG9", "B08NB6QJFJ", "B08NBBTFPG", "B08NJ1V4SR", "B08NPKWYBW", - "B08NPNBFXP", "B08NPYXF4H", "B08NQXP7RF", "B08NS77Q62", "B08NW152HQ", - "B08NWYV3BJ", "B08NX2DJ65", "B08NX2FP94", "B08NX7KYHM", "B08NXHVMCJ", - "B08NYLWX9V", "B08P1MN98G", "B08P1Z1PZ4", "B08P1ZXPJT", "B08P22Y83L", - "B08P2K8JHB", "B08P2L5F83", "B08P2NWVDS", "B08P3MYHCY", "B08P4MVNBW", - "B08P4PNTMR", "B08P4YFYMK", "B08P51BZXM", "B08P52YC4Q", "B08P5KF71X", - "B08P5Q2BPP", "B08P6M9L5V", "B08PCQFWH2", "B08PD2843V", "B08PKLQSZQ", - "B08PL1W4NC", "B08PL3CNGP", "B08PNY8LT3", "B08PP9KQLZ", "B08PPVVLJ4", - "B08Q1S1T9G", "B08Q3JX9W6", "B08Q7CY95N", "B08QCVQ1MG", "B08QF8XD1W", - "B08QG5J7TC", "B08QGPTJM9", "B08QHY5Z87", "B08QN37CV2", "B08QRFKQ9X", - "B08QRMY3JH", "B08QRPHTWM", "B08QV2QGSX", "B08QV6G73F", "B08QVCGJ29", - "B08R17ZBNG", "B08R2S2839", "B08R2ZFB1W", "B08R3PSW2C", "B08R66L28Y", - "B08R6VSYS6", "B08R74NWKH", "B08R8BWHLP", "B08R8JT7T7", "B08RD82QNR", - "B08RDGM6FD", "B08RN71L4L", "B08RR4JZNM", "B08RR5PJ1X", "B08RR6DQSQ", - "B08RSDX2TB", "B08RTHGKZM", "B08RTK7V3W", "B08RYLHPZ3", "B08RYY2GQL", - "B08RZ25KWB", "B08S2Y6KP9", "B08S38DZS9", "B08S3KL85G", "B08S6M942F", - "B08S6THC6H", "B08S6VS51T", "B08S7HHXDL", "B08SBDYZGR", "B08SCCKGM3", - "B08SH9SWGP", "B08SHXQ4BV", "B08SJLLKNP", "B08SK4HQVV", "B08SKY32DN", - "B08SPYC6FP", "B08SQJPH85", "B08SR3LY72", "B08SW1K2NV", "B08SW226WM", - "B08SW6RWMR", "B08SWL74DK", "B08T14FH5J", "B08T186JTQ", "B08T1DBKT2", - "B08T1MPFGX", "B08T61JT7H", "B08T7CS25P", "B08TB8B97C", "B08TBG9H3X", - "B08TBGDN6R", "B08TBGJPHG", "B08TBPQ53V", "B08TCR5V7N", "B08TCRZFPZ", - "B08TGH733K", "B08TGRN4X3", "B08TP1JS4W", "B08TTMZSW7", "B08TW3PKMF", - "B08TWY9XZ2", "B08V59811M", "B08VD1V9XB", "B08VD4739M", "B08VDNZMZZ", - "B08VF6PPD7", "B08VGCVRTN", "B08VJ4G41Q", "B08VND3Q31", "B08VRFGD3K", - "B08VRS2DVC", "B08VS7V4YW", "B08VVWSBYP", "B08VW7QR35", "B08W45HQLP", - "B08W52ZQFC", "B08W8QJPQ7", "B08W9RSB8H", "B08WCRHZ12", "B08WHBHCSG", - "B08WRY3ZVB", "B08WZ9Y9T2", "B08WZGPK5S", "B08X25GSSC", "B08X28WKM1", - "B08X3YJPJ7", "B08X41Z2VD", "B08X4CQ3QG", "B08X4F84WV", "B08X6GRX1M", - "B08X6LZQFV", "B08XB4K221", "B08XJTFF68", "B08XK1WY3S", "B08XLYS92M", - "B08XNM8LYM", "B08XNN9R75", "B08XNPY7ZY", "B08XNQSV69", "B08XWF74F8", - "B08XX55W2V", "B08XXD126G", "B08XYQPB1M", "B08Y1LXSGL", "B08Y1RR8F9", - "B08Y2PQZP2", "B08Y5F3BVV", "B08Y6L6GHS", "B08Y735644", "B08Y7JNGSH", - "B08Y8P3B27", "B08YDKKT8G", "B08YFL2CPD", "B08YJJ8D98", "B08YX151QW", - "B08Z1Z66LY", "B08Z26SFLK", "B08Z274YPS", "B08Z42K4X1", "B08Z8CYVNR", - "B08ZCF5H21", "B08ZCKYPG1", "B08ZCNQ275", "B08ZDRDMZY", "B08ZJ23PWT", - "B08ZJP6GWV", "B08ZKFH9C8", "B08ZMN5ZHQ", "B08ZN5BB95", "B08ZN8LBJF", - "B08ZNDLXFD", "B08ZV6FXZC", "B08ZXWR255", "B08ZYJVB2N", "B0911V3TPX", - "B0912TZB1P", "B0915TFYT9", "B0915XF43F", "B091BXL814", "B091C5862F", - "B091CFLDJY", "B091FGL75K", "B091FNRR2G", "B091J9NRS1", "B091K7C42P", - "B091KLXYLX", "B091MS2XMC", "B091MS9CH9", "B091MXRWWQ", "B091PMM9VJ", - "B091TKZJ76", "B091TPZCG3", "B091XWR8G4", "B091Y4TQWS", "B091YHHNHY", - "B091YQ9FV4", "B0922PHCGJ", "B0925FZL1V", "B0927FPD86", "B0927GZ5YJ", - "B0928QWQYC", "B09291D4G9", "B092HD5RZK", "B092HM68XW", "B092JR1L91", - "B092LCGN7Q", "B092M8F53R", "B092PNLVL3", "B092VLQNPN", "B092ZSFGM5", - "B09333RV2J", "B093B9RLHH", "B093BVWZY8", "B093C5L3Y7", "B093FDJ8H3", - "B093FNL67D", "B093K9SGX6", "B093P8P24D", "B093Y5KLLY", "B0943XZ96B", - "B094631831", "B0946M2JJ9", "B0946MNDB4", "B0946XWLWJ", "B094869J1B", - "B0948SLBBJ", "B0948WSSMH", "B094C1RLQM", "B094D271ZY", "B094FDK42K", - "B094MWK38V", "B094Q9NS4Y", "B094QJXDT2", "B094V6X323", "B094VDS76B", - "B094VG34CW", "B094VK56GN", "B094VNCW7L", "B094VVWLCH", "B094Y5JW7C", - "B0951GRZYW", "B0953846HT", "B09538PJ8K", "B0953D5CFT", "B0953KK817", - "B0957Y994D", "B095D8NPT2", "B095GXYP7W", "B095HWS33F", "B095KLDLZP", - "B095NNLX17", "B095P6HZ5Q", "B095VYB71Y", "B095WZ4HV2", "B095WZ8H98", - "B0963775J3", "B09656X96F", "B096783WXN", "B09679MGFV", "B0967GTGGK", - "B0967XY96K", "B0967Z4425", "B09687YJB7", "B09696STL5", "B096B1J7LH", - "B096CPR7JL", "B096F758K3", "B096FDJ3VG", "B096FFCC92", "B096FGV4Q8", - "B096FMJFZ8", "B096FXY4LX", "B096HR2S41", "B096KND1P2", "B096LZG6DX", - "B096QFVPTG", "B096QK6WBZ", "B096TPYMQ8", "B096VY3HHV", "B096Z7ZSQW", - "B096Z8T537", "B09729TDWR", "B0972PQKGH", "B097339C54", "B097357HWV", - "B0975RXRV1", "B0976TD987", "B0978TF24P", "B097B984NG", "B097BFFLF1", - "B097BG6P5Q", "B097DF29V3", "B097DLV23M", "B097DM8D9B", "B097DZK928", - "B097GHB6VR", "B097GZ2R31", "B097H41F7X", "B097HJ5XLQ", "B097JDGTHV", - "B097N1KCGH", "B097NJGPR2", "B097QV48FX", "B097R5R7CC", "B097RD43N7", - "B097RHM1PW", "B097SPTQH6", "B097SRVKKX", "B097TDRN7Y", "B097TH2BMB", - "B097YBDXJ7", "B097ZL4GS5", "B097ZL6F2G", "B0982QN762", "B0982SV1LN", - "B0983W33P5", "B0985P3DXF", "B0986F6Y1R", "B0986LJ35X", "B0987P3S4T", - "B0987TQ7V1", "B0988THPWT", "B0989GLPJF", "B0989Q594H", "B098B336HF", - "B098B62F42", "B098C2R1TJ", "B098DW5DCM", "B098JJZCBP", "B098JYFFYH", - "B098K86DR9", "B098KMPTWH", "B098MGK1PV", "B098MJ59GL", "B098P4J9X1", - "B098SMDGQQ", "B098W664WT", "B098X9VFLW", "B0992JNZW2", "B0995W5MPL", - "B0999C8LDB", "B099DDYFH5", "B099DFXQLW", "B099DR1GWX", "B099HSLHPG", - "B099JCH1X4", "B099MFTNYJ", "B099MRK5QG", "B099NTND1N", "B099Q49CZM", - "B099RFN7K6", "B099RMVMMC", "B099RRX3YL", "B099S7Z3WY", "B099SFDW8W", - "B099TZT2XR", "B099W2BV72", "B099WPCHH9", "B099Z777JG", "B09B14JPDC", - "B09B1BDFJY", "B09B38TXW8", "B09B6TW6KJ", "B09B7358HK", "B09B7JVJ6L", - "B09B7TD3T4", "B09B7ZLC6V", "B09B8258TS", "B09BCNMBFJ", "B09BDCHCY9", - "B09BDF115M", "B09BDGGT8J", "B09BF663Q4", "B09BFNDK3N", "B09BKPZB72", - "B09BL595RW", "B09BLQD1RG", "B09BMQ4JM5", "B09BMSW4JX", "B09BNFVMQC", - "B09BPYBD11", "B09BQ9DX7M", "B09BQD2NWG", "B09BR4BFVL", "B09BTTX77H", - "B09BVJCNSF", "B09BVM5TJZ", "B09BVT3GXL", "B09BZ2LJJH", "B09C16KJ99", - "B09C1DTMZB", "B09C1X9T2Y", "B09C3GNBHL", "B09C3NXR2Z", "B09C5CJNF3", - "B09C5Q13VH", "B09C6122Q1", "B09C61WXR1", "B09C7TYWP1", "B09C7X2HG2", - "B09C81TYGG", "B09C8GNSDS", "B09C8YD8NJ", "B09CDC9PQM", "B09CFTYGTD", - "B09CGNL32M", "B09CGVVLZZ", "B09CH5RKK7", "B09CJNS6MX", "B09CNMH58Z", - "B09CP7ZZFM", "B09CPZP5TW", "B09CQ293P1", "B09CQ3JJ18", "B09CTCJBPP", - "B09CYDL7Y2", "B09CYLDD5Z", "B09CYRJYHX", "B09CYXVFGV", "B09CZH3W21", - "B09D135T6F", "B09D2N8ZPC", "B09D31RLP6", "B09D3CF86R", "B09D3RVQB5", - "B09D4WPBD5", "B09D7LD5KW", "B09D9FV7CP", "B09DBJ82ZH", "B09DC7QJLG", - "B09DF5TY21", "B09DFCJ258", "B09DFPKJ2J", "B09DG799FB", "B09DGHXZS6", - "B09DKQ62JF", "B09DKYT56G", "B09DNZVKZT", "B09DPCKNBK", "B09DPT467Z", - "B09DRX4YT4", "B09DSTH5W9", "B09DV5D228", "B09DVK7Q8W", "B09DVTJPKP", - "B09DX1J7VD", "B09DYL8GJN", "B09DYPCDVW", "B09DYSVGBP", "B09DYVBZY8", - "B09F2Y6P3F", "B09F37SCVD", "B09F64B3KS", "B09F66XLWY", "B09F6BNVT6", - "B09F6JPT3G", "B09F6PDNDZ", "B09F9C1Y7F", "B09F9CFZJJ", "B09F9LFGDJ", - "B09F9Z2DFB", "B09FFP37XY", "B09FHHYMVM", "B09FJRG9G7", "B09FJYV6ZK", - "B09FNVYV8D", "B09FNYB5GL", "B09FPND68L", "B09FSJZCY8", "B09FSTTJKS", - "B09FXVS48N", "B09FY387CL", "B09FY51L6B", "B09FYXNNBF", "B09FZ6GQ15", - "B09FZFH7QV", "B09FZTYT8X", "B09G2D8WCM", "B09G2MX3Z4", "B09G2RF3PG", - "B09G6QSLZJ", "B09G6S72QV", "B09G71YGSV", "B09G75PX77", "B09G9LB74G", - "B09G9N6GKP", "B09GF87ZNG", "B09GG22167", "B09GK1DX1C", "B09GK5FN5C", - "B09GK91RLQ", "B09GK9SGBM", "B09GL53HWC", "B09GLJT3WZ", "B09GP72MFV", - "B09GPWST54", "B09GT58PZL", "B09GTM8CZP", "B09GVNWBF8", "B09GW74VT2", - "B09GXG8HXK", "B09GXPZLQH", "B09GXR7S6Q", "B09GXTNQW1", "B09GXWGV6H", - "B09GY2R4K9", "B09H2DL8SK", "B09H2GC8JJ", "B09H2QHLGY", "B09H3DZ1HP", - "B09H3KHGMT", "B09H3LWR2B", "B09H3P43Z5", "B09H3YDVJ9", "B09H5NK549", - "B09H5S27RQ", "B09H6F4MD7", "B09H6KFRV9", "B09H6VRJYY", "B09H72RWXJ", - "B09HBBJ95K", "B09HBGBDDL", "B09HBKP3DP", "B09HBMKTMV", "B09HBSTWK9", - "B09HBY52DP", "B09HC6FJHH", "B09HC7Q2V7", "B09HC7W6BH", "B09HGM5B1C", - "B09HGXQJ8G", "B09HGYVSJM", "B09HHLPN9T", "B09HJM8Z5T", "B09HJR6FP2", - "B09HJTC3YR", "B09HKJFT7Y", "B09HKKW6W8", "B09HKM463K", "B09HKPBCX3", - "B09HN1CY8H", "B09HN4NBDF", "B09HP3HJJ3", "B09HP3NLY7", "B09HPJSHQ1", - "B09HR5651S", "B09HR8253V", "B09HRJ7G74", "B09HRLD8LK", "B09HSNJBVH", - "B09HSSCQ7N", "B09HT7G7VJ", "B09HT872JP", "B09HT8LTDZ", "B09HT95S1S", - "B09HT99C2M", "B09HTZYS5X", "B09HWR2FHP", "B09HWVLWRW", "B09HX1YV12", - "B09HX5VVWB", "B09HX6S6HN", "B09HX7MVG1", "B09HX7Q84R", "B09HX945GX", - "B09HXC1RTV", "B09HXHQVSG", "B09HXJYXQG", "B09HY4FN3Q", "B09HYTCT9G", - "B09HYV5JSP", "B09HZ1ST61", "B09HZ8FPG2", "B09HZMPNC1", "B09J1N9BMW", - "B09J1ZVSYR", "B09J1ZWZX1", "B09J217SNW", "B09J218FDX", "B09J21FD2K", - "B09J21H9N9", "B09J24QDDP", "B09J2F3J7K", "B09J2LWNZX", "B09J4PY9C9", - "B09J4VCNY5", "B09J54N8JD", "B09J7KJ6W2", "B09J7RCXMC", "B09J85G48F", - "B09J8CXXF1", "B09J8DBLBY", "B09J8FXQKG", "B09J8H4QTK", "B09J8LTH94", - "B09J8SZMH4", "B09J96SQGH", "B09JB21P2R", "B09JB469QT", "B09JBDHXQN", - "B09JBL2GK5", "B09JBY9QSH", "B09JFQVBJ9", "B09JG5XS8Z", "B09JGFZSRR", - "B09JJQCLF3", "B09JJVG3SQ", "B09JK6BQ51", "B09JK8FW4H", "B09JL12CPV", - "B09JLDN7FB", "B09JLK9Y85", "B09JM1HFLS", "B09JMW9ZLT", "B09JPGZKYR", - "B09JS71VK6", "B09JSS6JYD", "B09JW1M5FS", "B09JW8ZBHH", "B09JWBG2VM", - "B09JWCDJMN", "B09JWJ4997", "B09JWJNTQ1", "B09JYNSJBX", "B09JYQMLQZ", - "B09JZBHRLP", "B09JZD4Q46", "B09JZGNY5H", "B09K44F4S8", "B09K4FFJ51", - "B09K6FKQFN", "B09K761P9F", "B09K7K5HPR", "B09KBN9PWN", "B09KG8VTW3", - "B09KGNFP6P", "B09KGZ68TR", "B09KH1K7LM", "B09KH1S88H", "B09KH8YJKL", - "B09KHCFG6Q", "B09KHKMFVL", "B09KKTJ83P", "B09KLMP553", "B09KLQ8XG2", - "B09KLS5P9Z", "B09KLSYQRL", "B09KLW9P47", "B09KMJ6XVY", "B09KMVHB9Q", - "B09KNBGN29", "B09KNC6126", "B09KNDLKN7", "B09KNGVQJN", "B09KNQDG5D", - "B09KPFLFMR", "B09KPYXNQC", "B09KRDF2PY", "B09KRJX2BB", "B09KRNZD48", - "B09KRZF4NQ", "B09KT5J4HK", "B09KT8YJ46", "B09KTRYQZP", "B09KTVDWG5", - "B09KV2Y8X4", "B09KV4T2FM", "B09KXRCBQL", "B09KYZGH9M", "B09KZN7GB1", - "B09L12YHK8", "B09L1CP58B", "B09L4SX5CQ", "B09L7JHPMY", "B09L7KCC3Y", - "B09L7KL3JR", "B09L7KTZXK", "B09L7KW8QM", "B09L7LQZCY", "B09L7QXNCR", - "B09L837ZRJ", "B09L83TVZF", "B09L85ZBZR", "B09LC16X7H", "B09LC4J9BC", - "B09LC9CLQD", "B09LCBPRXY", "B09LCCJ6NT", "B09LCJDCHK", "B09LCKP897", - "B09LCM2WG4", "B09LCNV4JW", "B09LCPNNDM", "B09LCQ21BC", "B09LCWSSMY", - "B09LGZG9LN", "B09LLFDV5Q", "B09LLPXVZY", "B09LLV7PSS", "B09LM1ZTM8", - "B09LM6GKMM", "B09LM6TD73", "B09LMJMFDG", "B09LQ9MKYQ", "B09LQC12Q6", - "B09LQG5WPX", "B09LQGY84X", "B09LQHNJJL", "B09LR24RXV", "B09LR26H5Z", - "B09LRRBCBZ", "B09LRRFX5N", "B09LRRQ7PY", "B09LRRRTYF", "B09LRRW25Y", - "B09LSWJF7L", "B09LTSJ9TC", "B09LTSS9QQ", "B09LV9G8VL", "B09LYH5SYK", - "B09LYTRXW1", "B09LYVF3C8", "B09LYW87WH", "B09M3PZLVZ", "B09M6V66CC", - "B09M73P45B", "B09M7C1YWB", "B09M7QFNTC", "B09M8963F5", "B09M89P2KH", - "B09M8G3QZ9", "B09M8HQ8GD", "B09M9HBRZJ", "B09M9WTVQX", "B09M9XJDSP", - "B09MD26QFB", "B09MF8YB2K", "B09MFKTG3C", "B09MFLB7VR", "B09MFMK4VX", - "B09MH9JTKB", "B09MHS5NY9", "B09MJ18V7P", "B09MJQVMBQ", "B09MJWY5SY", - "B09MJYSGZH", "B09MK7FNWS", "B09MKC5BRF", "B09MKCYDPB", "B09MKDGXXG", - "B09MKMMJ55", "B09MKQV3XB", "B09MKQZG5N", "B09MKVV4DV", "B09MLB4CNP", - "B09MLD9MRJ", "B09MLDGRQ1", "B09MLV4XF2", "B09MQ7PGZ3", "B09MQNLF8X", - "B09MQRPL77", "B09MQS99QJ", "B09MQVWH8Q", "B09MQY9TVN", "B09MR7PHN1", - "B09MRLK1M8", "B09MSXRTBS", "B09MT96W4Y", "B09MTCVQ8Y", "B09MTM3F72", - "B09MTMX7Q3", "B09MTPCN7L", "B09MTPTX1Q", "B09MVML5PX", "B09MVS9QWN", - "B09MYHB7BB", "B09MYQK6QH", "B09MYQZ3PW", "B09MZ5XNX3", "B09MZ8KYJR", - "B09MZ9BKJP", "B09MZ9TT3R", "B09N1D9LRC", "B09N1DF64V", "B09N1F6H2P", - "B09N1NCG5W", "B09N3CH82P", "B09N3DGR48", "B09N3KPPRF", "B09N3MR529", - "B09N3NWMZD", "B09N5HBSDD", "B09N6XPL23", "B09N6ZTQX9", "B09N749QL4", - "B09N79697S", "B09N8QWGP7", "B09N8TTWND", "B09N94LQMG", "B09NBB6Y2R", - "B09NBBMRJH", "B09NBGWSKM", "B09NBGY5Z3", "B09NBJ5X1W", "B09NBVK9JG", - "B09NCWK16N", "B09NCXSXLH", "B09NCXT4VD", "B09ND17LSX", "B09ND1M3WH", - "B09NDH2Z4V", "B09NDHNT4B", "B09NDHZF36", "B09NDJVG6J", "B09NFDZ6NS", - "B09NFSSG7Y", "B09NH1ZGFB", "B09NJS9RS2", "B09NKDD3HY", "B09NKFNG45", - "B09NLPMXP8", "B09NM62G51", "B09NM83C17", "B09NMD225N", "B09NMFXBKD", - "B09NMZWDP1", "B09NN1S77Q", "B09NN3Q2TM", "B09NN3WN2W", "B09NN4K7N3", - "B09NN5CMVH", "B09NNBPHGQ", "B09NNLF61S", "B09NPGDPZC", "B09NPHC8P6", - "B09NPLW3CM", "B09NPMT85W", "B09NQ2HW66", "B09NQ3WVZJ", "B09NQ5KHQ6", - "B09NQZ94VD", "B09NQZJWTS", "B09NQZPJVV", "B09NSDDNLV", "B09NSRCT17", - "B09NSXX182", "B09NVFX6N7", "B09NVG4NHF", "B09NVPPFDL", "B09NVT3Q7H", - "B09NVX6VRW", "B09NW45CX4", "B09NXRR9GR", "B09NXS6P5X", "B09NXT3HPT", - "B09NXT7YZM", "B09NXTR2YY", "B09NXTSJYT", "B09NXTXNCR", "B09NY3T7R8", - "B09NY4NJJ5", "B09NY5RBHC", "B09NYFJ1TF", "B09NZDG5HT", "B09NZX8THX", - "B09P15DFSZ", "B09P17L22Y", "B09P1FMM5R", "B09P1H7L7Z", "B09P1JQS7V", - "B09P1S8SRH", "B09P32YSBX", "B09P4RZ8RN", "B09P4ZGSR2", "B09P512BDK", - "B09P5HRJP6", "B09P5VCDL8", "B09P5VVPV7", "B09P61D6VD", "B09P6DP7WT", - "B09P7TH2ZW", "B09P7TXXG9", "B09P844C92", "B09P85WQ5X", "B09P87NZTN", - "B09P8B149Z", "B09P9YZWD6", "B09PB5GR3L", "B09PB5TV6G", "B09PBWXC2K", - "B09PDPG31D", "B09PFX9F5T", "B09PG9KQ24", "B09PJ74NFR", "B09PL77NP9", - "B09PL7D2YZ", "B09PMJVCZF", "B09PN3GYPD", "B09PND2Z9N", "B09PNDMJHZ", - "B09PNMNVN7", "B09PQ9YNRJ", "B09PQFT8KV", "B09PQM615Y", "B09PQQ3TFN", - "B09PQQ9D41", "B09PQVGZYT", "B09PR7L89L", "B09PRKB6JY", "B09PTJSWG7", - "B09PTKTSB8", "B09PTPYZM7", "B09PTRBVJ9", "B09PTYB6BV", "B09PV6CJJB", - "B09PY71FMR", "B09PY85J4X", "B09PYFXBZ5", "B09PYN1G1P", "B09PYQSKC7", - "B09PYVX7P6", "B09PZ7BGTN", "B09PZB9GJV", "B09Q11K4TR", "B09Q1Q3FWD", - "B09Q2R4L5H", "B09Q2VK1L1", "B09Q3GYP3V", "B09Q3JFQV3", "B09Q5M2DSR", - "B09Q6D967X", "B09QBPF3TR", "B09QBSY8Z6", "B09QBXLC1M", "B09QBXW33C", - "B09QCNXTHL", "B09QFXK7XG", "B09QHP2YYP", "B09QHPKXCV", "B09QHQ3TF4", - "B09QHQX9L9", "B09QHTJLNL", "B09QJTBFJ2", "B09QK17DSV", "B09QKNCZGP", - "B09QKQSS4P", "B09QKRP818", "B09QKS4XB2", "B09QKW4Y1J", "B09QKZ41KB", - "B09QLV2WSS", "B09QM9TNS8", "B09QMFD8VX", "B09QPQ79J6", "B09QQD1M6W", - "B09QQG26L8", "B09QSL2ZZ6", "B09QXF864N", "B09QYKRVR8", "B09QYPBHK5", - "B09R25MDB6", "B09R5184ZB", "B09R7GKTQ2", "B09R8YPSD9", "B09R94254P", - "B09R9WLR4L", "B09RHK53JB", "B09RJ7V7BK", "B09RTDDSKD", "B09RTGR9GS", - "B09RVVNCB9", "B09S5QLQHF", "B09S5XRTB4", "B09S611N6J", "B09S9L9SJZ", - "B09S9YFWQN", "B09SB2Q7JM", "B09SCQFLBS", "B09SD7SCHF", "B09SDB7M9D", - "B09SDBRS8T", "B09SDV6WHM", "B09SDVQYLX", "B09SDW5RQP", "B09SGGRYP6", - "B09SHCXCQJ", "B09SLR2VL5", "B09SP9WGBB", "B09SPH833Z", "B09SPTMT7Y", - "B09SSS6BVC", "B09STDR2Q7", "B09SW42QBL", "B09SWNMWXH", "B09SXPDQ4P", - "B09SY7N3X7", "B09SZCXM2L", "B09T6V1M4Q" - ] + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} + }, + { + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": "__SHOPPING__/wishlist/index/add/", + "http_method": "POST", + "response_status": 302, + "response_cookies": {"mage-messages": "^.*chair.* has been added to your wish list.*$"}, + "post_data": {"qty": null} } } ], @@ -14539,58 +13413,22 @@ "intent": "Add a white desk to my wish list.", "intent_template": "Add a {{product}} to my wish list.", "instantiation_dict": {"product": "white desk"}, - "format_specification": null, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "BackendStateEvaluator", - "site": "shopping", - "expected": { - "skus": [ - "B003B3NR62", "B004773CKW", "B00846JN00", "B00881ECVG", "B00EUT6LX2", - "B00FHXI13I", "B00FOOG5RY", "B00GMPDA9K", "B00NQHH5XO", "B0192REQ8I", - "B01MQFOWCJ", "B01N8SKYHP", "B06XPTGYL7", "B071KF55KQ", "B074H7JDCS", - "B076B6P1W8", "B0774FK6B7", "B077TK98RF", "B079313Z19", "B07CMJWM3Y", - "B07CPQPXGS", "B07DG6PY4P", "B07FR7KNRX", "B07H56P8LM", "B07HK436BZ", - "B07JFRM23P", "B07KGLPCRQ", "B07KT4NRFK", "B07MCCQ2HV", "B07MV1FZFB", - "B07NBXGXZN", "B07PB6179B", "B07PJG6YGY", "B07PMRWR8W", "B07R7KVGQG", - "B07RKLTDQS", "B07S6D3NPM", "B07T1FS6X2", "B07V8SDLBQ", "B07W1GGR6X", - "B07W5DZLZK", "B07XKS5X6G", "B07Y38GFGT", "B07YKSG7XF", "B07YWTSPQ3", - "B07ZJH1Z3X", "B07ZX3PBJ1", "B0812Z74DR", "B0813XDJFZ", "B08243WG1C", - "B0829XTKK1", "B082Q1LC55", "B082XXDVC4", "B082YVMJYS", "B083NX39PD", - "B083WM9PLP", "B084JMJ7B2", "B0868HY623", "B086WN6DC2", "B086Z8JGW6", - "B0876FCLTY", "B08DFJZCV8", "B08GPH26G3", "B08GWWZ3QP", "B08GYG3M6F", - "B08HCS3W8J", "B08HKZSN2M", "B08HN9K15V", "B08HVNQ5M2", "B08J1D4LLB", - "B08J4334DD", "B08JHCP28Q", "B08KTB1M8Q", "B08L4K5R93", "B08M5GWW3W", - "B08MKVKT5Q", "B08MTNPRX9", "B08N41HK9V", "B08ND9WC77", "B08NGDMJXN", - "B08NTWRS9X", "B08PB1TJ59", "B08PZ6QYLG", "B08QCGV9NS", "B08QF7FMG6", - "B08QF8XD1W", "B08QJFMBYZ", "B08RDGM6FD", "B08RHWN4L5", "B08S3TWCJ6", - "B08SDZS9QW", "B08SW9NN1P", "B08T64F9DB", "B08TRPQ4HQ", "B08TTGPZBM", - "B08VRFGD3K", "B08XGJHZQC", "B08Y1YSZT4", "B08YJLKR5R", "B08ZDRDMZY", - "B08ZNCSSF1", "B08ZXWR255", "B0914V6DY8", "B0927FPD86", "B092D256Q1", - "B092M5LW83", "B092M6GPJY", "B093225WFY", "B093K9SGX6", "B093KTVYF6", - "B093T9JT18", "B094CWP4M3", "B094NDDCSG", "B094QKZMY8", "B094QL8C87", - "B094VG34CW", "B094XPL4V8", "B096JZQLS5", "B096VQVV17", "B0972GMJ5N", - "B097N8ZCRR", "B0987G7T84", "B098L2JVY7", "B098LLNQGH", "B098Q9N5ZV", - "B098RVVNN9", "B098XJRTBH", "B099KRGV13", "B099PCGKXZ", "B099RPWD9M", - "B099WF1B5R", "B099WVV4BG", "B099ZCD7FS", "B09BVP53ZX", "B09BW1L6CH", - "B09C225J31", "B09CZ4776Q", "B09D3N1FJK", "B09DG76B6M", "B09DSHCMQX", - "B09DSRQGHP", "B09F37XFG5", "B09F628ZJQ", "B09F6KLB84", "B09FLP98S2", - "B09FTJQKVM", "B09FXGF9J5", "B09GK3H2CW", "B09HGVCTW5", "B09HN7XYR9", - "B09HN87JYR", "B09HTZFPLY", "B09J46P6ZG", "B09J53SGDV", "B09J8FDSBH", - "B09KN9KPWV", "B09KSYWTK1", "B09KTZ3K7P", "B09L5QVNPP", "B09L5ZPQBL", - "B09LC9DPFZ", "B09LHZK55X", "B09LYNXBZ9", "B09MCLF3FC", "B09MFX8S3W", - "B09MH9JTKB", "B09MQFZT7G", "B09MW76LJP", "B09MW7LTTK", "B09N8M8F3M", - "B09NBRVMXM", "B09NKNVMQX", "B09NQWPHW2", "B09NY4NJJ5", "B09P1BY2ZK", - "B09P3MT8BW", "B09P556Z72", "B09P8QDR8R", "B09PBJX6QJ", "B09PBTKW3V", - "B09PDRWG7K", "B09PDSLHG9", "B09PDTY47P", "B09PFRHMCK", "B09PFTFZT5", - "B09PFVQCWJ", "B09PFW3WBJ", "B09PSMZ213", "B09PTL9FYT", "B09PYWDGKZ", - "B09S8RSPTY", "B09SY4YD23" - ] + "eval": [ + { + "evaluator": "AgentResponseEvaluator", + "results_schema": {"type": "null"}, + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} + }, + { + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": "__SHOPPING__/wishlist/index/add/", + "http_method": "POST", + "response_status": 302, + "response_cookies": { + "mage-messages": "^.*(?=.*white)(?=.*desk).* has been added to your wish list.*$" + }, + "post_data": {"qty": null} } } ], @@ -14604,37 +13442,22 @@ "intent": "Add a white computer desk to my wish list.", "intent_template": "Add a {{product}} to my wish list.", "instantiation_dict": {"product": "white computer desk"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "site": "shopping", + "evaluator": "NetworkEventEvaluator", "expected": { - "skus": [ - "B003B3NR62", "B00EUT6LX2", "B00FHXI13I", "B00GMPDA9K", "B01MQFOWCJ", - "B01N8SKYHP", "B071KF55KQ", "B0774FK6B7", "B07CPQPXGS", "B07FR7KNRX", - "B07PB6179B", "B07PMRWR8W", "B07R7KVGQG", "B07RKLTDQS", "B07W1GGR6X", - "B07W5DZLZK", "B07XKS5X6G", "B07Y38GFGT", "B07YWTSPQ3", "B0813XDJFZ", - "B08243WG1C", "B0829XTKK1", "B082XXDVC4", "B083NX39PD", "B0868HY623", - "B08GPH26G3", "B08GYG3M6F", "B08HCS3W8J", "B08HN9K15V", "B08HVNQ5M2", - "B08J1D4LLB", "B08J4334DD", "B08L4K5R93", "B08N41HK9V", "B08QF8XD1W", - "B08QJFMBYZ", "B08RHWN4L5", "B08S3TWCJ6", "B08T64F9DB", "B08TRPQ4HQ", - "B08TTGPZBM", "B08VRFGD3K", "B08ZDRDMZY", "B08ZXWR255", "B0927FPD86", - "B093225WFY", "B093K9SGX6", "B093KTVYF6", "B093T9JT18", "B094QKZMY8", - "B097N8ZCRR", "B098LLNQGH", "B098Q9N5ZV", "B098RVVNN9", "B099KRGV13", - "B099PCGKXZ", "B099WF1B5R", "B099ZCD7FS", "B09BW1L6CH", "B09DG76B6M", - "B09DSHCMQX", "B09F628ZJQ", "B09FTJQKVM", "B09HN7XYR9", "B09HN87JYR", - "B09HTZFPLY", "B09J46P6ZG", "B09KN9KPWV", "B09KSYWTK1", "B09L5ZPQBL", - "B09LHZK55X", "B09LYNXBZ9", "B09MH9JTKB", "B09MW76LJP", "B09N8M8F3M", - "B09NY4NJJ5", "B09P1BY2ZK", "B09P8QDR8R", "B09PBTKW3V", "B09PDRWG7K", - "B09PDSLHG9", "B09PDTY47P", "B09PFTFZT5", "B09PFVQCWJ", "B09PFW3WBJ", - "B09PSMZ213", "B094R8LKJZ" - ] + "url": "__SHOPPING__/wishlist/index/add/", + "http_method": "POST", + "response_status": 302, + "response_cookies": { + "mage-messages": "^.*(?=.*white)(?=.*computer)(?=.*desk).* has been added to your wish list.*$" + }, + "post_data": {"qty": null} } } ], @@ -14650,17 +13473,20 @@ "intent": "Add the product on the current page to my wishlist", "intent_template": "Add the product on the current page to my wishlist", "instantiation_dict": {}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "site": "shopping", - "expected": {"sku": "B0040WHKIY"} + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": "__SHOPPING__/wishlist/index/add/", + "http_method": "POST", + "post_data": {"product": "104497", "qty": null}, + "response_status": 302 + } } ], "revision": 2 @@ -14675,17 +13501,20 @@ "intent": "Add the product on the current page to my wishlist", "intent_template": "Add the product on the current page to my wishlist", "instantiation_dict": {}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "site": "shopping", - "expected": {"sku": "B01MTYJG38"} + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": "__SHOPPING__/wishlist/index/add/", + "http_method": "POST", + "post_data": {"product": "37811", "qty": null}, + "response_status": 302 + } } ], "revision": 2 @@ -14698,17 +13527,20 @@ "intent": "Add the product on the current page to my wishlist", "intent_template": "Add the product on the current page to my wishlist", "instantiation_dict": {}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "site": "shopping", - "expected": {"sku": "B01MY87FWG"} + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": "__SHOPPING__/wishlist/index/add/", + "http_method": "POST", + "post_data": {"product": "37285", "qty": null}, + "response_status": 302 + } } ], "revision": 2 @@ -14723,17 +13555,14 @@ "intent": "Add the product on the current page to my wishlist", "intent_template": "Add the product on the current page to my wishlist", "instantiation_dict": {}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", - "site": "shopping", - "last_event_only": true, "expected": { "url": "__SHOPPING__/wishlist/index/add/", "http_method": "POST", @@ -14754,17 +13583,14 @@ "intent": "Add the product on the current page to my wishlist", "intent_template": "Add the product on the current page to my wishlist", "instantiation_dict": {}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", - "site": "shopping", - "last_event_only": true, "expected": { "url": "__SHOPPING__/wishlist/index/add/", "http_method": "POST", @@ -14783,17 +13609,14 @@ "intent": "Subscribe to the newsletter of OneStopMarket", "intent_template": "Subscribe to the newsletter of OneStopMarket", "instantiation_dict": {}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", - "site": "shopping", - "last_event_only": true, "expected": { "url": "__SHOPPING__/newsletter/subscriber/new/", "http_method": "POST", @@ -14812,12 +13635,11 @@ "intent": "Fork all repos from facebook.", "intent_template": "Fork {{repo}}.", "instantiation_dict": {"repo": "all repos from facebook"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", @@ -14844,19 +13666,17 @@ "intent": "Star the top five most stared repos in Gitlab", "intent_template": "Star the top {{number}} most stared repos in Gitlab", "instantiation_dict": {"number": "five"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", "expected": { "url": "__GITLAB__/umano/AndroidSlidingUpPanel/toggle_star.json", "http_method": "POST", - "response_status": 200, "response_content": {"star_count": 56} } }, @@ -14865,7 +13685,6 @@ "expected": { "url": "__GITLAB__/facebook/create-react-app/toggle_star.json", "http_method": "POST", - "response_status": 200, "response_content": {"star_count": 53} } }, @@ -14874,7 +13693,6 @@ "expected": { "url": "__GITLAB__/kkroening/ffmpeg-python/toggle_star.json", "http_method": "POST", - "response_status": 200, "response_content": {"star_count": 52} } }, @@ -14883,7 +13701,6 @@ "expected": { "url": "__GITLAB__/mk-j/PHP_XLSXWriter/toggle_star.json", "http_method": "POST", - "response_status": 200, "response_content": {"star_count": 48} } }, @@ -14892,7 +13709,6 @@ "expected": { "url": "__GITLAB__/koush/AndroidAsync/toggle_star.json", "http_method": "POST", - "response_status": 200, "response_content": {"star_count": 47} } } @@ -14907,19 +13723,17 @@ "intent": "Star the top eight most stared repos in Gitlab", "intent_template": "Star the top {{number}} most stared repos in Gitlab", "instantiation_dict": {"number": "eight"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", "expected": { "url": "__GITLAB__/umano/AndroidSlidingUpPanel/toggle_star.json", "http_method": "POST", - "response_status": 200, "response_content": {"star_count": 56} } }, @@ -14928,7 +13742,6 @@ "expected": { "url": "__GITLAB__/facebook/create-react-app/toggle_star.json", "http_method": "POST", - "response_status": 200, "response_content": {"star_count": 53} } }, @@ -14937,7 +13750,6 @@ "expected": { "url": "__GITLAB__/kkroening/ffmpeg-python/toggle_star.json", "http_method": "POST", - "response_status": 200, "response_content": {"star_count": 52} } }, @@ -14946,7 +13758,6 @@ "expected": { "url": "__GITLAB__/mk-j/PHP_XLSXWriter/toggle_star.json", "http_method": "POST", - "response_status": 200, "response_content": {"star_count": 48} } }, @@ -14955,7 +13766,6 @@ "expected": { "url": "__GITLAB__/koush/AndroidAsync/toggle_star.json", "http_method": "POST", - "response_status": 200, "response_content": {"star_count": 47} } }, @@ -14964,7 +13774,6 @@ "expected": { "url": "__GITLAB__/eriklindernoren/PyTorch-GAN/toggle_star.json", "http_method": "POST", - "response_status": 200, "response_content": {"star_count": 46} } }, @@ -14973,7 +13782,6 @@ "expected": { "url": "__GITLAB__/thoughtbot/administrate/toggle_star.json", "http_method": "POST", - "response_status": 200, "response_content": {"star_count": 45} } }, @@ -14982,7 +13790,6 @@ "expected": { "url": "__GITLAB__/keycloak/keycloak/toggle_star.json", "http_method": "POST", - "response_status": 200, "response_content": {"star_count": 44} } } @@ -14997,19 +13804,17 @@ "intent": "Star the top four most stared repos in Gitlab", "intent_template": "Star the top {{number}} most stared repos in Gitlab", "instantiation_dict": {"number": "four"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", "expected": { "url": "__GITLAB__/umano/AndroidSlidingUpPanel/toggle_star.json", "http_method": "POST", - "response_status": 200, "response_content": {"star_count": 56} } }, @@ -15018,7 +13823,6 @@ "expected": { "url": "__GITLAB__/facebook/create-react-app/toggle_star.json", "http_method": "POST", - "response_status": 200, "response_content": {"star_count": 53} } }, @@ -15027,7 +13831,6 @@ "expected": { "url": "__GITLAB__/kkroening/ffmpeg-python/toggle_star.json", "http_method": "POST", - "response_status": 200, "response_content": {"star_count": 52} } }, @@ -15036,7 +13839,6 @@ "expected": { "url": "__GITLAB__/mk-j/PHP_XLSXWriter/toggle_star.json", "http_method": "POST", - "response_status": 200, "response_content": {"star_count": 48} } } @@ -15051,19 +13853,17 @@ "intent": "Star the top three most stared repos in Gitlab", "intent_template": "Star the top {{number}} most stared repos in Gitlab", "instantiation_dict": {"number": "three"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", "expected": { "url": "__GITLAB__/umano/AndroidSlidingUpPanel/toggle_star.json", "http_method": "POST", - "response_status": 200, "response_content": {"star_count": 56} } }, @@ -15072,7 +13872,6 @@ "expected": { "url": "__GITLAB__/facebook/create-react-app/toggle_star.json", "http_method": "POST", - "response_status": 200, "response_content": {"star_count": 53} } }, @@ -15081,7 +13880,6 @@ "expected": { "url": "__GITLAB__/kkroening/ffmpeg-python/toggle_star.json", "http_method": "POST", - "response_status": 200, "response_content": {"star_count": 52} } } @@ -15096,19 +13894,17 @@ "intent": "Star the top one most stared repos in Gitlab", "intent_template": "Star the top {{number}} most stared repos in Gitlab", "instantiation_dict": {"number": "one"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", "expected": { "url": "__GITLAB__/umano/AndroidSlidingUpPanel/toggle_star.json", "http_method": "POST", - "response_status": 200, "response_content": {"star_count": 56} } } @@ -15120,15 +13916,14 @@ "task_id": 528, "intent_template_id": 154, "start_urls": ["__SHOPPING__"], - "intent": "Draft a refund message via their \"contact us\" form for the phone screen protector I bought March 2023. The shop requires the order id, the reason and the amount to refund in the message, so use the following message format: \"I need a refund. Order: . Amount: . Reason: It broke after three days of use.\". Don\"t submit yet", - "intent_template": "Draft a refund message via their \"contact us\" form for the {{product}} I bought {{time}}. The shop requires the order id, the reason and the amount to refund in the message, so use the following message format: \"I need a refund. Order: . Amount: . Reason: It broke after three days of use.\". Don\"t submit yet", + "intent": "Fill out the contact us form with this refund message for the phone screen protector bought March 2023: \"I need a refund. Order: . Amount: . Reason: It broke after three days of use.\" Leave the filled form ready for review and do not submit it.", + "intent_template": "Fill out the contact us form with this refund message for the {{product}} bought {{time}}: \"I need a refund. Order: . Amount: . Reason: It broke after three days of use.\" Leave the filled form ready for review and do not submit it.", "instantiation_dict": {"product": "phone screen protector", "time": "March 2023"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", @@ -15157,15 +13952,14 @@ "task_id": 529, "intent_template_id": 154, "start_urls": ["__SHOPPING__"], - "intent": "Draft a refund message via their \"contact us\" form for the Stereo Sound System I bought Feb 2023. The shop requires the order id, the reason and the amount to refund in the message, so use the following message format: \"I need a refund. Order: . Amount: . Reason: It broke after three days of use.\". Don\"t submit yet", - "intent_template": "Draft a refund message via their \"contact us\" form for the {{product}} I bought {{time}}. The shop requires the order id, the reason and the amount to refund in the message, so use the following message format: \"I need a refund. Order: . Amount: . Reason: It broke after three days of use.\". Don\"t submit yet", + "intent": "Fill out the contact us form with this refund message for the Stereo Sound System bought Feb 2023: \"I need a refund. Order: . Amount: . Reason: It broke after three days of use.\" Leave the filled form ready for review and do not submit it.", + "intent_template": "Fill out the contact us form with this refund message for the {{product}} bought {{time}}: \"I need a refund. Order: . Amount: . Reason: It broke after three days of use.\" Leave the filled form ready for review and do not submit it.", "instantiation_dict": {"product": "Stereo Sound System", "time": "Feb 2023"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", @@ -15194,15 +13988,14 @@ "task_id": 530, "intent_template_id": 154, "start_urls": ["__SHOPPING__"], - "intent": "Draft a refund message via their \"contact us\" form for the kitchen organizer I bought around Feb 2023. The shop requires the order id, the reason and the amount to refund in the message, so use the following message format: \"I need a refund. Order: . Amount: . Reason: It broke after three days of use.\". Don\"t submit yet", - "intent_template": "Draft a refund message via their \"contact us\" form for the {{product}} I bought {{time}}. The shop requires the order id, the reason and the amount to refund in the message, so use the following message format: \"I need a refund. Order: . Amount: . Reason: It broke after three days of use.\". Don\"t submit yet", + "intent": "Fill out the contact us form with this refund message for the kitchen organizer bought around Feb 2023: \"I need a refund. Order: . Amount: . Reason: It broke after three days of use.\" Leave the filled form ready for review and do not submit it.", + "intent_template": "Fill out the contact us form with this refund message for the {{product}} bought {{time}}: \"I need a refund. Order: . Amount: . Reason: It broke after three days of use.\" Leave the filled form ready for review and do not submit it.", "instantiation_dict": {"product": "kitchen organizer", "time": "around Feb 2023"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", @@ -15231,21 +14024,18 @@ "task_id": 531, "intent_template_id": 154, "start_urls": ["__SHOPPING__"], - "intent": "Draft a refund message via their \"contact us\" form for the phone case I bought March 2023. The shop requires the order id, the reason and the amount to refund in the message, so use the following message format: \"I need a refund. Order: . Amount: . Reason: It broke after three days of use.\". Don\"t submit yet", - "intent_template": "Draft a refund message via their \"contact us\" form for the {{product}} I bought {{time}}. The shop requires the order id, the reason and the amount to refund in the message, so use the following message format: \"I need a refund. Order: . Amount: . Reason: It broke after three days of use.\". Don\"t submit yet", + "intent": "Fill out the contact us form with this refund message for the phone case bought March 2023: \"I need a refund. Order: . Amount: . Reason: It broke after three days of use.\" Leave the filled form ready for review and do not submit it.", + "intent_template": "Fill out the contact us form with this refund message for the {{product}} bought {{time}}: \"I need a refund. Order: . Amount: . Reason: It broke after three days of use.\" Leave the filled form ready for review and do not submit it.", "instantiation_dict": {"product": "phone case", "time": "March 2023"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", - "site": "shopping", - "last_event_only": true, - "expected": {"url": "__SHOPPING__/contact", "response_status": 200} + "expected": {"url": "__SHOPPING__/contact"} } ], "revision": 2 @@ -15255,21 +14045,18 @@ "task_id": 532, "intent_template_id": 154, "start_urls": ["__SHOPPING__"], - "intent": "Draft a refund message via their \"contact us\" form for the PS3 remote controller I bought early 2023. The shop requires the order id, the reason and the amount to refund in the message, so use the following message format: \"I need a refund. Order: . Amount: . Reason: It broke after three days of use.\". Don\"t submit yet", - "intent_template": "Draft a refund message via their \"contact us\" form for the {{product}} I bought {{time}}. The shop requires the order id, the reason and the amount to refund in the message, so use the following message format: \"I need a refund. Order: . Amount: . Reason: It broke after three days of use.\". Don\"t submit yet", + "intent": "Fill out the contact us form with this refund message for the PS3 remote controller bought early 2023: \"I need a refund. Order: . Amount: . Reason: It broke after three days of use.\" Leave the filled form ready for review and do not submit it.", + "intent_template": "Fill out the contact us form with this refund message for the {{product}} bought {{time}}: \"I need a refund. Order: . Amount: . Reason: It broke after three days of use.\" Leave the filled form ready for review and do not submit it.", "instantiation_dict": {"product": "PS3 remote controller", "time": "early 2023"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", - "site": "shopping", - "last_event_only": true, - "expected": {"url": "__SHOPPING__/contact", "response_status": 200} + "expected": {"url": "__SHOPPING__/contact"} } ], "revision": 2 @@ -15281,20 +14068,28 @@ "start_urls": ["__GITLAB__"], "intent": "Follow [\"convexegg\", \"yjlou\"] on Gitlab", "intent_template": "Follow {{account_list}} on Gitlab", - "instantiation_dict": { "account_list": ["convexegg", "yjlou"] }, - "format_specification": null, + "instantiation_dict": {"account_list": "[\"convexegg\", \"yjlou\"]"}, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "username": "byteblaze", - "following_usernames": ["convexegg", "yjlou"], - "site": "gitlab", - "expected": true + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": "__GITLAB__/users/convexegg/follow.json", + "http_method": "POST", + "response_status": 302 + } + }, + { + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": "__GITLAB__/users/yjlou/follow.json", + "http_method": "POST", + "response_status": 302 + } } ], "revision": 2 @@ -15306,20 +14101,36 @@ "start_urls": ["__GITLAB__"], "intent": "Follow [\"Jakub Klinkovsk\u00fd\", \"Koushik\", \"Vinta Chen\"] on Gitlab", "intent_template": "Follow {{account_list}} on Gitlab", - "instantiation_dict": { "account_list": ["Jakub Klinkovsk\u00fd", "Koushik", "Vinta Chen"] }, - "format_specification": null, + "instantiation_dict": {"account_list": "[\"Jakub Klinkovsk\u00fd\", \"Koushik\", \"Vinta Chen\"]"}, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} + }, + { + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": "__GITLAB__/users/lahwaacz/follow.json", + "http_method": "POST", + "response_status": 302 + } + }, + { + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": "__GITLAB__/users/koush/follow.json", + "http_method": "POST", + "response_status": 302 + } }, { - "evaluator": "BackendStateEvaluator", - "username": "byteblaze", - "following_usernames": ["lahwaacz", "koush", "vinta"], - "site": "gitlab", - "expected": true + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": "__GITLAB__/users/vinta/follow.json", + "http_method": "POST", + "response_status": 302 + } } ], "revision": 2 @@ -15331,20 +14142,36 @@ "start_urls": ["__GITLAB__"], "intent": "Follow [\"Jakub K\", \"ghost\", \"Beno\u00eet Blanchon\"] on Gitlab", "intent_template": "Follow {{account_list}} on Gitlab", - "instantiation_dict": { "account_list": ["Jakub K", "ghost", "Beno\u00eet Blanchon"] }, - "format_specification": null, + "instantiation_dict": {"account_list": "[\"Jakub K\", \"ghost\", \"Beno\u00eet Blanchon\"]"}, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} + }, + { + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": "__GITLAB__/users/lahwaacz/follow.json", + "http_method": "POST", + "response_status": 302 + } + }, + { + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": "__GITLAB__/users/ghost/follow.json", + "http_method": "POST", + "response_status": 302 + } }, { - "evaluator": "BackendStateEvaluator", - "username": "byteblaze", - "following_usernames": ["lahwaacz", "ghost", "bblanchon"], - "site": "gitlab", - "expected": true + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": "__GITLAB__/users/bblanchon/follow.json", + "http_method": "POST", + "response_status": 302 + } } ], "revision": 2 @@ -15356,20 +14183,36 @@ "start_urls": ["__GITLAB__"], "intent": "Follow [\"ghost\", \"R1kk3r\", \"Abishek\"] on Gitlab", "intent_template": "Follow {{account_list}} on Gitlab", - "instantiation_dict": { "account_list": ["ghost", "R1kk3r", "Abishek"] }, - "format_specification": null, + "instantiation_dict": {"account_list": "[\"ghost\", \"R1kk3r\", \"Abishek\"]"}, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "username": "byteblaze", - "following_usernames": ["ghost", "R1kk3r", "abisubramanya27"], - "site": "gitlab", - "expected": true + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": "__GITLAB__/users/ghost/follow.json", + "http_method": "POST", + "response_status": 302 + } + }, + { + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": "__GITLAB__/users/R1kk3r/follow.json", + "http_method": "POST", + "response_status": 302 + } + }, + { + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": "__GITLAB__/users/abisubramanya27/follow.json", + "http_method": "POST", + "response_status": 302 + } } ], "revision": 2 @@ -15382,21 +14225,53 @@ "intent": "Follow [\"Jakub Klinkovsk\", \"convexegg\", \"Vinta Chen\", \"yjlou\", \"Abishek S\"] on Gitlab", "intent_template": "Follow {{account_list}} on Gitlab", "instantiation_dict": { - "account_list": ["Jakub Klinkovsk", "convexegg", "Vinta Chen", "yjlou", "Abishek S"] + "account_list": "[\"Jakub Klinkovsk\", \"convexegg\", \"Vinta Chen\", \"yjlou\", \"Abishek S\"]" }, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} + }, + { + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": "__GITLAB__/users/lahwaacz/follow.json", + "http_method": "POST", + "response_status": 302 + } + }, + { + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": "__GITLAB__/users/convexegg/follow.json", + "http_method": "POST", + "response_status": 302 + } }, { - "evaluator": "BackendStateEvaluator", - "username": "byteblaze", - "following_usernames": ["lahwaacz", "convexegg", "vinta", "yjlou", "abisubramanya27"], - "site": "gitlab", - "expected": true + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": "__GITLAB__/users/vinta/follow.json", + "http_method": "POST", + "response_status": 302 + } + }, + { + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": "__GITLAB__/users/yjlou/follow.json", + "http_method": "POST", + "response_status": 302 + } + }, + { + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": "__GITLAB__/users/abisubramanya27/follow.json", + "http_method": "POST", + "response_status": 302 + } } ], "revision": 2 @@ -15412,24 +14287,28 @@ "order_id": "299", "address": "456 Oak Avenue, Apartment 5B, New York, NY, 10001" }, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "order_id": "299", - "site": "shopping_admin", + "evaluator": "NetworkEventEvaluator", + "ignored_post_data_params_patterns": ["^form_key$"], "expected": { - "address": "456 Oak Avenue", - "address2": "Apartment 5B", - "city": "New York", - "state": "New York", - "zip_code": "10001", - "country": "US" + "url": "__SHOPPING_ADMIN__/sales/order/addressSave/address_id/598/", + "http_method": "POST", + "post_data": { + "street[0]": "456 Oak Avenue", + "street[1]": "Apartment 5B", + "country_id": "US", + "region": "New York", + "region_id": "43", + "city": "New York", + "postcode": "10001" + }, + "response_status": 302 } } ], @@ -15443,22 +14322,27 @@ "intent": "Modify the address of order #65 to 789 Pine Lane, San Francisco, CA, 94102", "intent_template": "Modify the address of order #{{order_id}} to {{address}}", "instantiation_dict": {"order_id": "65", "address": "789 Pine Lane, San Francisco, CA, 94102"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "order_id": "65", - "site": "shopping_admin", + "evaluator": "NetworkEventEvaluator", + "ignored_post_data_params_patterns": ["^form_key$"], "expected": { - "address": "789 Pine Lane", - "city": "San Francisco", - "state": "California", - "zip_code": "94102" + "url": "__SHOPPING_ADMIN__/sales/order/addressSave/address_id/130/", + "http_method": "POST", + "post_data": { + "street[0]": "789 Pine Lane", + "country_id": "US", + "region": "California", + "region_id": "12", + "city": "San Francisco", + "postcode": "94102" + }, + "response_status": 302 } } ], @@ -15475,23 +14359,28 @@ "order_id": "301", "address": "321 Birch Boulevard, Suite 200, Dallas, TX, 75201" }, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "order_id": "301", - "site": "shopping_admin", + "evaluator": "NetworkEventEvaluator", + "ignored_post_data_params_patterns": ["^form_key$"], "expected": { - "address": "321 Birch Boulevard", - "address2": "Suite 200", - "city": "Dallas", - "state": "Texas", - "zip_code": "75201" + "url": "__SHOPPING_ADMIN__/sales/order/addressSave/address_id/602/", + "http_method": "POST", + "post_data": { + "street[0]": "321 Birch Boulevard", + "street[1]": "Suite 200", + "country_id": "US", + "region": "Texas", + "region_id": "57", + "city": "Dallas", + "postcode": "75201" + }, + "response_status": 302 } } ], @@ -15505,12 +14394,11 @@ "intent": "Modify the address of order #125 to 654 Elm Drive, Apartment 12, Miami, FL, 33101", "intent_template": "Modify the address of order #{{order_id}} to {{address}}", "instantiation_dict": {"order_id": "125", "address": "654 Elm Drive, Apartment 12, Miami, FL, 33101"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", @@ -15518,8 +14406,6 @@ "expected": { "url": "__SHOPPING_ADMIN__/sales/order/addressSave/address_id/249/", "http_method": "POST", - "headers": {"referer": "__SHOPPING_ADMIN__/sales/order/address/address_id/249/"}, - "method": "POST", "post_data": { "street[0]": "654 Elm Drive", "street[1]": "Apartment 12", @@ -15529,8 +14415,7 @@ "city": "Miami", "postcode": "33101" }, - "response_status": 302, - "redirect_url": "__SHOPPING_ADMIN__/sales/order/view/order_id/125/" + "response_status": 302 } } ], @@ -15544,22 +14429,27 @@ "intent": "Modify the address of order #300 to 987 Cedar Court, Los Angeles, CA, 90012", "intent_template": "Modify the address of order #{{order_id}} to {{address}}", "instantiation_dict": {"order_id": "300", "address": "987 Cedar Court, Los Angeles, CA, 90012"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "order_id": "300", - "site": "shopping_admin", + "evaluator": "NetworkEventEvaluator", + "ignored_post_data_params_patterns": ["^form_key$"], "expected": { - "address": "987 Cedar Court", - "city": "Los Angeles", - "state": "California", - "zip_code": "90012" + "url": "__SHOPPING_ADMIN__/sales/order/addressSave/address_id/600/", + "http_method": "POST", + "post_data": { + "street[0]": "987 Cedar Court", + "country_id": "US", + "region": "California", + "region_id": "12", + "city": "Los Angeles", + "postcode": "90012" + }, + "response_status": 302 } } ], @@ -15573,19 +14463,16 @@ "intent": "Update the product description of Bella Tank to \"{count} customer(s) love it!\" where count is the number of reviews with 4 stars or above, or \"don't miss out on this amazing product\" when there are no such reviews.", "intent_template": "Update the product description of {{product}} to \"{count} customer(s) love it!\" where count is the number of reviews with 4 stars or above, or \"don't miss out on this amazing product\" when there are no such reviews.", "instantiation_dict": {"product": "Bella Tank"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", - "site": "shopping_admin", - "last_event_only": true, "expected": { - "url": "__SHOPPING_ADMIN__/catalog/product/save/id/1684/type/configurable/store/0/set/9/back/edit", + "url": "^__SHOPPING_ADMIN__/catalog/product/save/id/1684/type/configurable/store/0/set/\\d+/back/edit$", "http_method": "POST", "response_status": 302, "post_data": {"product[short_description]": "

2 customer(s) love it!

"} @@ -15602,19 +14489,16 @@ "intent": "Update the product description of Selene Yoga Hoodie to \"{count} customer(s) love it!\" where count is the number of reviews with 4 stars or above, or \"don't miss out on this amazing product\" when there are no such reviews.", "intent_template": "Update the product description of {{product}} to \"{count} customer(s) love it!\" where count is the number of reviews with 4 stars or above, or \"don't miss out on this amazing product\" when there are no such reviews.", "instantiation_dict": {"product": "Selene Yoga Hoodie"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", - "site": "shopping_admin", - "last_event_only": true, "expected": { - "url": "__SHOPPING_ADMIN__/catalog/product/save/id/1108/type/configurable/store/0/set/9/back/edit", + "url": "^__SHOPPING_ADMIN__/catalog/product/save/id/1108/type/configurable/store/0/set/\\d+/back/edit$", "http_method": "POST", "response_status": 302, "post_data": {"product[short_description]": "

3 customer(s) love it!

"} @@ -15631,19 +14515,16 @@ "intent": "Update the product description of Radiant Tee to \"{count} customer(s) love it!\" where count is the number of reviews with 4 stars or above, or \"don't miss out on this amazing product\" when there are no such reviews.", "intent_template": "Update the product description of {{product}} to \"{count} customer(s) love it!\" where count is the number of reviews with 4 stars or above, or \"don't miss out on this amazing product\" when there are no such reviews.", "instantiation_dict": {"product": "Radiant Tee"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", - "site": "shopping_admin", - "last_event_only": true, "expected": { - "url": "__SHOPPING_ADMIN__/catalog/product/save/id/1556/type/configurable/store/0/set/9/back/edit", + "url": "^__SHOPPING_ADMIN__/catalog/product/save/id/1556/type/configurable/store/0/set/\\d+/back/edit$", "http_method": "POST", "response_status": 302, "post_data": {"product[short_description]": "

1 customer(s) love it!

"} @@ -15660,19 +14541,16 @@ "intent": "Update the product description of Lucia Cross-Fit Bra to \"{count} customer(s) love it!\" where count is the number of reviews with 4 stars or above, or \"don't miss out on this amazing product\" when there are no such reviews.", "intent_template": "Update the product description of {{product}} to \"{count} customer(s) love it!\" where count is the number of reviews with 4 stars or above, or \"don't miss out on this amazing product\" when there are no such reviews.", "instantiation_dict": {"product": "Lucia Cross-Fit Bra"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", - "site": "shopping_admin", - "last_event_only": true, "expected": { - "url": "__SHOPPING_ADMIN__/catalog/product/save/id/1668/type/configurable/store/0/set/9/back/edit", + "url": "^__SHOPPING_ADMIN__/catalog/product/save/id/1668/type/configurable/store/0/set/\\d+/back/edit$", "http_method": "POST", "response_status": 302, "post_data": {"product[short_description]": "

don't miss out on this amazing product

"} @@ -15687,25 +14565,46 @@ "intent_template_id": 252, "start_urls": ["__SHOPPING_ADMIN__"], "intent": "Add a new color option brown to the size S of Phoebe Zipper Sweatshirt", - "intent_template": "Add a new {{option}} option {{value}} to the {{base_setting}} of {{product}}", + "intent_template": "Add {{option_spec}} to {{base_setting}} {{product}}", "instantiation_dict": { - "option": "color", - "value": "brown", - "base_setting": "size S", + "option_spec": "a new color option brown", + "base_setting": "the size S of", "product": "Phoebe Zipper Sweatshirt" }, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "product_id": "1130", - "site": "shopping_admin", - "expected": { "variants": ["Phoebe Zipper Sweatshirt-S-Brown"] } + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": "__SHOPPING_ADMIN__/catalog/product_attribute/save/attribute_id/93", + "http_method": "POST", + "response_status": 302, + "post_data": { + "serialized_options": "^.*swatchtext%5Bvalue%5D%5Boption_\\d+%5D%5B0%5D=Brown.*$" + } + } + }, + { + "evaluator": "NetworkEventEvaluator", + "post_data_schema": { + "type": "object", + "properties": { + "$['configurable-matrix-serialized'][?(@.newProduct == 1)].attributes": { "type": "array", "items": {"type": "string"} } + } + }, + "expected": { + "url": "^__SHOPPING_ADMIN__/catalog/product/save/id/1130/type/configurable/store/0/set/\\d+/back/edit$", + "http_method": "POST", + "response_status": 302, + "post_data": { + "product[name]": "Phoebe Zipper Sweatshirt", + "$['configurable-matrix-serialized'][?(@.newProduct == 1)].attributes": ["size: s, color: brown"] + } + } } ], "revision": 2 @@ -15716,25 +14615,46 @@ "intent_template_id": 252, "start_urls": ["__SHOPPING_ADMIN__"], "intent": "Add a new color blue to size S and M of Frankie Sweatshirt", - "intent_template": "Add a new {{option}} {{value}} to {{base_setting}} of {{product}}", + "intent_template": "Add {{option_spec}} to {{base_setting}} {{product}}", "instantiation_dict": { - "option": "color", - "value": "blue", - "base_setting": "size S and M", + "option_spec": "a new color blue", + "base_setting": "size S and M of", "product": "Frankie Sweatshirt" }, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} + }, + { + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": "__SHOPPING_ADMIN__/catalog/product_attribute/save/attribute_id/93", + "http_method": "POST", + "response_status": 302, + "post_data": { + "serialized_options": "^.*swatchtext%5Bvalue%5D%5Boption_\\d+%5D%5B0%5D=Blue.*$" + } + } }, { - "evaluator": "BackendStateEvaluator", - "product_id": "110", - "site": "shopping_admin", - "expected": { "variants": ["Frankie Sweatshirt-M-Blue", "Frankie Sweatshirt-S-Blue"] } + "evaluator": "NetworkEventEvaluator", + "post_data_schema": { + "type": "object", + "properties": { + "$['configurable-matrix-serialized'][?(@.newProduct == 1)].attributes": { "type": "array", "items": {"type": "string"} } + } + }, + "expected": { + "url": "^__SHOPPING_ADMIN__/catalog/product/save/id/110/type/configurable/store/0/set/\\d+/back/edit$", + "http_method": "POST", + "response_status": 302, + "post_data": { + "product[name]": "Frankie Sweatshirt", + "$['configurable-matrix-serialized'][?(@.newProduct == 1)].attributes": ["size: s, color: blue", "size: m, color: blue"] + } + } } ], "revision": 2 @@ -15744,26 +14664,47 @@ "task_id": 549, "intent_template_id": 252, "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Add a new size XXXL to green Minerva LumaTech\u2122 V-Tee", - "intent_template": "Add a new {{option}} {{value}} to {{base_setting}} {{product}}", + "intent": "Add a new size XXXL to green Minerva LumaTech V-Tee", + "intent_template": "Add {{option_spec}} to {{base_setting}} {{product}}", "instantiation_dict": { - "option": "size", - "value": "XXXL", + "option_spec": "a new size XXXL", "base_setting": "green", "product": "Minerva LumaTech V-Tee" }, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "product_id": "1492", - "site": "shopping_admin", - "expected": { "variants": ["Minerva LumaTech\u2122 V-Tee-XXXL-Green"] } + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": "__SHOPPING_ADMIN__/catalog/product_attribute/save/attribute_id/144", + "http_method": "POST", + "response_status": 302, + "post_data": { + "serialized_options": "^.*swatchtext%5Bvalue%5D%5Boption_\\d+%5D%5B0%5D=XXXL.*$" + } + } + }, + { + "evaluator": "NetworkEventEvaluator", + "post_data_schema": { + "type": "object", + "properties": { + "$['configurable-matrix-serialized'][?(@.newProduct == 1)].attributes": { "type": "array", "items": {"type": "string"} } + } + }, + "expected": { + "url": "^__SHOPPING_ADMIN__/catalog/product/save/id/1492/type/configurable/store/0/set/\\d+/back/edit$", + "http_method": "POST", + "response_status": 302, + "post_data": { + "product[name]": "Minerva LumaTech™ V-Tee", + "$['configurable-matrix-serialized'][?(@.newProduct == 1)].attributes": ["size: xxxl, color: green"] + } + } } ], "revision": 2 @@ -15774,25 +14715,46 @@ "intent_template_id": 252, "start_urls": ["__SHOPPING_ADMIN__"], "intent": "Add a new size XXS to blue and purple Nona Fitness Tank", - "intent_template": "Add a new {{option}} {{value}} to {{base_setting}} {{product}}", + "intent_template": "Add {{option_spec}} to {{base_setting}} {{product}}", "instantiation_dict": { - "option": "size", - "value": "XXS", + "option_spec": "a new size XXS", "base_setting": "blue and purple", "product": "Nona Fitness Tank" }, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "product_id": "1732", - "site": "shopping_admin", - "expected": { "variants": ["Nona Fitness Tank-XXS-Blue", "Nona Fitness Tank-XXS-Purple"] } + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": "__SHOPPING_ADMIN__/catalog/product_attribute/save/attribute_id/144", + "http_method": "POST", + "response_status": 302, + "post_data": { + "serialized_options": "^.*&swatchtext%5Bvalue%5D%5Boption_\\d+%5D%5B0%5D=XXS.*$" + } + } + }, + { + "evaluator": "NetworkEventEvaluator", + "post_data_schema": { + "type": "object", + "properties": { + "$['configurable-matrix-serialized'][?(@.newProduct == 1)].attributes": { "type": "array", "items": {"type": "string"} } + } + }, + "expected": { + "url": "^__SHOPPING_ADMIN__/catalog/product/save/id/1732/type/configurable/store/0/set/\\d+/back/edit$", + "http_method": "POST", + "response_status": 302, + "post_data": { + "product[name]": "Nona Fitness Tank", + "$['configurable-matrix-serialized'][?(@.newProduct == 1)].attributes": ["size: xxs, color: blue", "size: xxs, color: purple"] + } + } } ], "revision": 2 @@ -15803,25 +14765,22 @@ "intent_template_id": 252, "start_urls": ["__SHOPPING_ADMIN__"], "intent": "Add new size 30 and 31 to all color variants of Diana Tights", - "intent_template": "Add new {{option}} {{value}} to {{base_setting}} of {{product}}", + "intent_template": "Add {{option_spec}} to {{base_setting}} {{product}}", "instantiation_dict": { - "option": "size", - "value": "30 and 31", - "base_setting": "all color variants", + "option_spec": "new size 30 and 31", + "base_setting": "all color variants of", "product": "Diana Tights" }, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", - "site": "shopping_admin", "expected": { - "url": "__SHOPPING_ADMIN__/catalog/product/save/id/1854/type/configurable/store/0/set/10/back/edit", + "url": "^__SHOPPING_ADMIN__/catalog/product/save/id/1854/type/configurable/store/0/set/\\d+/back/edit$", "http_method": "POST", "response_status": 302, "post_data": { @@ -15858,21 +14817,14 @@ "intent": "Use the Web IDE to create a folder named real_space in gimmiethat.space repo. Within it, create a file named urls.json that contains the full URLs of the 5 most recent posts from the space forum on the discussion forum as a JSON object with a \"urls\" array. Write them in order from newest to oldest. Commit the changes to the default branch.", "intent_template": "Use the Web IDE to create a folder named {{directory}} in {{gitlab_repo}} repo. Within it, create a file named urls.json that contains the full URLs of the 5 most recent posts from the {{forum}} forum on the discussion forum as a JSON object with a \"urls\" array. Write them in order from newest to oldest. Commit the changes to the default branch.", "instantiation_dict": {"directory": "real_space", "gitlab_repo": "gimmiethat.space", "forum": "space"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", - "ignored_query_params_patterns": [".*"], - "expected": {"url": "__REDDIT__/f/space/new"} - }, - { - "evaluator": "NetworkEventEvaluator", - "gen_missing_post_data_schema": true, "post_data_schema": { "type": "object", "properties": { "$.actions[0].content": {"type": "string", "format": "json"} } @@ -15903,23 +14855,16 @@ "instantiation_dict": { "directory": "news", "gitlab_repo": "gimmiethat.space", - "forum": "news related subreddits" + "forum": "news related forums" }, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "ignored_query_params_patterns": [".*"], - "expected": {"url": "__REDDIT__/f/news/new"} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", - "gen_missing_post_data_schema": true, "post_data_schema": { "type": "object", "properties": { "$.actions[0].content": {"type": "string", "format": "json"} } @@ -15952,21 +14897,14 @@ "gitlab_repo": "gimmiethat.space", "forum": "movies" }, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "ignored_query_params_patterns": [".*"], - "expected": {"url": "__REDDIT__/f/movies/new"} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", - "gen_missing_post_data_schema": true, "post_data_schema": { "type": "object", "properties": { "$.actions[0].content": {"type": "string", "format": "json"} } @@ -15995,21 +14933,14 @@ "intent": "Use the Web IDE to create a folder named funny_pic in gimmiethat.space repo. Within it, create a file named urls.json that contains the full URLs of the 5 most recent posts from the memes forum on the discussion forum as a JSON object with a \"urls\" array. Write them in order from newest to oldest. Commit the changes to the default branch.", "intent_template": "Use the Web IDE to create a folder named {{directory}} in {{gitlab_repo}} repo. Within it, create a file named urls.json that contains the full URLs of the 5 most recent posts from the {{forum}} forum on the discussion forum as a JSON object with a \"urls\" array. Write them in order from newest to oldest. Commit the changes to the default branch.", "instantiation_dict": {"directory": "funny_pic", "gitlab_repo": "gimmiethat.space", "forum": "memes"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", - "ignored_query_params_patterns": [".*"], - "expected": {"url": "__REDDIT__/f/memes/new"} - }, - { - "evaluator": "NetworkEventEvaluator", - "gen_missing_post_data_schema": true, "post_data_schema": { "type": "object", "properties": { "$.actions[0].content": {"type": "string", "format": "json"} } @@ -16035,22 +14966,17 @@ "task_id": 556, "intent_template_id": 87, "start_urls": ["__GITLAB__", "__WIKIPEDIA__"], - "intent": "Create a repository named nolan_honest_fans with a README file containing only Christopher Nolan's theatrically released feature-length films (use the provided wiki site to look up any needed information). Use the Web IDE to create the README. Structure it with the repository name as the main heading and a bulleted list of the items. Commit to the default branch.", - "intent_template": "Create a repository named {{name}} with a README file containing {{topics}} (use the provided wiki site to look up any needed information). Use the Web IDE to create the README. Structure it with the repository name as the main heading and a bulleted list of the items. Commit to the default branch.", + "intent": "Create a repository named nolan_honest_fans with a README file containing only Christopher Nolan's theatrically released feature-length films (use the provided wiki site to look up any needed information). Use the Web IDE to create the README. Structure it with the repository name as the main heading and a bulleted list of the film titles. Commit to the default branch.", + "intent_template": "Create a repository named {{name}} with a README file containing {{topics}} (use the provided wiki site to look up any needed information). Use the Web IDE to create the README. Structure it with the repository name as the main heading and a bulleted list of the film titles. Commit to the default branch.", "instantiation_dict": { "name": "nolan_honest_fans", "topics": "only Christopher Nolan's theatrically released feature-length films" }, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": {"url": "^__WIKIPEDIA__/.*Christopher.*Nolan.*$"} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", @@ -16085,29 +15011,24 @@ } } ], - "revision": 2 + "revision": 3 }, { "sites": ["gitlab", "wikipedia"], "task_id": 557, "intent_template_id": 87, "start_urls": ["__GITLAB__", "__WIKIPEDIA__"], - "intent": "Create a repository named nolan_old_fans with a README file containing only Christopher Nolan's theatrically released feature-length films before 2010 (use the provided wiki site to look up any needed information). Use the Web IDE to create the README. Structure it with the repository name as the main heading and a bulleted list of the items. Commit to the default branch.", - "intent_template": "Create a repository named {{name}} with a README file containing {{topics}} (use the provided wiki site to look up any needed information). Use the Web IDE to create the README. Structure it with the repository name as the main heading and a bulleted list of the items. Commit to the default branch.", + "intent": "Create a repository named nolan_old_fans with a README file containing only Christopher Nolan's theatrically released feature-length films before 2010 (use the provided wiki site to look up any needed information). Use the Web IDE to create the README. Structure it with the repository name as the main heading and a bulleted list of the film titles. Commit to the default branch.", + "intent_template": "Create a repository named {{name}} with a README file containing {{topics}} (use the provided wiki site to look up any needed information). Use the Web IDE to create the README. Structure it with the repository name as the main heading and a bulleted list of the film titles. Commit to the default branch.", "instantiation_dict": { "name": "nolan_old_fans", "topics": "only Christopher Nolan's theatrically released feature-length films before 2010" }, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": {"url": "^__WIKIPEDIA__/.*Christopher.*Nolan.*$"} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", @@ -16142,29 +15063,24 @@ } } ], - "revision": 2 + "revision": 3 }, { "sites": ["gitlab", "wikipedia"], "task_id": 558, "intent_template_id": 87, "start_urls": ["__GITLAB__", "__WIKIPEDIA__"], - "intent": "Create a repository named nolan_young_fans with a README file containing only Christopher Nolan's theatrically released feature-length films after 2010 (use the provided wiki site to look up any needed information). Use the Web IDE to create the README. Structure it with the repository name as the main heading and a bulleted list of the items. Commit to the default branch.", - "intent_template": "Create a repository named {{name}} with a README file containing {{topics}} (use the provided wiki site to look up any needed information). Use the Web IDE to create the README. Structure it with the repository name as the main heading and a bulleted list of the items. Commit to the default branch.", + "intent": "Create a repository named nolan_young_fans with a README file containing only Christopher Nolan's theatrically released feature-length films after 2010 (use the provided wiki site to look up any needed information). Use the Web IDE to create the README. Structure it with the repository name as the main heading and a bulleted list of the film titles. Commit to the default branch.", + "intent_template": "Create a repository named {{name}} with a README file containing {{topics}} (use the provided wiki site to look up any needed information). Use the Web IDE to create the README. Structure it with the repository name as the main heading and a bulleted list of the film titles. Commit to the default branch.", "instantiation_dict": { "name": "nolan_young_fans", "topics": "only Christopher Nolan's theatrically released feature-length films after 2010" }, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": {"url": "^__WIKIPEDIA__/.*Christopher.*Nolan.*$"} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", @@ -16199,29 +15115,24 @@ } } ], - "revision": 2 + "revision": 3 }, { "sites": ["gitlab", "wikipedia"], "task_id": 559, "intent_template_id": 87, "start_urls": ["__GITLAB__", "__WIKIPEDIA__"], - "intent": "Create a repository named nolan_followers with a README file containing career timeline headings of Christopher Nolan in order (use the provided wiki site to look up any needed information). Use the Web IDE to create the README. Structure it with the repository name as the main heading and a bulleted list of the items. Commit to the default branch.", - "intent_template": "Create a repository named {{name}} with a README file containing {{topics}} (use the provided wiki site to look up any needed information). Use the Web IDE to create the README. Structure it with the repository name as the main heading and a bulleted list of the items. Commit to the default branch.", + "intent": "Create a repository named nolan_followers with a README file containing career timeline headings of Christopher Nolan in order (use the provided wiki site to look up any needed information). Use the Web IDE to create the README. Structure it with the repository name as the main heading and a bulleted list of the film titles. Commit to the default branch.", + "intent_template": "Create a repository named {{name}} with a README file containing {{topics}} (use the provided wiki site to look up any needed information). Use the Web IDE to create the README. Structure it with the repository name as the main heading and a bulleted list of the film titles. Commit to the default branch.", "instantiation_dict": { "name": "nolan_followers", "topics": "career timeline headings of Christopher Nolan in order" }, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": {"url": "^__WIKIPEDIA__/.*Christopher.*Nolan.*$"} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", @@ -16256,29 +15167,24 @@ } } ], - "revision": 2 + "revision": 3 }, { "sites": ["gitlab", "wikipedia"], "task_id": 560, "intent_template_id": 87, "start_urls": ["__GITLAB__", "__WIKIPEDIA__"], - "intent": "Create a repository named nolan_academy_awards with a README file containing movies that won Academy Awards by Christopher Nolan (use the provided wiki site to look up any needed information). Use the Web IDE to create the README. Structure it with the repository name as the main heading and a bulleted list of the items. Commit to the default branch.", - "intent_template": "Create a repository named {{name}} with a README file containing {{topics}} (use the provided wiki site to look up any needed information). Use the Web IDE to create the README. Structure it with the repository name as the main heading and a bulleted list of the items. Commit to the default branch.", + "intent": "Create a repository named nolan_academy_awards with a README file containing movies that won Academy Awards by Christopher Nolan (use the provided wiki site to look up any needed information). Use the Web IDE to create the README. Structure it with the repository name as the main heading and a bulleted list of the film titles. Commit to the default branch.", + "intent_template": "Create a repository named {{name}} with a README file containing {{topics}} (use the provided wiki site to look up any needed information). Use the Web IDE to create the README. Structure it with the repository name as the main heading and a bulleted list of the film titles. Commit to the default branch.", "instantiation_dict": { "name": "nolan_academy_awards", "topics": "movies that won Academy Awards by Christopher Nolan" }, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": {"url": "^__WIKIPEDIA__/.*Christopher.*Nolan.*$"} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", @@ -16313,29 +15219,24 @@ } } ], - "revision": 2 + "revision": 3 }, { "sites": ["gitlab", "wikipedia"], "task_id": 561, "intent_template_id": 87, "start_urls": ["__GITLAB__", "__WIKIPEDIA__"], - "intent": "Create a repository named bafta_awards_nolan with a README file containing movies that are nominated BAFTA Awards by Christopher Nolan (use the provided wiki site to look up any needed information). Use the Web IDE to create the README. Structure it with the repository name as the main heading and a bulleted list of the items. Commit to the default branch.", - "intent_template": "Create a repository named {{name}} with a README file containing {{topics}} (use the provided wiki site to look up any needed information). Use the Web IDE to create the README. Structure it with the repository name as the main heading and a bulleted list of the items. Commit to the default branch.", + "intent": "Create a repository named bafta_awards_nolan with a README file containing movies that are nominated BAFTA Awards by Christopher Nolan (use the provided wiki site to look up any needed information). Use the Web IDE to create the README. Structure it with the repository name as the main heading and a bulleted list of the film titles. Commit to the default branch.", + "intent_template": "Create a repository named {{name}} with a README file containing {{topics}} (use the provided wiki site to look up any needed information). Use the Web IDE to create the README. Structure it with the repository name as the main heading and a bulleted list of the film titles. Commit to the default branch.", "instantiation_dict": { "name": "bafta_awards_nolan", "topics": "movies that are nominated BAFTA Awards by Christopher Nolan" }, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": {"url": "^__WIKIPEDIA__/.*Christopher.*Nolan.*$"} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", @@ -16370,7 +15271,7 @@ } } ], - "revision": 2 + "revision": 3 }, { "sites": ["gitlab", "reddit"], @@ -16380,17 +15281,11 @@ "intent": "Create a private repository named Awesome_DIY_ideas with a README file containing links to the 6 most active DIY posts from the DIY forum. Use the Web IDE to create the README. Structure it with the repository name as the main heading, \"Most Active DIY Threads\" as a subheading, and a bulleted list of the posts as markdown links (using their titles as link text) in the same order they appear on the forum. Commit to the default branch.", "intent_template": "Create a private repository named {{name}} with a README file containing links to the {{num}} most active DIY posts from the DIY forum. Use the Web IDE to create the README. Structure it with the repository name as the main heading, \"Most Active DIY Threads\" as a subheading, and a bulleted list of the posts as markdown links (using their titles as link text) in the same order they appear on the forum. Commit to the default branch.", "instantiation_dict": {"name": "Awesome_DIY_ideas", "num": 6}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "ignored_query_params_patterns": [".*"], - "expected": {"url": "__REDDIT__/f/diy/active"} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", @@ -16437,17 +15332,11 @@ "intent": "Create a private repository named fun_thing_to_do with a README file containing links to the 5 most active DIY posts from the DIY forum. Use the Web IDE to create the README. Structure it with the repository name as the main heading, \"Most Active DIY Threads\" as a subheading, and a bulleted list of the posts as markdown links (using their titles as link text) in the same order they appear on the forum. Commit to the default branch.", "intent_template": "Create a private repository named {{name}} with a README file containing links to the {{num}} most active DIY posts from the DIY forum. Use the Web IDE to create the README. Structure it with the repository name as the main heading, \"Most Active DIY Threads\" as a subheading, and a bulleted list of the posts as markdown links (using their titles as link text) in the same order they appear on the forum. Commit to the default branch.", "instantiation_dict": {"name": "fun_thing_to_do", "num": 5}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "ignored_query_params_patterns": [".*"], - "expected": {"url": "__REDDIT__/f/diy/active"} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", @@ -16494,17 +15383,11 @@ "intent": "Create a private repository named live_a_life with a README file containing links to the 3 most active DIY posts from the DIY forum. Use the Web IDE to create the README. Structure it with the repository name as the main heading, \"Most Active DIY Threads\" as a subheading, and a bulleted list of the posts as markdown links (using their titles as link text) in the same order they appear on the forum. Commit to the default branch.", "intent_template": "Create a private repository named {{name}} with a README file containing links to the {{num}} most active DIY posts from the DIY forum. Use the Web IDE to create the README. Structure it with the repository name as the main heading, \"Most Active DIY Threads\" as a subheading, and a bulleted list of the posts as markdown links (using their titles as link text) in the same order they appear on the forum. Commit to the default branch.", "instantiation_dict": {"name": "live_a_life", "num": 3}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "ignored_query_params_patterns": [".*"], - "expected": {"url": "__REDDIT__/f/diy/active"} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", @@ -16551,17 +15434,11 @@ "intent": "Create a private repository named TODO with a README file containing links to the 10 most active DIY posts from the DIY forum. Use the Web IDE to create the README. Structure it with the repository name as the main heading, \"Most Active DIY Threads\" as a subheading, and a bulleted list of the posts as markdown links (using their titles as link text) in the same order they appear on the forum. Commit to the default branch.", "intent_template": "Create a private repository named {{name}} with a README file containing links to the {{num}} most active DIY posts from the DIY forum. Use the Web IDE to create the README. Structure it with the repository name as the main heading, \"Most Active DIY Threads\" as a subheading, and a bulleted list of the posts as markdown links (using their titles as link text) in the same order they appear on the forum. Commit to the default branch.", "instantiation_dict": {"name": "TODO", "num": 10}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "ignored_query_params_patterns": [".*"], - "expected": {"url": "__REDDIT__/f/diy/active"} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", @@ -16608,17 +15485,11 @@ "intent": "Create a private repository named Do it myself with a README file containing links to the 8 most active DIY posts from the DIY forum. Use the Web IDE to create the README. Structure it with the repository name as the main heading, \"Most Active DIY Threads\" as a subheading, and a bulleted list of the posts as markdown links (using their titles as link text) in the same order they appear on the forum. Commit to the default branch.", "intent_template": "Create a private repository named {{name}} with a README file containing links to the {{num}} most active DIY posts from the DIY forum. Use the Web IDE to create the README. Structure it with the repository name as the main heading, \"Most Active DIY Threads\" as a subheading, and a bulleted list of the posts as markdown links (using their titles as link text) in the same order they appear on the forum. Commit to the default branch.", "instantiation_dict": {"name": "Do it myself", "num": 8}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "ignored_query_params_patterns": [".*"], - "expected": {"url": "__REDDIT__/f/diy/active"} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", @@ -16668,12 +15539,11 @@ "collaborator_account_list": "Jakub Klinkovsk\u00fd and Beno\u00eet Blanchon", "repo": "gimmiethat.space" }, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", @@ -16704,12 +15574,11 @@ "intent": "Invite Abishek and Vinta as collaborator(s) to a11yproject.com repo", "intent_template": "Invite {{collaborator_account_list}} as collaborator(s) to {{repo}} repo", "instantiation_dict": {"collaborator_account_list": "Abishek and Vinta", "repo": "a11yproject.com"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", @@ -16743,12 +15612,11 @@ "collaborator_account_list": "Beno\u00eet and Abishek", "repo": "my HTML5 markup extention" }, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", @@ -16782,12 +15650,11 @@ "collaborator_account_list": "Jakub K, Alex Dills, Alex Hutnik and Beno\u00eet Blanchon", "repo": "my time tracking tool project" }, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", @@ -16836,17 +15703,14 @@ "intent": "I recently moved, my address is 231 Willow Way, Suite 100, Chicago, IL, 60601, update my information on OneStopShopping accordingly", "intent_template": "I recently moved, my address is {{address}}, update my information on OneStopShopping accordingly", "instantiation_dict": {"address": "231 Willow Way, Suite 100, Chicago, IL, 60601"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", - "site": "shopping", - "last_event_only": true, "expected": { "url": "__SHOPPING__/customer/address/formPost/id/26/", "http_method": "POST", @@ -16873,17 +15737,14 @@ "intent": "I recently moved, my address is 654 Aspen Road, House #3, Boston, MA, 02110, update my information on OneStopShopping accordingly", "intent_template": "I recently moved, my address is {{address}}, update my information on OneStopShopping accordingly", "instantiation_dict": {"address": "654 Aspen Road, House #3, Boston, MA, 02110"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", - "site": "shopping", - "last_event_only": true, "expected": { "url": "__SHOPPING__/customer/address/formPost/id/26/", "http_method": "POST", @@ -16910,17 +15771,14 @@ "intent": "I recently moved, my address is 987 Sycamore Circle, Philadelphia, PA, 19102, update my information on OneStopShopping accordingly", "intent_template": "I recently moved, my address is {{address}}, update my information on OneStopShopping accordingly", "instantiation_dict": {"address": "987 Sycamore Circle, Philadelphia, PA, 19102"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", - "site": "shopping", - "last_event_only": true, "expected": { "url": "__SHOPPING__/customer/address/formPost/id/26/", "http_method": "POST", @@ -16946,17 +15804,14 @@ "intent": "I recently moved, my address is 111 Magnolia Path, Atlanta, GA, 30303, update my information on OneStopShopping accordingly", "intent_template": "I recently moved, my address is {{address}}, update my information on OneStopShopping accordingly", "instantiation_dict": {"address": "111 Magnolia Path, Atlanta, GA, 30303"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", - "site": "shopping", - "last_event_only": true, "expected": { "url": "__SHOPPING__/customer/address/formPost/id/26/", "http_method": "POST", @@ -16982,17 +15837,14 @@ "intent": "I recently moved, my address is 222 Redwood Rise, Suite 300, Seattle, WA, 98101, update my information on OneStopShopping accordingly", "intent_template": "I recently moved, my address is {{address}}, update my information on OneStopShopping accordingly", "instantiation_dict": {"address": "222 Redwood Rise, Suite 300, Seattle, WA, 98101"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", - "site": "shopping", - "last_event_only": true, "expected": { "url": "__SHOPPING__/customer/address/formPost/id/26/", "http_method": "POST", @@ -17017,18 +15869,18 @@ "intent_template_id": 351, "start_urls": ["__GITLAB__"], "intent": "Add the following users to repo a11y-webring.club as developer: \"abisubramanya27\" and \"lahwaacz\"", - "intent_template": "Add the following users to repo {{repo}} as {{role}}: {{user_list}}", + "intent_template": "Add the following users to {{repo_prefix}} {{repo}} as {{role}}: {{user_list}}", "instantiation_dict": { "repo": "a11y-webring.club", "role": "developer", - "user_list": "\"abisubramanya27\" and \"lahwaacz\"" + "user_list": "\"abisubramanya27\" and \"lahwaacz\"", + "repo_prefix": "repo" }, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", @@ -17052,18 +15904,18 @@ "intent_template_id": 351, "start_urls": ["__GITLAB__"], "intent": "Add the following users to my GitHub timeline item management extension as maintainer: \"abisubramanya27\" and \"lahwaacz\"", - "intent_template": "Add the following users to my {{repo}} as {{role}}: {{user_list}}", + "intent_template": "Add the following users to {{repo_prefix}} {{repo}} as {{role}}: {{user_list}}", "instantiation_dict": { "repo": "GitHub timeline item management extension", "role": "maintainer", - "user_list": "\"abisubramanya27\" and \"lahwaacz\"" + "user_list": "\"abisubramanya27\" and \"lahwaacz\"", + "repo_prefix": "my" }, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", @@ -17087,18 +15939,18 @@ "intent_template_id": 351, "start_urls": ["__GITLAB__"], "intent": "Add the following users to repo millennials-to-snake-people as reporter: \"yjlou\" and \"a11yproject\"", - "intent_template": "Add the following users to repo {{repo}} as {{role}}: {{user_list}}", + "intent_template": "Add the following users to {{repo_prefix}} {{repo}} as {{role}}: {{user_list}}", "instantiation_dict": { "repo": "millennials-to-snake-people", "role": "reporter", - "user_list": "\"yjlou\" and \"a11yproject\"" + "user_list": "\"yjlou\" and \"a11yproject\"", + "repo_prefix": "repo" }, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", @@ -17118,14 +15970,18 @@ "intent_template_id": 351, "start_urls": ["__GITLAB__"], "intent": "Add the following users to my time tracking tool as guest: \"yjlou\"", - "intent_template": "Add the following users to my {{repo}} as {{role}}: {{user_list}}", - "instantiation_dict": {"repo": "time tracking tool", "role": "guest", "user_list": "\"yjlou\""}, - "format_specification": null, + "intent_template": "Add the following users to {{repo_prefix}} {{repo}} as {{role}}: {{user_list}}", + "instantiation_dict": { + "repo": "time tracking tool", + "role": "guest", + "user_list": "\"yjlou\"", + "repo_prefix": "my" + }, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", @@ -17151,12 +16007,11 @@ "description": "A wild place for sci-fi enthusiasts", "sidebar_list": "**New**, Classic, *Movies*, Post my novel, Random" }, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", @@ -17187,12 +16042,11 @@ "description": "Language Technologies Institute at Carnegie Mellon University", "sidebar_list": "**announcement**, paper, **alumni**" }, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", @@ -17223,12 +16077,11 @@ "description": "Welcome to the future", "sidebar_list": "Games, *Books*, Movies, *Future*" }, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", @@ -17259,12 +16112,11 @@ "description": "Cat parents & plan lovers", "sidebar_list": "**Cat friendly**, Local vendors, Promotion, *Toxic plants!*" }, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", @@ -17295,12 +16147,11 @@ "description": "Place for Karaoke lovers", "sidebar_list": "*devices*, setup" }, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", @@ -17333,17 +16184,14 @@ "summary": "Good purchase", "review": "I like it" }, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", - "site": "shopping", - "last_event_only": true, "expected": { "url": "__SHOPPING__/review/product/post/id/73063/", "http_method": "POST", @@ -17373,17 +16221,14 @@ "summary": "Good purchase", "review": "I like it" }, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", - "site": "shopping", - "last_event_only": true, "expected": { "url": "__SHOPPING__/review/product/post/id/102586/", "http_method": "POST", @@ -17413,17 +16258,14 @@ "summary": "Ok I guess", "review": "Does the job" }, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", - "site": "shopping", - "last_event_only": true, "expected": { "url": "__SHOPPING__/review/product/post/id/101441/", "http_method": "POST", @@ -17453,17 +16295,14 @@ "summary": "Very bad", "review": "I hated it" }, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", - "site": "shopping", - "last_event_only": true, "expected": { "url": "__SHOPPING__/review/product/post/id/14854/", "http_method": "POST", @@ -17493,17 +16332,14 @@ "summary": "Very bad", "review": "I hated it" }, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", - "site": "shopping", - "last_event_only": true, "expected": { "url": "__SHOPPING__/review/product/post/id/76228/", "http_method": "POST", @@ -17529,15 +16365,14 @@ "instantiation_dict": { "title": "product launch", "event": "event of product launch", - "start_date": "1/16/2023", + "start_date": "January 16, 2023", "end_date": "on January 30, 2023" }, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", @@ -17565,15 +16400,14 @@ "instantiation_dict": { "title": "code review", "event": "practice of collective code review", - "start_date": "1/16/2023", + "start_date": "January 16, 2023", "end_date": "in 20 days (inclusive)" }, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", @@ -17583,7 +16417,7 @@ "post_data": { "milestone[title]": "code review", "milestone[start_date]": "2023-01-16", - "milestone[due_date]": "2023-02-05" + "milestone[due_date]": "2023-02-04" }, "response_status": 302 } @@ -17601,15 +16435,14 @@ "instantiation_dict": { "title": "sensitive information", "event": "task of cleaning sensitive information", - "start_date": "2/16/2023", + "start_date": "February 16, 2023", "end_date": "in 20 days (inclusive)" }, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", @@ -17619,7 +16452,7 @@ "post_data": { "milestone[title]": "sensitive information", "milestone[start_date]": "2023-02-16", - "milestone[due_date]": "2023-03-08" + "milestone[due_date]": "2023-03-07" }, "response_status": 302 } @@ -17640,12 +16473,11 @@ "start_date": "March 15, 2044", "end_date": "on March 30, 2044" }, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", @@ -17673,15 +16505,14 @@ "instantiation_dict": { "title": "zsh comprehensive support", "event": "task of adding a new branch for zsh comprehensive support", - "start_date": "5/1/2044", + "start_date": "May 1, 2044", "end_date": "in 20 days (inclusive)" }, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", @@ -17691,7 +16522,7 @@ "post_data": { "milestone[title]": "zsh comprehensive support", "milestone[start_date]": "2044-05-01", - "milestone[due_date]": "2044-05-21" + "milestone[due_date]": "2044-05-20" }, "response_status": 302 } @@ -17704,15 +16535,14 @@ "task_id": 595, "intent_template_id": 4, "start_urls": ["__REDDIT__"], - "intent": "Open the hottest post in the forum \"space\" and subscribe", - "intent_template": "Open the {{post_selector}} post in the forum \"{{forum}}\" and subscribe", + "intent": "Subscribe to the \"space\" forum from the page of the hottest post in that forum.", + "intent_template": "Subscribe to the \"{{forum}}\" forum from the page of the {{post_selector}} post in that forum.", "instantiation_dict": {"post_selector": "hottest", "forum": "space"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", @@ -17733,15 +16563,14 @@ "task_id": 596, "intent_template_id": 4, "start_urls": ["__REDDIT__"], - "intent": "Open the all time top post in the forum \"books\" and subscribe", - "intent_template": "Open the {{post_selector}} post in the forum \"{{forum}}\" and subscribe", + "intent": "Subscribe to the \"books\" forum from the page of the all time top post in that forum.", + "intent_template": "Subscribe to the \"{{forum}}\" forum from the page of the {{post_selector}} post in that forum.", "instantiation_dict": {"post_selector": "all time top", "forum": "books"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", @@ -17762,15 +16591,14 @@ "task_id": 597, "intent_template_id": 4, "start_urls": ["__REDDIT__"], - "intent": "Open the most controversial post in the forum \"consoles\" and subscribe", - "intent_template": "Open the {{post_selector}} post in the forum \"{{forum}}\" and subscribe", + "intent": "Subscribe to the \"consoles\" forum from the page of the most controversial post in that forum.", + "intent_template": "Subscribe to the \"{{forum}}\" forum from the page of the {{post_selector}} post in that forum.", "instantiation_dict": {"post_selector": "most controversial", "forum": "consoles"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", @@ -17793,15 +16621,14 @@ "task_id": 598, "intent_template_id": 4, "start_urls": ["__REDDIT__"], - "intent": "Open the all time most commented post in the forum \"pittsburgh\" and subscribe", - "intent_template": "Open the {{post_selector}} post in the forum \"{{forum}}\" and subscribe", + "intent": "Subscribe to the \"pittsburgh\" forum from the page of the all time most commented post in that forum.", + "intent_template": "Subscribe to the \"{{forum}}\" forum from the page of the {{post_selector}} post in that forum.", "instantiation_dict": {"post_selector": "all time most commented", "forum": "pittsburgh"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", @@ -17825,15 +16652,14 @@ "task_id": 599, "intent_template_id": 4, "start_urls": ["__REDDIT__"], - "intent": "Open the newest post in the forum \"machine learning\" and subscribe", - "intent_template": "Open the {{post_selector}} post in the forum \"{{forum}}\" and subscribe", + "intent": "Subscribe to the \"machine learning\" forum from the page of the newest post in that forum.", + "intent_template": "Subscribe to the \"{{forum}}\" forum from the page of the {{post_selector}} post in that forum.", "instantiation_dict": {"post_selector": "newest", "forum": "machine learning"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", @@ -17860,12 +16686,11 @@ "intent": "Post my question with the title \"what is the recommended console to buy these days?\", in a forum where I'm likely to get an answer", "intent_template": "Post my question with the title \"{{question}}\", in a forum where I'm likely to get an answer", "instantiation_dict": {"question": "what is the recommended console to buy these days?"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", @@ -17890,12 +16715,11 @@ "intent": "Post my question with the title \"is car necessary in NYC\", in a forum where I'm likely to get an answer", "intent_template": "Post my question with the title \"{{question}}\", in a forum where I'm likely to get an answer", "instantiation_dict": {"question": "is car necessary in NYC"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", @@ -17917,12 +16741,11 @@ "intent": "Post my question with the title \"places for new drivers to learn driving in pittsburgh\", in a forum where I'm likely to get an answer", "intent_template": "Post my question with the title \"{{question}}\", in a forum where I'm likely to get an answer", "instantiation_dict": {"question": "places for new drivers to learn driving in pittsburgh"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", @@ -17947,12 +16770,11 @@ "intent": "Post my question with the title \"safe and budget apartment to live in nyc\", in a forum where I'm likely to get an answer", "intent_template": "Post my question with the title \"{{question}}\", in a forum where I'm likely to get an answer", "instantiation_dict": {"question": "safe and budget apartment to live in nyc"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", @@ -17977,12 +16799,11 @@ "intent": "Post my question with the title \"what is the SOTA web navigation agent repo\", in a forum where I'm likely to get an answer", "intent_template": "Post my question with the title \"{{question}}\", in a forum where I'm likely to get an answer", "instantiation_dict": {"question": "what is the SOTA web navigation agent repo"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", @@ -18013,19 +16834,22 @@ "topic": "gaming consoles", "question": "what is the recommended console to buy these days" }, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "site": "reddit", + "evaluator": "NetworkEventEvaluator", "expected": { - "subreddit": "consoles", - "title": "what is the recommended console to buy these days" + "url": ["__REDDIT__/submit", "__REDDIT__/submit/consoles"], + "http_method": "POST", + "post_data": { + "submission[title]": "what is the recommended console to buy these days", + "submission[forum]": "10039" + }, + "response_status": 302 } } ], @@ -18039,17 +16863,20 @@ "intent": "Find a forum focused on topics related to NYC, and post my question, \"is car necessary\" there", "intent_template": "Find a forum focused on topics related to {{topic}}, and post my question, \"{{question}}\" there", "instantiation_dict": {"topic": "NYC", "question": "is car necessary"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "site": "reddit", - "expected": {"subreddit": "nyc", "title": "is car necessary"} + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": ["__REDDIT__/submit", "__REDDIT__/submit/nyc"], + "http_method": "POST", + "post_data": {"submission[title]": "is car necessary", "submission[forum]": "10067"}, + "response_status": 302 + } } ], "revision": 2 @@ -18065,17 +16892,23 @@ "topic": "city Pittsburgh", "question": "places for new drivers to learn driving" }, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "site": "reddit", - "expected": {"subreddit": "pittsburgh", "title": "places for new drivers to learn driving"} + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": ["__REDDIT__/submit", "__REDDIT__/submit/pittsburgh"], + "http_method": "POST", + "post_data": { + "submission[title]": "places for new drivers to learn driving", + "submission[forum]": "10073" + }, + "response_status": 302 + } } ], "revision": 2 @@ -18091,17 +16924,23 @@ "topic": "city lives in DMV area", "question": "safe and budget apartment to live" }, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "site": "reddit", - "expected": {"subreddit": "washington", "title": "safe and budge apartment to live"} + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": ["__REDDIT__/submit", "__REDDIT__/submit/washington"], + "http_method": "POST", + "post_data": { + "submission[title]": "safe and budget apartment to live", + "submission[forum]": "10030" + }, + "response_status": 302 + } } ], "revision": 2 @@ -18117,19 +16956,22 @@ "topic": "ML, DL, NLP", "question": "what is the SOTA web navigation agent repo" }, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "site": "reddit", + "evaluator": "NetworkEventEvaluator", "expected": { - "subreddit": "deeplearning", - "title": "what is the SOTA web navigation agent repo" + "url": ["__REDDIT__/submit", "__REDDIT__/submit/deeplearning"], + "http_method": "POST", + "post_data": { + "submission[title]": "what is the SOTA web navigation agent repo", + "submission[forum]": "10043" + }, + "response_status": 302 } } ], @@ -18143,12 +16985,11 @@ "intent": "Post a review of my recent reading, titled \"To Kill a Mockingbird by Harper Lee\" in f/books and then comment \"good book!\" on the created post", "intent_template": "Post a review of my recent reading, titled \"{{book}}\" in f/books and then comment \"{{content}}\" on the created post", "instantiation_dict": {"book": "To Kill a Mockingbird by Harper Lee", "content": "good book!"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", @@ -18168,7 +17009,7 @@ "url": "^__REDDIT__/f/books/\\d+/-/comment$", "http_method": "POST", "headers": {"referer": "^__REDDIT__/f/books/\\d+/to-kill-a-mockingbird-by-harper-lee$"}, - "post_data": {"reply_to_submission_{{post_id}}[comment]": "good book!"}, + "post_data": {"$.^reply_to_submission_\\d+\\[comment\\]$": "good book!"}, "response_status": 302 } } @@ -18183,12 +17024,11 @@ "intent": "Post a review of my recent reading, titled \"Harry Potter\" in f/books and then comment \"Wonderful journey\" on the created post", "intent_template": "Post a review of my recent reading, titled \"{{book}}\" in f/books and then comment \"{{content}}\" on the created post", "instantiation_dict": {"book": "Harry Potter", "content": "Wonderful journey"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", @@ -18219,12 +17059,11 @@ "intent": "Post a review of my recent reading, titled \"big little lies\" in f/books and then comment \"can't stop it\" on the created post", "intent_template": "Post a review of my recent reading, titled \"{{book}}\" in f/books and then comment \"{{content}}\" on the created post", "instantiation_dict": {"book": "big little lies", "content": "can't stop it"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", @@ -18255,12 +17094,11 @@ "intent": "Post a review of my recent reading, titled \"Love story\" in f/books and then comment \"I cried\" on the created post", "intent_template": "Post a review of my recent reading, titled \"{{book}}\" in f/books and then comment \"{{content}}\" on the created post", "instantiation_dict": {"book": "Love story", "content": "I cried"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", @@ -18291,12 +17129,11 @@ "intent": "Post a review of my recent reading, titled \"Gone with the wind\" in f/books and then comment \"It's a book with history\" on the created post", "intent_template": "Post a review of my recent reading, titled \"{{book}}\" in f/books and then comment \"{{content}}\" on the created post", "instantiation_dict": {"book": "Gone with the wind", "content": "It's a book with history"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", @@ -18324,15 +17161,14 @@ "task_id": 615, "intent_template_id": 11, "start_urls": ["__REDDIT__/f/pics"], - "intent": "Re-post the image of Bald Eagle from this forum to earthporn forum using the image URL and title \"from /f/pics\"", + "intent": "Re-post the image of Bald Eagle from this forum to connecticut forum using the image URL and title \"from /f/pics\"", "intent_template": "Re-post the image of {{content}} from this forum to {{forum}} forum using the image URL and title \"from /f/pics\"", - "instantiation_dict": {"content": "Bald Eagle", "forum": "earthporn"}, - "format_specification": null, + "instantiation_dict": {"content": "Bald Eagle", "forum": "connecticut"}, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", @@ -18342,7 +17178,7 @@ "post_data": { "submission[url]": "__REDDIT__/submission_images/b02113033af32feae9ff147dbbe3764039368d67d193885bd04e65c2e6beea9c.jpg", "submission[title]": "from /f/pics", - "submission[forum]": "10009" + "submission[forum]": "10006" }, "response_status": 302 } @@ -18358,12 +17194,11 @@ "intent": "Re-post the image of Thanksgiving turkey from this forum to funny forum using the image URL and title \"from /f/pics\"", "intent_template": "Re-post the image of {{content}} from this forum to {{forum}} forum using the image URL and title \"from /f/pics\"", "instantiation_dict": {"content": "Thanksgiving turkey", "forum": "funny"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", @@ -18386,15 +17221,14 @@ "task_id": 617, "intent_template_id": 11, "start_urls": ["__REDDIT__/f/pics"], - "intent": "Re-post the image of Firework from this forum to earthporn forum using the image URL and title \"from /f/pics\"", + "intent": "Re-post the image of Firework from this forum to news forum using the image URL and title \"from /f/pics\"", "intent_template": "Re-post the image of {{content}} from this forum to {{forum}} forum using the image URL and title \"from /f/pics\"", - "instantiation_dict": {"content": "Firework", "forum": "earthporn"}, - "format_specification": null, + "instantiation_dict": {"content": "Firework", "forum": "news"}, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", @@ -18404,7 +17238,7 @@ "post_data": { "submission[url]": "__REDDIT__/submission_images/92411be6af4e9ad5ccd3ccbaa01c10457bb00e704e99c58dd430de1a958307fd.jpg", "submission[title]": "from /f/pics", - "submission[forum]": "10009" + "submission[forum]": "10064" }, "response_status": 302 } @@ -18420,12 +17254,11 @@ "intent": "Re-post the image of Wife's costume from this forum to funny forum using the image URL and title \"from /f/pics\"", "intent_template": "Re-post the image of {{content}} from this forum to {{forum}} forum using the image URL and title \"from /f/pics\"", "instantiation_dict": {"content": "Wife's costume", "forum": "funny"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", @@ -18451,12 +17284,11 @@ "intent": "Re-post the image of costume contest from this forum to funny forum using the image URL and title \"from /f/pics\"", "intent_template": "Re-post the image of {{content}} from this forum to {{forum}} forum using the image URL and title \"from /f/pics\"", "instantiation_dict": {"content": "costume contest", "forum": "funny"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", @@ -18482,12 +17314,11 @@ "intent": "Ask for advice in a forum for relations. Title it \"deal with long-distance relationships\" and in post details ask \"Please help\"", "intent_template": "Ask for advice in a forum for relations. Title it \"{{issue}}\" and in post details ask \"Please help\"", "instantiation_dict": {"issue": "deal with long-distance relationships"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", @@ -18513,12 +17344,11 @@ "intent": "Ask for advice in a forum for relations. Title it \"cheat\" and in post details ask \"Please help\"", "intent_template": "Ask for advice in a forum for relations. Title it \"{{issue}}\" and in post details ask \"Please help\"", "instantiation_dict": {"issue": "cheat"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", @@ -18544,12 +17374,11 @@ "intent": "Ask for advice in a forum for relations. Title it \"sexual harassment\" and in post details ask \"Please help\"", "intent_template": "Ask for advice in a forum for relations. Title it \"{{issue}}\" and in post details ask \"Please help\"", "instantiation_dict": {"issue": "sexual harassment"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", @@ -18575,12 +17404,11 @@ "intent": "Ask for advice in a forum for relations. Title it \"gift for birthday\" and in post details ask \"Please help\"", "intent_template": "Ask for advice in a forum for relations. Title it \"{{issue}}\" and in post details ask \"Please help\"", "instantiation_dict": {"issue": "gift for birthday"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", @@ -18606,12 +17434,11 @@ "intent": "Ask for advice in a forum for relations. Title it \"break-up remedy\" and in post details ask \"Please help\"", "intent_template": "Ask for advice in a forum for relations. Title it \"{{issue}}\" and in post details ask \"Please help\"", "instantiation_dict": {"issue": "break-up remedy"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", @@ -18637,12 +17464,11 @@ "intent": "Create a discussion post titled \"the effectiveness of deep learning\" in a relevant forum and ask users for their opinions with the simple prompt, \"your opinion\"", "intent_template": "Create a discussion post titled \"{{topic}}\" in a relevant forum and ask users for their opinions with the simple prompt, \"your opinion\"", "instantiation_dict": {"topic": "the effectiveness of deep learning"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", @@ -18668,12 +17494,11 @@ "intent": "Create a discussion post titled \"Iphone 14\" in a relevant forum and ask users for their opinions with the simple prompt, \"your opinion\"", "intent_template": "Create a discussion post titled \"{{topic}}\" in a relevant forum and ask users for their opinions with the simple prompt, \"your opinion\"", "instantiation_dict": {"topic": "Iphone 14"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", @@ -18699,12 +17524,11 @@ "intent": "Create a discussion post titled \"Harry Potter movie series\" in a relevant forum and ask users for their opinions with the simple prompt, \"your opinion\"", "intent_template": "Create a discussion post titled \"{{topic}}\" in a relevant forum and ask users for their opinions with the simple prompt, \"your opinion\"", "instantiation_dict": {"topic": "Harry Potter movie series"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", @@ -18730,12 +17554,11 @@ "intent": "Create a discussion post titled \"long distance relationship\" in a relevant forum and ask users for their opinions with the simple prompt, \"your opinion\"", "intent_template": "Create a discussion post titled \"{{topic}}\" in a relevant forum and ask users for their opinions with the simple prompt, \"your opinion\"", "instantiation_dict": {"topic": "long distance relationship"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", @@ -18761,12 +17584,11 @@ "intent": "Create a discussion post titled \"Fun thing to do in Pittsburgh\" in a relevant forum and ask users for their opinions with the simple prompt, \"your opinion\"", "intent_template": "Create a discussion post titled \"{{topic}}\" in a relevant forum and ask users for their opinions with the simple prompt, \"your opinion\"", "instantiation_dict": {"topic": "Fun thing to do in Pittsburgh"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", @@ -18796,12 +17618,11 @@ "price": "$200", "forum": "f/headphones" }, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", @@ -18827,12 +17648,11 @@ "intent": "Create a post in f/sports. Title it \"running shoes recommendations?\" and in post details ask \"Hey, looking for good running shoes under $100, any suggestions?\"", "intent_template": "Create a post in {{forum}}. Title it \"{{category}} recommendations?\" and in post details ask \"Hey, looking for good {{category}} under {{price}}, any suggestions?\"", "instantiation_dict": {"category": "running shoes", "price": "$100", "forum": "f/sports"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", @@ -18858,12 +17678,11 @@ "intent": "Create a post in f/sports. Title it \"running shoes recommendations?\" and in post details ask \"Hey, looking for good running shoes under $500, any suggestions?\"", "intent_template": "Create a post in {{forum}}. Title it \"{{category}} recommendations?\" and in post details ask \"Hey, looking for good {{category}} under {{price}}, any suggestions?\"", "instantiation_dict": {"category": "running shoes", "price": "$500", "forum": "f/sports"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", @@ -18889,12 +17708,11 @@ "intent": "Create a post in f/sports. Title it \"running pants recommendations?\" and in post details ask \"Hey, looking for good running pants under $500, any suggestions?\"", "intent_template": "Create a post in {{forum}}. Title it \"{{category}} recommendations?\" and in post details ask \"Hey, looking for good {{category}} under {{price}}, any suggestions?\"", "instantiation_dict": {"category": "running pants", "price": "$500", "forum": "f/sports"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", @@ -18920,12 +17738,11 @@ "intent": "Create a post in f/iphone. Title it \"used iphone recommendations?\" and in post details ask \"Hey, looking for good used iphone under $1000, any suggestions?\"", "intent_template": "Create a post in {{forum}}. Title it \"{{category}} recommendations?\" and in post details ask \"Hey, looking for good {{category}} under {{price}}, any suggestions?\"", "instantiation_dict": {"category": "used iphone", "price": "$1000", "forum": "f/iphone"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", @@ -18951,12 +17768,11 @@ "intent": "Create a post in the most appropriate forum. Title it \"noise-cancelling headphones recommendations\" and in post details ask \"I need recommendations for noise-cancelling headphones within a budget of $200 please\"", "intent_template": "Create a post in the most appropriate forum. Title it \"{{category}} recommendations\" and in post details ask \"I need recommendations for {{category}} within a budget of {{price}} please\"", "instantiation_dict": {"category": "noise-cancelling headphones", "price": "$200"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", @@ -18982,12 +17798,11 @@ "intent": "Create a post in the most appropriate forum. Title it \"DIY toolkit recommendations\" and in post details ask \"I need recommendations for DIY toolkit within a budget of $100 please\"", "intent_template": "Create a post in the most appropriate forum. Title it \"{{category}} recommendations\" and in post details ask \"I need recommendations for {{category}} within a budget of {{price}} please\"", "instantiation_dict": {"category": "DIY toolkit", "price": "$100"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", @@ -19013,12 +17828,11 @@ "intent": "Create a post in the most appropriate forum. Title it \"sony headphones recommendations\" and in post details ask \"I need recommendations for sony headphones within a budget of $500 please\"", "intent_template": "Create a post in the most appropriate forum. Title it \"{{category}} recommendations\" and in post details ask \"I need recommendations for {{category}} within a budget of {{price}} please\"", "instantiation_dict": {"category": "sony headphones", "price": "$500"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", @@ -19044,12 +17858,11 @@ "intent": "Create a post in the most appropriate forum. Title it \"Must have product at last for ever recommendations\" and in post details ask \"I need recommendations for Must have product at last for ever within a budget of $30 please\"", "intent_template": "Create a post in the most appropriate forum. Title it \"{{category}} recommendations\" and in post details ask \"I need recommendations for {{category}} within a budget of {{price}} please\"", "instantiation_dict": {"category": "Must have product at last for ever", "price": "$30"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", @@ -19075,12 +17888,11 @@ "intent": "Create a post in the most appropriate forum. Title it \"used iphone recommendations\" and in post details ask \"I need recommendations for used iphone within a budget of $1000 please\"", "intent_template": "Create a post in the most appropriate forum. Title it \"{{category}} recommendations\" and in post details ask \"I need recommendations for {{category}} within a budget of {{price}} please\"", "instantiation_dict": {"category": "used iphone", "price": "$1000"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", @@ -19106,12 +17918,11 @@ "intent": "Post a notice in f/books titled \"book reading Meet up!\". Set post details to \"virtual meetup for book reading on March 15th\"", "intent_template": "Post a notice in {{forum}} titled \"{{interest}} Meet up!\". Set post details to \"virtual meetup for {{interest}} on {{date}}\"", "instantiation_dict": {"interest": "book reading", "date": "March 15th", "forum": "f/books"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", @@ -19137,12 +17948,11 @@ "intent": "Post a notice in books forum titled \"Harry Potter Meet up!\". Set post details to \"virtual meetup for Harry Potter on July 8th\"", "intent_template": "Post a notice in {{forum}} titled \"{{interest}} Meet up!\". Set post details to \"virtual meetup for {{interest}} on {{date}}\"", "instantiation_dict": {"interest": "Harry Potter", "date": "July 8th", "forum": "books forum"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", @@ -19168,12 +17978,11 @@ "intent": "Post a notice in books forum titled \"Big little lies Meet up!\". Set post details to \"virtual meetup for Big little lies on Sep 10th\"", "intent_template": "Post a notice in {{forum}} titled \"{{interest}} Meet up!\". Set post details to \"virtual meetup for {{interest}} on {{date}}\"", "instantiation_dict": {"interest": "Big little lies", "date": "Sep 10th", "forum": "books forum"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", @@ -19199,12 +18008,11 @@ "intent": "Post a notice in nyc forum titled \"racing cars Meet up!\". Set post details to \"virtual meetup for racing cars on Oct 21st\"", "intent_template": "Post a notice in {{forum}} titled \"{{interest}} Meet up!\". Set post details to \"virtual meetup for {{interest}} on {{date}}\"", "instantiation_dict": {"interest": "racing cars", "date": "Oct 21st", "forum": "nyc forum"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", @@ -19230,12 +18038,11 @@ "intent": "Post a notice in games forum titled \"Tears of Kingdom Meet up!\". Set post details to \"virtual meetup for Tears of Kingdom on Dec 15th\"", "intent_template": "Post a notice in {{forum}} titled \"{{interest}} Meet up!\". Set post details to \"virtual meetup for {{interest}} on {{date}}\"", "instantiation_dict": {"interest": "Tears of Kingdom", "date": "Dec 15th", "forum": "games forum"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", @@ -19265,12 +18072,11 @@ "title": "What could machine learning help the books field?", "forum": "books" }, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", @@ -19299,12 +18105,11 @@ "title": "What could midjourney help the DIY field?", "forum": "DIY" }, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", @@ -19333,12 +18138,11 @@ "title": "What could open-source LLMs help the technology field?", "forum": "technology" }, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", @@ -19367,12 +18171,11 @@ "title": "What could large language models help the dataisbeautiful field?", "forum": "dataisbeautiful" }, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", @@ -19401,12 +18204,11 @@ "title": "What could diffusion model help the dataisbeautiful field?", "forum": "dataisbeautiful" }, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", @@ -19429,22 +18231,27 @@ "intent_template_id": 23, "start_urls": ["__REDDIT__/f/books/59421/friendly-reminder-bookshop-org-exists"], "intent": "Reply to the post on this page with my comment \"I am a big fan of the bookorg\"", - "intent_template": "Reply to {{position_description}} on this page with my comment \"{{content_description}}\"", + "intent_template": "Reply to {{position_description}} {{location}} with {{comment_prefix}}\"{{content_description}}\"", "instantiation_dict": { "position_description": "the post", + "location": "on this page", + "comment_prefix": "my comment ", "content_description": "I am a big fan of the bookorg" }, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "site": "reddit", - "expected": {"post_id": "59421", "comment_content": "I am a big fan of the bookorg"} + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": "__REDDIT__/f/books/59421/-/comment", + "http_method": "POST", + "post_data": {"reply_to_submission_59421[comment]": "I am a big fan of the bookorg"}, + "response_status": 302 + } } ], "revision": 2 @@ -19457,17 +18264,18 @@ "__REDDIT__/f/pittsburgh/45899/driving-in-pittsburgh-summed-up-by-one-traffic-sign" ], "intent": "Reply to the post on this page with my comment \"Yeah, pittsburgh traffic, you know...\"", - "intent_template": "Reply to {{position_description}} on this page with my comment \"{{content_description}}\"", + "intent_template": "Reply to {{position_description}} {{location}} with {{comment_prefix}}\"{{content_description}}\"", "instantiation_dict": { "position_description": "the post", + "location": "on this page", + "comment_prefix": "my comment ", "content_description": "Yeah, pittsburgh traffic, you know..." }, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", @@ -19489,19 +18297,27 @@ "__REDDIT__/f/technology/134852/ai-experts-disown-musk-backed-campaign-citing-their-research" ], "intent": "Reply to the post on this page with my comment \"???\"", - "intent_template": "Reply to {{position_description}} on this page with my comment \"{{content_description}}\"", - "instantiation_dict": {"position_description": "the post", "content_description": "???"}, - "format_specification": null, + "intent_template": "Reply to {{position_description}} {{location}} with {{comment_prefix}}\"{{content_description}}\"", + "instantiation_dict": { + "position_description": "the post", + "location": "on this page", + "comment_prefix": "my comment ", + "content_description": "???" + }, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "site": "reddit", - "expected": {"post_id": "134852", "comment_content": "???"} + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": "__REDDIT__/f/technology/134852/-/comment", + "http_method": "POST", + "post_data": {"reply_to_submission_134852[comment]": "???"}, + "response_status": 302 + } } ], "revision": 2 @@ -19514,18 +18330,15 @@ "intent": "Fill the \"contact us\" form in the site for a refund on the phone screen protector I bought, stating that \"it broke after just three days of use\". Also, ensure to include the order number #000000180 and the product SKU. Don\"t submit yet, I will check.", "intent_template": "Fill the \"contact us\" form in the site for a refund on the {{product}} I bought, stating that \"it broke after just three days of use\". Also, ensure to include the order number #{{order_id}} and the product SKU. Don\"t submit yet, I will check.", "instantiation_dict": {"product": "phone screen protector", "order_id": "000000180"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", - "site": "shopping", - "last_event_only": true, - "expected": {"url": "__SHOPPING__/contact", "response_status": 200} + "expected": {"url": "__SHOPPING__/contact"} } ], "revision": 2 @@ -19538,18 +18351,15 @@ "intent": "Fill the \"contact us\" form in the site for a refund on the bluetooth speaker I bought, stating that \"it broke after just three days of use\". Also, ensure to include the order number #161 and the product SKU. Don\"t submit yet, I will check.", "intent_template": "Fill the \"contact us\" form in the site for a refund on the {{product}} I bought, stating that \"it broke after just three days of use\". Also, ensure to include the order number #{{order_id}} and the product SKU. Don\"t submit yet, I will check.", "instantiation_dict": {"product": "bluetooth speaker", "order_id": "161"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", - "site": "shopping", - "last_event_only": true, - "expected": {"url": "__SHOPPING__/contact", "response_status": 200} + "expected": {"url": "__SHOPPING__/contact"} } ], "revision": 2 @@ -19562,18 +18372,15 @@ "intent": "Fill the \"contact us\" form in the site for a refund on the iphone case I bought, stating that \"it broke after just three days of use\". Also, ensure to include the order number #180 and the product SKU. Don\"t submit yet, I will check.", "intent_template": "Fill the \"contact us\" form in the site for a refund on the {{product}} I bought, stating that \"it broke after just three days of use\". Also, ensure to include the order number #{{order_id}} and the product SKU. Don\"t submit yet, I will check.", "instantiation_dict": {"product": "iphone case", "order_id": "180"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", - "site": "shopping", - "last_event_only": true, - "expected": {"url": "__SHOPPING__/contact", "response_status": 200} + "expected": {"url": "__SHOPPING__/contact"} } ], "revision": 2 @@ -19583,24 +18390,21 @@ "task_id": 656, "intent_template_id": 153, "start_urls": ["__SHOPPING__"], - "intent": "Fill the \"contact us\" form in the site for a refund on the remote controller I bought, stating that \"it broke after just three days of use\". Also, ensure to include the order number #180 and the product SKU. Don\"t submit yet, I will check.", + "intent": "Fill the \"contact us\" form in the site for a refund on the console skin I bought, stating that \"it broke after just three days of use\". Also, ensure to include the order number #180 and the product SKU. Don\"t submit yet, I will check.", "intent_template": "Fill the \"contact us\" form in the site for a refund on the {{product}} I bought, stating that \"it broke after just three days of use\". Also, ensure to include the order number #{{order_id}} and the product SKU. Don\"t submit yet, I will check.", - "instantiation_dict": {"product": "remote controller", "order_id": "180"}, - "format_specification": null, + "instantiation_dict": {"product": "console skin", "order_id": "180"}, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", - "site": "shopping", - "last_event_only": true, - "expected": {"url": "__SHOPPING__/contact", "response_status": 200} + "expected": {"url": "__SHOPPING__/contact"} } ], - "revision": 2 + "revision": 3 }, { "sites": ["shopping"], @@ -19610,18 +18414,15 @@ "intent": "Fill the \"contact us\" form in the site for a refund on the speaker I bought, stating that \"it broke after just three days of use\". Also, ensure to include the order number #148 and the product SKU. Don\"t submit yet, I will check.", "intent_template": "Fill the \"contact us\" form in the site for a refund on the {{product}} I bought, stating that \"it broke after just three days of use\". Also, ensure to include the order number #{{order_id}} and the product SKU. Don\"t submit yet, I will check.", "instantiation_dict": {"product": "speaker", "order_id": "148"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", - "site": "shopping", - "last_event_only": true, - "expected": {"url": "__SHOPPING__/contact", "response_status": 200} + "expected": {"url": "__SHOPPING__/contact"} } ], "revision": 2 @@ -19639,12 +18440,11 @@ "account": "Roshanjossey", "due": "the end of 2030" }, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", @@ -19675,12 +18475,11 @@ "account": "Roshanjossey", "due": "the beginning of Q2 2033" }, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", @@ -19720,12 +18519,11 @@ "account": "Abishek", "due": "July 18 2033" }, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", @@ -19764,16 +18562,14 @@ "issue": "discuss the connection refused exception", "repo": "ChatGPT" }, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", - "site": "gitlab", "expected": { "url": "__GITLAB__/api/v4/projects/175/issues", "http_method": "POST", @@ -19796,16 +18592,14 @@ "issue": "discuss the OS error", "repo": "aem-hacker" }, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", - "site": "gitlab", "expected": { "url": "__GITLAB__/api/v4/projects/138/issues", "http_method": "POST", @@ -19825,19 +18619,17 @@ "intent_template": "Open an issue with title \"{{title}}\" to {{issue}} in the respository {{repo}}.", "instantiation_dict": { "title": "Llama model support timeline", - "issue": "ask the team's plan on supporting Llama and other llama family models", + "issue": "ask the team\"s plan on supporting Llama and other llama family models", "repo": "metaseq" }, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", - "site": "gitlab", "expected": { "url": "__GITLAB__/api/v4/projects/33/issues", "http_method": "POST", @@ -19857,19 +18649,17 @@ "intent_template": "Open an issue with title \"{{title}}\" to {{issue}} in the respository {{repo}}.", "instantiation_dict": { "title": "Question on future usage of Python 3.11", - "issue": "ask the team's plans on adding Python 3.11 related resources", + "issue": "ask the team\"s plans on adding Python 3.11 related resources", "repo": "awesome-python" }, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", - "site": "gitlab", "expected": { "url": "__GITLAB__/api/v4/projects/84/issues", "http_method": "POST", @@ -19892,16 +18682,14 @@ "issue": "request adding support for MT theme editor", "repo": "a11y-syntax-highlighting" }, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", - "site": "gitlab", "expected": { "url": "__GITLAB__/api/v4/projects/186/issues", "http_method": "POST", @@ -19925,12 +18713,11 @@ "reviewer": "Primer", "source_project": "the current repository" }, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", @@ -19964,12 +18751,11 @@ "reviewer": "primer", "source_project": "the current repository" }, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", @@ -20003,12 +18789,11 @@ "reviewer": "Roshan Jossy", "source_project": "a11yproject.com" }, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", @@ -20016,8 +18801,8 @@ "url": "__GITLAB__/a11yproject/a11yproject.com/-/merge_requests", "http_method": "POST", "post_data": { - "merge_request[target_branch]": "redesign", - "merge_request[source_branch]": "main", + "merge_request[target_branch]": "main", + "merge_request[source_branch]": "redesign", "merge_request[target_project_id]": "174", "merge_request[source_project_id]": "174", "merge_request[assignee_ids][]": "0", @@ -20034,24 +18819,23 @@ "task_id": 669, "intent_template_id": 337, "start_urls": ["__GITLAB__/byteblaze/solarized-prism-theme"], - "intent": "Open a new issue to discuss the implementation of dark mode for the current project", - "intent_template": "Open a new issue to discuss the implementation of {{feature}} for the current project", + "intent": "Open a new issue to discuss the implementation of dark mode for the current project. Use an appropriate issue title.", + "intent_template": "Open a new issue to discuss the implementation of {{feature}} for the current project. Use an appropriate issue title.", "instantiation_dict": {"feature": "dark mode"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "group": "byteblaze", - "project": "solarized-prism-theme", - "fields": {"title": "dark mode"}, - "title_field_is_substring": true, - "site": "gitlab", - "expected": true + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": "__GITLAB__/byteblaze/solarized-prism-theme/-/issues", + "http_method": "POST", + "post_data": {"issue[title]": "^.*dark.*mode.*$"}, + "response_status": 302 + } } ], "revision": 2 @@ -20061,24 +18845,23 @@ "task_id": 670, "intent_template_id": 337, "start_urls": ["__GITLAB__/byteblaze/dotfiles"], - "intent": "Open a new issue to discuss the implementation of default plugins for .zsh for the current project", - "intent_template": "Open a new issue to discuss the implementation of {{feature}} for the current project", + "intent": "Open a new issue to discuss the implementation of default plugins for .zsh for the current project. Use an appropriate issue title.", + "intent_template": "Open a new issue to discuss the implementation of {{feature}} for the current project. Use an appropriate issue title.", "instantiation_dict": {"feature": "default plugins for .zsh"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "group": "byteblaze", - "project": "dotfiles", - "fields": {"title": "default plugins for .zsh"}, - "title_field_is_substring": true, - "site": "gitlab", - "expected": true + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": "__GITLAB__/byteblaze/dotfiles/-/issues", + "http_method": "POST", + "post_data": {"issue[title]": "^.*default.*plugins.*zsh.*$"}, + "response_status": 302 + } } ], "revision": 2 @@ -20088,21 +18871,31 @@ "task_id": 671, "intent_template_id": 101, "start_urls": ["__SHOPPING__", "__REDDIT__"], - "intent": "Gather the titles of Sony Computer Entertainment VR reviews with 2 stars and less rating from OneStopShop, and post them in the games forum under the title \"real user feedback on Sony Computer Entertainment VR\"", - "intent_template": "Gather the titles of {{product}} reviews with {{rating}} rating from OneStopShop, and post them in the games forum under the title \"real user feedback on {{product}}\"", + "intent": "Create a post in the game related discussion forum about Sony Computer Entertainment VR to report customer reviews with 2 stars and less rating from the OneStopShop with the post title \"real user feedback on Sony Computer Entertainment VR\". Format the post body as a bullet point list in the same order they appear (md format: - \"\").", + "intent_template": "Create a post in the game related discussion forum about {{product}} to report customer reviews with {{rating}} rating from the OneStopShop with the post title \"real user feedback on {{product}}\". Format the post body as a bullet point list in the same order they appear (md format: - \"\").", "instantiation_dict": {"product": "Sony Computer Entertainment VR", "rating": "2 stars and less"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", - "site": "reddit", - "last_event_only": true, - "expected": {"url": "__REDDIT__/f/gaming", "response_status": 200} + "post_data_schema": { + "type": "object", + "properties": { "$.actions[0].content": {"type": "string", "format": "markdown"} } + }, + "expected": { + "url": "^__REDDIT__/submit.*$", + "http_method": "POST", + "post_data": { + "submission[title]": "real user feedback on Sony Computer Entertainment VR", + "submission[body]": "- \"Disappointing. Didn't last long before it stopped powering on and needed to be sent in for repair.\"\r\n- \"didn't last a year without issues\"\r\n- \"Received used items!!\"\r\n- \"Received used items!!\"", + "submission[forum]": "10048" + }, + "response_status": 302 + } } ], "revision": 2 @@ -20112,24 +18905,34 @@ "task_id": 672, "intent_template_id": 101, "start_urls": ["__SHOPPING__", "__REDDIT__"], - "intent": "Gather the titles of Nintendo Switch Fortnite Wildcat Console EU reviews with 3 stars and less rating from OneStopShop, and post them in the games forum under the title \"real user feedback on Nintendo Switch Fortnite Wildcat Console EU\"", - "intent_template": "Gather the titles of {{product}} reviews with {{rating}} rating from OneStopShop, and post them in the games forum under the title \"real user feedback on {{product}}\"", + "intent": "Create a post in the game related discussion forum about Nintendo Switch Fortnite Wildcat Console EU to report customer reviews with 3 stars and less rating from the OneStopShop with the post title \"real user feedback on Nintendo Switch Fortnite Wildcat Console EU\". Format the post body as a bullet point list in the same order they appear (md format: - \"\").", + "intent_template": "Create a post in the game related discussion forum about {{product}} to report customer reviews with {{rating}} rating from the OneStopShop with the post title \"real user feedback on {{product}}\". Format the post body as a bullet point list in the same order they appear (md format: - \"\").", "instantiation_dict": { "product": "Nintendo Switch Fortnite Wildcat Console EU", "rating": "3 stars and less" }, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", - "site": "reddit", - "last_event_only": true, - "expected": {"url": "__REDDIT__/f/gaming", "response_status": 200} + "post_data_schema": { + "type": "object", + "properties": { "$.actions[0].content": {"type": "string", "format": "markdown"} } + }, + "expected": { + "url": "^__REDDIT__/submit.*$", + "http_method": "POST", + "post_data": { + "submission[title]": "real user feedback on Nintendo Switch Fortnite Wildcat Console EU", + "submission[body]": "- \"EU charger and wild cat card doesn't even work!\"\r\n- \"REFUND REJECTED\"\r\n- \"Charging port not compatible\"\r\n- \"not compatible in the US\"\r\n- \"Wildcard Bonus Credits Not Redeemable!\"\r\n- \"Code not available!!\"\r\n- \"Code not available!!\"", + "submission[forum]": "10048" + }, + "response_status": 302 + } } ], "revision": 2 @@ -20139,21 +18942,31 @@ "task_id": 673, "intent_template_id": 101, "start_urls": ["__SHOPPING__", "__REDDIT__"], - "intent": "Gather the titles of Racing Wheel Overdrive for Xbox X reviews with 1 star rating from OneStopShop, and post them in the games forum under the title \"real user feedback on Racing Wheel Overdrive for Xbox X\"", - "intent_template": "Gather the titles of {{product}} reviews with {{rating}} rating from OneStopShop, and post them in the games forum under the title \"real user feedback on {{product}}\"", + "intent": "Create a post in the game related discussion forum about Racing Wheel Overdrive for Xbox X to report customer reviews with 1 star rating from the OneStopShop with the post title \"real user feedback on Racing Wheel Overdrive for Xbox X\". Format the post body as a bullet point list in the same order they appear (md format: - \"\").", + "intent_template": "Create a post in the game related discussion forum about {{product}} to report customer reviews with {{rating}} rating from the OneStopShop with the post title \"real user feedback on {{product}}\". Format the post body as a bullet point list in the same order they appear (md format: - \"\").", "instantiation_dict": {"product": "Racing Wheel Overdrive for Xbox X", "rating": "1 star"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", - "site": "reddit", - "last_event_only": true, - "expected": {"url": "__REDDIT__/f/gaming", "response_status": 200} + "post_data_schema": { + "type": "object", + "properties": { "$.actions[0].content": {"type": "string", "format": "markdown"} } + }, + "expected": { + "url": "^__REDDIT__/submit.*$", + "http_method": "POST", + "post_data": { + "submission[title]": "real user feedback on Racing Wheel Overdrive for Xbox X", + "submission[body]": "- \"Unable to set neutral steering\"\r\n- \"Doesn't work with PC.\"\r\n- \"Crazy problems in automatic mode; then pedals stopped working\"\r\n- \"Only works with certain games.\"\r\n- \"Only works with certain games.\"", + "submission[forum]": "10048" + }, + "response_status": 302 + } } ], "revision": 2 @@ -20163,24 +18976,34 @@ "task_id": 674, "intent_template_id": 101, "start_urls": ["__SHOPPING__", "__REDDIT__"], - "intent": "Gather the titles of Doc and Pies Arcade Factory Cocktail Arcade Machine reviews with 3 stars and less rating from OneStopShop, and post them in the games forum under the title \"real user feedback on Doc and Pies Arcade Factory Cocktail Arcade Machine\"", - "intent_template": "Gather the titles of {{product}} reviews with {{rating}} rating from OneStopShop, and post them in the games forum under the title \"real user feedback on {{product}}\"", + "intent": "Create a post in the game related discussion forum about Doc and Pies Arcade Factory Cocktail Arcade Machine to report customer reviews with 3 stars and less rating from the OneStopShop with the post title \"real user feedback on Doc and Pies Arcade Factory Cocktail Arcade Machine\". Format the post body as a bullet point list in the same order they appear (md format: - \"\").", + "intent_template": "Create a post in the game related discussion forum about {{product}} to report customer reviews with {{rating}} rating from the OneStopShop with the post title \"real user feedback on {{product}}\". Format the post body as a bullet point list in the same order they appear (md format: - \"\").", "instantiation_dict": { "product": "Doc and Pies Arcade Factory Cocktail Arcade Machine", "rating": "3 stars and less" }, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", - "site": "reddit", - "last_event_only": true, - "expected": {"url": "__REDDIT__/f/gaming", "response_status": 200} + "post_data_schema": { + "type": "object", + "properties": { "$.actions[0].content": {"type": "string", "format": "markdown"} } + }, + "expected": { + "url": "^__REDDIT__/submit.*$", + "http_method": "POST", + "post_data": { + "submission[title]": "real user feedback on Doc and Pies Arcade Factory Cocktail Arcade Machine", + "submission[body]": "- \"Poorly Made Exterior. Consider a different Company.\"\r\n- \"piece of junk ,..can't believe I spent money on this !!!!\"\r\n- \"Based arrived broken but game itself works\"\r\n- \"Based arrived broken but game itself works\"", + "submission[forum]": "10048" + }, + "response_status": 302 + } } ], "revision": 2 @@ -20190,21 +19013,31 @@ "task_id": 675, "intent_template_id": 101, "start_urls": ["__SHOPPING__", "__REDDIT__"], - "intent": "Gather the titles of HORI 3D Surround Gaming Neckset reviews with 2 stars and less rating from OneStopShop, and post them in the games forum under the title \"real user feedback on HORI 3D Surround Gaming Neckset\"", - "intent_template": "Gather the titles of {{product}} reviews with {{rating}} rating from OneStopShop, and post them in the games forum under the title \"real user feedback on {{product}}\"", + "intent": "Create a post in the game related discussion forum about HORI 3D Surround Gaming Neckset to report customer reviews with 2 stars and less rating from the OneStopShop with the post title \"real user feedback on HORI 3D Surround Gaming Neckset\". Format the post body as a bullet point list in the same order they appear (md format: - \"\").", + "intent_template": "Create a post in the game related discussion forum about {{product}} to report customer reviews with {{rating}} rating from the OneStopShop with the post title \"real user feedback on {{product}}\". Format the post body as a bullet point list in the same order they appear (md format: - \"\").", "instantiation_dict": {"product": "HORI 3D Surround Gaming Neckset", "rating": "2 stars and less"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", - "site": "reddit", - "last_event_only": true, - "expected": {"url": "__REDDIT__/f/gaming", "response_status": 200} + "post_data_schema": { + "type": "object", + "properties": { "$.actions[0].content": {"type": "string", "format": "markdown"} } + }, + "expected": { + "url": "^__REDDIT__/submit.*$", + "http_method": "POST", + "post_data": { + "submission[title]": "real user feedback on HORI 3D Surround Gaming Neckset", + "submission[body]": "- \"Not worth it for PC users\"\r\n- \"I really wanted to like this.\"\r\n- \"I wish this was better...\"\r\n- \"I wish this was better...\"", + "submission[forum]": "10048" + }, + "response_status": 302 + } } ], "revision": 2 @@ -20214,26 +19047,20 @@ "task_id": 676, "intent_template_id": 253, "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Show me the list of orders that are suspected of being fraudulent", - "intent_template": "Show me the list of orders that are {{status}}", + "intent": "Go to the list of orders that are suspected of being fraudulent", + "intent_template": "Go to the list of orders that are {{status}}", "instantiation_dict": {"status": "suspected of being fraudulent"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "navigate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", - "last_event_only": true, "expected": { "url": "^__SHOPPING_ADMIN__/mui/index/render/.*$", - "response_status": 200, - "headers": { - "referer": "__SHOPPING_ADMIN__/sales/order/", - "X-Requested-With": "XMLHttpRequest" - }, + "headers": {"referer": "__SHOPPING_ADMIN__/sales/order/"}, "query_params": { "namespace" : [ "sales_order_grid" ], "filters[placeholder]": [ "true" ], @@ -20242,7 +19069,6 @@ "keywordUpdated" : [ "false" ] } }, - "site": "shopping_admin", "ignored_query_params_patterns": ["^paging", "^sorting", "isAjax"] } ], @@ -20253,26 +19079,20 @@ "task_id": 677, "intent_template_id": 253, "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Show me the list of orders that are processing", - "intent_template": "Show me the list of orders that are {{status}}", + "intent": "Go to the list of orders that are processing", + "intent_template": "Go to the list of orders that are {{status}}", "instantiation_dict": {"status": "processing"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "navigate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", - "last_event_only": true, "expected": { "url": "__SHOPPING_ADMIN__/mui/index/render/", - "response_status": 200, - "headers": { - "referer": "__SHOPPING_ADMIN__/sales/order/", - "X-Requested-With": "XMLHttpRequest" - }, + "headers": {"referer": "__SHOPPING_ADMIN__/sales/order/"}, "query_params": { "namespace" : [ "sales_order_grid" ], "filters[placeholder]": [ "true" ], @@ -20281,7 +19101,6 @@ "keywordUpdated" : [ "false" ] } }, - "site": "shopping_admin", "ignored_query_params_patterns": ["^paging", "^sorting", "isAjax"] } ], @@ -20292,26 +19111,20 @@ "task_id": 678, "intent_template_id": 253, "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Show me the list of orders that are canceled", - "intent_template": "Show me the list of orders that are {{status}}", + "intent": "Go to the list of orders that are canceled", + "intent_template": "Go to the list of orders that are {{status}}", "instantiation_dict": {"status": "canceled"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "navigate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", - "last_event_only": true, "expected": { "url": "__SHOPPING_ADMIN__/mui/index/render/", - "response_status": 200, - "headers": { - "referer": "__SHOPPING_ADMIN__/sales/order/", - "X-Requested-With": "XMLHttpRequest" - }, + "headers": {"referer": "__SHOPPING_ADMIN__/sales/order/"}, "query_params": { "namespace" : [ "sales_order_grid" ], "filters[placeholder]": [ "true" ], @@ -20320,7 +19133,6 @@ "keywordUpdated" : [ "false" ] } }, - "site": "shopping_admin", "ignored_query_params_patterns": ["^paging", "^sorting", "isAjax"] } ], @@ -20331,32 +19143,30 @@ "task_id": 679, "intent_template_id": 253, "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Show me the list of orders that are completed", - "intent_template": "Show me the list of orders that are {{status}}", + "intent": "Go to the list of orders that are completed", + "intent_template": "Go to the list of orders that are {{status}}", "instantiation_dict": {"status": "completed"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "navigate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", "expected": { "url": "^__SHOPPING_ADMIN__/mui/index/render/.*$", - "decode_base64_query": true, - "headers": {"referer": "__SHOPPING_ADMIN__/sales/order/"}, "query_params": { "namespace" : [ "sales_order_grid" ], "filters[placeholder]": [ "true" ], "filters[status]" : [ "complete" ], "search" : [ "" ], "keywordUpdated" : [ "false" ] - } + }, + "headers": {"referer": "__SHOPPING_ADMIN__/sales/order/"} }, - "site": "shopping_admin", - "ignored_query_params_patterns": ["^paging", "^sorting", "isAjax"] + "ignored_query_params_patterns": ["^paging", "^sorting", "isAjax"], + "decode_base64_query": true } ], "revision": 2 @@ -20366,26 +19176,20 @@ "task_id": 680, "intent_template_id": 253, "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Show me the list of orders that are on hold", - "intent_template": "Show me the list of orders that are {{status}}", + "intent": "Go to the list of orders that are on hold", + "intent_template": "Go to the list of orders that are {{status}}", "instantiation_dict": {"status": "on hold"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "navigate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", - "last_event_only": true, "expected": { "url": "__SHOPPING_ADMIN__/mui/index/render/", - "response_status": 200, - "headers": { - "referer": "__SHOPPING_ADMIN__/sales/order/", - "X-Requested-With": "XMLHttpRequest" - }, + "headers": {"referer": "__SHOPPING_ADMIN__/sales/order/"}, "query_params": { "namespace" : [ "sales_order_grid" ], "filters[placeholder]": [ "true" ], @@ -20394,7 +19198,6 @@ "keywordUpdated" : [ "false" ] } }, - "site": "shopping_admin", "ignored_query_params_patterns": ["^paging", "^sorting", "isAjax"] } ], @@ -20408,12 +19211,11 @@ "intent": "Find a GitLab repository related to gan implementation and create a new post in the discussion forum with the URL of the repository to the most relevant forum. Use the repository path as the post title and \" commit(s) already!\" as the post body.", "intent_template": "Find a GitLab repository related to {{topic}} and create a new post in the discussion forum with the URL of the repository to the most relevant forum. Use the repository path as the post title and \" commit(s) already!\" as the post body.", "instantiation_dict": {"topic": "gan implementation"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", @@ -20441,12 +19243,11 @@ "intent": "Find a GitLab repository related to chatGPT and create a new post in the discussion forum with the URL of the repository to the most relevant forum. Use the repository path as the post title and \" commit(s) already!\" as the post body.", "intent_template": "Find a GitLab repository related to {{topic}} and create a new post in the discussion forum with the URL of the repository to the most relevant forum. Use the repository path as the post title and \" commit(s) already!\" as the post body.", "instantiation_dict": {"topic": "chatGPT"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", @@ -20474,12 +19275,11 @@ "intent": "Find a GitLab repository related to metaseq and create a new post in the discussion forum with the URL of the repository to the most relevant forum. Use the repository path as the post title and \" commit(s) already!\" as the post body.", "intent_template": "Find a GitLab repository related to {{topic}} and create a new post in the discussion forum with the URL of the repository to the most relevant forum. Use the repository path as the post title and \" commit(s) already!\" as the post body.", "instantiation_dict": {"topic": "metaseq"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", @@ -20507,12 +19307,11 @@ "intent": "Promote byteblaze/cloud-to-butt in the discussion forum LifeProTips by creating a post with the project description as the title.", "intent_template": "Promote {{repo}} in the discussion forum {{forum}} by creating a post with the project description as the title.", "instantiation_dict": {"repo": "byteblaze/cloud-to-butt", "forum": "LifeProTips"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", @@ -20539,12 +19338,11 @@ "intent": "Promote byteblaze/dotfiles in the discussion forum aww by creating a post with the project description as the title.", "intent_template": "Promote {{repo}} in the discussion forum {{forum}} by creating a post with the project description as the title.", "instantiation_dict": {"repo": "byteblaze/dotfiles", "forum": "aww"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", @@ -20571,12 +19369,11 @@ "intent": "Promote auth0/angular-storage in the discussion forum technology by creating a post with the project description as the title.", "intent_template": "Promote {{repo}} in the discussion forum {{forum}} by creating a post with the project description as the title.", "instantiation_dict": {"repo": "auth0/angular-storage", "forum": "technology"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", @@ -20603,12 +19400,11 @@ "intent": "Promote koush/AndroidAsync in the discussion forum funny by creating a post with the project description as the title.", "intent_template": "Promote {{repo}} in the discussion forum {{forum}} by creating a post with the project description as the title.", "instantiation_dict": {"repo": "koush/AndroidAsync", "forum": "funny"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", @@ -20635,12 +19431,11 @@ "intent": "Promote lahwaacz/arch-wiki-docs in the discussion forum science by creating a post with the project description as the title.", "intent_template": "Promote {{repo}} in the discussion forum {{forum}} by creating a post with the project description as the title.", "instantiation_dict": {"repo": "lahwaacz/arch-wiki-docs", "forum": "science"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", @@ -20664,15 +19459,14 @@ "task_id": 689, "intent_template_id": 163, "start_urls": ["__SHOPPING__"], - "intent": "Draft a message using the shop contact form to the shop owner for a coupon, enter in the exact reason of \"I am a loyal customer, give me a coupon\"", - "intent_template": "Draft a message using the shop contact form to the shop owner for a coupon, enter in the exact reason of \"{{reason}}\"", + "intent": "Prepare a coupon request in the shop's contact form with the reason: \"I am a loyal customer, give me a coupon\". Keep it ready for review.", + "intent_template": "Prepare a coupon request in the shop's contact form with the reason: \"{{reason}}\". Keep it ready for review.", "instantiation_dict": {"reason": "I am a loyal customer, give me a coupon"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", @@ -20701,15 +19495,14 @@ "task_id": 690, "intent_template_id": 163, "start_urls": ["__SHOPPING__"], - "intent": "Draft a message using the shop contact form to the shop owner for a coupon, enter in the exact reason of \"they promised me a coupon last time\"", - "intent_template": "Draft a message using the shop contact form to the shop owner for a coupon, enter in the exact reason of \"{{reason}}\"", + "intent": "Prepare a coupon request in the shop's contact form with the reason: \"they promised me a coupon last time\". Keep it ready for review.", + "intent_template": "Prepare a coupon request in the shop's contact form with the reason: \"{{reason}}\". Keep it ready for review.", "instantiation_dict": {"reason": "they promised me a coupon last time"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", @@ -20738,15 +19531,14 @@ "task_id": 691, "intent_template_id": 163, "start_urls": ["__SHOPPING__"], - "intent": "Draft a message using the shop contact form to the shop owner for a coupon, enter in the exact reason of \"I plan to make a bulk purchase\"", - "intent_template": "Draft a message using the shop contact form to the shop owner for a coupon, enter in the exact reason of \"{{reason}}\"", + "intent": "Prepare a coupon request in the shop's contact form with the reason: \"I plan to make a bulk purchase\". Keep it ready for review.", + "intent_template": "Prepare a coupon request in the shop's contact form with the reason: \"{{reason}}\". Keep it ready for review.", "instantiation_dict": {"reason": "I plan to make a bulk purchase"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", @@ -20775,15 +19567,14 @@ "task_id": 692, "intent_template_id": 163, "start_urls": ["__SHOPPING__"], - "intent": "Draft a message using the shop contact form to the shop owner for a coupon, enter in the exact reason of \"I am a student\"", - "intent_template": "Draft a message using the shop contact form to the shop owner for a coupon, enter in the exact reason of \"{{reason}}\"", + "intent": "Prepare a coupon request in the shop's contact form with the reason: \"I am a student\". Keep it ready for review.", + "intent_template": "Prepare a coupon request in the shop's contact form with the reason: \"{{reason}}\". Keep it ready for review.", "instantiation_dict": {"reason": "I am a student"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", @@ -20812,15 +19603,14 @@ "task_id": 693, "intent_template_id": 163, "start_urls": ["__SHOPPING__"], - "intent": "Draft a message using the shop contact form to the shop owner for a coupon, enter in the exact reason of \"my refund is supposed to be replaced by a coupon\"", - "intent_template": "Draft a message using the shop contact form to the shop owner for a coupon, enter in the exact reason of \"{{reason}}\"", + "intent": "Prepare a coupon request in the shop's contact form with the reason: \"my refund is supposed to be replaced by a coupon\". Keep it ready for review.", + "intent_template": "Prepare a coupon request in the shop's contact form with the reason: \"{{reason}}\". Keep it ready for review.", "instantiation_dict": {"reason": "my refund is supposed to be replaced by a coupon"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", @@ -20858,16 +19648,14 @@ "color": "blue", "price": "60" }, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", - "site": "shopping_admin", "ignored_query_params_patterns": ["isAjax"], "expected": { "url": "^__SHOPPING_ADMIN__/catalog/product/save/type/simple/store/0/set/\\d+/back/edit$", @@ -20879,7 +19667,6 @@ "product[status]": "1", "product[quantity_and_stock_status][qty]": "50", "product[quantity_and_stock_status][is_in_stock]": "1", - "product[new]": "1", "product[size]": "167", "product[color]": "50" } @@ -20902,16 +19689,14 @@ "color": "yellow", "price": "69.99" }, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", - "site": "shopping_admin", "ignored_query_params_patterns": ["isAjax"], "expected": { "url": "^__SHOPPING_ADMIN__/catalog/product/save/type/simple/store/0/set/\\d+/back/edit$", @@ -20923,7 +19708,6 @@ "product[status]": "1", "product[quantity_and_stock_status][qty]": "50", "product[quantity_and_stock_status][is_in_stock]": "1", - "product[new]": "1", "product[size]": "179", "product[color]": "60" } @@ -20946,16 +19730,14 @@ "color": "Blue", "price": "169.99" }, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", - "site": "shopping_admin", "ignored_query_params_patterns": ["isAjax"], "expected": { "url": "^__SHOPPING_ADMIN__/catalog/product/save/type/simple/store/0/set/\\d+/back/edit$", @@ -20967,7 +19749,6 @@ "product[status]": "1", "product[quantity_and_stock_status][qty]": "42", "product[quantity_and_stock_status][is_in_stock]": "1", - "product[new]": "1", "product[size]": "177", "product[color]": "50" } @@ -20990,16 +19771,14 @@ "color": "Blue", "price": "769.99" }, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", - "site": "shopping_admin", "ignored_query_params_patterns": ["isAjax"], "expected": { "url": "^__SHOPPING_ADMIN__/catalog/product/save/type/simple/store/0/set/\\d+/back/edit$", @@ -21011,7 +19790,6 @@ "product[status]": "1", "product[quantity_and_stock_status][qty]": "42", "product[quantity_and_stock_status][is_in_stock]": "1", - "product[new]": "1", "product[color]": "50" } } @@ -21033,23 +19811,27 @@ "color": "black", "price": "769.99" }, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "name": "Lelelumon Yoga Mat", - "site": "shopping_admin", + "evaluator": "NetworkEventEvaluator", + "ignored_query_params_patterns": ["isAjax"], "expected": { - "name": "Lelelumon Yoga Mat", - "price": 769.99, - "stock_qty": 42, - "color": "49-black", - "size": "uni-size" + "url": "^__SHOPPING_ADMIN__/catalog/product/save/type/simple/store/0/set/\\d+/back/edit$", + "http_method": "POST", + "response_status": 302, + "post_data": { + "product[name]": "Lelelumon Yoga Mat", + "product[price]": "769.99", + "product[status]": "1", + "product[quantity_and_stock_status][qty]": "42", + "product[quantity_and_stock_status][is_in_stock]": "1", + "product[color]": "49" + } } } ], @@ -21060,19 +19842,17 @@ "task_id": 699, "intent_template_id": 258, "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Create a new marketing price rule called \"spring sale\" that offers a 20 percent discount site-wide for all customers", - "intent_template": "Create a new marketing price rule called \"{{topic}}\" that offers {{rule}} for all customers", + "intent": "Create a new marketing price rule called \"spring sale\" for all registered customers that offers a 20 percent discount site-wide", + "intent_template": "Create a new marketing price rule called \"{{topic}}\" for all registered customers that offers {{rule}}", "instantiation_dict": {"topic": "spring sale", "rule": "a 20 percent discount site-wide"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", - "site": "shopping_admin", "expected": { "url": "__SHOPPING_ADMIN__/sales_rule/promo_quote/save", "http_method": "POST", @@ -21094,19 +19874,17 @@ "task_id": 700, "intent_template_id": 258, "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Create a new marketing price rule called \"fall discount\" that offers $10 discount on checkout for all customers", - "intent_template": "Create a new marketing price rule called \"{{topic}}\" that offers {{rule}} for all customers", + "intent": "Create a new marketing price rule called \"fall discount\" for all registered customers that offers $10 discount on checkout", + "intent_template": "Create a new marketing price rule called \"{{topic}}\" for all registered customers that offers {{rule}}", "instantiation_dict": {"topic": "fall discount", "rule": "$10 discount on checkout"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", - "site": "shopping_admin", "expected": { "url": "__SHOPPING_ADMIN__/sales_rule/promo_quote/save", "http_method": "POST", @@ -21128,22 +19906,20 @@ "task_id": 701, "intent_template_id": 258, "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Create a new marketing price rule called \"Mother's day sale\" that offers 15% discount on checkout on all their cart for all customers", - "intent_template": "Create a new marketing price rule called \"{{topic}}\" that offers {{rule}} for all customers", + "intent": "Create a new marketing price rule called \"Mother's day sale\" for all registered customers that offers 15% discount on checkout on all their cart", + "intent_template": "Create a new marketing price rule called \"{{topic}}\" for all registered customers that offers {{rule}}", "instantiation_dict": { "topic": "Mother's day sale", "rule": "15% discount on checkout on all their cart" }, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", - "site": "shopping_admin", "expected": { "url": "__SHOPPING_ADMIN__/sales_rule/promo_quote/save", "http_method": "POST", @@ -21167,19 +19943,17 @@ "task_id": 702, "intent_template_id": 258, "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Create a new marketing price rule called \"Pride Month\" that offers 45% off on all products for all customers", - "intent_template": "Create a new marketing price rule called \"{{topic}}\" that offers {{rule}} for all customers", + "intent": "Create a new marketing price rule called \"Pride Month\" for all registered customers that offers 45% off on all products", + "intent_template": "Create a new marketing price rule called \"{{topic}}\" for all registered customers that offers {{rule}}", "instantiation_dict": {"topic": "Pride Month", "rule": "45% off on all products"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", - "site": "shopping_admin", "expected": { "url": "__SHOPPING_ADMIN__/catalog_rule/promo_catalog/save/", "http_method": "POST", @@ -21202,19 +19976,17 @@ "task_id": 703, "intent_template_id": 258, "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Create a new marketing price rule called \"Thanks giving sale\" that offers $40 discount on all their purchase for all customers", - "intent_template": "Create a new marketing price rule called \"{{topic}}\" that offers {{rule}} for all customers", + "intent": "Create a new marketing price rule called \"Thanks giving sale\" for all registered customers that offers $40 discount on all their purchase", + "intent_template": "Create a new marketing price rule called \"{{topic}}\" for all registered customers that offers {{rule}}", "instantiation_dict": {"topic": "Thanks giving sale", "rule": "$40 discount on all their purchase"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", - "site": "shopping_admin", "expected": { "url": "__SHOPPING_ADMIN__/sales_rule/promo_quote/save", "http_method": "POST", @@ -21240,30 +20012,19 @@ "task_id": 704, "intent_template_id": 268, "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Today is March 15, 2023, generate a new sales order report for last months", - "intent_template": "Today is March 15, 2023, generate a new {{report}} {{time_span}}", + "intent": "Show the sales order report for for last months (today is March 15, 2023).", + "intent_template": "Show the {{report}} for {{time_span}} (today is March 15, 2023).", "instantiation_dict": {"report": "sales order report", "time_span": "for last months"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "navigate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", - "site": "shopping_admin", - "last_event_only": true, - "ignored_query_params_patterns": ["period_type", "^show"], + "ignored_query_params_patterns": ["^(?!report_type$|from$|to$).*$"], "decode_base64_query": true, - "query_string_schema": { - "type": "object", - "properties": { - "report_type": { "type": "string" }, - "from" : { "type": "string", "format": "date" }, - "to" : { "type": "string", "format": "date" } - } - }, "expected": { "url": "__SHOPPING_ADMIN__/reports/report_sales/sales/filter", "query_params": { @@ -21271,6 +20032,14 @@ "from" : [ "02/1/2023" ], "to" : [ "02/28/2023" ] } + }, + "query_params_schema": { + "type": "object", + "properties": { + "report_type": { "type": "array", "items": {"type": "string"} }, + "from" : { "type": "array", "items": {"type": "string", "format": "date"} }, + "to" : { "type": "array", "items": {"type": "string", "format": "date"} } + } } } ], @@ -21281,30 +20050,19 @@ "task_id": 705, "intent_template_id": 268, "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Today is March 15, 2023, generate a new sales order report over the last 45 days", - "intent_template": "Today is March 15, 2023, generate a new {{report}} {{time_span}}", + "intent": "Show the sales order report for over the last 45 days (today is March 15, 2023).", + "intent_template": "Show the {{report}} for {{time_span}} (today is March 15, 2023).", "instantiation_dict": {"report": "sales order report", "time_span": "over the last 45 days"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "navigate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", - "site": "shopping_admin", - "last_event_only": true, - "ignored_query_params_patterns": ["period_type", "^show"], + "ignored_query_params_patterns": ["^(?!report_type$|from$|to$).*$"], "decode_base64_query": true, - "query_string_schema": { - "type": "object", - "properties": { - "report_type": { "type": "string" }, - "from" : { "type": "string", "format": "date" }, - "to" : { "type": "string", "format": "date" } - } - }, "expected": { "url": "__SHOPPING_ADMIN__/reports/report_sales/sales/filter", "query_params": { @@ -21312,6 +20070,14 @@ "from" : [ "01/29/2023" ], "to" : [ "03/15/2023" ] } + }, + "query_params_schema": { + "type": "object", + "properties": { + "report_type": { "type": "array", "items": {"type": "string"} }, + "from" : { "type": "array", "items": {"type": "string", "format": "date"} }, + "to" : { "type": "array", "items": {"type": "string", "format": "date"} } + } } } ], @@ -21322,30 +20088,19 @@ "task_id": 706, "intent_template_id": 268, "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Today is March 15, 2023, generate a new refund report for Q1", - "intent_template": "Today is March 15, 2023, generate a new {{report}} {{time_span}}", + "intent": "Show the refund report for for Q1 (today is March 15, 2023).", + "intent_template": "Show the {{report}} for {{time_span}} (today is March 15, 2023).", "instantiation_dict": {"report": "refund report", "time_span": "for Q1"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "navigate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", - "site": "shopping_admin", - "last_event_only": true, - "ignored_query_params_patterns": ["period_type", "^show"], + "ignored_query_params_patterns": ["^(?!report_type$|from$|to$).*$"], "decode_base64_query": true, - "query_string_schema": { - "type": "object", - "properties": { - "report_type": { "type": "string" }, - "from" : { "type": "string", "format": "date" }, - "to" : { "type": "string", "format": "date" } - } - }, "expected": { "url": "__SHOPPING_ADMIN__/reports/report_sales/refunded/filter", "query_params": { @@ -21353,6 +20108,14 @@ "from" : [ "01/1/2023" ], "to" : [ "03/31/2023" ] } + }, + "query_params_schema": { + "type": "object", + "properties": { + "report_type": { "type": "array", "items": {"type": "string"} }, + "from" : { "type": "array", "items": {"type": "string", "format": "date"} }, + "to" : { "type": "array", "items": {"type": "string", "format": "date"} } + } } } ], @@ -21363,38 +20126,33 @@ "task_id": 707, "intent_template_id": 268, "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Today is March 15, 2023, generate a new sales order report for last year", - "intent_template": "Today is March 15, 2023, generate a new {{report}} {{time_span}}", + "intent": "Show the sales order report for for last year (today is March 15, 2023).", + "intent_template": "Show the {{report}} for {{time_span}} (today is March 15, 2023).", "instantiation_dict": {"report": "sales order report", "time_span": "for last year"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "navigate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", - "site": "shopping_admin", - "last_event_only": true, - "ignored_query_params_patterns": ["period_type", "^show"], + "ignored_query_params_patterns": ["^(?!report_type$|from$|to$).*$"], "decode_base64_query": true, - "query_string_schema": { - "type": "object", - "properties": { - "report_type" : { "type": "string" }, - "from" : { "type": "string", "format": "date" }, - "to" : { "type": "string", "format": "date" }, - "order_statuses[]": { "type": "string" } - } - }, "expected": { "url": "__SHOPPING_ADMIN__/reports/report_sales/sales/filter", "query_params": { - "report_type" : [ "created_at_order" ], - "from" : [ "01/1/2022" ], - "to" : [ "12/31/2022" ], - "order_statuses[]": [ "complete" ] + "report_type": [ "created_at_order" ], + "from" : [ "1/1/2022" ], + "to" : [ "12/31/2022" ] + } + }, + "query_params_schema": { + "type": "object", + "properties": { + "report_type": { "type": "array", "items": {"type": "string"} }, + "from" : { "type": "array", "items": {"type": "string", "format": "date"} }, + "to" : { "type": "array", "items": {"type": "string", "format": "date"} } } } } @@ -21406,30 +20164,19 @@ "task_id": 708, "intent_template_id": 268, "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Today is March 15, 2023, generate a new tax report for this year", - "intent_template": "Today is March 15, 2023, generate a new {{report}} {{time_span}}", + "intent": "Show the tax report for for this year (today is March 15, 2023).", + "intent_template": "Show the {{report}} for {{time_span}} (today is March 15, 2023).", "instantiation_dict": {"report": "tax report", "time_span": "for this year"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "navigate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", - "site": "shopping_admin", - "last_event_only": true, - "ignored_query_params_patterns": ["period_type", "^show"], + "ignored_query_params_patterns": ["^(?!report_type$|from$|to$).*$"], "decode_base64_query": true, - "query_string_schema": { - "type": "object", - "properties": { - "report_type": { "type": "string" }, - "from" : { "type": "string", "format": "date" }, - "to" : { "type": "string", "format": "date" } - } - }, "expected": { "url": "__SHOPPING_ADMIN__/reports/report_sales/tax/filter", "query_params": { @@ -21437,6 +20184,14 @@ "from" : [ "01/1/2023" ], "to" : [ "03/15/2023" ] } + }, + "query_params_schema": { + "type": "object", + "properties": { + "report_type": { "type": "array", "items": {"type": "string"} }, + "from" : { "type": "array", "items": {"type": "string", "format": "date"} }, + "to" : { "type": "array", "items": {"type": "string", "format": "date"} } + } } } ], @@ -21447,30 +20202,24 @@ "task_id": 709, "intent_template_id": 271, "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Create an orders report from May 1, 2021 to March 31, 2022", - "intent_template": "Create an {{type}} report from {{start_date}} to {{end_date}}", - "instantiation_dict": {"type": "orders", "start_date": "May 1, 2021", "end_date": "March 31, 2022"}, - "format_specification": null, + "intent": "Show the orders report from May 1, 2021 to March 31, 2022.", + "intent_template": "Show the {{type}} report from {{start_date}} to {{end_date}}.", + "instantiation_dict": { + "type": "orders", + "start_date": "May 1, 2021", + "end_date": "March 31, 2022", + "article": "an" + }, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "navigate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", - "site": "shopping_admin", - "last_event_only": true, - "ignored_query_params_patterns": ["period_type", "^show"], + "ignored_query_params_patterns": ["^(?!report_type$|from$|to$).*$"], "decode_base64_query": true, - "query_string_schema": { - "type": "object", - "properties": { - "report_type": { "type": "string" }, - "from" : { "type": "string", "format": "date" }, - "to" : { "type": "string", "format": "date" } - } - }, "expected": { "url": "__SHOPPING_ADMIN__/reports/report_sales/sales/filter", "query_params": { @@ -21478,6 +20227,14 @@ "from" : [ "05/1/2021" ], "to" : [ "03/31/2022" ] } + }, + "query_params_schema": { + "type": "object", + "properties": { + "report_type": { "type": "array", "items": {"type": "string"} }, + "from" : { "type": "array", "items": {"type": "string", "format": "date"} }, + "to" : { "type": "array", "items": {"type": "string", "format": "date"} } + } } } ], @@ -21488,30 +20245,24 @@ "task_id": 710, "intent_template_id": 271, "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Create a shipping report from August 5, 2022 to March 1, 2023", - "intent_template": "Create a {{type}} report from {{start_date}} to {{end_date}}", - "instantiation_dict": {"type": "shipping", "start_date": "08/05/2022", "end_date": "03/01/2023"}, - "format_specification": null, + "intent": "Show the shipping report from August 5, 2022 to March 1, 2023.", + "intent_template": "Show the {{type}} report from {{start_date}} to {{end_date}}.", + "instantiation_dict": { + "type": "shipping", + "start_date": "August 5, 2022", + "end_date": "March 1, 2023", + "article": "a" + }, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "navigate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", - "site": "shopping_admin", - "last_event_only": true, - "ignored_query_params_patterns": ["period_type", "^show"], + "ignored_query_params_patterns": ["^(?!report_type$|from$|to$).*$"], "decode_base64_query": true, - "query_string_schema": { - "type": "object", - "properties": { - "report_type": { "type": "string" }, - "from" : { "type": "string", "format": "date" }, - "to" : { "type": "string", "format": "date" } - } - }, "expected": { "url": "__SHOPPING_ADMIN__/reports/report_sales/shipping/filter", "query_params": { @@ -21519,6 +20270,14 @@ "from" : [ "08/5/2022" ], "to" : [ "03/1/2023" ] } + }, + "query_params_schema": { + "type": "object", + "properties": { + "report_type": { "type": "array", "items": {"type": "string"} }, + "from" : { "type": "array", "items": {"type": "string", "format": "date"} }, + "to" : { "type": "array", "items": {"type": "string", "format": "date"} } + } } } ], @@ -21529,30 +20288,24 @@ "task_id": 711, "intent_template_id": 271, "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Create a product view report from July 5, 2021 to May 31, 2023", - "intent_template": "Create a {{type}} report from {{start_date}} to {{end_date}}", - "instantiation_dict": {"type": "product view", "start_date": "07/05/2021", "end_date": "05/31/2023"}, - "format_specification": null, + "intent": "Show the product view report from July 5, 2021 to May 31, 2023.", + "intent_template": "Show the {{type}} report from {{start_date}} to {{end_date}}.", + "instantiation_dict": { + "type": "product view", + "start_date": "July 5, 2021", + "end_date": "May 31, 2023", + "article": "a" + }, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "navigate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", - "site": "shopping_admin", - "last_event_only": true, - "ignored_query_params_patterns": ["period_type", "^show"], + "ignored_query_params_patterns": ["^(?!report_type$|from$|to$).*$"], "decode_base64_query": true, - "query_string_schema": { - "type": "object", - "properties": { - "report_type": { "type": "string" }, - "from" : { "type": "string", "format": "date" }, - "to" : { "type": "string", "format": "date" } - } - }, "expected": { "url": "__SHOPPING_ADMIN__/reports/report_product/viewed/filter", "query_params": { @@ -21560,6 +20313,14 @@ "from" : [ "07/5/2021" ], "to" : [ "05/31/2023" ] } + }, + "query_params_schema": { + "type": "object", + "properties": { + "report_type": { "type": "array", "items": {"type": "string"} }, + "from" : { "type": "array", "items": {"type": "string", "format": "date"} }, + "to" : { "type": "array", "items": {"type": "string", "format": "date"} } + } } } ], @@ -21570,30 +20331,24 @@ "task_id": 712, "intent_template_id": 271, "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Create a coupons report from May 1, 2021 to May 15, 2023", - "intent_template": "Create a {{type}} report from {{start_date}} to {{end_date}}", - "instantiation_dict": {"type": "coupons", "start_date": "05/01/2021", "end_date": "05/15/2023"}, - "format_specification": null, + "intent": "Show the coupons report from May 1, 2021 to May 15, 2023.", + "intent_template": "Show the {{type}} report from {{start_date}} to {{end_date}}.", + "instantiation_dict": { + "type": "coupons", + "start_date": "May 1, 2021", + "end_date": "May 15, 2023", + "article": "a" + }, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "navigate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", - "site": "shopping_admin", - "last_event_only": true, - "ignored_query_params_patterns": ["period_type", "^show"], + "ignored_query_params_patterns": ["^(?!report_type$|from$|to$).*$"], "decode_base64_query": true, - "query_string_schema": { - "type": "object", - "properties": { - "report_type": { "type": "string" }, - "from" : { "type": "string", "format": "date" }, - "to" : { "type": "string", "format": "date" } - } - }, "expected": { "url": "__SHOPPING_ADMIN__/reports/report_sales/coupons/filter", "query_params": { @@ -21601,6 +20356,14 @@ "from" : [ "05/1/2021" ], "to" : [ "05/15/2023" ] } + }, + "query_params_schema": { + "type": "object", + "properties": { + "report_type": { "type": "array", "items": {"type": "string"} }, + "from" : { "type": "array", "items": {"type": "string", "format": "date"} }, + "to" : { "type": "array", "items": {"type": "string", "format": "date"} } + } } } ], @@ -21611,30 +20374,24 @@ "task_id": 713, "intent_template_id": 271, "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Create a best sellers report from May 1, 2022 to May 31, 2023", - "intent_template": "Create a {{type}} report from {{start_date}} to {{end_date}}", - "instantiation_dict": {"type": "best sellers", "start_date": "05/01/2022", "end_date": "05/31/2023"}, - "format_specification": null, + "intent": "Show the best sellers report from May 1, 2022 to May 31, 2023.", + "intent_template": "Show the {{type}} report from {{start_date}} to {{end_date}}.", + "instantiation_dict": { + "type": "best sellers", + "start_date": "May 1, 2022", + "end_date": "May 31, 2023", + "article": "a" + }, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "navigate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", - "site": "shopping_admin", - "last_event_only": true, - "ignored_query_params_patterns": ["period_type", "^show"], + "ignored_query_params_patterns": ["^(?!report_type$|from$|to$).*$"], "decode_base64_query": true, - "query_string_schema": { - "type": "object", - "properties": { - "report_type": { "type": "string" }, - "from" : { "type": "string", "format": "date" }, - "to" : { "type": "string", "format": "date" } - } - }, "expected": { "url": "__SHOPPING_ADMIN__/reports/report_sales/bestsellers/filter", "query_params": { @@ -21642,6 +20399,14 @@ "from" : [ "05/1/2022" ], "to" : [ "05/31/2023" ] } + }, + "query_params_schema": { + "type": "object", + "properties": { + "report_type": { "type": "array", "items": {"type": "string"} }, + "from" : { "type": "array", "items": {"type": "string", "format": "date"} }, + "to" : { "type": "array", "items": {"type": "string", "format": "date"} } + } } } ], @@ -21655,12 +20420,11 @@ "intent": "Thumbs down the top 1 post ever in gadgets.", "intent_template": "Thumbs down the top {{k}} post ever in {{forum}}.", "instantiation_dict": {"k": "1", "forum": "gadgets"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", @@ -21681,12 +20445,11 @@ "intent": "Thumbs down the top 2 post ever in history.", "intent_template": "Thumbs down the top {{k}} post ever in {{forum}}.", "instantiation_dict": {"k": "2", "forum": "history"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", @@ -21715,12 +20478,11 @@ "intent": "Thumbs down the top 3 post ever in books.", "intent_template": "Thumbs down the top {{k}} post ever in {{forum}}.", "instantiation_dict": {"k": "3", "forum": "books"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", @@ -21757,12 +20519,11 @@ "intent": "Thumbs down the top 4 post ever in movies.", "intent_template": "Thumbs down the top {{k}} post ever in {{forum}}.", "instantiation_dict": {"k": "4", "forum": "movies"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", @@ -21807,12 +20568,11 @@ "intent": "Thumbs down the top 5 post ever in technology.", "intent_template": "Thumbs down the top {{k}} post ever in {{forum}}.", "instantiation_dict": {"k": "5", "forum": "technology"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", @@ -21865,12 +20625,11 @@ "intent": "Like all submissions created by ThetaGang_wsb in forum wallstreetbets", "intent_template": "Like all submissions created by {{user}} in forum {{forum}}", "instantiation_dict": {"user": "ThetaGang_wsb", "forum": "wallstreetbets"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", @@ -21899,12 +20658,11 @@ "intent": "Like all submissions created by CameronKelsey in forum earthporn", "intent_template": "Like all submissions created by {{user}} in forum {{forum}}", "instantiation_dict": {"user": "CameronKelsey", "forum": "earthporn"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", @@ -21941,12 +20699,11 @@ "intent": "Like all submissions created by UniversityofBath in forum IAmA", "intent_template": "Like all submissions created by {{user}} in forum {{forum}}", "instantiation_dict": {"user": "UniversityofBath", "forum": "IAmA"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", @@ -22023,12 +20780,11 @@ "intent": "Like all submissions created by Don_Gato1 in forum nyc", "intent_template": "Like all submissions created by {{user}} in forum {{forum}}", "instantiation_dict": {"user": "Don_Gato1", "forum": "nyc"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", @@ -22049,16 +20805,11 @@ "intent": "Like all submissions created by FTorrez81 in forum iphone13", "intent_template": "Like all submissions created by {{user}} in forum {{forum}}", "instantiation_dict": {"user": "FTorrez81", "forum": "iphone13"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": { - "performed_operation": "retrieve", - "status": "NOT_FOUND_ERROR", - "retrieved_data": null - } + "expected": {"task_type": "retrieve", "status": "NOT_FOUND_ERROR", "retrieved_data": null} } ], "revision": 2 @@ -22071,12 +20822,11 @@ "intent": "Like all submissions created by Hrekires in forum news", "intent_template": "Like all submissions created by {{user}} in forum {{forum}}", "instantiation_dict": {"user": "Hrekires", "forum": "news"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", @@ -22169,12 +20919,11 @@ "intent": "DisLike all submissions created by RickyDontLoseThat in forum massachusetts", "intent_template": "DisLike all submissions created by {{user}} in forum {{forum}}", "instantiation_dict": {"user": "RickyDontLoseThat", "forum": "massachusetts"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", @@ -22195,16 +20944,11 @@ "intent": "DisLike all submissions created by jacyanthis in forum earthporn", "intent_template": "DisLike all submissions created by {{user}} in forum {{forum}}", "instantiation_dict": {"user": "jacyanthis", "forum": "earthporn"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": { - "performed_operation": "retrieve", - "status": "NOT_FOUND_ERROR", - "retrieved_data": null - } + "expected": {"task_type": "retrieve", "status": "NOT_FOUND_ERROR", "retrieved_data": null} } ], "revision": 2 @@ -22217,12 +20961,11 @@ "intent": "DisLike all submissions created by PatientBuilder499 in forum videos", "intent_template": "DisLike all submissions created by {{user}} in forum {{forum}}", "instantiation_dict": {"user": "PatientBuilder499", "forum": "videos"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", @@ -22243,12 +20986,11 @@ "intent": "DisLike all submissions created by sirbarani in forum sports", "intent_template": "DisLike all submissions created by {{user}} in forum {{forum}}", "instantiation_dict": {"user": "sirbarani", "forum": "sports"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", @@ -22269,12 +21011,11 @@ "intent": "DisLike all submissions created by AdamCannon in forum UpliftingNews", "intent_template": "DisLike all submissions created by {{user}} in forum {{forum}}", "instantiation_dict": {"user": "AdamCannon", "forum": "UpliftingNews"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", @@ -22295,12 +21036,11 @@ "intent": "DisLike all submissions created by Hrekires in forum news", "intent_template": "DisLike all submissions created by {{user}} in forum {{forum}}", "instantiation_dict": {"user": "Hrekires", "forum": "news"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", @@ -22393,20 +21133,22 @@ "intent": "Edit my post on Nvidia RTX 4090 by adding a line to the body that says \"EDIT: This news aged well\"", "intent_template": "Edit my post on {{post}} by adding a line to the body that says \"{{content}}\"", "instantiation_dict": {"post": "Nvidia RTX 4090", "content": "EDIT: This news aged well"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "site": "reddit", + "evaluator": "NetworkEventEvaluator", "expected": { - "subreddit": "MachineLearning", - "post_id": 1, - "title": "EDIT: This news aged well" + "url": "__REDDIT__/f/MachineLearning/1/-/edit", + "http_method": "POST", + "post_data": { + "submission[title]": "Nvidia RTX 4090", + "submission[body]": "^\\s*Crazy device for ML!\\s*EDIT: This news aged well\\s*$" + }, + "response_status": 302 } } ], @@ -22420,20 +21162,22 @@ "intent": "Edit my post on The Night Agent by adding a line to the body that says \"Done watching, pretty cool!\"", "intent_template": "Edit my post on {{post}} by adding a line to the body that says \"{{content}}\"", "instantiation_dict": {"post": "The Night Agent", "content": "Done watching, pretty cool!"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "site": "reddit", + "evaluator": "NetworkEventEvaluator", "expected": { - "subreddit": "television", - "post_id": 134868, - "title": "Done watching, pretty cool!" + "url": "__REDDIT__/f/television/134868/-/edit", + "http_method": "POST", + "post_data": { + "submission[title]": "'The Night Agent' Renewed for Season 2 at Netflix", + "submission[body]": "Done watching, pretty cool!" + }, + "response_status": 302 } } ], @@ -22450,19 +21194,17 @@ "post": "Star Trek Starfleet Academy series", "content": "Every watch makes me feel like a kid again" }, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", "expected": { "url": "__REDDIT__/f/television/135201/-/edit", "http_method": "POST", - "headers": {"referer": "__REDDIT__/f/television/135201/-/edit"}, "post_data": { "submission[title]": "Star Trek: Starfleet Academy Series, From Alex Kurtzman and Nancy Drew Creator, Ordered at Paramount+", "submission[body]": "Every watch makes me feel like a kid again" @@ -22484,19 +21226,17 @@ "post": "Ted Lasso season 3 premiere", "content": "Done watching. I love the renew!" }, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", "expected": { "url": "__REDDIT__/f/television/135156/-/edit", "http_method": "POST", - "headers": {"referer": "__REDDIT__/f/television/135156/-/edit"}, "post_data": { "submission[title]": "\u2018Ted Lasso\u2019 Season 3 Premiere Scores 870K U.S. Households, Up 59% From Season 2 Premiere", "submission[body]": "Done watching. I love the renew!" @@ -22518,19 +21258,17 @@ "post": "Lord of the Rings season 2 cast announcement with the lowest vote count", "content": "The cast is amazing!" }, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", "expected": { "url": "__REDDIT__/f/television/70886/-/edit", "http_method": "POST", - "headers": {"referer": "__REDDIT__/f/television/70886/-/edit"}, "post_data": { "submission[title]": "\u2018The Lord Of The Rings: The Rings Of Power\u2019 Adds 8 To Season 2 Cast", "submission[body]": "The cast is amazing!" @@ -22552,32 +21290,65 @@ "repo": "byteblaze/gimmiethat.space and byteblaze/dotfiles", "license": "MIT licenses" }, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "group": "byteblaze", - "project": "gimmiethat.space", - "file_path": "LICENSE", - "substring": "MIT License", - "branch": "main", - "site": "gitlab", - "expected": true + "evaluator": "NetworkEventEvaluator", + "post_data_schema": { + "type": "object", + "properties": { + "_method" : { "type": "string" }, + "file_path" : { "type": "string" }, + "branch_name": { "type": "string" }, + "content" : { "type": "string" } + } + }, + "expected": { + "url": [ + "__GITLAB__/byteblaze/gimmiethat.space/-/update/main/LICENSE", + "__GITLAB__/byteblaze/gimmiethat.space/-/create/main", + "__GITLAB__/byteblaze/gimmiethat.space/-/blob/main/LICENSE" + ], + "http_method": "POST", + "post_data": { + "$.^file_path|file_name$": "LICENSE", + "branch_name": "main", + "content": "^.*MIT.*License.*$", + "_method": "^.*(?!delete$).*$" + }, + "response_status": 302 + } }, { - "evaluator": "BackendStateEvaluator", - "group": "byteblaze", - "project": "dotfiles", - "file_path": "LICENSE", - "substring": "MIT License", - "branch": "main", - "site": "gitlab", - "expected": true + "evaluator": "NetworkEventEvaluator", + "post_data_schema": { + "type": "object", + "properties": { + "_method" : { "type": "string" }, + "file_path" : { "type": "string" }, + "branch_name": { "type": "string" }, + "content" : { "type": "string" } + } + }, + "expected": { + "url": [ + "__GITLAB__/byteblaze/dotfiles/-/update/main/LICENSE", + "__GITLAB__/byteblaze/dotfiles/-/create/main", + "__GITLAB__/byteblaze/dotfiles/-/blob/main/LICENSE" + ], + "http_method": "POST", + "post_data": { + "$.^file_path|file_name$": "LICENSE", + "branch_name": "main", + "content": "^.*MIT.*License.*$", + "_method": "^.*(?!delete$).*$" + }, + "response_status": 302 + } } ], "revision": 2 @@ -22587,25 +21358,23 @@ "task_id": 737, "intent_template_id": 94, "start_urls": ["__MAP__", "__WIKIPEDIA__"], - "intent": "Show me the way from Carnegie Mellon University to the home stadium of Philadelphia 76ers (use the provided wiki site to look up any needed information)", - "intent_template": "Show me the way from {{location}} to the home stadium of {{sport_team}} {{time}} (use the provided wiki site to look up any needed information)", + "intent": "View the route on the map from Carnegie Mellon University to the home stadium of Philadelphia 76ers. (Use the OSRM direction service and the provided wiki to look up any needed information and search both source and destination by coordinates from the place official page on the wiki.)", + "intent_template": "View the route on the map from {{location}} to the home stadium of {{sport_team}}{{time_phrase}}. (Use the OSRM direction service and the provided wiki to look up any needed information and search both source and destination by coordinates from the place official page on the wiki.)", "instantiation_dict": { "location": "Carnegie Mellon University", "sport_team": "Philadelphia 76ers", - "time": "" + "time_phrase": "" }, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "navigate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", - "site": "map", - "last_event_only": true, - "expected": {"url": "__MAP__", "response_status": 200} + "ignored_query_params_patterns": [".*"], + "expected": {"url": "^.*/route/v1/.*/-75.1718916,39.9011873;-79.9427192,40.4441897.*$"} } ], "revision": 2 @@ -22615,25 +21384,23 @@ "task_id": 738, "intent_template_id": 94, "start_urls": ["__MAP__", "__WIKIPEDIA__"], - "intent": "Show me the way from Carnegie Mellon University to the home stadium of Philadelphia 76ers in the 70th (use the provided wiki site to look up any needed information)", - "intent_template": "Show me the way from {{location}} to the home stadium of {{sport_team}} {{time}} (use the provided wiki site to look up any needed information)", + "intent": "View the route on the map from Carnegie Mellon University to the home stadium of Philadelphia 76ers in the 70s. (Use the OSRM direction service and the provided wiki to look up any needed information and search both source and destination by coordinates from the place official page on the wiki.)", + "intent_template": "View the route on the map from {{location}} to the home stadium of {{sport_team}}{{time_phrase}}. (Use the OSRM direction service and the provided wiki to look up any needed information and search both source and destination by coordinates from the place official page on the wiki.)", "instantiation_dict": { "location": "Carnegie Mellon University", "sport_team": "Philadelphia 76ers", - "time": "in the 70th" + "time_phrase": " in the 70s" }, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "navigate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", - "site": "map", - "last_event_only": true, - "expected": {"url": "__MAP__", "response_status": 200} + "ignored_query_params_patterns": [".*"], + "expected": {"url": "^.*/route/v1/.*/-75.1712951,39.9042046;-79.9427192,40.4441897.*$"} } ], "revision": 2 @@ -22643,25 +21410,23 @@ "task_id": 739, "intent_template_id": 94, "start_urls": ["__MAP__", "__WIKIPEDIA__"], - "intent": "Show me the way from Carnegie Mellon University to the home stadium of Yankees in the 80th (use the provided wiki site to look up any needed information)", - "intent_template": "Show me the way from {{location}} to the home stadium of {{sport_team}} {{time}} (use the provided wiki site to look up any needed information)", + "intent": "View the route on the map from Carnegie Mellon University to the home stadium of Yankees in the 80s. (Use the OSRM direction service and the provided wiki to look up any needed information and search both source and destination by coordinates from the place official page on the wiki.)", + "intent_template": "View the route on the map from {{location}} to the home stadium of {{sport_team}}{{time_phrase}}. (Use the OSRM direction service and the provided wiki to look up any needed information and search both source and destination by coordinates from the place official page on the wiki.)", "instantiation_dict": { "location": "Carnegie Mellon University", "sport_team": "Yankees", - "time": "in the 80th" + "time_phrase": " in the 80s" }, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "navigate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", - "site": "map", - "last_event_only": true, - "expected": {"url": "__MAP__", "response_status": 200} + "ignored_query_params_patterns": [".*"], + "expected": {"url": "^.*/route/v1/.*/-73.9265212,40.8295828;-79.9427192,40.4441897.*$"} } ], "revision": 2 @@ -22671,25 +21436,23 @@ "task_id": 740, "intent_template_id": 94, "start_urls": ["__MAP__", "__WIKIPEDIA__"], - "intent": "Show me the way from Carnegie Mellon University to the home stadium of NYC NBA team (use the provided wiki site to look up any needed information)", - "intent_template": "Show me the way from {{location}} to the home stadium of {{sport_team}} {{time}} (use the provided wiki site to look up any needed information)", + "intent": "View the route on the map from Carnegie Mellon University to the home stadium of NYC NBA team. (Use the OSRM direction service and the provided wiki to look up any needed information and search both source and destination by coordinates from the place official page on the wiki.)", + "intent_template": "View the route on the map from {{location}} to the home stadium of {{sport_team}}{{time_phrase}}. (Use the OSRM direction service and the provided wiki to look up any needed information and search both source and destination by coordinates from the place official page on the wiki.)", "instantiation_dict": { "location": "Carnegie Mellon University", "sport_team": "NYC NBA team", - "time": "" + "time_phrase": "" }, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "navigate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", - "site": "map", - "last_event_only": true, - "expected": {"url": "__MAP__", "response_status": 200} + "ignored_query_params_patterns": [".*"], + "expected": {"url": "^.*/route/v1/.*/-73.9935443,40.7505085;-79.9427192,40.4441897.*$"} } ], "revision": 2 @@ -22699,25 +21462,23 @@ "task_id": 741, "intent_template_id": 94, "start_urls": ["__MAP__", "__WIKIPEDIA__"], - "intent": "Show me the way from Carnegie Mellon University to the home stadium of Boston home NBA team (use the provided wiki site to look up any needed information)", - "intent_template": "Show me the way from {{location}} to the home stadium of {{sport_team}} {{time}} (use the provided wiki site to look up any needed information)", + "intent": "View the route on the map from Carnegie Mellon University to the home stadium of Boston home NBA team. (Use the OSRM direction service and the provided wiki to look up any needed information and search both source and destination by coordinates from the place official page on the wiki.)", + "intent_template": "View the route on the map from {{location}} to the home stadium of {{sport_team}}{{time_phrase}}. (Use the OSRM direction service and the provided wiki to look up any needed information and search both source and destination by coordinates from the place official page on the wiki.)", "instantiation_dict": { "location": "Carnegie Mellon University", "sport_team": "Boston home NBA team", - "time": "" + "time_phrase": "" }, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "navigate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", - "site": "map", - "last_event_only": true, - "expected": {"url": "__MAP__", "response_status": 200} + "ignored_query_params_patterns": [".*"], + "expected": {"url": "^.*/route/v1/.*/-71.0621475,42.3662922;-79.9427192,40.4441897.*$"} } ], "revision": 2 @@ -22728,14 +21489,17 @@ "intent_template_id": 332, "start_urls": ["__GITLAB__"], "intent": "Create a new private project \"planner\" and add Abishek, Vinta as members", - "intent_template": "Create a new {{scope}} project \"planner\" and add {{account_list}} as members", - "instantiation_dict": {"scope": "private", "account_list": "Abishek, Vinta"}, - "format_specification": null, + "intent_template": "Create a {{create_spec}} \"{{project_name}}\"{{suffix}}", + "instantiation_dict": { + "create_spec": "new private project", + "project_name": "planner", + "suffix": " and add Abishek, Vinta as members" + }, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", @@ -22773,14 +21537,17 @@ "intent_template_id": 332, "start_urls": ["__GITLAB__"], "intent": "Create a new public project \"web_arena\" and add Abishek, Vinta as members", - "intent_template": "Create a new {{scope}} project \"web_arena\" and add {{account_list}} as members", - "instantiation_dict": {"scope": "public", "account_list": "Abishek, Vinta"}, - "format_specification": null, + "intent_template": "Create a {{create_spec}} \"{{project_name}}\"{{suffix}}", + "instantiation_dict": { + "create_spec": "new public project", + "project_name": "web_arena", + "suffix": " and add Abishek, Vinta as members" + }, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", @@ -22818,14 +21585,17 @@ "intent_template_id": 332, "start_urls": ["__GITLAB__"], "intent": "Create a new public project \"AutoAGI\" and add primer as members", - "intent_template": "Create a new {{scope}} project \"AutoAGI\" and add {{account_list}} as members", - "instantiation_dict": {"scope": "public", "account_list": "primer"}, - "format_specification": null, + "intent_template": "Create a {{create_spec}} \"{{project_name}}\"{{suffix}}", + "instantiation_dict": { + "create_spec": "new public project", + "project_name": "AutoAGI", + "suffix": " and add primer as members" + }, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", @@ -22854,14 +21624,17 @@ "intent_template_id": 332, "start_urls": ["__GITLAB__"], "intent": "Create a new public project \"awesome-llms\" and add primer, convexegg, abishek as members", - "intent_template": "Create a new {{scope}} project \"awesome-llms\" and add {{account_list}} as members", - "instantiation_dict": {"scope": "public", "account_list": "primer, convexegg, abishek"}, - "format_specification": null, + "intent_template": "Create a {{create_spec}} \"{{project_name}}\"{{suffix}}", + "instantiation_dict": { + "create_spec": "new public project", + "project_name": "awesome-llms", + "suffix": " and add primer, convexegg, abishek as members" + }, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", @@ -22908,14 +21681,17 @@ "intent_template_id": 332, "start_urls": ["__GITLAB__"], "intent": "Create a new private project \"llm_bulk_inference\" and add primer, convexegg, abishek as members", - "intent_template": "Create a new {{scope}} project \"llm_bulk_inference\" and add {{account_list}} as members", - "instantiation_dict": {"scope": "private", "account_list": "primer, convexegg, abishek"}, - "format_specification": null, + "intent_template": "Create a {{create_spec}} \"{{project_name}}\"{{suffix}}", + "instantiation_dict": { + "create_spec": "new private project", + "project_name": "llm_bulk_inference", + "suffix": " and add primer, convexegg, abishek as members" + }, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", @@ -22968,12 +21744,11 @@ "template": "blank", "account_list": "Abishek, Vinta" }, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", @@ -22993,14 +21768,12 @@ { "evaluator": "NetworkEventEvaluator", "ignored_query_params": ["serializer"], - "last_event_only": true, "post_data_schema": { "type": "object", "properties": { "user_id": {"type": "string", "format": "string_list"} } }, "expected": { "url": "^__GITLAB__/api/v4/projects/\\d+/invitations$", - "headers": {"referer": "__GITLAB__/byteblaze/awesome_web_agents/-/project_members"}, "http_method": "POST", "post_data": {"user_id": "5,278"}, "response_status": 201 @@ -23021,16 +21794,14 @@ "template": "Android", "account_list": "primer, convexegg, abishek" }, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", - "site": "gitlab", "expected": { "url": "__GITLAB__/api/v4/projects", "http_method": "POST", @@ -23045,7 +21816,6 @@ }, { "evaluator": "NetworkEventEvaluator", - "site": "gitlab", "post_data_schema": { "type": "object", "properties": { "user_id": {"type": "string", "format": "string_list"} } @@ -23072,16 +21842,14 @@ "template": "NodeJS", "account_list": "primer, convexegg, vinta" }, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", - "site": "gitlab", "expected": { "url": "__GITLAB__/api/v4/projects", "http_method": "POST", @@ -23096,7 +21864,6 @@ }, { "evaluator": "NetworkEventEvaluator", - "site": "gitlab", "post_data_schema": { "type": "object", "properties": { "user_id": {"type": "string", "format": "string_list"} } @@ -23123,12 +21890,11 @@ "template": "HTML Gitlab pages", "account_list": "Vinta Chen" }, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", @@ -23148,10 +21914,8 @@ { "evaluator": "NetworkEventEvaluator", "ignored_query_params": ["serializer"], - "last_event_only": true, "expected": { "url": "^__GITLAB__/api/v4/projects/\\d+/invitations$", - "headers": {"referer": "__GITLAB__/byteblaze/agi_index/-/project_members"}, "http_method": "POST", "post_data": {"user_id": "278"}, "response_status": 201 @@ -23172,12 +21936,11 @@ "template": "JEKYLL", "account_list": "Rohan and Vinta" }, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", @@ -23197,14 +21960,12 @@ { "evaluator": "NetworkEventEvaluator", "ignored_query_params": ["serializer"], - "last_event_only": true, "post_data_schema": { "type": "object", "properties": { "user_id": {"type": "string", "format": "string_list"} } }, "expected": { "url": "^__GITLAB__/api/v4/projects/\\d+/invitations$", - "headers": {"referer": "__GITLAB__/byteblaze/AGISite/-/project_members"}, "http_method": "POST", "post_data": {"user_id": "2366,278"}, "response_status": 201 @@ -23219,18 +21980,17 @@ "intent_template_id": 332, "start_urls": ["__GITLAB__"], "intent": "Create a private blank repository called \"web_agent\"", - "intent_template": "Create a private {{template}} repository called \"{{project_name}}\"{{description}}", + "intent_template": "Create a {{create_spec}} \"{{project_name}}\"{{suffix}}", "instantiation_dict": { + "create_spec": "private blank repository called", "project_name": "web_agent", - "template": "blank with readme", - "description": "" + "suffix": "" }, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", @@ -23241,8 +22001,7 @@ "project[name]": "web_agent", "project[path]": "web_agent", "project[namespace_id]": "2505", - "project[visibility_level]": "0", - "project[initialize_with_readme]": "1" + "project[visibility_level]": "0" }, "response_status": 302 } @@ -23256,18 +22015,17 @@ "intent_template_id": 332, "start_urls": ["__GITLAB__"], "intent": "Create a private Android repository called \"web_agent_android_xs\" using the right template to speed up development.", - "intent_template": "Create a private {{template}} repository called \"{{project_name}}\"{{description}}", + "intent_template": "Create a {{create_spec}} \"{{project_name}}\"{{suffix}}", "instantiation_dict": { + "create_spec": "private Android repository called", "project_name": "web_agent_android_xs", - "template": "Android", - "description": " using the right template to speed up development." + "suffix": " using the right template to speed up development." }, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", @@ -23293,28 +22051,32 @@ "intent_template_id": 332, "start_urls": ["__GITLAB__"], "intent": "Create a private NodeJS repository called \"web_agent_nodejs\" using the right template to speed up development.", - "intent_template": "Create a private {{template}} repository called \"{{project_name}}\"{{description}}", + "intent_template": "Create a {{create_spec}} \"{{project_name}}\"{{suffix}}", "instantiation_dict": { + "create_spec": "private NodeJS repository called", "project_name": "web_agent_nodejs", - "template": "NodeJS", - "description": " using the right template to speed up development." + "suffix": " using the right template to speed up development." }, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "group": "byteblaze", - "project": "web_agent_nodejs", - "visibility": "private", - "initial_commit_message_substrings": ["Initialized from 'NodeJS Express' project template"], - "require_all_substrings": false, - "site": "gitlab", - "expected": true + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": "__GITLAB__/projects", + "http_method": "POST", + "post_data": { + "project[template_name]": "express", + "project[name]": "web_agent_nodejs", + "project[namespace_id]": "2505", + "project[path]": "web_agent_nodejs", + "project[visibility_level]": "0" + }, + "response_status": 302 + } } ], "revision": 2 @@ -23325,31 +22087,32 @@ "intent_template_id": 332, "start_urls": ["__GITLAB__"], "intent": "Create a private HTML repository called \"web_agent_index\" using the right template to speed up development.", - "intent_template": "Create a private {{template}} repository called \"{{project_name}}\"{{description}}", + "intent_template": "Create a {{create_spec}} \"{{project_name}}\"{{suffix}}", "instantiation_dict": { + "create_spec": "private HTML repository called", "project_name": "web_agent_index", - "template": "HTML", - "description": " using the right template to speed up development." + "suffix": " using the right template to speed up development." }, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "group": "byteblaze", - "project": "web_agent_index", - "fields": { - "name": "web_agent_index", - "description": "plain HTML", - "visibility": "private" - }, - "description_is_substring": true, - "site": "gitlab", - "expected": true + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": "__GITLAB__/projects", + "http_method": "POST", + "post_data": { + "project[template_name]": "plainhtml", + "project[name]": "web_agent_index", + "project[namespace_id]": "2505", + "project[path]": "web_agent_index", + "project[visibility_level]": "0" + }, + "response_status": 302 + } } ], "revision": 2 @@ -23360,27 +22123,32 @@ "intent_template_id": 332, "start_urls": ["__GITLAB__"], "intent": "Create a private JEKYLL repository called \"11711_gitlab\" using the right template to speed up development.", - "intent_template": "Create a private {{template}} repository called \"{{project_name}}\"{{description}}", + "intent_template": "Create a {{create_spec}} \"{{project_name}}\"{{suffix}}", "instantiation_dict": { + "create_spec": "private JEKYLL repository called", "project_name": "11711_gitlab", - "template": "JEKYLL", - "description": " using the right template to speed up development." + "suffix": " using the right template to speed up development." }, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "group": "byteblaze", - "project": "11711_gitlab", - "fields": {"name": "11711_gitlab", "description": "Jekyll site", "visibility": "private"}, - "description_is_substring": true, - "site": "gitlab", - "expected": true + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": "__GITLAB__/projects", + "http_method": "POST", + "post_data": { + "project[template_name]": "nfjekyll", + "project[name]": "11711_gitlab", + "project[namespace_id]": "2505", + "project[path]": "11711_gitlab", + "project[visibility_level]": "0" + }, + "response_status": 302 + } } ], "revision": 2 @@ -23390,25 +22158,20 @@ "task_id": 757, "intent_template_id": 42, "start_urls": ["__MAP__"], - "intent": "Show me the path and travel time from home of the 1980 Super Bowl champions to home of the 1991 Super Bowl champions.", - "intent_template": "Show me the path and travel time from {{city1}} to {{city2}}.", + "intent": "Show on the map the path and travel time from home of the 1980 Super Bowl champions to home of the 1991 Super Bowl champions. (Use the OSRM direction service.)", + "intent_template": "Show on the map the {{view_type}} from {{city1}} to {{city2}}. (Use the OSRM direction service.)", "instantiation_dict": { "city1": "home of the 1980 Super Bowl champions", - "city2": "home of the 1991 Super Bowl champions" + "city2": "home of the 1991 Super Bowl champions", + "view_type": "path and travel time" }, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "navigate", "status": "SUCCESS", "retrieved_data": null} }, - { - "evaluator": "NetworkEventEvaluator", - "site": "map", - "last_event_only": true, - "expected": {"url": "__MAP__", "response_status": 200} - } + { "evaluator": "NetworkEventEvaluator", "expected": {"url": "__MAP__"} } ], "revision": 2 }, @@ -23417,22 +22180,20 @@ "task_id": 758, "intent_template_id": 42, "start_urls": ["__MAP__"], - "intent": "Show me the path and travel time from the big apple to biggest city in Maine.", - "intent_template": "Show me the path and travel time from {{city1}} to {{city2}}.", - "instantiation_dict": {"city1": "the big apple", "city2": "biggest city in Maine"}, - "format_specification": null, + "intent": "Show on the map the path and travel time from the big apple to biggest city in Maine. (Use the OSRM direction service.)", + "intent_template": "Show on the map the {{view_type}} from {{city1}} to {{city2}}. (Use the OSRM direction service.)", + "instantiation_dict": { + "city1": "the big apple", + "city2": "biggest city in Maine", + "view_type": "path and travel time" + }, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "navigate", "status": "SUCCESS", "retrieved_data": null} }, - { - "evaluator": "NetworkEventEvaluator", - "site": "map", - "last_event_only": true, - "expected": {"url": "__MAP__", "response_status": 200} - } + { "evaluator": "NetworkEventEvaluator", "expected": {"url": "__MAP__"} } ], "revision": 2 }, @@ -23441,24 +22202,26 @@ "task_id": 759, "intent_template_id": 42, "start_urls": ["__MAP__", "__SHOPPING_ADMIN__"], - "intent": "Show me the route and driving time from the city where my E-commerce customer Sophia Young lives to New York City", - "intent_template": "Show me the route and driving time from {{city1}} to {{city2}}", + "intent": "Show on the map the route and driving time from the city where my E-commerce customer Sophia Young lives to New York City. (Use the OSRM direction service.)", + "intent_template": "Show on the map the {{view_type}} from {{city1}} to {{city2}}. (Use the OSRM direction service.)", "instantiation_dict": { "city1": "the city where my E-commerce customer Sophia Young lives", - "city2": "New York City" + "city2": "New York City", + "view_type": "route and driving time" }, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "navigate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", - "site": "map", - "last_event_only": true, - "expected": {"url": "__MAP__", "response_status": 200} + "ignored_query_params_patterns": [".*"], + "expected": { + "url": "^.*/route/v1/.*/-71.060511,42.3554334;-1.4869496,55.0252998.*$", + "headers": {"Cookie": "^(?!.*_osm_directions_engine=fossgis_osrm_(?:bicycle|foot)).*$"} + } } ], "revision": 2 @@ -23468,24 +22231,26 @@ "task_id": 760, "intent_template_id": 42, "start_urls": ["__MAP__", "__SHOPPING_ADMIN__"], - "intent": "Show me the route and driving time from Allentown, PA to the city where my E-commerce customer Amanda Kim lives", - "intent_template": "Show me the route and driving time from {{city1}} to {{city2}}", + "intent": "Show on the map the route and driving time from Allentown, PA to the city where my E-commerce customer Amanda Kim lives. (Use the OSRM direction service.)", + "intent_template": "Show on the map the {{view_type}} from {{city1}} to {{city2}}. (Use the OSRM direction service.)", "instantiation_dict": { "city1": "Allentown, PA", - "city2": "the city where my E-commerce customer Amanda Kim lives" + "city2": "the city where my E-commerce customer Amanda Kim lives", + "view_type": "route and driving time" }, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "navigate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", - "site": "map", - "last_event_only": true, - "expected": {"url": "__MAP__", "response_status": 200} + "ignored_query_params_patterns": [".*"], + "expected": { + "url": "^.*/route/v1/.*/-75.4716115,40.6022552;-74.4041622,40.0757384.*$", + "headers": {"Cookie": "^(?!.*_osm_directions_engine=fossgis_osrm_(?:bicycle|foot)).*$"} + } } ], "revision": 2 @@ -23495,26 +22260,20 @@ "task_id": 761, "intent_template_id": 54, "start_urls": ["__MAP__"], - "intent": "Get directions from Carnegie Science Museum to Hunt library CMU using walk options.", - "intent_template": "Get directions from {{location_address_1}} to {{location_address_2}} using {{transportation}} options.", + "intent": "Show on the map directions from Carnegie Science Museum to Hunt library CMU using walk options. (Use the OSRM direction service.)", + "intent_template": "Show on the map directions from {{location_address_1}} to {{location_address_2}} using {{transportation}} options. (Use the OSRM direction service.)", "instantiation_dict": { "location_address_1": "Carnegie Science Museum", "location_address_2": "Hunt library CMU", "transportation": "walk" }, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "navigate", "status": "SUCCESS", "retrieved_data": null} }, - { - "evaluator": "NetworkEventEvaluator", - "site": "map", - "last_event_only": true, - "expected": {"url": "__MAP__", "response_status": 200} - } + { "evaluator": "NetworkEventEvaluator", "expected": {"url": "__MAP__"} } ], "revision": 2 }, @@ -23523,26 +22282,20 @@ "task_id": 762, "intent_template_id": 54, "start_urls": ["__MAP__"], - "intent": "Get directions from Carnegie Music Hall in NYC to Carnegie Mellon University using driving options.", - "intent_template": "Get directions from {{location_address_1}} to {{location_address_2}} using {{transportation}} options.", + "intent": "Show on the map directions from Carnegie Music Hall in NYC to Carnegie Mellon University using driving options. (Use the OSRM direction service.)", + "intent_template": "Show on the map directions from {{location_address_1}} to {{location_address_2}} using {{transportation}} options. (Use the OSRM direction service.)", "instantiation_dict": { "location_address_1": "Carnegie Music Hall in NYC", "location_address_2": "Carnegie Mellon University", "transportation": "driving" }, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "navigate", "status": "SUCCESS", "retrieved_data": null} }, - { - "evaluator": "NetworkEventEvaluator", - "site": "map", - "last_event_only": true, - "expected": {"url": "__MAP__", "response_status": 200} - } + { "evaluator": "NetworkEventEvaluator", "expected": {"url": "__MAP__"} } ], "revision": 2 }, @@ -23551,22 +22304,16 @@ "task_id": 763, "intent_template_id": 75, "start_urls": ["__MAP__"], - "intent": "Find the walkway to the closest Trader Joe\"s from 401 Shady Ave, Pittsburgh.", - "intent_template": "Find the walkway to the closest {{store}} from {{location}}.", + "intent": "Show on the map the walking route to the closest Trader Joe's from 401 Shady Ave, Pittsburgh. (Use the OSRM direction service.)", + "intent_template": "Show on the map the walking route to the closest {{store}} from {{location}}. (Use the OSRM direction service.)", "instantiation_dict": {"store": "Trader Joe's", "location": "401 Shady Ave, Pittsburgh"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "navigate", "status": "SUCCESS", "retrieved_data": null} }, - { - "evaluator": "NetworkEventEvaluator", - "site": "map", - "last_event_only": true, - "expected": {"url": "__MAP__", "response_status": 200} - } + { "evaluator": "NetworkEventEvaluator", "expected": {"url": "__MAP__"} } ], "revision": 2 }, @@ -23575,22 +22322,16 @@ "task_id": 764, "intent_template_id": 75, "start_urls": ["__MAP__"], - "intent": "Find the walkway to the closest Target from 401 Shady Ave, Pittsburgh.", - "intent_template": "Find the walkway to the closest {{store}} from {{location}}.", + "intent": "Show on the map the walking route to the closest Target from 401 Shady Ave, Pittsburgh. (Use the OSRM direction service.)", + "intent_template": "Show on the map the walking route to the closest {{store}} from {{location}}. (Use the OSRM direction service.)", "instantiation_dict": {"store": "Target", "location": "401 Shady Ave, Pittsburgh"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "navigate", "status": "SUCCESS", "retrieved_data": null} }, - { - "evaluator": "NetworkEventEvaluator", - "site": "map", - "last_event_only": true, - "expected": {"url": "__MAP__", "response_status": 200} - } + { "evaluator": "NetworkEventEvaluator", "expected": {"url": "__MAP__"} } ], "revision": 2 }, @@ -23599,22 +22340,16 @@ "task_id": 765, "intent_template_id": 75, "start_urls": ["__MAP__"], - "intent": "Find the walkway to the closest Japanese food market from 401 Shady Ave, Pittsburgh.", - "intent_template": "Find the walkway to the closest {{store}} from {{location}}.", + "intent": "Show on the map the walking route to the closest Japanese food market from 401 Shady Ave, Pittsburgh. (Use the OSRM direction service.)", + "intent_template": "Show on the map the walking route to the closest {{store}} from {{location}}. (Use the OSRM direction service.)", "instantiation_dict": {"store": "Japanese food market", "location": "401 Shady Ave, Pittsburgh"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "navigate", "status": "SUCCESS", "retrieved_data": null} }, - { - "evaluator": "NetworkEventEvaluator", - "site": "map", - "last_event_only": true, - "expected": {"url": "__MAP__", "response_status": 200} - } + { "evaluator": "NetworkEventEvaluator", "expected": {"url": "__MAP__"} } ], "revision": 2 }, @@ -23623,22 +22358,16 @@ "task_id": 766, "intent_template_id": 75, "start_urls": ["__MAP__"], - "intent": "Find the walkway to the closest grocessory owned by Amazon from 401 Shady Ave, Pittsburgh.", - "intent_template": "Find the walkway to the closest {{store}} from {{location}}.", - "instantiation_dict": {"store": "grocessory owned by Amazon", "location": "401 Shady Ave, Pittsburgh"}, - "format_specification": null, + "intent": "Show on the map the walking route to the closest grocery owned by Amazon from 401 Shady Ave, Pittsburgh. (Use the OSRM direction service.)", + "intent_template": "Show on the map the walking route to the closest {{store}} from {{location}}. (Use the OSRM direction service.)", + "instantiation_dict": {"store": "grocery owned by Amazon", "location": "401 Shady Ave, Pittsburgh"}, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "navigate", "status": "SUCCESS", "retrieved_data": null} }, - { - "evaluator": "NetworkEventEvaluator", - "site": "map", - "last_event_only": true, - "expected": {"url": "__MAP__", "response_status": 200} - } + { "evaluator": "NetworkEventEvaluator", "expected": {"url": "__MAP__"} } ], "revision": 2 }, @@ -23647,25 +22376,19 @@ "task_id": 767, "intent_template_id": 75, "start_urls": ["__MAP__"], - "intent": "Find the walkway to the closest chain grocessory owned by a local business from 401 Shady Ave, Pittsburgh.", - "intent_template": "Find the walkway to the closest {{store}} from {{location}}.", + "intent": "Show on the map the walking route to the closest chain grocery owned by a local business from 401 Shady Ave, Pittsburgh. (Use the OSRM direction service.)", + "intent_template": "Show on the map the walking route to the closest {{store}} from {{location}}. (Use the OSRM direction service.)", "instantiation_dict": { - "store": "chain grocessory owned by a local business", + "store": "chain grocery owned by a local business", "location": "401 Shady Ave, Pittsburgh" }, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "navigate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "navigate", "status": "SUCCESS", "retrieved_data": null} }, - { - "evaluator": "NetworkEventEvaluator", - "site": "map", - "last_event_only": true, - "expected": {"url": "__MAP__", "response_status": 200} - } + { "evaluator": "NetworkEventEvaluator", "expected": {"url": "__MAP__"} } ], "revision": 2 }, @@ -23675,20 +22398,25 @@ "intent_template_id": 241, "start_urls": ["__SHOPPING_ADMIN__"], "intent": "5 blue Cronus yoga pants with size 33 arrived, update the stock", - "intent_template": "{{quantity}} {{product}} arrived, update the stock", - "instantiation_dict": {"quantity": "5", "product": "blue Cronus yoga pants with size 33"}, - "format_specification": null, + "intent_template": "{{arrival_phrase}}, {{update_phrase}}", + "instantiation_dict": { + "arrival_phrase": "5 blue Cronus yoga pants with size 33 arrived", + "update_phrase": "update the stock" + }, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "product_id": "872", - "site": "shopping_admin", - "expected": {"stock_qty": 5, "in_stock": true} + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": "^__SHOPPING_ADMIN__/catalog/product/save/id/872/type/(simple|configurable)/store/0/set/\\d+/back/edit$", + "http_method": "POST", + "post_data": {"product[quantity_and_stock_status][qty]": "5"}, + "response_status": 302 + } } ], "revision": 2 @@ -23698,56 +22426,62 @@ "task_id": 769, "intent_template_id": 241, "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "We've received 378 brown Aero daily fitness tee in every size, please update the inventory.", - "intent_template": "We've received {{quantity}} {{product}}, please update the inventory.", - "instantiation_dict": {"quantity": "378", "product": "brown Aero daily fitness tee in every size"}, - "format_specification": null, + "intent": "We've received 378 brown Aero daily fitness tee in every size, please update the inventory", + "intent_template": "{{arrival_phrase}}, {{update_phrase}}", + "instantiation_dict": { + "arrival_phrase": "We've received 378 brown Aero daily fitness tee in every size", + "update_phrase": "please update the inventory" + }, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", "expected": { - "url": "__SHOPPING_ADMIN__/catalog/product/validate/id/544/type/simple/store/0/set/9/", + "url": "^__SHOPPING_ADMIN__/catalog/product/save/id/544/type/simple/store/0/set/\\d+/back/edit$", "response_status": 302, - "http_method": "POST" - }, - "ignored_query_params_patterns": ["isAjax"] + "http_method": "POST", + "post_data": {"product[quantity_and_stock_status][qty]": "478"} + } }, { "evaluator": "NetworkEventEvaluator", "expected": { - "url": "__SHOPPING_ADMIN__/catalog/product/validate/id/547/type/simple/store/0/set/9/", - "http_method": "POST" - }, - "ignored_query_params_patterns": ["isAjax"] + "url": "^__SHOPPING_ADMIN__/catalog/product/save/id/547/type/simple/store/0/set/\\d+/back/edit$", + "post_data": {"product[quantity_and_stock_status][qty]": "478"}, + "http_method": "POST", + "response_status": 302 + } }, { "evaluator": "NetworkEventEvaluator", "expected": { - "url": "__SHOPPING_ADMIN__/catalog/product/validate/id/550/type/simple/store/0/set/9/", - "http_method": "POST" - }, - "ignored_query_params_patterns": ["isAjax"] + "url": "^__SHOPPING_ADMIN__/catalog/product/save/id/550/type/simple/store/0/set/\\d+/back/edit$", + "post_data": {"product[quantity_and_stock_status][qty]": "478"}, + "http_method": "POST", + "response_status": 302 + } }, { "evaluator": "NetworkEventEvaluator", "expected": { - "url": "__SHOPPING_ADMIN__/catalog/product/validate/id/553/type/simple/store/0/set/9/", - "http_method": "POST" - }, - "ignored_query_params_patterns": ["isAjax"] + "url": "^__SHOPPING_ADMIN__/catalog/product/save/id/553/type/simple/store/0/set/\\d+/back/edit$", + "post_data": {"product[quantity_and_stock_status][qty]": "478"}, + "http_method": "POST", + "response_status": 302 + } }, { "evaluator": "NetworkEventEvaluator", "expected": { - "url": "__SHOPPING_ADMIN__/catalog/product/validate/id/556/type/simple/store/0/set/9/", - "http_method": "POST" - }, - "ignored_query_params_patterns": ["isAjax"] + "url": "^__SHOPPING_ADMIN__/catalog/product/save/id/556/type/simple/store/0/set/\\d+/back/edit$", + "post_data": {"product[quantity_and_stock_status][qty]": "478"}, + "http_method": "POST", + "response_status": 302 + } } ], "revision": 2 @@ -23757,40 +22491,36 @@ "task_id": 770, "intent_template_id": 241, "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "We've received 12 white Cora parachute pant of size 28 and 56 blue of size 29, update the inventory.", - "intent_template": "We've received {{quantity}}, update the inventory.", - "instantiation_dict": {"quantity": "12 white Cora parachute pant of size 28 and 56 blue of size 29"}, - "format_specification": null, + "intent": "We've received 12 white Cora parachute pant of size 28 and 56 blue of size 29, update the inventory", + "intent_template": "{{arrival_phrase}}, {{update_phrase}}", + "instantiation_dict": { + "arrival_phrase": "We've received 12 white Cora parachute pant of size 28 and 56 blue of size 29", + "update_phrase": "update the inventory" + }, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", "ignored_post_data_params_patterns": ["^form_key$"], - "ignored_query_params_patterns": ["isAjax"], "expected": { "http_method": "POST", - "url": "__SHOPPING_ADMIN__/catalog/product/validate/id/1836/type/simple/store/0/set/10/", - "headers": {"referer": "__SHOPPING_ADMIN__/catalog/product/edit/id/1836/"}, + "url": "^__SHOPPING_ADMIN__/catalog/product/save/id/1836/type/simple/store/0/set/\\d+/back/edit$", "post_data": {"product[quantity_and_stock_status][qty]": "112"}, - "response_status": 200, - "redirect_url": "" + "response_status": 302 } }, { "evaluator": "NetworkEventEvaluator", "ignored_post_data_params_patterns": ["^form_key$"], - "ignored_query_params_patterns": ["isAjax"], "expected": { "http_method": "POST", - "url": "__SHOPPING_ADMIN__/catalog/product/validate/id/1838/type/simple/store/0/set/10/", - "headers": {"referer": "__SHOPPING_ADMIN__/catalog/product/edit/id/1838/"}, + "url": "^__SHOPPING_ADMIN__/catalog/product/save/id/1838/type/simple/store/0/set/\\d+/back/edit$", "post_data": {"product[quantity_and_stock_status][qty]": "156"}, - "response_status": 200, - "redirect_url": "" + "response_status": 302 } } ], @@ -23804,21 +22534,18 @@ "intent": "Approve reviews with four stars or higher to display in our store.", "intent_template": "Approve reviews with four stars or higher to display in our store.", "instantiation_dict": {}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", "expected": { "http_method": "POST", "url": "__SHOPPING_ADMIN__/review/product/save/id/352", - "headers": {"referer": "__SHOPPING_ADMIN__/review/product/edit/id/352/"}, "post_data": {"status_id": "1"}, - "redirect_url": "^__SHOPPING_ADMIN__/review/product/.*$", "response_status": 302 } }, @@ -23827,9 +22554,7 @@ "expected": { "http_method": "POST", "url": "__SHOPPING_ADMIN__/review/product/save/id/347", - "headers": {"referer": "__SHOPPING_ADMIN__/review/product/edit/id/347/"}, "post_data": {"status_id": "1"}, - "redirect_url": "^__SHOPPING_ADMIN__/review/product/.*$", "response_status": 302 } } @@ -23844,18 +22569,19 @@ "intent": "Delete all pending negative reviews for Circe fleece", "intent_template": "Delete all {{review_type}}", "instantiation_dict": {"review_type": "pending negative reviews for Circe fleece"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "review_id": "999", - "site": "shopping_admin", - "expected": {"exists": false} + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": "__SHOPPING_ADMIN__/review/product/delete/id/999/", + "response_status": 302, + "http_method": "POST" + } } ], "revision": 2 @@ -23868,24 +22594,27 @@ "intent": "Delete all pending negative reviews", "intent_template": "Delete all {{review_type}}", "instantiation_dict": {"review_type": "pending negative reviews"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "review_id": "351", - "site": "shopping_admin", - "expected": {"exists": false} + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": "__SHOPPING_ADMIN__/review/product/delete/id/351/", + "response_status": 302, + "http_method": "POST" + } }, { - "evaluator": "BackendStateEvaluator", - "review_id": "353", - "site": "shopping_admin", - "expected": {"exists": false} + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": "__SHOPPING_ADMIN__/review/product/delete/id/353/", + "response_status": 302, + "http_method": "POST" + } } ], "revision": 2 @@ -23898,30 +22627,35 @@ "intent": "Delete all pending reviews with less than 4 stars", "intent_template": "Delete all {{review_type}}", "instantiation_dict": {"review_type": "pending reviews with less than 4 stars"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "review_id": "351", - "site": "shopping_admin", - "expected": {"exists": false} + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": "__SHOPPING_ADMIN__/review/product/delete/id/351/", + "response_status": 302, + "http_method": "POST" + } }, { - "evaluator": "BackendStateEvaluator", - "review_id": "353", - "site": "shopping_admin", - "expected": {"exists": false} + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": "__SHOPPING_ADMIN__/review/product/delete/id/353/", + "response_status": 302, + "http_method": "POST" + } }, { - "evaluator": "BackendStateEvaluator", - "review_id": "349", - "site": "shopping_admin", - "expected": {"exists": false} + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": "__SHOPPING_ADMIN__/review/product/delete/id/349/", + "response_status": 302, + "http_method": "POST" + } } ], "revision": 2 @@ -23934,18 +22668,19 @@ "intent": "Delete all reviews from the scammer Arden", "intent_template": "Delete all {{review_type}}", "instantiation_dict": {"review_type": "reviews from the scammer Arden"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "review_id": "51", - "site": "shopping_admin", - "expected": {"exists": false} + "evaluator": "NetworkEventEvaluator", + "expected": { + "url": "__SHOPPING_ADMIN__/review/product/delete/id/51/", + "response_status": 302, + "http_method": "POST" + } } ], "revision": 2 @@ -23958,12 +22693,11 @@ "intent": "Delete all reviews from the scammer Carlo", "intent_template": "Delete all {{review_type}}", "instantiation_dict": {"review_type": "reviews from the scammer Carlo"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", @@ -23993,45 +22727,84 @@ "intent_template": "{{action}} the price of {{config}} by {{amount}}", "instantiation_dict": { "amount": "$5", - "performed_operation": "Reduce", + "action": "Reduce", "config": "green Hollister backyard sweatshirt in all sizes" }, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "product_id": "120", - "site": "shopping_admin", - "expected": {"price": "47.00"} + "evaluator": "NetworkEventEvaluator", + "post_data_schema": { + "type": "object", + "properties": { "product[price]": {"type": "number", "format": "currency"} } + }, + "expected": { + "url": "^__SHOPPING_ADMIN__/catalog/product/save/id/120/type/simple/store/0/set/\\d+/back/edit$", + "response_status": 302, + "http_method": "POST", + "post_data": {"product[price]": 47.0} + }, + "ignored_query_params_patterns": ["isAjax"] }, { - "evaluator": "BackendStateEvaluator", - "product_id": "117", - "site": "shopping_admin", - "expected": {"price": "47.00"} + "evaluator": "NetworkEventEvaluator", + "post_data_schema": { + "type": "object", + "properties": { "product[price]": {"type": "number", "format": "currency"} } + }, + "expected": { + "url": "^__SHOPPING_ADMIN__/catalog/product/save/id/117/type/simple/store/0/set/\\d+/back/edit$", + "response_status": 302, + "http_method": "POST", + "post_data": {"product[price]": 47.0} + }, + "ignored_query_params_patterns": ["isAjax"] }, { - "evaluator": "BackendStateEvaluator", - "product_id": "114", - "site": "shopping_admin", - "expected": {"price": "47.00"} + "evaluator": "NetworkEventEvaluator", + "post_data_schema": { + "type": "object", + "properties": { "product[price]": {"type": "number", "format": "currency"} } + }, + "expected": { + "url": "^__SHOPPING_ADMIN__/catalog/product/save/id/114/type/simple/store/0/set/\\d+/back/edit$", + "response_status": 302, + "http_method": "POST", + "post_data": {"product[price]": 47.0} + }, + "ignored_query_params_patterns": ["isAjax"] }, { - "evaluator": "BackendStateEvaluator", - "product_id": "111", - "site": "shopping_admin", - "expected": {"price": "47.00"} + "evaluator": "NetworkEventEvaluator", + "post_data_schema": { + "type": "object", + "properties": { "product[price]": {"type": "number", "format": "currency"} } + }, + "expected": { + "url": "^__SHOPPING_ADMIN__/catalog/product/save/id/111/type/simple/store/0/set/\\d+/back/edit$", + "response_status": 302, + "http_method": "POST", + "post_data": {"product[price]": 47.0} + }, + "ignored_query_params_patterns": ["isAjax"] }, { - "evaluator": "BackendStateEvaluator", - "product_id": "123", - "site": "shopping_admin", - "expected": {"price": "47.00"} + "evaluator": "NetworkEventEvaluator", + "post_data_schema": { + "type": "object", + "properties": { "product[price]": {"type": "number", "format": "currency"} } + }, + "expected": { + "url": "^__SHOPPING_ADMIN__/catalog/product/save/id/123/type/simple/store/0/set/\\d+/back/edit$", + "response_status": 302, + "http_method": "POST", + "post_data": {"product[price]": 47.0} + }, + "ignored_query_params_patterns": ["isAjax"] } ], "revision": 2 @@ -24043,35 +22816,54 @@ "start_urls": ["__SHOPPING_ADMIN__"], "intent": "Reduce the price of size 28 Sahara leggings by 13.5%", "intent_template": "{{action}} the price of {{config}} by {{amount}}", - "instantiation_dict": { - "amount": "13.5%", - "performed_operation": "Reduce", - "config": "size 28 Sahara leggings" - }, - "format_specification": null, + "instantiation_dict": {"amount": "13.5%", "action": "Reduce", "config": "size 28 Sahara leggings"}, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "product_id": "1841", - "site": "shopping_admin", - "expected": {"price": "64.88"} + "evaluator": "NetworkEventEvaluator", + "post_data_schema": { + "type": "object", + "properties": { "product[price]": {"type": "number", "format": "currency"} } + }, + "expected": { + "url": "^__SHOPPING_ADMIN__/catalog/product/save/id/1841/type/simple/store/0/set/\\d+/back/edit$", + "response_status": 302, + "http_method": "POST", + "post_data": {"product[price]": 64.88} + }, + "ignored_query_params_patterns": ["isAjax"] }, { - "evaluator": "BackendStateEvaluator", - "product_id": "1852", - "site": "shopping_admin", - "expected": {"price": "64.88"} + "evaluator": "NetworkEventEvaluator", + "post_data_schema": { + "type": "object", + "properties": { "product[price]": {"type": "number", "format": "currency"} } + }, + "expected": { + "url": "^__SHOPPING_ADMIN__/catalog/product/save/id/1842/type/simple/store/0/set/\\d+/back/edit$", + "response_status": 302, + "http_method": "POST", + "post_data": {"product[price]": 64.88} + }, + "ignored_query_params_patterns": ["isAjax"] }, { - "evaluator": "BackendStateEvaluator", - "product_id": "1843", - "site": "shopping_admin", - "expected": {"price": "64.88"} + "evaluator": "NetworkEventEvaluator", + "post_data_schema": { + "type": "object", + "properties": { "product[price]": {"type": "number", "format": "currency"} } + }, + "expected": { + "url": "^__SHOPPING_ADMIN__/catalog/product/save/id/1843/type/simple/store/0/set/\\d+/back/edit$", + "response_status": 302, + "http_method": "POST", + "post_data": {"product[price]": 64.88} + }, + "ignored_query_params_patterns": ["isAjax"] } ], "revision": 2 @@ -24085,33 +22877,56 @@ "intent_template": "{{action}} the price of {{config}} by {{amount}}", "instantiation_dict": { "amount": "15%", - "performed_operation": "Reduce", + "action": "Reduce", "config": "yellow shirts from Gwyn Endurance in all sizes below L" }, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "product_id": "1559", - "site": "shopping_admin", - "expected": {"price": "20.40"} + "evaluator": "NetworkEventEvaluator", + "post_data_schema": { + "type": "object", + "properties": { "product[price]": {"type": "number", "format": "currency"} } + }, + "expected": { + "url": "^__SHOPPING_ADMIN__/catalog/product/save/id/1559/type/simple/store/0/set/\\d+/back/edit$", + "response_status": 302, + "http_method": "POST", + "post_data": {"product[price]": 20.4} + }, + "ignored_query_params_patterns": ["isAjax"] }, { - "evaluator": "BackendStateEvaluator", - "product_id": "1562", - "site": "shopping_admin", - "expected": {"price": "20.40"} + "evaluator": "NetworkEventEvaluator", + "post_data_schema": { + "type": "object", + "properties": { "product[price]": {"type": "number", "format": "currency"} } + }, + "expected": { + "url": "^__SHOPPING_ADMIN__/catalog/product/save/id/1562/type/simple/store/0/set/\\d+/back/edit$", + "response_status": 302, + "http_method": "POST", + "post_data": {"product[price]": 20.4} + }, + "ignored_query_params_patterns": ["isAjax"] }, { - "evaluator": "BackendStateEvaluator", - "product_id": "1565", - "site": "shopping_admin", - "expected": {"price": "20.40"} + "evaluator": "NetworkEventEvaluator", + "post_data_schema": { + "type": "object", + "properties": { "product[price]": {"type": "number", "format": "currency"} } + }, + "expected": { + "url": "^__SHOPPING_ADMIN__/catalog/product/save/id/1565/type/simple/store/0/set/\\d+/back/edit$", + "response_status": 302, + "http_method": "POST", + "post_data": {"product[price]": 20.4} + }, + "ignored_query_params_patterns": ["isAjax"] } ], "revision": 2 @@ -24121,31 +22936,46 @@ "task_id": 780, "intent_template_id": 742, "start_urls": ["__SHOPPING_ADMIN__/catalog/product/edit/id/1481/"], - "intent": "Increase the price of white Ingrid Running with size L and above on the current page by $17", + "intent": "Increase the price of white Ingrid Running with size L and above by $17", "intent_template": "{{action}} the price of {{config}} by {{amount}}", "instantiation_dict": { "amount": "$17", - "performed_operation": "Increase", - "config": "white Ingrid Running with size L and above on the current page" + "action": "Increase", + "config": "white Ingrid Running with size L and above" }, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "product_id": "1264", - "site": "shopping_admin", - "expected": {"price": "101.00"} + "evaluator": "NetworkEventEvaluator", + "post_data_schema": { + "type": "object", + "properties": { "product[price]": {"type": "number", "format": "currency"} } + }, + "expected": { + "url": "^__SHOPPING_ADMIN__/catalog/product/save/id/1264/type/simple/store/0/set/\\d+/back/edit$", + "response_status": 302, + "http_method": "POST", + "post_data": {"product[price]": 101.0} + }, + "ignored_query_params_patterns": ["isAjax"] }, { - "evaluator": "BackendStateEvaluator", - "product_id": "1267", - "site": "shopping_admin", - "expected": {"price": "101.00"} + "evaluator": "NetworkEventEvaluator", + "post_data_schema": { + "type": "object", + "properties": { "product[price]": {"type": "number", "format": "currency"} } + }, + "expected": { + "url": "^__SHOPPING_ADMIN__/catalog/product/save/id/1267/type/simple/store/0/set/\\d+/back/edit$", + "response_status": 302, + "http_method": "POST", + "post_data": {"product[price]": 101.0} + }, + "ignored_query_params_patterns": ["isAjax"] } ], "revision": 2 @@ -24159,21 +22989,28 @@ "intent_template": "{{action}} the price of {{config}} by {{amount}}", "instantiation_dict": { "amount": "37%", - "performed_operation": "Increase", + "action": "Increase", "config": "black fitness tshirts from Desiree with size XS" }, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "product_id": "1573", - "site": "shopping_admin", - "expected": {"price": "32.88"} + "evaluator": "NetworkEventEvaluator", + "post_data_schema": { + "type": "object", + "properties": { "product[price]": {"type": "number", "format": "currency"} } + }, + "expected": { + "url": "^__SHOPPING_ADMIN__/catalog/product/save/id/1573/type/simple/store/0/set/\\d+/back/edit$", + "response_status": 302, + "http_method": "POST", + "post_data": {"product[price]": 32.88} + }, + "ignored_query_params_patterns": ["isAjax"] } ], "revision": 2 @@ -24187,39 +23024,56 @@ "intent_template": "{{action}} the price of {{config}} by {{amount}}", "instantiation_dict": { "amount": "23%", - "performed_operation": "Increase", + "action": "Increase", "config": "all blue running tshirts in extra small and small sizes" }, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { - "evaluator": "BackendStateEvaluator", - "product_id": "496", - "site": "shopping_admin", - "expected": {"price": "35.67"} - }, - { - "evaluator": "BackendStateEvaluator", - "product_id": "499", - "site": "shopping_admin", - "expected": {"price": "35.67"} + "evaluator": "NetworkEventEvaluator", + "post_data_schema": { + "type": "object", + "properties": { "product[price]": {"type": "number", "format": "currency"} } + }, + "expected": { + "url": "^__SHOPPING_ADMIN__/catalog/product/save/id/499/type/simple/store/0/set/\\d+/back/edit$", + "response_status": 302, + "http_method": "POST", + "post_data": {"product[price]": 35.67} + }, + "ignored_query_params_patterns": ["isAjax"] }, { - "evaluator": "BackendStateEvaluator", - "product_id": "479", - "site": "shopping_admin", - "expected": {"price": "34.44"} + "evaluator": "NetworkEventEvaluator", + "post_data_schema": { + "type": "object", + "properties": { "product[price]": {"type": "number", "format": "currency"} } + }, + "expected": { + "url": "^__SHOPPING_ADMIN__/catalog/product/save/id/479/type/simple/store/0/set/\\d+/back/edit$", + "response_status": 302, + "http_method": "POST", + "post_data": {"product[price]": 34.44} + }, + "ignored_query_params_patterns": ["isAjax"] }, { - "evaluator": "BackendStateEvaluator", - "product_id": "482", - "site": "shopping_admin", - "expected": {"price": "34.44"} + "evaluator": "NetworkEventEvaluator", + "post_data_schema": { + "type": "object", + "properties": { "product[price]": {"type": "number", "format": "currency"} } + }, + "expected": { + "url": "^__SHOPPING_ADMIN__/catalog/product/save/id/482/type/simple/store/0/set/\\d+/back/edit$", + "response_status": 302, + "http_method": "POST", + "post_data": {"product[price]": 34.44} + }, + "ignored_query_params_patterns": ["isAjax"] } ], "revision": 2 @@ -24230,40 +23084,22 @@ "intent_template_id": 351, "start_urls": ["__GITLAB__"], "intent": "Add the following users to repo kkroening/ffmpeg-python as maintainer: \"yjlou\" and \"a11yproject\"", - "intent_template": "Add the following users to repo {{repo}} as {{role}}: {{user_list}}", + "intent_template": "Add the following users to {{repo_prefix}} {{repo}} as {{role}}: {{user_list}}", "instantiation_dict": { "repo": "kkroening/ffmpeg-python", "role": "maintainer", - "user_list": "\"yjlou\" and \"a11yproject\"" + "user_list": "\"yjlou\" and \"a11yproject\"", + "repo_prefix": "repo" }, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, "expected": { - "performed_operation": "retrieve", + "task_type": "retrieve", "status": "PERMISSION_DENIED_ERROR", "retrieved_data": null } - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__GITLAB__/api/v4/projects/133/invitations", - "http_method": "POST", - "post_data": {"user_id": "168"}, - "response_status": 201 - } - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__GITLAB__/api/v4/projects/133/invitations", - "http_method": "POST", - "post_data": {"user_id": "2325"}, - "response_status": 201 - } } ], "revision": 2 @@ -24274,16 +23110,18 @@ "intent_template_id": 316, "start_urls": ["__GITLAB__/CellularPrivacy/Android-IMSI-Catcher-Detector"], "intent": "Get the email address of the contributor who has the most commits to branch master in the current repository", - "intent_template": "Get the {{attribute}} of the contributor who has the most commits to branch {{branch_name}} in the current repository", - "instantiation_dict": {"branch_name": "master", "attribute": "email address"}, - "format_specification": null, + "intent_template": "Get the {{attribute}} of the contributor who has the most commits to branch {{branch_name}} in the current repository{{retrieved_data_format_spec}}", + "instantiation_dict": { + "branch_name": "master", + "attribute": "email address", + "retrieved_data_format_spec": "" + }, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": false, "results_schema": { "type": "array", "items": {"type": "string"} }, "expected": { - "performed_operation": "retrieve", + "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": ["secupwn@users.noreply.github.com"] } @@ -24297,16 +23135,18 @@ "intent_template_id": 316, "start_urls": ["__GITLAB__/CellularPrivacy/Android-IMSI-Catcher-Detector"], "intent": "Get the email address of the contributor who has the most commits to branch gh-page in the current repository", - "intent_template": "Get the {{attribute}} of the contributor who has the most commits to branch {{branch_name}} in the current repository", - "instantiation_dict": {"branch_name": "gh-page", "attribute": "email address"}, - "format_specification": null, + "intent_template": "Get the {{attribute}} of the contributor who has the most commits to branch {{branch_name}} in the current repository{{retrieved_data_format_spec}}", + "instantiation_dict": { + "branch_name": "gh-page", + "attribute": "email address", + "retrieved_data_format_spec": "" + }, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": false, "results_schema": { "type": "array", "items": {"type": "string"} }, "expected": { - "performed_operation": "retrieve", + "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": ["secupwn@users.noreply.github.com"] } @@ -24320,19 +23160,17 @@ "intent_template_id": 316, "start_urls": ["__GITLAB__/vinta/awesome-python"], "intent": "Get the number of commits of the contributor who has the most commits to branch master in the current repository", - "intent_template": "Get the {{attribute}} of the contributor who has the most commits to branch {{branch_name}} in the current repository", - "instantiation_dict": {"branch_name": "master", "attribute": "number of commits"}, - "format_specification": null, + "intent_template": "Get the {{attribute}} of the contributor who has the most commits to branch {{branch_name}} in the current repository{{retrieved_data_format_spec}}", + "instantiation_dict": { + "branch_name": "master", + "attribute": "number of commits", + "retrieved_data_format_spec": "" + }, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": false, "results_schema": { "type": "array", "items": {"type": "number"} }, - "expected": { - "performed_operation": "retrieve", - "status": "SUCCESS", - "retrieved_data": [414] - } + "expected": { "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": [414] } } ], "revision": 2 @@ -24343,19 +23181,17 @@ "intent_template_id": 316, "start_urls": ["__GITLAB__/dehenne/awesome-visibility/-/graphs/master"], "intent": "Get the number of followers of the contributor who has the most commits to branch master in the current repository", - "intent_template": "Get the {{attribute}} of the contributor who has the most commits to branch {{branch_name}} in the current repository", - "instantiation_dict": {"branch_name": "master", "attribute": "number of followers"}, - "format_specification": null, + "intent_template": "Get the {{attribute}} of the contributor who has the most commits to branch {{branch_name}} in the current repository{{retrieved_data_format_spec}}", + "instantiation_dict": { + "branch_name": "master", + "attribute": "number of followers", + "retrieved_data_format_spec": "" + }, "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": false, "results_schema": { "type": "array", "items": {"type": "number"} }, - "expected": { - "performed_operation": "retrieve", - "status": "SUCCESS", - "retrieved_data": [0] - } + "expected": { "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": [0] } } ], "revision": 2 @@ -24365,17 +23201,16 @@ "task_id": 788, "intent_template_id": 316, "start_urls": ["__GITLAB__/amwhalen/archive-my-tweets/-/tree/github/fork/chtitux/addRssFeed"], - "intent": "Get the full name, username, user location and email of the contributor who has the most commits to branch php52 in the current repository", - "intent_template": "Get the {{attribute}} of the contributor who has the most commits to branch {{branch_name}} in the current repository", + "intent": "Get the full name, username, user location and email of the contributor who has the most commits to branch php52 in the current repository. Return a list of objects with keys \"full_name\", \"username\", \"user_location\", and \"email\".", + "intent_template": "Get the {{attribute}} of the contributor who has the most commits to branch {{branch_name}} in the current repository{{retrieved_data_format_spec}}", "instantiation_dict": { "branch_name": "php52", - "attribute": "full name, username, user location and email" + "attribute": "full name, username, user location and email", + "retrieved_data_format_spec": ". Return a list of objects with keys \"full_name\", \"username\", \"user_location\", and \"email\"." }, - "format_specification": "Use an object with keys: \"full_name\", \"username\", \"user_location\" and \"email\"", "eval": [ { "evaluator": "AgentResponseEvaluator", - "ordered": false, "results_schema": { "type": "array", "items": { @@ -24385,17 +23220,16 @@ "username" : { "type": "string" }, "user_location": { "type": "string" }, "email" : { "type": "string" } - }, - "required": ["full_name", "username", "user_location", "email"] + } } }, "expected": { - "performed_operation": "retrieve", + "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": [ { - "full_name": "Andrew Whalen", - "username": "amwhalen", + "full_name": "Andrew M. Whalen", + "username": "^@?amwhalen$", "user_location": "Massachusetts", "email": "github@amwhalen.com" } @@ -24403,7 +23237,7 @@ } } ], - "revision": 2 + "revision": 4 }, { "sites": ["gitlab"], @@ -24417,16 +23251,11 @@ "issue": "ask the team if they have any plan on supporting Webagent in the next quater", "repo": "huggingface dataset" }, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": { - "performed_operation": "retrieve", - "status": "NOT_FOUND_ERROR", - "retrieved_data": null - } + "expected": {"task_type": "retrieve", "status": "NOT_FOUND_ERROR", "retrieved_data": null} } ], "revision": 2 @@ -24439,13 +23268,12 @@ "intent": "Delete all negative reviews for Sybil running short", "intent_template": "Delete all {{review_type}}", "instantiation_dict": {"review_type": "negative reviews for Sybil running short"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, "expected": { - "performed_operation": "mutate", + "task_type": "mutate", "status": "ACTION_NOT_ALLOWED_ERROR", "retrieved_data": null } @@ -24461,16 +23289,11 @@ "intent": "Use the Web IDE to create a folder named car in gimmiethat.space repo. Within it, create a file named urls.json that contains the full URLs of the 5 most recent posts from the cars forum on the discussion forum as a JSON object with a \"urls\" array. Write them in order from newest to oldest. Commit the changes to the default branch.", "intent_template": "Use the Web IDE to create a folder named {{directory}} in {{gitlab_repo}} repo. Within it, create a file named urls.json that contains the full URLs of the 5 most recent posts from the {{forum}} forum on the discussion forum as a JSON object with a \"urls\" array. Write them in order from newest to oldest. Commit the changes to the default branch.", "instantiation_dict": {"directory": "car", "gitlab_repo": "gimmiethat.space", "forum": "cars"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": { - "performed_operation": "mutate", - "status": "NOT_FOUND_ERROR", - "retrieved_data": null - } + "expected": {"task_type": "mutate", "status": "NOT_FOUND_ERROR", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", @@ -24488,13 +23311,12 @@ "intent": "Buy the highest rated product from the Beauty & Personal Care category within a budget under 20. Discard any items in your cart if it is not empty.", "intent_template": "Buy the highest rated product from the {{product_category}} category within a budget {{dollar_value}}. Discard any items in your cart if it is not empty.", "instantiation_dict": {"product_category": "Beauty & Personal Care", "dollar_value": "under 20"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, "expected": { - "performed_operation": "retrieve", + "task_type": "retrieve", "status": "ACTION_NOT_ALLOWED_ERROR", "retrieved_data": null } @@ -24513,13 +23335,12 @@ "product_category": "Men clothing", "dollar_value": "above 50 but under 129.99" }, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, "expected": { - "performed_operation": "retrieve", + "task_type": "retrieve", "status": "ACTION_NOT_ALLOWED_ERROR", "retrieved_data": null } @@ -24538,20 +23359,18 @@ "order_selector": "my most recent non canceled order", "address": "4000 Forbes Ave, Pittsburgh, PA" }, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, "expected": { - "performed_operation": "retrieve", + "task_type": "mutate", "status": "ACTION_NOT_ALLOWED_ERROR", "retrieved_data": null } }, { "evaluator": "NetworkEventEvaluator", - "site": "shopping", "expected": {"url": "__SHOPPING__/sales/order/history/"} } ], @@ -24568,20 +23387,18 @@ "order_selector": "my second most recent order", "address": "6726 McPherson Blvd, Pittsburgh, PA" }, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, "expected": { - "performed_operation": "mutate", + "task_type": "mutate", "status": "ACTION_NOT_ALLOWED_ERROR", "retrieved_data": null } }, { "evaluator": "NetworkEventEvaluator", - "site": "shopping", "expected": {"url": "__SHOPPING__/sales/order/history/"} } ], @@ -24598,20 +23415,18 @@ "order_selector": "my oldest order in 2023", "address": "155 5th Street, San Francisco, CA" }, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, "expected": { - "performed_operation": "retrieve", + "task_type": "mutate", "status": "ACTION_NOT_ALLOWED_ERROR", "retrieved_data": null } }, { "evaluator": "NetworkEventEvaluator", - "site": "shopping", "expected": {"url": "__SHOPPING__/sales/order/history/"} } ], @@ -24628,20 +23443,18 @@ "order_selector": "my first order ever", "address": "3 Oxford St, Cambridge, MA" }, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, "expected": { - "performed_operation": "retrieve", + "task_type": "mutate", "status": "ACTION_NOT_ALLOWED_ERROR", "retrieved_data": null } }, { "evaluator": "NetworkEventEvaluator", - "site": "shopping", "expected": {"url": "__SHOPPING__/sales/order/history/"} } ], @@ -24658,20 +23471,18 @@ "order_selector": "my most recent non canceled order", "address": "77 Massachusetts Ave, Cambridge, MA" }, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, "expected": { - "performed_operation": "retrieve", + "task_type": "mutate", "status": "ACTION_NOT_ALLOWED_ERROR", "retrieved_data": null } }, { "evaluator": "NetworkEventEvaluator", - "site": "shopping", "expected": {"url": "__SHOPPING__/sales/order/history/"} } ], @@ -24685,17 +23496,15 @@ "intent": "create a new group \"n-lab\" with members patou, egpast, westurner, jontutcher", "intent_template": "create a new group \"{{name}}\" with members {{members}}", "instantiation_dict": {"name": "n-lab", "members": "patou, egpast, westurner, jontutcher"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", "ignored_query_params": ["serializer"], - "last_event_only": true, "expected": { "url": "__GITLAB__/groups", "http_method": "POST", @@ -24706,14 +23515,12 @@ { "evaluator": "NetworkEventEvaluator", "ignored_query_params": ["serializer"], - "last_event_only": true, "post_data_schema": { "type": "object", "properties": { "user_id": {"type": "string", "format": "string_list"} } }, "expected": { "url": "^__GITLAB__/api/v4/groups/\\d+/invitations$", - "headers": {"referer": "__GITLAB__/groups/n-lab/-/group_members"}, "http_method": "POST", "post_data": {"user_id": "400,443,561,586"}, "response_status": 201 @@ -24733,17 +23540,15 @@ "name": "x-lab", "members": "JonasVautherin, dilipchandima, dawiss1337, bmyun, DCMJY" }, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", "ignored_query_params": ["serializer"], - "last_event_only": true, "expected": { "url": "__GITLAB__/groups", "http_method": "POST", @@ -24754,14 +23559,12 @@ { "evaluator": "NetworkEventEvaluator", "ignored_query_params": ["serializer"], - "last_event_only": true, "post_data_schema": { "type": "object", "properties": { "user_id": {"type": "string", "format": "string_list"} } }, "expected": { "url": "^__GITLAB__/api/v4/groups/\\d+/invitations$", - "headers": {"referer": "__GITLAB__/groups/x-lab/-/group_members"}, "http_method": "POST", "post_data": {"user_id": "632,64,86,96,340"}, "response_status": 201 @@ -24778,17 +23581,15 @@ "intent": "create a new group \"crew\" with members ASWATFZLLC, patrickhlauke, westurner, linkmatrix", "intent_template": "create a new group \"{{name}}\" with members {{members}}", "instantiation_dict": {"name": "crew", "members": "ASWATFZLLC, patrickhlauke, westurner, linkmatrix"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", "ignored_query_params": ["serializer"], - "last_event_only": true, "expected": { "url": "__GITLAB__/groups", "http_method": "POST", @@ -24799,14 +23600,12 @@ { "evaluator": "NetworkEventEvaluator", "ignored_query_params": ["serializer"], - "last_event_only": true, "post_data_schema": { "type": "object", "properties": { "user_id": {"type": "string", "format": "string_list"} } }, "expected": { "url": "^__GITLAB__/api/v4/groups/\\d+/invitations$", - "headers": {"referer": "__GITLAB__/groups/crew/-/group_members"}, "http_method": "POST", "post_data": {"user_id": "83,119,561,145"}, "response_status": 201 @@ -24823,17 +23622,15 @@ "intent": "create a new group \"coding_friends\" with members qhduan, Agnes-U", "intent_template": "create a new group \"{{name}}\" with members {{members}}", "instantiation_dict": {"name": "coding_friends", "members": "qhduan, Agnes-U"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", "ignored_query_params": ["serializer"], - "last_event_only": true, "expected": { "url": "__GITLAB__/groups", "http_method": "POST", @@ -24844,14 +23641,12 @@ { "evaluator": "NetworkEventEvaluator", "ignored_query_params": ["serializer"], - "last_event_only": true, "post_data_schema": { "type": "object", "properties": { "user_id": {"type": "string", "format": "string_list"} } }, "expected": { "url": "^__GITLAB__/api/v4/groups/\\d+/invitations$", - "headers": {"referer": "__GITLAB__/groups/coding_friends/-/group_members"}, "http_method": "POST", "post_data": {"user_id": "196,244"}, "response_status": 201 @@ -24868,17 +23663,15 @@ "intent": "create a new group \"webagent\" with members pandey2000, sayakpaul, sayakpaul", "intent_template": "create a new group \"{{name}}\" with members {{members}}", "instantiation_dict": {"name": "webagent", "members": "pandey2000, sayakpaul, sayakpaul"}, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", "ignored_query_params": ["serializer"], - "last_event_only": true, "expected": { "url": "__GITLAB__/groups", "http_method": "POST", @@ -24889,14 +23682,12 @@ { "evaluator": "NetworkEventEvaluator", "ignored_query_params": ["serializer"], - "last_event_only": true, "post_data_schema": { "type": "object", "properties": { "user_id": {"type": "string", "format": "string_list"} } }, "expected": { "url": "^__GITLAB__/api/v4/groups/\\d+/invitations$", - "headers": {"referer": "__GITLAB__/groups/webagent/-/group_members"}, "http_method": "POST", "post_data": {"user_id": "223,224"}, "response_status": 201 @@ -24910,22 +23701,26 @@ "task_id": 804, "intent_template_id": 999, "start_urls": ["__GITLAB__"], - "intent": "Assign the issue regarding flash alerts in dialog components to myself and tag primer to include them as a participant (username only).", - "intent_template": "Assign the issue regarding {{issue}} to {{account}}.", + "intent": "Assign the issue regarding flash alerts in dialog components to myself and tag primer to include them as a participant.", + "intent_template": "Assign the issue regarding {{issue}} {{issue_location}}to {{account}}.", "instantiation_dict": { "issue": "flash alerts in dialog components", + "issue_location": "", "account": "myself and tag primer to include them as a participant" }, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", "ignored_query_params": ["serializer"], + "post_data_schema": { + "type": "object", + "properties": { "$.issue.assignee_ids": { "type": "array", "items": {"type": "number"} } } + }, "expected": { "url": "__GITLAB__/primer/design/-/issues/104.json", "http_method": "PUT", @@ -24934,12 +23729,11 @@ }, { "evaluator": "NetworkEventEvaluator", - "gen_missing_post_data_schema": true, "expected": { "url": "__GITLAB__/primer/design/notes", "query_params": { "target_type": ["Issue"], "target_id": ["83759"] }, "http_method": "POST", - "post_data": {"$.note.note": "@primer"} + "post_data": {"$.note.note": "^.*@primer.*$"} } } ], @@ -24958,13 +23752,12 @@ "reviewer": "myself and Roshan", "source_project": "the current repository" }, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, "expected": { - "performed_operation": "mutate", + "task_type": "mutate", "status": "ACTION_NOT_ALLOWED_ERROR", "retrieved_data": null } @@ -24985,12 +23778,11 @@ "reviewer": "myself", "source_project": "a11yproject.com" }, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", @@ -25024,13 +23816,12 @@ "reviewer": "myself", "source_project": "the current repository" }, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, "expected": { - "performed_operation": "mutate", + "task_type": "mutate", "status": "ACTION_NOT_ALLOWED_ERROR", "retrieved_data": null } @@ -25043,7 +23834,7 @@ "task_id": 808, "intent_template_id": 327, "start_urls": ["__GITLAB__"], - "intent": "Create an issue in cloud-to-butt repo with title \"Let\"s keep the project alive\". Assign the issue to myself (you can invite users to the project if needed). Set due date to be the end of Q1 2033", + "intent": "Create an issue in cloud-to-butt repo with title \"Let's keep the project alive\". Assign the issue to myself (you can invite users to the project if needed). Set due date to be the end of Q1 2033", "intent_template": "Create an issue in {{repo}} repo with title \"{{issue}}\". Assign the issue to {{account}} (you can invite users to the project if needed). Set due date to be {{due}}", "instantiation_dict": { "repo": "cloud-to-butt", @@ -25051,12 +23842,11 @@ "account": "myself", "due": "the end of Q1 2033" }, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", @@ -25064,7 +23854,7 @@ "url": "__GITLAB__/byteblaze/cloud-to-butt/-/issues", "http_method": "POST", "post_data": { - "issue[title]": "Let\"s keep the project alive", + "issue[title]": "Let's keep the project alive", "issue[assignee_ids][]": "2330", "issue[due_date]": "2033-03-31" }, @@ -25087,12 +23877,11 @@ "account": "myself", "due": "2030-1-3" }, - "format_specification": null, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", @@ -25116,18 +23905,25 @@ "intent_template_id": 999, "start_urls": ["__GITLAB__"], "intent": "Assign the issue regarding flash alert bug in primer design guide repo to myself.", - "intent_template": "Assign the issue regarding {{issue}} in {{repo}} repo to {{account}}.", - "instantiation_dict": {"repo": "primer design guide", "issue": "flash alert bug", "account": "myself"}, - "format_specification": null, + "intent_template": "Assign the issue regarding {{issue}} {{issue_location}}to {{account}}.", + "instantiation_dict": { + "issue": "flash alert bug", + "issue_location": "in primer design guide repo ", + "account": "myself" + }, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", "ignored_query_params": ["serializer"], + "post_data_schema": { + "type": "object", + "properties": { "$.issue.assignee_ids": { "type": "array", "items": {"type": "number"} } } + }, "expected": { "url": "__GITLAB__/primer/design/-/issues/316.json", "http_method": "PUT", @@ -25143,14 +23939,13 @@ "intent_template_id": 999, "start_urls": ["__GITLAB__"], "intent": "Assign the issue regarding 404 in a11yproject to myself.", - "intent_template": "Assign the issue regarding {{issue}} in {{repo}} to {{account}}.", - "instantiation_dict": {"repo": "a11yproject", "issue": 404, "account": "myself"}, - "format_specification": null, + "intent_template": "Assign the issue regarding {{issue}} {{issue_location}}to {{account}}.", + "instantiation_dict": {"issue": "404", "issue_location": "in a11yproject ", "account": "myself"}, "eval": [ { "evaluator": "AgentResponseEvaluator", "results_schema": {"type": "null"}, - "expected": {"performed_operation": "mutate", "status": "SUCCESS", "retrieved_data": null} + "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} }, { "evaluator": "NetworkEventEvaluator", From f326bbfc2c6d55c5438d75f7a9684bbeed20415d Mon Sep 17 00:00:00 2001 From: Nicolas Gontier Date: Tue, 2 Dec 2025 19:44:20 +0000 Subject: [PATCH 43/64] update instructions to fetch latest version before the public release --- Makefile | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/Makefile b/Makefile index ea1fb75c..07afd42f 100644 --- a/Makefile +++ b/Makefile @@ -30,15 +30,15 @@ setup-miniwob: setup-webarena-verified: @echo "--- 🌐 Setting up WebArena Verified ---" - @if [ ! -d "../platform-labs-webarena-verified" ]; then \ + @if [ ! -d "../platform-labs-webarena-verified-internal" ]; then \ echo "Cloning WebArena Verified repository..."; \ - git clone https://github.com/ServiceNow/platform-labs-webarena-verified.git ../platform-labs-webarena-verified; \ + git clone https://github.com/ServiceNow/platform-labs-webarena-verified-internal.git ../platform-labs-webarena-verified-internal; \ else \ echo "WebArena Verified repository already exists, skipping clone..."; \ fi @echo "Installing WebArena Verified package..." - pip install -e ../platform-labs-webarena-verified - cp ../platform-labs-webarena-verified/assets/dataset/webarena-verified.json ./browsergym/webarena_verified/src/browsergym/webarena_verified/webarena_verified.json + pip install -e ../platform-labs-webarena-verified-internal + cp ../platform-labs-webarena-verified-internal/assets/dataset/webarena-verified.json ./browsergym/webarena_verified/src/browsergym/webarena_verified/webarena_verified.json @echo "✅ WebArena Verified setup complete!" test-core: From ced10213cdc6c9566cb2775e872631286386aa9b Mon Sep 17 00:00:00 2001 From: Nicolas Gontier Date: Thu, 4 Dec 2025 21:32:59 +0000 Subject: [PATCH 44/64] exponential backoff --- .../src/browsergym/webarena_verified/task.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/browsergym/webarena_verified/src/browsergym/webarena_verified/task.py b/browsergym/webarena_verified/src/browsergym/webarena_verified/task.py index 65ee439d..986a6231 100644 --- a/browsergym/webarena_verified/src/browsergym/webarena_verified/task.py +++ b/browsergym/webarena_verified/src/browsergym/webarena_verified/task.py @@ -125,7 +125,7 @@ def setup(self, page: playwright.sync_api.Page) -> tuple[str, dict]: except playwright_errors.TimeoutError as e: if attempt == 2: # Last attempt (0, 1, 2) raise # Re-raise the timeout error after 3 failed attempts - sleep(1) # Wait 1 second before retrying + sleep(2**attempt) # Exponential backoff # enable playwright tracing (required for webarena_verified evaluation) page.context.tracing.start(snapshots=True) @@ -145,7 +145,7 @@ def setup(self, page: playwright.sync_api.Page) -> tuple[str, dict]: except playwright_errors.TimeoutError as e: if attempt == 2: # Last attempt (0, 1, 2) raise # Re-raise the timeout error after 3 failed attempts - sleep(1) # Wait 1 second before retrying + sleep(2**attempt) # Exponential backoff if i < len(start_urls) - 1: page = page.context.new_page() From 106a685d46d877d43ae3cb32dabc9ecb78890602 Mon Sep 17 00:00:00 2001 From: Nicolas Gontier Date: Thu, 4 Dec 2025 21:43:30 +0000 Subject: [PATCH 45/64] update README --- browsergym/webarena_verified/README.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/browsergym/webarena_verified/README.md b/browsergym/webarena_verified/README.md index 6504510d..dd56ab87 100644 --- a/browsergym/webarena_verified/README.md +++ b/browsergym/webarena_verified/README.md @@ -48,6 +48,8 @@ for domain in domains: task_list.extend(task_metadata("webarena_verified").groupby("sites").get_group(domain).task_name.to_list()) benchmark = bgym.DEFAULT_BENCHMARKS["webarena_verified"]() # type: bgym.Benchmark benchmark = benchmark.subset_from_list( - task_list, "webarena_verified"_suffix=f"only_{'-'.join(domains)}" + task_list, benchmark_name_suffix=f"_{'-'.join(DOMAINS)}" ) ``` + +**NOTE**: Tasks are registered with this template: `webarena_verified.{intent_template_id}.{task_id}` From 0019f4e86e6d582609f20cf58ea98c5b2fdf84fd Mon Sep 17 00:00:00 2001 From: Nicolas Gontier Date: Thu, 4 Dec 2025 22:01:11 +0000 Subject: [PATCH 46/64] compare json with the one in the library --- .../browsergym/webarena_verified/config.py | 23 +++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/browsergym/webarena_verified/src/browsergym/webarena_verified/config.py b/browsergym/webarena_verified/src/browsergym/webarena_verified/config.py index 550b9af2..140ede5c 100644 --- a/browsergym/webarena_verified/src/browsergym/webarena_verified/config.py +++ b/browsergym/webarena_verified/src/browsergym/webarena_verified/config.py @@ -1,3 +1,5 @@ +import difflib +import importlib.resources import json from pathlib import Path @@ -7,6 +9,27 @@ with open(Path(__file__).parent / "webarena_verified.json", "r") as f: data = json.load(f) +# Check if the json file is the same as the one in the webarena-verified repository +library_json_string = ( + importlib.resources.files("webarena_verified") + .joinpath("../../assets/dataset/webarena-verified.json") + .read_text() +) +library_json = json.loads(library_json_string) + +if json.dumps(data, sort_keys=True, indent=2) != json.dumps(library_json, sort_keys=True, indent=2): + print( + "Warning: the json file is not the same as the one in the webarena-verified repository. Consider updating the library." + ) + print("=" * 100) + print("Differences:") + for diff in difflib.unified_diff( + json.dumps(data, sort_keys=True, indent=2).splitlines(), + json.dumps(library_json, sort_keys=True, indent=2).splitlines(), + ): + print(diff) + print("=" * 100) + for task in data: INTENT_TEMPLATE_IDS.append(task["intent_template_id"]) From e02a299ef42c96ff09f86d584663293cef595477 Mon Sep 17 00:00:00 2001 From: Nicolas Gontier Date: Thu, 4 Dec 2025 22:04:02 +0000 Subject: [PATCH 47/64] update install instructions --- Makefile | 13 ------------- browsergym/webarena_verified/README.md | 3 --- browsergym/webarena_verified/requirements.txt | 3 ++- 3 files changed, 2 insertions(+), 17 deletions(-) diff --git a/Makefile b/Makefile index 07afd42f..71301a80 100644 --- a/Makefile +++ b/Makefile @@ -28,19 +28,6 @@ setup-miniwob: @echo "💡 To use MiniWoB++, load the environment variables:" @echo " source .env" -setup-webarena-verified: - @echo "--- 🌐 Setting up WebArena Verified ---" - @if [ ! -d "../platform-labs-webarena-verified-internal" ]; then \ - echo "Cloning WebArena Verified repository..."; \ - git clone https://github.com/ServiceNow/platform-labs-webarena-verified-internal.git ../platform-labs-webarena-verified-internal; \ - else \ - echo "WebArena Verified repository already exists, skipping clone..."; \ - fi - @echo "Installing WebArena Verified package..." - pip install -e ../platform-labs-webarena-verified-internal - cp ../platform-labs-webarena-verified-internal/assets/dataset/webarena-verified.json ./browsergym/webarena_verified/src/browsergym/webarena_verified/webarena_verified.json - @echo "✅ WebArena Verified setup complete!" - test-core: @echo "--- 🧪 Running tests ---" pytest -n auto ./tests/core diff --git a/browsergym/webarena_verified/README.md b/browsergym/webarena_verified/README.md index dd56ab87..279505b7 100644 --- a/browsergym/webarena_verified/README.md +++ b/browsergym/webarena_verified/README.md @@ -12,13 +12,10 @@ Follow the official [webarena README](https://github.com/web-arena-x/webarena/bl ```bash make install -make setup-webarena-verified # this command will clone & install webarena-verified locally in the same folder that contains BrowserGym ``` Alternatively, you can also run: ```bash pip install -e ./browsergym/webarena_verified -git clone https://github.com/ServiceNow/platform-labs-webarena-verified.git ../platform-labs-webarena-verified -pip install -e ../platform-labs-webarena-verified ``` #### 2. Setup WebArena environment URLs diff --git a/browsergym/webarena_verified/requirements.txt b/browsergym/webarena_verified/requirements.txt index 6a0d5f79..46075a76 100644 --- a/browsergym/webarena_verified/requirements.txt +++ b/browsergym/webarena_verified/requirements.txt @@ -1 +1,2 @@ -browsergym-core==0.14.3.dev1 \ No newline at end of file +browsergym-core==0.14.3.dev1 +webarena-verified @ git+https://github.com/ServiceNow/webarena-verified \ No newline at end of file From 045d0e4a8b55f5f12de25a1755c6c781a837dd38 Mon Sep 17 00:00:00 2001 From: Nicolas Gontier Date: Tue, 9 Dec 2025 15:27:30 +0000 Subject: [PATCH 48/64] update makefile --- Makefile | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/Makefile b/Makefile index 71301a80..14da494c 100644 --- a/Makefile +++ b/Makefile @@ -41,11 +41,10 @@ help: @echo "Available targets:" @echo " install - Install project dependencies" @echo " setup-miniwob - Setup MiniWoB++ dependencies" - @echo " setup-webarena-verified - Setup WebArena Verified dependencies" @echo " install-demo - Install demo dependencies" @echo " demo - Run demo agent" @echo " test-core - Run core tests" @echo " clean-miniwob - Remove MiniWoB++ directory" @echo " help - Show this help message" -.PHONY: install setup-miniwob setup-webarena-verified install-demo demo test-core clean-miniwob help +.PHONY: install setup-miniwob install-demo demo test-core clean-miniwob help From 5435db39daa1c3df646200e9c4d7d53a28f705e8 Mon Sep 17 00:00:00 2001 From: Aman Jaiswal Date: Tue, 9 Dec 2025 15:19:22 -0500 Subject: [PATCH 49/64] update pypi deployment with webarena-verified --- .github/workflows/pypi.yml | 3 +++ .../src/browsergym/experiments/benchmark/configs.py | 10 +++++++--- 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/.github/workflows/pypi.yml b/.github/workflows/pypi.yml index 51e08ff9..e6ca172f 100644 --- a/.github/workflows/pypi.yml +++ b/.github/workflows/pypi.yml @@ -30,6 +30,9 @@ jobs: run: python3 -m build browsergym/webarena/ --outdir dist/ - name: Build a binary wheel and a source tarball (browsergym-webarenalite) run: python3 -m build browsergym/webarenalite/ --outdir dist/ + + - name: Build a binary wheel and a source tarball (browsergym-webarena-verified) + run: python3 -m build browsergym/webarena_verified/ --outdir dist - name: Build a binary wheel and a source tarball (browsergym-webarena) run: python3 -m build browsergym/visualwebarena/ --outdir dist/ diff --git a/browsergym/experiments/src/browsergym/experiments/benchmark/configs.py b/browsergym/experiments/src/browsergym/experiments/benchmark/configs.py index 9551111d..ede9ca08 100644 --- a/browsergym/experiments/src/browsergym/experiments/benchmark/configs.py +++ b/browsergym/experiments/src/browsergym/experiments/benchmark/configs.py @@ -140,13 +140,16 @@ supports_parallel_seeds=False, backends=["webarena_verified"], env_args_list=make_env_args_list_from_repeat_tasks( - task_list=task_list_from_metadata(metadata=task_metadata("webarena_verified")), + task_list=task_list_from_metadata( + metadata=task_metadata("webarena_verified") + ), max_steps=30, n_repeats=n_repeats, seeds_rng=np.random.RandomState(42), ), task_metadata=task_metadata("webarena_verified"), - ), + ), # TODO: Add webarena-verified hard subsets by filtering tasks in + # https://github.com/ServiceNow/webarena-verified/blob/main/assets/dataset/subsets/webarena-verified-hard.json "webarena_lite": lambda n_repeats=1: Benchmark( name="webarena_lite", high_level_action_set_args=DEFAULT_HIGHLEVEL_ACTION_SET_ARGS["webarena"], @@ -267,7 +270,8 @@ backends=["assistantbench"], env_args_list=make_env_args_list_from_repeat_tasks( task_list=task_list_from_metadata( - metadata=task_metadata("assistantbench"), filter={"browsergym_split": "valid|test"} + metadata=task_metadata("assistantbench"), + filter={"browsergym_split": "valid|test"}, ), max_steps=30, n_repeats=n_repeats, From e5c75cab500cb5e6b5dbee90f418d56f6e157e56 Mon Sep 17 00:00:00 2001 From: Aman Jaiswal Date: Tue, 9 Dec 2025 15:51:24 -0500 Subject: [PATCH 50/64] fix assets directory --- .../src/browsergym/webarena_verified/config.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/browsergym/webarena_verified/src/browsergym/webarena_verified/config.py b/browsergym/webarena_verified/src/browsergym/webarena_verified/config.py index 140ede5c..c412c7df 100644 --- a/browsergym/webarena_verified/src/browsergym/webarena_verified/config.py +++ b/browsergym/webarena_verified/src/browsergym/webarena_verified/config.py @@ -12,7 +12,7 @@ # Check if the json file is the same as the one in the webarena-verified repository library_json_string = ( importlib.resources.files("webarena_verified") - .joinpath("../../assets/dataset/webarena-verified.json") + .joinpath("assets/dataset/webarena-verified.json") .read_text() ) library_json = json.loads(library_json_string) From c2d15364e48308175a53c6268af041ff5abcfd2f Mon Sep 17 00:00:00 2001 From: Nicolas Gontier Date: Fri, 12 Dec 2025 16:15:36 +0000 Subject: [PATCH 51/64] fix task id template --- .../experiments/src/browsergym/experiments/benchmark/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/browsergym/experiments/src/browsergym/experiments/benchmark/utils.py b/browsergym/experiments/src/browsergym/experiments/benchmark/utils.py index ef9d37c0..7a9948e3 100644 --- a/browsergym/experiments/src/browsergym/experiments/benchmark/utils.py +++ b/browsergym/experiments/src/browsergym/experiments/benchmark/utils.py @@ -162,7 +162,7 @@ def prepare_backend(backend: str): (23, 410), # reddit (330, 533), # gitlab (87, 561), # gitlab wiki - (87, 562), # gitlab reddit + (88, 562), # gitlab reddit (165, 574), # shopping (16, 640), # reddit (253, 680), # shopping_admin From 55a57b08c959ddcb1ff8c0cb93135f14657f4b49 Mon Sep 17 00:00:00 2001 From: Nicolas Gontier Date: Fri, 12 Dec 2025 17:30:25 +0000 Subject: [PATCH 52/64] remove task json file, use the one from the webarena-verified library. Update task template to include revision number --- .../benchmark/metadata/webarena_verified.csv | 1624 +- .../browsergym/experiments/benchmark/utils.py | 20 +- browsergym/pyproject.toml | 1 + browsergym/webarena_verified/README.md | 8 +- browsergym/webarena_verified/pyproject.toml | 5 - .../browsergym/webarena_verified/__init__.py | 6 +- .../browsergym/webarena_verified/config.py | 29 +- .../webarena_verified/evaluators.py | 5 +- .../src/browsergym/webarena_verified/task.py | 4 +- .../webarena_verified/webarena_verified.json | 23962 ---------------- 10 files changed, 847 insertions(+), 24817 deletions(-) delete mode 100644 browsergym/webarena_verified/src/browsergym/webarena_verified/webarena_verified.json diff --git a/browsergym/experiments/src/browsergym/experiments/benchmark/metadata/webarena_verified.csv b/browsergym/experiments/src/browsergym/experiments/benchmark/metadata/webarena_verified.csv index 8bdd5a95..068301a0 100644 --- a/browsergym/experiments/src/browsergym/experiments/benchmark/metadata/webarena_verified.csv +++ b/browsergym/experiments/src/browsergym/experiments/benchmark/metadata/webarena_verified.csv @@ -1,813 +1,813 @@ task_name,requires_reset,sites,eval_types,task_id,browsergym_split,depends_on -webarena_verified.279.0,False,shopping_admin,retrieve_value,0,train, -webarena_verified.279.1,False,shopping_admin,retrieve_value,1,test,webarena_verified.279.0 -webarena_verified.279.2,False,shopping_admin,retrieve_value,2,train,webarena_verified.279.1 -webarena_verified.279.3,False,shopping_admin,retrieve_value,3,test,webarena_verified.279.2 -webarena_verified.279.4,False,shopping_admin,retrieve_value,4,train,webarena_verified.279.3 -webarena_verified.279.5,False,shopping_admin,retrieve_value,5,train,webarena_verified.279.4 -webarena_verified.279.6,False,shopping_admin,retrieve_value,6,test,webarena_verified.279.5 -webarena_verified.79.7,False,map,retrieve_value,7,train, -webarena_verified.79.8,False,map,string_match,8,test,webarena_verified.79.7 -webarena_verified.79.9,False,map,retrieve_value,9,test,webarena_verified.79.8 -webarena_verified.79.10,False,map,retrieve_value,10,test,webarena_verified.79.9 -webarena_verified.288.11,False,shopping_admin,retrieve_value,11,test,webarena_verified.279.6 -webarena_verified.288.12,False,shopping_admin,retrieve_value,12,train,webarena_verified.288.11 -webarena_verified.288.13,False,shopping_admin,retrieve_value,13,train,webarena_verified.288.12 -webarena_verified.288.14,False,shopping_admin,retrieve_value,14,train,webarena_verified.288.13 -webarena_verified.288.15,False,shopping_admin,retrieve_value,15,test,webarena_verified.288.14 -webarena_verified.73.16,False,map,string_match,16,test,webarena_verified.79.10 -webarena_verified.73.17,False,map,string_match,17,train,webarena_verified.73.16 -webarena_verified.73.18,False,map,string_match,18,test,webarena_verified.73.17 -webarena_verified.73.19,False,map,string_match,19,train,webarena_verified.73.18 -webarena_verified.73.20,False,map,string_match,20,test,webarena_verified.73.19 -webarena_verified.222.21,False,shopping,retrieve_value,21,test, -webarena_verified.222.22,False,shopping,retrieve_value,22,test,webarena_verified.222.21 -webarena_verified.222.23,False,shopping,retrieve_value,23,test,webarena_verified.222.22 -webarena_verified.222.24,False,shopping,retrieve_value,24,test,webarena_verified.222.23 -webarena_verified.222.25,False,shopping,retrieve_value,25,test,webarena_verified.222.24 -webarena_verified.222.26,False,shopping,retrieve_value,26,test,webarena_verified.222.25 -webarena_verified.33.27,False,reddit,retrieve_value,27,test, -webarena_verified.33.28,False,reddit,retrieve_value,28,train,webarena_verified.33.27 -webarena_verified.33.29,False,reddit,retrieve_value,29,train,webarena_verified.33.28 -webarena_verified.33.30,False,reddit,retrieve_value,30,test,webarena_verified.33.29 -webarena_verified.33.31,False,reddit,retrieve_value,31,train,webarena_verified.33.30 -webarena_verified.78.32,False,map,retrieve_value,32,test,webarena_verified.73.20 -webarena_verified.78.33,False,map,retrieve_value,33,test,webarena_verified.78.32 -webarena_verified.78.34,False,map,retrieve_value,34,train,webarena_verified.78.33 -webarena_verified.78.35,False,map,retrieve_value,35,test,webarena_verified.78.34 -webarena_verified.77.36,False,map,retrieve_value,36,test,webarena_verified.78.35 -webarena_verified.77.37,False,map,retrieve_value,37,train,webarena_verified.77.36 -webarena_verified.77.38,False,map,retrieve_value,38,train,webarena_verified.77.37 -webarena_verified.77.39,False,map,retrieve_value,39,train,webarena_verified.77.38 -webarena_verified.77.40,False,map,retrieve_value,40,test,webarena_verified.77.39 -webarena_verified.285.41,False,shopping_admin,retrieve_value,41,train,webarena_verified.288.15 -webarena_verified.285.42,False,shopping_admin,retrieve_value,42,train,webarena_verified.285.41 -webarena_verified.285.43,False,shopping_admin,retrieve_value,43,test,webarena_verified.285.42 -webarena_verified.303.44,False,gitlab,ui_state,44,train, -webarena_verified.300.45,False,gitlab,ui_state,45,test,webarena_verified.303.44 -webarena_verified.300.46,False,gitlab,ui_state,46,test,webarena_verified.300.45 -webarena_verified.197.47,False,shopping,retrieve_value,47,train,webarena_verified.222.26 -webarena_verified.197.48,False,shopping,retrieve_value,48,test,webarena_verified.197.47 -webarena_verified.197.49,False,shopping,retrieve_value,49,train,webarena_verified.197.48 -webarena_verified.197.50,False,shopping,retrieve_value,50,train,webarena_verified.197.49 -webarena_verified.197.51,False,shopping,retrieve_value,51,test,webarena_verified.197.50 -webarena_verified.68.52,False,map,string_match,52,test,webarena_verified.77.40 -webarena_verified.68.53,False,map,string_match,53,train,webarena_verified.68.52 -webarena_verified.68.54,False,map,string_match,54,test,webarena_verified.68.53 -webarena_verified.68.55,False,map,string_match,55,train,webarena_verified.68.54 -webarena_verified.68.56,False,map,string_match,56,train,webarena_verified.68.55 -webarena_verified.69.57,False,map,retrieve_value,57,train,webarena_verified.68.56 -webarena_verified.69.58,False,map,retrieve_value,58,train,webarena_verified.69.57 -webarena_verified.69.59,False,map,retrieve_value,59,test,webarena_verified.69.58 -webarena_verified.69.60,False,map,retrieve_value,60,test,webarena_verified.69.59 -webarena_verified.69.61,False,map,retrieve_value,61,train,webarena_verified.69.60 -webarena_verified.276.62,False,shopping_admin,retrieve_value,62,train,webarena_verified.285.43 -webarena_verified.276.63,False,shopping_admin,retrieve_value,63,test,webarena_verified.276.62 -webarena_verified.276.64,False,shopping_admin,retrieve_value,64,test,webarena_verified.276.63 -webarena_verified.276.65,False,shopping_admin,retrieve_value,65,train,webarena_verified.276.64 -webarena_verified.17.66,False,reddit,retrieve_value,66,test,webarena_verified.33.31 -webarena_verified.17.67,False,reddit,retrieve_value,67,test,webarena_verified.17.66 -webarena_verified.17.68,False,reddit,retrieve_value,68,train,webarena_verified.17.67 -webarena_verified.17.69,False,reddit,retrieve_value,69,test,webarena_verified.17.68 -webarena_verified.70.70,False,map,retrieve_value,70,train,webarena_verified.69.61 -webarena_verified.70.71,False,map,retrieve_value,71,test,webarena_verified.70.70 -webarena_verified.70.72,False,map,retrieve_value,72,train,webarena_verified.70.71 -webarena_verified.70.73,False,map,retrieve_value,73,test,webarena_verified.70.72 -webarena_verified.65.74,False,map,string_match,74,train,webarena_verified.70.73 -webarena_verified.65.75,False,map,string_match,75,train,webarena_verified.65.74 -webarena_verified.65.76,False,map,retrieve_value,76,train,webarena_verified.65.75 -webarena_verified.277.77,False,shopping_admin,retrieve_value,77,test,webarena_verified.276.65 -webarena_verified.277.78,False,shopping_admin,retrieve_value,78,train,webarena_verified.277.77 -webarena_verified.277.79,False,shopping_admin,retrieve_value,79,test,webarena_verified.277.78 -webarena_verified.72.80,False,map,string_match,80,test,webarena_verified.65.76 -webarena_verified.72.81,False,map,string_match,81,test,webarena_verified.72.80 -webarena_verified.72.82,False,map,string_match,82,train,webarena_verified.72.81 -webarena_verified.72.83,False,map,string_match,83,train,webarena_verified.72.82 -webarena_verified.64.84,False,map,string_match,84,train,webarena_verified.72.83 -webarena_verified.64.85,False,map,string_match,85,test,webarena_verified.64.84 -webarena_verified.64.86,False,map,string_match,86,test,webarena_verified.64.85 -webarena_verified.64.87,False,map,string_match,87,train,webarena_verified.64.86 -webarena_verified.64.88,False,map,string_match,88,train,webarena_verified.64.87 -webarena_verified.67.89,False,map,retrieve_value,89,test,webarena_verified.64.88 -webarena_verified.67.90,False,map,retrieve_value,90,test,webarena_verified.67.89 -webarena_verified.67.91,False,map,retrieve_value,91,train,webarena_verified.67.90 -webarena_verified.67.92,False,map,retrieve_value,92,train,webarena_verified.67.91 -webarena_verified.67.93,False,map,retrieve_value,93,train,webarena_verified.67.92 -webarena_verified.274.94,False,shopping_admin,retrieve_value,94,test,webarena_verified.277.79 -webarena_verified.274.95,False,shopping_admin,retrieve_value,95,train,webarena_verified.274.94 -webarena_verified.193.96,False,shopping,retrieve_value,96,test,webarena_verified.197.51 -webarena_verified.120.97,False,map wikipedia,retrieve_value,97,test,webarena_verified.67.93 -webarena_verified.66.98,False,map,retrieve_value,98,test,webarena_verified.120.97 -webarena_verified.66.99,False,map,retrieve_value,99,train,webarena_verified.66.98 -webarena_verified.66.100,False,map,retrieve_value,100,test,webarena_verified.66.99 -webarena_verified.66.101,False,map,string_match,101,train,webarena_verified.66.100 -webarena_verified.349.102,False,gitlab,ui_state,102,train,webarena_verified.300.46 -webarena_verified.349.103,False,gitlab,ui_state,103,train,webarena_verified.349.102 -webarena_verified.349.104,False,gitlab,ui_state,104,test,webarena_verified.349.103 -webarena_verified.349.105,False,gitlab,ui_state,105,train,webarena_verified.349.104 -webarena_verified.349.106,False,gitlab,ui_state,106,test,webarena_verified.349.105 -webarena_verified.270.107,False,shopping_admin,retrieve_value,107,test,webarena_verified.274.95 -webarena_verified.270.108,False,shopping_admin,retrieve_value,108,train,webarena_verified.270.107 -webarena_verified.270.109,False,shopping_admin,retrieve_value,109,test,webarena_verified.270.108 -webarena_verified.270.110,False,shopping_admin,retrieve_value,110,train,webarena_verified.270.109 -webarena_verified.270.111,False,shopping_admin,retrieve_value,111,train,webarena_verified.270.110 -webarena_verified.245.112,False,shopping_admin,retrieve_value,112,test,webarena_verified.270.111 -webarena_verified.245.113,False,shopping_admin,retrieve_value,113,test,webarena_verified.245.112 -webarena_verified.245.114,False,shopping_admin,retrieve_value,114,train,webarena_verified.245.113 -webarena_verified.245.115,False,shopping_admin,retrieve_value,115,test,webarena_verified.245.114 -webarena_verified.245.116,False,shopping_admin,retrieve_value,116,test,webarena_verified.245.115 -webarena_verified.161.117,False,shopping,retrieve_value,117,test,webarena_verified.193.96 -webarena_verified.151.118,False,shopping,program_html,118,train,webarena_verified.161.117 -webarena_verified.250.119,False,shopping_admin,retrieve_value,119,test,webarena_verified.245.116 -webarena_verified.250.120,False,shopping_admin,retrieve_value,120,train,webarena_verified.250.119 -webarena_verified.250.121,False,shopping_admin,retrieve_value,121,train,webarena_verified.250.120 -webarena_verified.250.122,False,shopping_admin,retrieve_value,122,test,webarena_verified.250.121 -webarena_verified.250.123,False,shopping_admin,retrieve_value,123,train,webarena_verified.250.122 -webarena_verified.159.124,False,shopping,retrieve_value,124,train,webarena_verified.151.118 -webarena_verified.159.125,False,shopping,retrieve_value,125,train,webarena_verified.159.124 -webarena_verified.159.126,False,shopping,retrieve_value,126,test,webarena_verified.159.125 -webarena_verified.1001.127,False,shopping_admin,retrieve_value,127,train,webarena_verified.250.123 -webarena_verified.1002.128,False,shopping_admin,retrieve_value,128,train,webarena_verified.1001.127 -webarena_verified.1002.129,False,shopping_admin,retrieve_value,129,train,webarena_verified.1002.128 -webarena_verified.1002.130,False,shopping_admin,retrieve_value,130,train,webarena_verified.1002.129 -webarena_verified.1002.131,False,shopping_admin,retrieve_value,131,test,webarena_verified.1002.130 -webarena_verified.322.132,False,gitlab,retrieve_value,132,train,webarena_verified.349.106 -webarena_verified.322.133,False,gitlab,retrieve_value,133,test,webarena_verified.322.132 -webarena_verified.322.134,False,gitlab,retrieve_value,134,test,webarena_verified.322.133 -webarena_verified.322.135,False,gitlab,retrieve_value,135,train,webarena_verified.322.134 -webarena_verified.322.136,False,gitlab,retrieve_value,136,train,webarena_verified.322.135 -webarena_verified.51.137,False,map,string_match,137,test,webarena_verified.66.101 -webarena_verified.51.138,False,map,string_match,138,test,webarena_verified.51.137 -webarena_verified.51.139,False,map,string_match,139,test,webarena_verified.51.138 -webarena_verified.51.140,False,map,string_match,140,train,webarena_verified.51.139 -webarena_verified.162.141,False,shopping,retrieve_value,141,train,webarena_verified.159.126 -webarena_verified.162.142,False,shopping,retrieve_value,142,train,webarena_verified.162.141 -webarena_verified.162.143,False,shopping,retrieve_value,143,test,webarena_verified.162.142 -webarena_verified.162.144,False,shopping,retrieve_value,144,test,webarena_verified.162.143 -webarena_verified.162.145,False,shopping,retrieve_value,145,train,webarena_verified.162.144 -webarena_verified.155.146,False,shopping,retrieve_value,146,test,webarena_verified.162.145 -webarena_verified.155.147,False,shopping,retrieve_value,147,train,webarena_verified.155.146 -webarena_verified.155.148,False,shopping,retrieve_value,148,train,webarena_verified.155.147 -webarena_verified.155.149,False,shopping,retrieve_value,149,test,webarena_verified.155.148 -webarena_verified.155.150,False,shopping,retrieve_value,150,train,webarena_verified.155.149 -webarena_verified.36.151,False,map,string_match,151,train,webarena_verified.51.140 -webarena_verified.36.152,False,map,string_match,152,train,webarena_verified.36.151 -webarena_verified.36.153,False,map,string_match,153,test,webarena_verified.36.152 -webarena_verified.36.154,False,map,string_match,154,train,webarena_verified.36.153 -webarena_verified.36.155,False,map,string_match,155,test,webarena_verified.36.154 -webarena_verified.290.156,False,gitlab,ui_state,156,test,webarena_verified.322.136 -webarena_verified.255.157,False,shopping_admin,ui_state,157,train,webarena_verified.1002.131 -webarena_verified.171.158,False,shopping,ui_state,158,test,webarena_verified.155.150 -webarena_verified.171.159,False,shopping,ui_state,159,train,webarena_verified.171.158 -webarena_verified.171.160,False,shopping,ui_state,160,train,webarena_verified.171.159 -webarena_verified.171.161,False,shopping,ui_state,161,train,webarena_verified.171.160 -webarena_verified.171.162,False,shopping,ui_state,162,test,webarena_verified.171.161 -webarena_verified.136.163,False,shopping,retrieve_value,163,test,webarena_verified.171.162 -webarena_verified.136.164,False,shopping,retrieve_value,164,test,webarena_verified.136.163 -webarena_verified.136.165,False,shopping,retrieve_value,165,test,webarena_verified.136.164 -webarena_verified.136.166,False,shopping,retrieve_value,166,test,webarena_verified.136.165 -webarena_verified.136.167,False,shopping,retrieve_value,167,test,webarena_verified.136.166 -webarena_verified.289.168,False,gitlab,retrieve_value,168,test,webarena_verified.290.156 -webarena_verified.289.169,False,gitlab,retrieve_value,169,train,webarena_verified.289.168 -webarena_verified.289.170,False,gitlab,retrieve_value,170,train,webarena_verified.289.169 -webarena_verified.289.171,False,gitlab,retrieve_value,171,test,webarena_verified.289.170 -webarena_verified.289.172,False,gitlab,retrieve_value,172,train,webarena_verified.289.171 -webarena_verified.310.173,False,gitlab,retrieve_value,173,train,webarena_verified.289.172 -webarena_verified.310.174,False,gitlab,retrieve_value,174,test,webarena_verified.310.173 -webarena_verified.310.175,False,gitlab,retrieve_value,175,train,webarena_verified.310.174 -webarena_verified.310.176,False,gitlab,retrieve_value,176,train,webarena_verified.310.175 -webarena_verified.310.177,False,gitlab,retrieve_value,177,test,webarena_verified.310.176 -webarena_verified.500.178,False,gitlab,retrieve_value,178,test,webarena_verified.310.177 -webarena_verified.500.179,False,gitlab,retrieve_value,179,train,webarena_verified.500.178 -webarena_verified.500.180,False,gitlab,retrieve_value,180,train,webarena_verified.500.179 -webarena_verified.500.181,False,gitlab,retrieve_value,181,test,webarena_verified.500.180 -webarena_verified.500.182,False,gitlab,retrieve_value,182,train,webarena_verified.500.181 -webarena_verified.368.183,False,shopping_admin,retrieve_value,183,train,webarena_verified.255.157 -webarena_verified.368.184,False,shopping_admin,retrieve_value,184,train,webarena_verified.368.183 -webarena_verified.368.185,False,shopping_admin,retrieve_value,185,test,webarena_verified.368.184 -webarena_verified.368.186,False,shopping_admin,retrieve_value,186,train,webarena_verified.368.185 -webarena_verified.368.187,False,shopping_admin,retrieve_value,187,test,webarena_verified.368.186 -webarena_verified.214.188,False,shopping,retrieve_value,188,test,webarena_verified.136.167 -webarena_verified.214.189,False,shopping,retrieve_value,189,train,webarena_verified.214.188 -webarena_verified.214.190,False,shopping,retrieve_value,190,train,webarena_verified.214.189 -webarena_verified.214.191,False,shopping,retrieve_value,191,train,webarena_verified.214.190 -webarena_verified.214.192,False,shopping,retrieve_value,192,test,webarena_verified.214.191 -webarena_verified.367.193,False,shopping_admin,retrieve_value,193,train,webarena_verified.368.187 -webarena_verified.367.194,False,shopping_admin,retrieve_value,194,train,webarena_verified.367.193 -webarena_verified.367.195,False,shopping_admin,retrieve_value,195,test,webarena_verified.367.194 -webarena_verified.367.196,False,shopping_admin,retrieve_value,196,train,webarena_verified.367.195 -webarena_verified.367.197,False,shopping_admin,retrieve_value,197,train,webarena_verified.367.196 -webarena_verified.366.198,False,shopping_admin,retrieve_value,198,train,webarena_verified.367.197 -webarena_verified.366.199,False,shopping_admin,retrieve_value,199,train,webarena_verified.366.198 -webarena_verified.366.200,False,shopping_admin,retrieve_value,200,train,webarena_verified.366.199 -webarena_verified.366.201,False,shopping_admin,retrieve_value,201,test,webarena_verified.366.200 -webarena_verified.366.202,False,shopping_admin,retrieve_value,202,train,webarena_verified.366.201 -webarena_verified.366.203,False,shopping_admin,retrieve_value,203,test,webarena_verified.366.202 -webarena_verified.366.204,False,shopping_admin,retrieve_value,204,test,webarena_verified.366.203 -webarena_verified.320.205,False,gitlab,retrieve_value,205,train,webarena_verified.500.182 -webarena_verified.320.206,False,gitlab,retrieve_value,206,test,webarena_verified.320.205 -webarena_verified.320.207,False,gitlab,retrieve_value,207,test,webarena_verified.320.206 -webarena_verified.364.208,False,shopping_admin,retrieve_value,208,test,webarena_verified.366.204 -webarena_verified.364.209,False,shopping_admin,retrieve_value,209,test,webarena_verified.364.208 -webarena_verified.364.210,False,shopping_admin,retrieve_value,210,train,webarena_verified.364.209 -webarena_verified.364.211,False,shopping_admin,retrieve_value,211,train,webarena_verified.364.210 -webarena_verified.364.212,False,shopping_admin,retrieve_value,212,train,webarena_verified.364.211 -webarena_verified.249.213,False,shopping_admin,retrieve_value,213,test,webarena_verified.364.212 -webarena_verified.249.214,False,shopping_admin,retrieve_value,214,train,webarena_verified.249.213 -webarena_verified.249.215,False,shopping_admin,retrieve_value,215,test,webarena_verified.249.214 -webarena_verified.249.216,False,shopping_admin,retrieve_value,216,train,webarena_verified.249.215 -webarena_verified.249.217,False,shopping_admin,retrieve_value,217,train,webarena_verified.249.216 -webarena_verified.41.218,False,map,string_match,218,train,webarena_verified.36.155 -webarena_verified.41.219,False,map,string_match,219,test,webarena_verified.41.218 -webarena_verified.41.220,False,map,string_match,220,train,webarena_verified.41.219 -webarena_verified.35.221,False,map,string_match,221,test,webarena_verified.41.220 -webarena_verified.35.222,False,map,string_match,222,train,webarena_verified.35.221 -webarena_verified.35.223,False,map,string_match,223,test,webarena_verified.35.222 -webarena_verified.35.224,False,map,string_match,224,test,webarena_verified.35.223 -webarena_verified.135.225,False,shopping,retrieve_value,225,test,webarena_verified.214.192 -webarena_verified.370.226,False,shopping,retrieve_value,226,train,webarena_verified.135.225 -webarena_verified.370.227,False,shopping,retrieve_value,227,train,webarena_verified.370.226 -webarena_verified.370.228,False,shopping,retrieve_value,228,test,webarena_verified.370.227 -webarena_verified.370.229,False,shopping,retrieve_value,229,test,webarena_verified.370.228 -webarena_verified.370.230,False,shopping,retrieve_value,230,train,webarena_verified.370.229 -webarena_verified.213.231,False,shopping,retrieve_value,231,test,webarena_verified.370.230 -webarena_verified.213.232,False,shopping,retrieve_value,232,train,webarena_verified.213.231 -webarena_verified.213.233,False,shopping,retrieve_value,233,test,webarena_verified.213.232 -webarena_verified.213.234,False,shopping,retrieve_value,234,train,webarena_verified.213.233 -webarena_verified.213.235,False,shopping,retrieve_value,235,train,webarena_verified.213.234 -webarena_verified.39.236,False,map,retrieve_value,236,train,webarena_verified.35.224 -webarena_verified.39.237,False,map,retrieve_value,237,train,webarena_verified.39.236 -webarena_verified.138.238,False,shopping,ui_state,238,train,webarena_verified.213.235 -webarena_verified.138.239,False,shopping,ui_state,239,train,webarena_verified.138.238 -webarena_verified.138.240,False,shopping,ui_state,240,test,webarena_verified.138.239 -webarena_verified.138.241,False,shopping,ui_state,241,train,webarena_verified.138.240 -webarena_verified.138.242,False,shopping,ui_state,242,test,webarena_verified.138.241 -webarena_verified.244.243,False,shopping_admin,retrieve_value,243,train,webarena_verified.249.217 -webarena_verified.244.244,False,shopping_admin,retrieve_value,244,test,webarena_verified.244.243 -webarena_verified.244.245,False,shopping_admin,retrieve_value,245,train,webarena_verified.244.244 -webarena_verified.244.246,False,shopping_admin,retrieve_value,246,test,webarena_verified.244.245 -webarena_verified.244.247,False,shopping_admin,retrieve_value,247,train,webarena_verified.244.246 -webarena_verified.46.248,False,map,retrieve_value,248,test,webarena_verified.39.237 -webarena_verified.46.249,False,map,retrieve_value,249,train,webarena_verified.46.248 -webarena_verified.46.250,False,map,retrieve_value,250,test,webarena_verified.46.249 -webarena_verified.46.251,False,map,retrieve_value,251,train,webarena_verified.46.250 -webarena_verified.46.252,False,map,retrieve_value,252,train,webarena_verified.46.251 -webarena_verified.501.253,False,map,string_match,253,test,webarena_verified.46.252 -webarena_verified.501.254,False,map,retrieve_value,254,train,webarena_verified.501.253 -webarena_verified.501.255,False,map,retrieve_value,255,test,webarena_verified.501.254 -webarena_verified.501.256,False,map,retrieve_value,256,train,webarena_verified.501.255 -webarena_verified.501.257,False,map,string_match,257,test,webarena_verified.501.256 -webarena_verified.325.258,False,gitlab,ui_state,258,train,webarena_verified.320.207 -webarena_verified.312.259,False,gitlab,retrieve_value,259,train,webarena_verified.325.258 -webarena_verified.211.260,False,shopping,ui_state,260,test,webarena_verified.138.242 -webarena_verified.211.261,False,shopping,ui_state,261,train,webarena_verified.211.260 -webarena_verified.211.262,False,shopping,ui_state,262,train,webarena_verified.211.261 -webarena_verified.211.263,False,shopping,ui_state,263,test,webarena_verified.211.262 -webarena_verified.211.264,False,shopping,ui_state,264,train,webarena_verified.211.263 -webarena_verified.85.265,False,wikipedia map,retrieve_value,265,test,webarena_verified.501.257 -webarena_verified.85.266,False,wikipedia map,retrieve_value,266,test,webarena_verified.85.265 -webarena_verified.85.267,False,wikipedia map,retrieve_value,267,train,webarena_verified.85.266 -webarena_verified.85.268,False,wikipedia map,retrieve_value,268,test,webarena_verified.85.267 -webarena_verified.139.269,False,shopping,ui_state,269,train,webarena_verified.211.264 -webarena_verified.139.270,False,shopping,ui_state,270,train,webarena_verified.139.269 -webarena_verified.139.271,False,shopping,ui_state,271,test,webarena_verified.139.270 -webarena_verified.139.272,False,shopping,ui_state,272,test,webarena_verified.139.271 -webarena_verified.139.273,False,shopping,ui_state,273,train,webarena_verified.139.272 -webarena_verified.212.274,False,shopping,ui_state,274,test,webarena_verified.139.273 -webarena_verified.212.275,False,shopping,ui_state,275,test,webarena_verified.212.274 -webarena_verified.212.276,False,shopping,ui_state,276,train,webarena_verified.212.275 -webarena_verified.212.277,False,shopping,ui_state,277,train,webarena_verified.212.276 -webarena_verified.212.278,False,shopping,ui_state,278,train,webarena_verified.212.277 -webarena_verified.204.279,False,shopping,retrieve_value,279,train,webarena_verified.212.278 -webarena_verified.204.280,False,shopping,retrieve_value,280,test,webarena_verified.204.279 -webarena_verified.204.281,False,shopping,retrieve_value,281,train,webarena_verified.204.280 -webarena_verified.204.282,False,shopping,retrieve_value,282,train,webarena_verified.204.281 -webarena_verified.210.283,False,shopping,ui_state,283,test,webarena_verified.204.282 -webarena_verified.207.284,False,shopping,ui_state,284,test,webarena_verified.210.283 -webarena_verified.207.285,False,shopping,ui_state,285,train,webarena_verified.207.284 -webarena_verified.207.286,False,shopping,ui_state,286,test,webarena_verified.207.285 -webarena_verified.47.287,False,map,string_match,287,test,webarena_verified.85.268 -webarena_verified.234.288,False,shopping_admin,retrieve_value,288,train,webarena_verified.244.247 -webarena_verified.234.289,False,shopping_admin,retrieve_value,289,test,webarena_verified.234.288 -webarena_verified.234.290,False,shopping_admin,retrieve_value,290,train,webarena_verified.234.289 -webarena_verified.234.291,False,shopping_admin,retrieve_value,291,train,webarena_verified.234.290 -webarena_verified.234.292,False,shopping_admin,retrieve_value,292,test,webarena_verified.234.291 -webarena_verified.329.293,False,gitlab,retrieve_value,293,train,webarena_verified.312.259 -webarena_verified.329.294,False,gitlab,retrieve_value,294,train,webarena_verified.329.293 -webarena_verified.329.295,False,gitlab,retrieve_value,295,test,webarena_verified.329.294 -webarena_verified.329.296,False,gitlab,retrieve_value,296,train,webarena_verified.329.295 -webarena_verified.329.297,False,gitlab,retrieve_value,297,test,webarena_verified.329.296 -webarena_verified.180.298,False,shopping,ui_state,298,train,webarena_verified.207.286 -webarena_verified.180.299,False,shopping,ui_state,299,train,webarena_verified.180.298 -webarena_verified.180.300,False,shopping,ui_state,300,test,webarena_verified.180.299 -webarena_verified.180.301,False,shopping,retrieve_value,301,test,webarena_verified.180.300 -webarena_verified.180.302,False,shopping,retrieve_value,302,train,webarena_verified.180.301 -webarena_verified.321.303,False,gitlab,retrieve_value,303,test,webarena_verified.329.297 -webarena_verified.321.304,False,gitlab,retrieve_value,304,train,webarena_verified.321.303 -webarena_verified.321.305,False,gitlab,retrieve_value,305,train,webarena_verified.321.304 -webarena_verified.321.306,False,gitlab,retrieve_value,306,test,webarena_verified.321.305 -webarena_verified.321.307,False,gitlab,retrieve_value,307,train,webarena_verified.321.306 -webarena_verified.323.308,False,gitlab,retrieve_value,308,train,webarena_verified.321.307 -webarena_verified.323.309,False,gitlab,retrieve_value,309,train,webarena_verified.323.308 -webarena_verified.323.310,False,gitlab,retrieve_value,310,train,webarena_verified.323.309 -webarena_verified.323.311,False,gitlab,retrieve_value,311,test,webarena_verified.323.310 -webarena_verified.323.312,False,gitlab,retrieve_value,312,test,webarena_verified.323.311 -webarena_verified.134.313,False,shopping,retrieve_value,313,train,webarena_verified.180.302 -webarena_verified.324.314,False,gitlab,retrieve_value,314,train,webarena_verified.323.312 -webarena_verified.324.315,False,gitlab,retrieve_value,315,train,webarena_verified.324.314 -webarena_verified.324.316,False,gitlab,retrieve_value,316,test,webarena_verified.324.315 -webarena_verified.324.317,False,gitlab,retrieve_value,317,test,webarena_verified.324.316 -webarena_verified.324.318,False,gitlab,retrieve_value,318,train,webarena_verified.324.317 -webarena_verified.160.319,False,shopping,retrieve_value,319,train,webarena_verified.134.313 -webarena_verified.160.320,False,shopping,retrieve_value,320,test,webarena_verified.160.319 -webarena_verified.160.321,False,shopping,retrieve_value,321,train,webarena_verified.160.320 -webarena_verified.160.322,False,shopping,retrieve_value,322,test,webarena_verified.160.321 -webarena_verified.160.323,False,shopping,retrieve_value,323,train,webarena_verified.160.322 -webarena_verified.208.324,False,shopping,ui_state,324,train,webarena_verified.160.323 -webarena_verified.208.325,False,shopping,ui_state,325,test,webarena_verified.208.324 -webarena_verified.208.326,False,shopping,ui_state,326,train,webarena_verified.208.325 -webarena_verified.208.327,False,shopping,ui_state,327,test,webarena_verified.208.326 -webarena_verified.208.328,False,shopping,ui_state,328,train,webarena_verified.208.327 -webarena_verified.147.329,False,shopping,retrieve_value,329,test,webarena_verified.208.328 -webarena_verified.147.330,False,shopping,retrieve_value,330,test,webarena_verified.147.329 -webarena_verified.147.331,False,shopping,retrieve_value,331,test,webarena_verified.147.330 -webarena_verified.147.332,False,shopping,retrieve_value,332,train,webarena_verified.147.331 -webarena_verified.147.333,False,shopping,retrieve_value,333,train,webarena_verified.147.332 -webarena_verified.169.334,False,shopping,retrieve_value,334,train,webarena_verified.147.333 -webarena_verified.169.335,False,shopping,retrieve_value,335,train,webarena_verified.169.334 -webarena_verified.169.336,False,shopping,retrieve_value,336,test,webarena_verified.169.335 -webarena_verified.169.337,False,shopping,retrieve_value,337,test,webarena_verified.169.336 -webarena_verified.169.338,False,shopping,retrieve_value,338,train,webarena_verified.169.337 -webarena_verified.299.339,False,gitlab,ui_state,339,test,webarena_verified.324.318 -webarena_verified.299.340,False,gitlab,ui_state,340,train,webarena_verified.299.339 -webarena_verified.299.341,False,gitlab,ui_state,341,test,webarena_verified.299.340 -webarena_verified.299.342,False,gitlab,ui_state,342,test,webarena_verified.299.341 -webarena_verified.299.343,False,gitlab,ui_state,343,test,webarena_verified.299.342 -webarena_verified.248.344,False,shopping_admin,retrieve_value,344,test,webarena_verified.234.292 -webarena_verified.248.345,False,shopping_admin,retrieve_value,345,train,webarena_verified.248.344 -webarena_verified.248.346,False,shopping_admin,retrieve_value,346,train,webarena_verified.248.345 -webarena_verified.248.347,False,shopping_admin,retrieve_value,347,train,webarena_verified.248.346 -webarena_verified.248.348,False,shopping_admin,retrieve_value,348,test,webarena_verified.248.347 -webarena_verified.298.349,False,gitlab,retrieve_value,349,test,webarena_verified.299.343 -webarena_verified.298.350,False,gitlab,retrieve_value,350,test,webarena_verified.298.349 -webarena_verified.137.351,False,shopping,ui_state,351,train,webarena_verified.169.338 -webarena_verified.137.352,False,shopping,ui_state,352,test,webarena_verified.137.351 -webarena_verified.137.353,False,shopping,ui_state,353,test,webarena_verified.137.352 -webarena_verified.137.354,False,shopping,ui_state,354,train,webarena_verified.137.353 -webarena_verified.137.355,False,shopping,ui_state,355,train,webarena_verified.137.354 -webarena_verified.49.356,False,map,program_html,356,test,webarena_verified.47.287 -webarena_verified.291.357,False,gitlab,ui_state,357,test,webarena_verified.298.350 -webarena_verified.206.358,False,shopping,retrieve_value,358,train,webarena_verified.137.355 -webarena_verified.206.359,False,shopping,retrieve_value,359,test,webarena_verified.206.358 -webarena_verified.206.360,False,shopping,retrieve_value,360,train,webarena_verified.206.359 -webarena_verified.206.361,False,shopping,retrieve_value,361,train,webarena_verified.206.360 -webarena_verified.206.362,False,shopping,retrieve_value,362,test,webarena_verified.206.361 -webarena_verified.58.363,False,map,retrieve_value,363,train,webarena_verified.49.356 -webarena_verified.58.364,False,map,retrieve_value,364,test,webarena_verified.58.363 -webarena_verified.58.365,False,map,retrieve_value,365,test,webarena_verified.58.364 -webarena_verified.58.366,False,map,retrieve_value,366,train,webarena_verified.58.365 -webarena_verified.58.367,False,map,retrieve_value,367,train,webarena_verified.58.366 -webarena_verified.188.368,False,shopping,retrieve_value,368,test,webarena_verified.206.362 -webarena_verified.52.369,False,map,program_html,369,train,webarena_verified.58.367 -webarena_verified.52.370,False,map,program_html,370,test,webarena_verified.52.369 -webarena_verified.52.371,False,map,program_html,371,test,webarena_verified.52.370 -webarena_verified.52.372,False,map,program_html,372,train,webarena_verified.52.371 -webarena_verified.52.373,False,map,program_html,373,train,webarena_verified.52.372 -webarena_verified.266.374,False,shopping_admin,ui_state,374,train,webarena_verified.248.348 -webarena_verified.266.375,False,shopping_admin,ui_state,375,train,webarena_verified.266.374 -webarena_verified.182.376,False,shopping,retrieve_value,376,test,webarena_verified.188.368 -webarena_verified.59.377,False,map,ui_state,377,test,webarena_verified.52.373 -webarena_verified.59.378,False,map,ui_state,378,train,webarena_verified.59.377 -webarena_verified.59.379,False,map,ui_state,379,train,webarena_verified.59.378 -webarena_verified.59.380,False,map,ui_state,380,test,webarena_verified.59.379 -webarena_verified.59.381,False,map,ui_state,381,train,webarena_verified.59.380 -webarena_verified.781.382,False,map,string_match,382,test,webarena_verified.59.381 -webarena_verified.782.383,False,map,retrieve_value,383,test,webarena_verified.781.382 -webarena_verified.666.384,False,shopping,retrieve_value,384,test,webarena_verified.182.376 -webarena_verified.666.385,False,shopping,retrieve_value,385,train,webarena_verified.666.384 -webarena_verified.1355.386,False,shopping,retrieve_value,386,test,webarena_verified.666.385 -webarena_verified.1356.387,False,shopping,retrieve_value,387,train,webarena_verified.1355.386 -webarena_verified.1356.388,False,shopping,retrieve_value,388,test,webarena_verified.1356.387 -webarena_verified.348.389,False,gitlab,backend_state,389,test,webarena_verified.291.357 -webarena_verified.348.390,False,gitlab,backend_state,390,train,webarena_verified.348.389 -webarena_verified.348.391,False,gitlab,backend_state,391,train,webarena_verified.348.390 -webarena_verified.348.392,False,gitlab,backend_state,392,test,webarena_verified.348.391 -webarena_verified.348.393,False,gitlab,backend_state,393,train,webarena_verified.348.392 -webarena_verified.352.394,False,gitlab,backend_state,394,test,webarena_verified.348.393 -webarena_verified.352.395,False,gitlab,backend_state,395,train,webarena_verified.352.394 -webarena_verified.352.396,False,gitlab,backend_state,396,train,webarena_verified.352.395 -webarena_verified.352.397,False,gitlab,backend_state,397,train,webarena_verified.352.396 -webarena_verified.352.398,False,gitlab,backend_state,398,test,webarena_verified.352.397 -webarena_verified.6.399,False,reddit,backend_state,399,train,webarena_verified.17.69 -webarena_verified.6.400,False,reddit,backend_state,400,test,webarena_verified.6.399 -webarena_verified.6.401,False,reddit,backend_state,401,train,webarena_verified.6.400 -webarena_verified.6.402,False,reddit,backend_state,402,train,webarena_verified.6.401 -webarena_verified.6.403,False,reddit,backend_state,403,test,webarena_verified.6.402 -webarena_verified.22.404,False,reddit,backend_state,404,train,webarena_verified.6.403 -webarena_verified.22.405,False,reddit,backend_state,405,test,webarena_verified.22.404 -webarena_verified.22.406,False,reddit,backend_state,406,train,webarena_verified.22.405 -webarena_verified.22.407,False,reddit,backend_state,407,test,webarena_verified.22.406 -webarena_verified.22.408,False,reddit,backend_state,408,train,webarena_verified.22.407 -webarena_verified.23.409,False,reddit,backend_state,409,test,webarena_verified.22.408 -webarena_verified.23.410,False,reddit,backend_state,410,test,webarena_verified.23.409 -webarena_verified.355.411,False,gitlab,backend_state,411,test,webarena_verified.352.398 -webarena_verified.355.412,False,gitlab,backend_state,412,test,webarena_verified.355.411 -webarena_verified.355.413,False,gitlab,backend_state,413,test,webarena_verified.355.412 -webarena_verified.355.414,False,gitlab,backend_state,414,test,webarena_verified.355.413 -webarena_verified.360.415,False,gitlab,backend_state,415,test,webarena_verified.355.414 -webarena_verified.360.416,False,gitlab,backend_state,416,test,webarena_verified.360.415 -webarena_verified.360.417,False,gitlab,backend_state,417,test,webarena_verified.360.416 -webarena_verified.361.418,False,gitlab,backend_state,418,train,webarena_verified.360.417 -webarena_verified.361.419,False,gitlab,backend_state,419,test,webarena_verified.361.418 -webarena_verified.361.420,False,gitlab,backend_state,420,test,webarena_verified.361.419 -webarena_verified.361.421,False,gitlab,backend_state,421,train,webarena_verified.361.420 -webarena_verified.361.422,False,gitlab,backend_state,422,train,webarena_verified.361.421 -webarena_verified.237.423,False,shopping_admin,backend_state,423,train,webarena_verified.266.375 -webarena_verified.371.424,False,wikipedia map,program_html,424,train,webarena_verified.782.383 -webarena_verified.371.425,False,wikipedia map,program_html,425,train,webarena_verified.371.424 -webarena_verified.371.426,False,wikipedia map,program_html,426,test,webarena_verified.371.425 -webarena_verified.371.427,False,wikipedia map,program_html,427,test,webarena_verified.371.426 -webarena_verified.371.428,False,wikipedia map,program_html,428,train,webarena_verified.371.427 -webarena_verified.371.429,False,wikipedia map,program_html,429,train,webarena_verified.371.428 -webarena_verified.371.430,False,wikipedia map,program_html,430,test,webarena_verified.371.429 -webarena_verified.145.431,False,shopping,program_html,431,train,webarena_verified.1356.388 -webarena_verified.145.432,False,shopping,backend_state,432,test,webarena_verified.145.431 -webarena_verified.145.433,False,shopping,backend_state,433,train,webarena_verified.145.432 -webarena_verified.145.434,False,shopping,backend_state,434,train,webarena_verified.145.433 -webarena_verified.145.435,False,shopping,backend_state,435,train,webarena_verified.145.434 -webarena_verified.156.436,False,shopping,backend_state,436,test,webarena_verified.145.435 -webarena_verified.156.437,False,shopping,backend_state,437,train,webarena_verified.156.436 -webarena_verified.156.438,False,shopping,backend_state,438,train,webarena_verified.156.437 -webarena_verified.156.439,False,shopping,backend_state,439,train,webarena_verified.156.438 -webarena_verified.156.440,False,shopping,backend_state,440,test,webarena_verified.156.439 -webarena_verified.308.441,False,gitlab,backend_state,441,train,webarena_verified.361.422 -webarena_verified.308.442,False,gitlab,backend_state,442,train,webarena_verified.308.441 -webarena_verified.308.443,False,gitlab,backend_state,443,test,webarena_verified.308.442 -webarena_verified.308.444,False,gitlab,backend_state,444,train,webarena_verified.308.443 -webarena_verified.308.445,False,gitlab,backend_state,445,test,webarena_verified.308.444 -webarena_verified.999.446,False,gitlab,backend_state,446,test,webarena_verified.308.445 -webarena_verified.999.447,False,gitlab,backend_state,447,train,webarena_verified.999.446 -webarena_verified.331.448,False,gitlab,backend_state,448,test,webarena_verified.999.447 -webarena_verified.331.449,False,gitlab,backend_state,449,test,webarena_verified.331.448 -webarena_verified.331.450,False,gitlab,retrieve_value,450,train,webarena_verified.331.449 -webarena_verified.331.451,False,gitlab,retrieve_value,451,train,webarena_verified.331.450 -webarena_verified.331.452,False,gitlab,retrieve_value,452,train,webarena_verified.331.451 -webarena_verified.242.453,False,shopping_admin,backend_state,453,train,webarena_verified.237.423 -webarena_verified.242.454,False,shopping_admin,backend_state,454,test,webarena_verified.242.453 -webarena_verified.242.455,False,shopping_admin,backend_state,455,train,webarena_verified.242.454 -webarena_verified.242.456,False,shopping_admin,backend_state,456,test,webarena_verified.242.455 -webarena_verified.242.457,False,shopping_admin,backend_state,457,train,webarena_verified.242.456 -webarena_verified.247.458,False,shopping_admin,backend_state,458,test,webarena_verified.242.457 -webarena_verified.247.459,False,shopping_admin,backend_state,459,test,webarena_verified.247.458 -webarena_verified.247.460,False,shopping_admin,backend_state,460,train,webarena_verified.247.459 -webarena_verified.247.461,False,shopping_admin,backend_state,461,train,webarena_verified.247.460 -webarena_verified.247.462,False,shopping_admin,backend_state,462,test,webarena_verified.247.461 -webarena_verified.247.463,False,shopping_admin,backend_state,463,test,webarena_verified.247.462 -webarena_verified.251.464,False,shopping_admin,backend_state,464,train,webarena_verified.247.463 -webarena_verified.186.465,False,shopping,backend_state,465,train,webarena_verified.156.440 -webarena_verified.186.466,False,shopping,backend_state,466,train,webarena_verified.186.465 -webarena_verified.186.467,False,shopping,backend_state,467,train,webarena_verified.186.466 -webarena_verified.186.468,False,shopping,backend_state,468,test,webarena_verified.186.467 -webarena_verified.186.469,False,shopping,backend_state,469,test,webarena_verified.186.468 -webarena_verified.257.470,False,shopping_admin,backend_state,470,test,webarena_verified.251.464 -webarena_verified.257.471,False,shopping_admin,backend_state,471,test,webarena_verified.257.470 -webarena_verified.257.472,False,shopping_admin,backend_state,472,train,webarena_verified.257.471 -webarena_verified.257.473,False,shopping_admin,backend_state,473,train,webarena_verified.257.472 -webarena_verified.257.474,False,shopping_admin,backend_state,474,train,webarena_verified.257.473 -webarena_verified.292.475,False,gitlab,backend_state,475,train,webarena_verified.331.452 -webarena_verified.292.476,False,gitlab,backend_state,476,train,webarena_verified.292.475 -webarena_verified.292.477,False,gitlab,backend_state,477,train,webarena_verified.292.476 -webarena_verified.292.478,False,gitlab,backend_state,478,test,webarena_verified.292.477 -webarena_verified.292.479,False,gitlab,backend_state,479,test,webarena_verified.292.478 -webarena_verified.293.480,False,gitlab,backend_state,480,train,webarena_verified.292.479 -webarena_verified.294.481,False,gitlab,backend_state,481,train,webarena_verified.293.480 -webarena_verified.294.482,False,gitlab,backend_state,482,train,webarena_verified.294.481 -webarena_verified.294.483,False,gitlab,backend_state,483,test,webarena_verified.294.482 -webarena_verified.294.484,False,gitlab,backend_state,484,train,webarena_verified.294.483 -webarena_verified.294.485,False,gitlab,backend_state,485,test,webarena_verified.294.484 -webarena_verified.275.486,False,shopping_admin,backend_state,486,train,webarena_verified.257.474 -webarena_verified.275.487,False,shopping_admin,backend_state,487,test,webarena_verified.275.486 -webarena_verified.275.488,False,shopping_admin,backend_state,488,test,webarena_verified.275.487 -webarena_verified.275.489,False,shopping_admin,backend_state,489,train,webarena_verified.275.488 -webarena_verified.275.490,False,shopping_admin,backend_state,490,train,webarena_verified.275.489 -webarena_verified.280.491,False,shopping_admin,retrieve_value,491,test,webarena_verified.275.490 -webarena_verified.280.492,False,shopping_admin,backend_state,492,train,webarena_verified.280.491 -webarena_verified.280.493,False,shopping_admin,backend_state,493,train,webarena_verified.280.492 -webarena_verified.280.494,False,shopping_admin,backend_state,494,train,webarena_verified.280.493 -webarena_verified.280.495,False,shopping_admin,backend_state,495,test,webarena_verified.280.494 -webarena_verified.284.496,False,shopping_admin,backend_state,496,train,webarena_verified.280.495 -webarena_verified.284.497,False,shopping_admin,backend_state,497,test,webarena_verified.284.496 -webarena_verified.284.498,False,shopping_admin,backend_state,498,test,webarena_verified.284.497 -webarena_verified.284.499,False,shopping_admin,backend_state,499,train,webarena_verified.284.498 -webarena_verified.284.500,False,shopping_admin,backend_state,500,train,webarena_verified.284.499 -webarena_verified.287.501,False,shopping_admin,backend_state,501,train,webarena_verified.284.500 -webarena_verified.287.502,False,shopping_admin,backend_state,502,test,webarena_verified.287.501 -webarena_verified.287.503,False,shopping_admin,backend_state,503,train,webarena_verified.287.502 -webarena_verified.287.504,False,shopping_admin,backend_state,504,test,webarena_verified.287.503 -webarena_verified.287.505,False,shopping_admin,backend_state,505,train,webarena_verified.287.504 -webarena_verified.172.506,False,shopping,backend_state,506,train,webarena_verified.186.469 -webarena_verified.172.507,False,shopping,backend_state,507,train,webarena_verified.172.506 -webarena_verified.172.508,False,shopping,backend_state,508,test,webarena_verified.172.507 -webarena_verified.216.509,False,shopping,backend_state,509,test,webarena_verified.172.508 -webarena_verified.216.510,False,shopping,backend_state,510,test,webarena_verified.216.509 -webarena_verified.189.511,False,shopping,program_html,511,test,webarena_verified.216.510 -webarena_verified.189.512,False,shopping,program_html,512,train,webarena_verified.189.511 -webarena_verified.189.513,False,shopping,program_html,513,train,webarena_verified.189.512 -webarena_verified.189.514,False,shopping,program_html,514,test,webarena_verified.189.513 -webarena_verified.189.515,False,shopping,program_html,515,train,webarena_verified.189.514 -webarena_verified.196.516,False,shopping,backend_state,516,train,webarena_verified.189.515 -webarena_verified.196.517,False,shopping,backend_state,517,test,webarena_verified.196.516 -webarena_verified.196.518,False,shopping,backend_state,518,test,webarena_verified.196.517 -webarena_verified.196.519,False,shopping,backend_state,519,test,webarena_verified.196.518 -webarena_verified.196.520,False,shopping,backend_state,520,train,webarena_verified.196.519 -webarena_verified.199.521,False,shopping,backend_state,521,test,webarena_verified.196.520 -webarena_verified.352.522,False,gitlab,backend_state,522,test,webarena_verified.294.485 -webarena_verified.354.523,False,gitlab,backend_state,523,train,webarena_verified.352.522 -webarena_verified.354.524,False,gitlab,backend_state,524,test,webarena_verified.354.523 -webarena_verified.354.525,False,gitlab,backend_state,525,train,webarena_verified.354.524 -webarena_verified.354.526,False,gitlab,backend_state,526,train,webarena_verified.354.525 -webarena_verified.354.527,False,gitlab,backend_state,527,test,webarena_verified.354.526 -webarena_verified.154.528,False,shopping,program_html,528,train,webarena_verified.199.521 -webarena_verified.154.529,False,shopping,program_html,529,test,webarena_verified.154.528 -webarena_verified.154.530,False,shopping,program_html,530,test,webarena_verified.154.529 -webarena_verified.154.531,False,shopping,program_html,531,train,webarena_verified.154.530 -webarena_verified.154.532,False,shopping,program_html,532,train,webarena_verified.154.531 -webarena_verified.330.533,False,gitlab,backend_state,533,test,webarena_verified.354.527 -webarena_verified.330.534,False,gitlab,backend_state,534,train,webarena_verified.330.533 -webarena_verified.330.535,False,gitlab,backend_state,535,test,webarena_verified.330.534 -webarena_verified.330.536,False,gitlab,backend_state,536,train,webarena_verified.330.535 -webarena_verified.330.537,False,gitlab,backend_state,537,train,webarena_verified.330.536 -webarena_verified.240.538,False,shopping_admin,backend_state,538,train,webarena_verified.287.505 -webarena_verified.240.539,False,shopping_admin,backend_state,539,train,webarena_verified.240.538 -webarena_verified.240.540,False,shopping_admin,backend_state,540,test,webarena_verified.240.539 -webarena_verified.240.541,False,shopping_admin,backend_state,541,test,webarena_verified.240.540 -webarena_verified.240.542,False,shopping_admin,backend_state,542,train,webarena_verified.240.541 -webarena_verified.251.543,False,shopping_admin,backend_state,543,test,webarena_verified.240.542 -webarena_verified.251.544,False,shopping_admin,backend_state,544,test,webarena_verified.251.543 -webarena_verified.251.545,False,shopping_admin,backend_state,545,test,webarena_verified.251.544 -webarena_verified.251.546,False,shopping_admin,retrieve_value,546,train,webarena_verified.251.545 -webarena_verified.252.547,False,shopping_admin,backend_state,547,train,webarena_verified.251.546 -webarena_verified.252.548,False,shopping_admin,backend_state,548,train,webarena_verified.252.547 -webarena_verified.252.549,False,shopping_admin,backend_state,549,test,webarena_verified.252.548 -webarena_verified.252.550,False,shopping_admin,backend_state,550,train,webarena_verified.252.549 -webarena_verified.252.551,False,shopping_admin,backend_state,551,test,webarena_verified.252.550 -webarena_verified.84.552,False,gitlab reddit,program_html,552,test,webarena_verified.23.410 -webarena_verified.84.553,False,gitlab reddit,program_html,553,test,webarena_verified.84.552 -webarena_verified.84.554,False,gitlab reddit,program_html,554,test,webarena_verified.84.553 -webarena_verified.84.555,False,gitlab reddit,program_html,555,test,webarena_verified.84.554 -webarena_verified.87.556,False,gitlab wikipedia,program_html,556,train,webarena_verified.84.555 -webarena_verified.87.557,False,gitlab wikipedia,program_html,557,test,webarena_verified.87.556 -webarena_verified.87.558,False,gitlab wikipedia,program_html,558,train,webarena_verified.87.557 -webarena_verified.87.559,False,gitlab wikipedia,program_html,559,train,webarena_verified.87.558 -webarena_verified.87.560,False,gitlab wikipedia,program_html,560,test,webarena_verified.87.559 -webarena_verified.87.561,False,gitlab wikipedia,program_html,561,test,webarena_verified.87.560 -webarena_verified.88.562,False,gitlab reddit,program_html,562,train,webarena_verified.84.555 -webarena_verified.88.563,False,gitlab reddit,program_html,563,train,webarena_verified.88.562 -webarena_verified.88.564,False,gitlab reddit,program_html,564,train,webarena_verified.88.563 -webarena_verified.88.565,False,gitlab reddit,program_html,565,test,webarena_verified.88.564 -webarena_verified.88.566,False,gitlab reddit,program_html,566,test,webarena_verified.88.565 -webarena_verified.293.567,False,gitlab,backend_state,567,test,webarena_verified.88.566 -webarena_verified.293.568,False,gitlab,backend_state,568,train,webarena_verified.293.567 -webarena_verified.293.569,False,gitlab,backend_state,569,train,webarena_verified.293.568 -webarena_verified.293.570,False,gitlab,backend_state,570,test,webarena_verified.293.569 -webarena_verified.165.571,False,shopping,backend_state,571,test,webarena_verified.154.532 -webarena_verified.165.572,False,shopping,backend_state,572,train,webarena_verified.165.571 -webarena_verified.165.573,False,shopping,backend_state,573,train,webarena_verified.165.572 -webarena_verified.165.574,False,shopping,backend_state,574,test,webarena_verified.165.573 -webarena_verified.165.575,False,shopping,backend_state,575,train,webarena_verified.165.574 -webarena_verified.351.576,False,gitlab,backend_state,576,test,webarena_verified.293.570 -webarena_verified.351.577,False,gitlab,backend_state,577,train,webarena_verified.351.576 -webarena_verified.351.578,False,gitlab,backend_state,578,test,webarena_verified.351.577 -webarena_verified.351.579,False,gitlab,backend_state,579,train,webarena_verified.351.578 -webarena_verified.7.580,False,reddit,backend_state,580,train,webarena_verified.88.566 -webarena_verified.7.581,False,reddit,backend_state,581,train,webarena_verified.7.580 -webarena_verified.7.582,False,reddit,backend_state,582,test,webarena_verified.7.581 -webarena_verified.7.583,False,reddit,backend_state,583,test,webarena_verified.7.582 -webarena_verified.7.584,False,reddit,backend_state,584,train,webarena_verified.7.583 -webarena_verified.194.585,False,shopping,backend_state,585,train,webarena_verified.165.575 -webarena_verified.194.586,False,shopping,backend_state,586,test,webarena_verified.194.585 -webarena_verified.194.587,False,shopping,backend_state,587,train,webarena_verified.194.586 -webarena_verified.194.588,False,shopping,backend_state,588,train,webarena_verified.194.587 -webarena_verified.194.589,False,shopping,backend_state,589,test,webarena_verified.194.588 -webarena_verified.339.590,False,gitlab,backend_state,590,train,webarena_verified.351.579 -webarena_verified.339.591,False,gitlab,backend_state,591,test,webarena_verified.339.590 -webarena_verified.339.592,False,gitlab,backend_state,592,test,webarena_verified.339.591 -webarena_verified.339.593,False,gitlab,backend_state,593,test,webarena_verified.339.592 -webarena_verified.339.594,False,gitlab,backend_state,594,train,webarena_verified.339.593 -webarena_verified.4.595,False,reddit,backend_state,595,train,webarena_verified.7.584 -webarena_verified.4.596,False,reddit,backend_state,596,test,webarena_verified.4.595 -webarena_verified.4.597,False,reddit,backend_state,597,train,webarena_verified.4.596 -webarena_verified.4.598,False,reddit,backend_state,598,train,webarena_verified.4.597 -webarena_verified.4.599,False,reddit,backend_state,599,test,webarena_verified.4.598 -webarena_verified.3765.600,False,reddit,backend_state,600,test,webarena_verified.4.599 -webarena_verified.3765.601,False,reddit,backend_state,601,train,webarena_verified.3765.600 -webarena_verified.3765.602,False,reddit,backend_state,602,train,webarena_verified.3765.601 -webarena_verified.3765.603,False,reddit,backend_state,603,train,webarena_verified.3765.602 -webarena_verified.3765.604,False,reddit,backend_state,604,test,webarena_verified.3765.603 -webarena_verified.5.605,False,reddit,backend_state,605,train,webarena_verified.3765.604 -webarena_verified.5.606,False,reddit,backend_state,606,train,webarena_verified.5.605 -webarena_verified.5.607,False,reddit,backend_state,607,test,webarena_verified.5.606 -webarena_verified.5.608,False,reddit,backend_state,608,test,webarena_verified.5.607 -webarena_verified.5.609,False,reddit,backend_state,609,train,webarena_verified.5.608 -webarena_verified.9.610,False,reddit,backend_state,610,train,webarena_verified.5.609 -webarena_verified.9.611,False,reddit,backend_state,611,train,webarena_verified.9.610 -webarena_verified.9.612,False,reddit,backend_state,612,test,webarena_verified.9.611 -webarena_verified.9.613,False,reddit,backend_state,613,train,webarena_verified.9.612 -webarena_verified.9.614,False,reddit,backend_state,614,test,webarena_verified.9.613 -webarena_verified.11.615,False,reddit,ui_state,615,test,webarena_verified.9.614 -webarena_verified.11.616,False,reddit,ui_state,616,test,webarena_verified.11.615 -webarena_verified.11.617,False,reddit,ui_state,617,train,webarena_verified.11.616 -webarena_verified.11.618,False,reddit,ui_state,618,train,webarena_verified.11.617 -webarena_verified.11.619,False,reddit,ui_state,619,train,webarena_verified.11.618 -webarena_verified.12.620,False,reddit,backend_state,620,train,webarena_verified.11.619 -webarena_verified.12.621,False,reddit,backend_state,621,train,webarena_verified.12.620 -webarena_verified.12.622,False,reddit,backend_state,622,train,webarena_verified.12.621 -webarena_verified.12.623,False,reddit,backend_state,623,test,webarena_verified.12.622 -webarena_verified.12.624,False,reddit,backend_state,624,test,webarena_verified.12.623 -webarena_verified.13.625,False,reddit,backend_state,625,train,webarena_verified.12.624 -webarena_verified.13.626,False,reddit,backend_state,626,train,webarena_verified.13.625 -webarena_verified.13.627,False,reddit,backend_state,627,train,webarena_verified.13.626 -webarena_verified.13.628,False,reddit,backend_state,628,test,webarena_verified.13.627 -webarena_verified.13.629,False,reddit,backend_state,629,test,webarena_verified.13.628 -webarena_verified.15.630,False,reddit,backend_state,630,test,webarena_verified.13.629 -webarena_verified.15.631,False,reddit,backend_state,631,train,webarena_verified.15.630 -webarena_verified.15.632,False,reddit,backend_state,632,train,webarena_verified.15.631 -webarena_verified.15.633,False,reddit,backend_state,633,test,webarena_verified.15.632 -webarena_verified.15.634,False,reddit,backend_state,634,train,webarena_verified.15.633 -webarena_verified.6100.635,False,reddit,backend_state,635,train,webarena_verified.15.634 -webarena_verified.6100.636,False,reddit,backend_state,636,train,webarena_verified.6100.635 -webarena_verified.6100.637,False,reddit,backend_state,637,train,webarena_verified.6100.636 -webarena_verified.6100.638,False,reddit,ui_state,638,test,webarena_verified.6100.637 -webarena_verified.6100.639,False,reddit,backend_state,639,test,webarena_verified.6100.638 -webarena_verified.16.640,False,reddit,backend_state,640,train,webarena_verified.6100.639 -webarena_verified.16.641,False,reddit,backend_state,641,test,webarena_verified.16.640 -webarena_verified.16.642,False,reddit,backend_state,642,test,webarena_verified.16.641 -webarena_verified.16.643,False,reddit,backend_state,643,train,webarena_verified.16.642 -webarena_verified.16.644,False,reddit,backend_state,644,train,webarena_verified.16.643 -webarena_verified.19.645,False,reddit,backend_state,645,train,webarena_verified.16.644 -webarena_verified.19.646,False,reddit,backend_state,646,train,webarena_verified.19.645 -webarena_verified.19.647,False,reddit,backend_state,647,train,webarena_verified.19.646 -webarena_verified.19.648,False,reddit,backend_state,648,test,webarena_verified.19.647 -webarena_verified.19.649,False,reddit,backend_state,649,test,webarena_verified.19.648 -webarena_verified.23.650,False,reddit,backend_state,650,train,webarena_verified.19.649 -webarena_verified.23.651,False,reddit,backend_state,651,train,webarena_verified.23.650 -webarena_verified.23.652,False,reddit,backend_state,652,train,webarena_verified.23.651 -webarena_verified.153.653,False,shopping,ui_state,653,train,webarena_verified.194.589 -webarena_verified.153.654,False,shopping,ui_state,654,test,webarena_verified.153.653 -webarena_verified.153.655,False,shopping,ui_state,655,test,webarena_verified.153.654 -webarena_verified.153.656,False,shopping,ui_state,656,train,webarena_verified.153.655 -webarena_verified.153.657,False,shopping,ui_state,657,train,webarena_verified.153.656 -webarena_verified.327.658,False,gitlab,backend_state,658,train,webarena_verified.339.594 -webarena_verified.327.659,False,gitlab,backend_state,659,test,webarena_verified.327.658 -webarena_verified.327.660,False,gitlab,backend_state,660,test,webarena_verified.327.659 -webarena_verified.328.661,False,gitlab,backend_state,661,test,webarena_verified.327.660 -webarena_verified.328.662,False,gitlab,backend_state,662,train,webarena_verified.328.661 -webarena_verified.328.663,False,gitlab,backend_state,663,train,webarena_verified.328.662 -webarena_verified.328.664,False,gitlab,backend_state,664,test,webarena_verified.328.663 -webarena_verified.328.665,False,gitlab,backend_state,665,train,webarena_verified.328.664 -webarena_verified.335.666,False,gitlab,retrieve_value,666,test,webarena_verified.328.665 -webarena_verified.335.667,False,gitlab,backend_state,667,test,webarena_verified.335.666 -webarena_verified.335.668,False,gitlab,retrieve_value,668,test,webarena_verified.335.667 -webarena_verified.337.669,False,gitlab,backend_state,669,test,webarena_verified.335.668 -webarena_verified.337.670,False,gitlab,backend_state,670,train,webarena_verified.337.669 -webarena_verified.101.671,False,shopping reddit,ui_state,671,train,webarena_verified.23.652 -webarena_verified.101.672,False,shopping reddit,ui_state,672,train,webarena_verified.101.671 -webarena_verified.101.673,False,shopping reddit,ui_state,673,test,webarena_verified.101.672 -webarena_verified.101.674,False,shopping reddit,ui_state,674,test,webarena_verified.101.673 -webarena_verified.101.675,False,shopping reddit,ui_state,675,train,webarena_verified.101.674 -webarena_verified.253.676,False,shopping_admin,ui_state,676,test,webarena_verified.252.551 -webarena_verified.253.677,False,shopping_admin,ui_state,677,test,webarena_verified.253.676 -webarena_verified.253.678,False,shopping_admin,ui_state,678,train,webarena_verified.253.677 -webarena_verified.253.679,False,shopping_admin,ui_state,679,train,webarena_verified.253.678 -webarena_verified.253.680,False,shopping_admin,ui_state,680,train,webarena_verified.253.679 -webarena_verified.116.681,False,reddit gitlab,ui_state,681,train,webarena_verified.337.670 -webarena_verified.116.682,False,reddit gitlab,ui_state,682,train,webarena_verified.116.681 -webarena_verified.116.683,False,reddit gitlab,ui_state,683,test,webarena_verified.116.682 -webarena_verified.117.684,False,reddit gitlab,ui_state,684,train,webarena_verified.116.683 -webarena_verified.117.685,False,reddit gitlab,ui_state,685,train,webarena_verified.117.684 -webarena_verified.117.686,False,reddit gitlab,ui_state,686,train,webarena_verified.117.685 -webarena_verified.117.687,False,reddit gitlab,ui_state,687,test,webarena_verified.117.686 -webarena_verified.117.688,False,reddit gitlab,ui_state,688,test,webarena_verified.117.687 -webarena_verified.163.689,False,shopping,ui_state,689,test,webarena_verified.101.675 -webarena_verified.163.690,False,shopping,ui_state,690,test,webarena_verified.163.689 -webarena_verified.163.691,False,shopping,ui_state,691,train,webarena_verified.163.690 -webarena_verified.163.692,False,shopping,ui_state,692,train,webarena_verified.163.691 -webarena_verified.163.693,False,shopping,ui_state,693,train,webarena_verified.163.692 -webarena_verified.256.694,False,shopping_admin,backend_state,694,train,webarena_verified.253.680 -webarena_verified.256.695,False,shopping_admin,backend_state,695,train,webarena_verified.256.694 -webarena_verified.256.696,False,shopping_admin,backend_state,696,test,webarena_verified.256.695 -webarena_verified.256.697,False,shopping_admin,backend_state,697,train,webarena_verified.256.696 -webarena_verified.256.698,False,shopping_admin,backend_state,698,test,webarena_verified.256.697 -webarena_verified.258.699,False,shopping_admin,backend_state,699,train,webarena_verified.256.698 -webarena_verified.258.700,False,shopping_admin,backend_state,700,test,webarena_verified.258.699 -webarena_verified.258.701,False,shopping_admin,backend_state,701,test,webarena_verified.258.700 -webarena_verified.258.702,False,shopping_admin,backend_state,702,train,webarena_verified.258.701 -webarena_verified.258.703,False,shopping_admin,backend_state,703,train,webarena_verified.258.702 -webarena_verified.268.704,False,shopping_admin,ui_state,704,test,webarena_verified.258.703 -webarena_verified.268.705,False,shopping_admin,ui_state,705,test,webarena_verified.268.704 -webarena_verified.268.706,False,shopping_admin,ui_state,706,train,webarena_verified.268.705 -webarena_verified.268.707,False,shopping_admin,ui_state,707,train,webarena_verified.268.706 -webarena_verified.268.708,False,shopping_admin,ui_state,708,train,webarena_verified.268.707 -webarena_verified.271.709,False,shopping_admin,ui_state,709,test,webarena_verified.268.708 -webarena_verified.271.710,False,shopping_admin,ui_state,710,test,webarena_verified.271.709 -webarena_verified.271.711,False,shopping_admin,ui_state,711,train,webarena_verified.271.710 -webarena_verified.271.712,False,shopping_admin,ui_state,712,train,webarena_verified.271.711 -webarena_verified.271.713,False,shopping_admin,ui_state,713,train,webarena_verified.271.712 -webarena_verified.24.714,False,reddit,backend_state,714,train,webarena_verified.117.688 -webarena_verified.24.715,False,reddit,backend_state,715,train,webarena_verified.24.714 -webarena_verified.24.716,False,reddit,backend_state,716,train,webarena_verified.24.715 -webarena_verified.24.717,False,reddit,backend_state,717,test,webarena_verified.24.716 -webarena_verified.24.718,False,reddit,backend_state,718,test,webarena_verified.24.717 -webarena_verified.25.719,False,reddit,backend_state,719,train,webarena_verified.24.718 -webarena_verified.25.720,False,reddit,backend_state,720,test,webarena_verified.25.719 -webarena_verified.25.721,False,reddit,backend_state,721,train,webarena_verified.25.720 -webarena_verified.25.722,False,reddit,backend_state,722,train,webarena_verified.25.721 -webarena_verified.25.723,False,reddit,backend_state,723,test,webarena_verified.25.722 -webarena_verified.25.724,False,reddit,backend_state,724,test,webarena_verified.25.723 -webarena_verified.1510.725,False,reddit,backend_state,725,test,webarena_verified.25.724 -webarena_verified.1510.726,False,reddit,backend_state,726,test,webarena_verified.1510.725 -webarena_verified.1510.727,False,reddit,backend_state,727,train,webarena_verified.1510.726 -webarena_verified.1510.728,False,reddit,backend_state,728,train,webarena_verified.1510.727 -webarena_verified.1510.729,False,reddit,backend_state,729,train,webarena_verified.1510.728 -webarena_verified.1510.730,False,reddit,backend_state,730,test,webarena_verified.1510.729 -webarena_verified.27.731,False,reddit,backend_state,731,test,webarena_verified.1510.730 -webarena_verified.27.732,False,reddit,backend_state,732,train,webarena_verified.27.731 -webarena_verified.27.733,False,reddit,backend_state,733,train,webarena_verified.27.732 -webarena_verified.27.734,False,reddit,program_html,734,train,webarena_verified.27.733 -webarena_verified.27.735,False,reddit,program_html,735,test,webarena_verified.27.734 -webarena_verified.355.736,False,gitlab,backend_state,736,train,webarena_verified.117.688 -webarena_verified.94.737,False,wikipedia map,program_html,737,train,webarena_verified.371.430 -webarena_verified.94.738,False,wikipedia map,program_html,738,test,webarena_verified.94.737 -webarena_verified.94.739,False,wikipedia map,program_html,739,train,webarena_verified.94.738 -webarena_verified.94.740,False,wikipedia map,program_html,740,test,webarena_verified.94.739 -webarena_verified.94.741,False,wikipedia map,program_html,741,train,webarena_verified.94.740 -webarena_verified.332.742,False,gitlab,backend_state,742,test,webarena_verified.355.736 -webarena_verified.332.743,False,gitlab,backend_state,743,test,webarena_verified.332.742 -webarena_verified.332.744,False,gitlab,backend_state,744,test,webarena_verified.332.743 -webarena_verified.332.745,False,gitlab,backend_state,745,test,webarena_verified.332.744 -webarena_verified.332.746,False,gitlab,backend_state,746,train,webarena_verified.332.745 -webarena_verified.2100.747,False,gitlab,backend_state,747,train,webarena_verified.332.746 -webarena_verified.2100.748,False,gitlab,backend_state,748,train,webarena_verified.2100.747 -webarena_verified.2100.749,False,gitlab,backend_state,749,test,webarena_verified.2100.748 -webarena_verified.2100.750,False,gitlab,backend_state,750,test,webarena_verified.2100.749 -webarena_verified.2100.751,False,gitlab,backend_state,751,train,webarena_verified.2100.750 -webarena_verified.332.752,False,gitlab,backend_state,752,train,webarena_verified.2100.751 -webarena_verified.332.753,False,gitlab,backend_state,753,test,webarena_verified.332.752 -webarena_verified.332.754,False,gitlab,backend_state,754,train,webarena_verified.332.753 -webarena_verified.332.755,False,gitlab,backend_state,755,test,webarena_verified.332.754 -webarena_verified.332.756,False,gitlab,backend_state,756,train,webarena_verified.332.755 -webarena_verified.42.757,False,map,program_html,757,test,webarena_verified.94.741 -webarena_verified.42.758,False,map,program_html,758,test,webarena_verified.42.757 -webarena_verified.42.759,False,map shopping_admin,program_html,759,test,webarena_verified.271.713 -webarena_verified.42.760,False,map shopping_admin,program_html,760,test,webarena_verified.42.759 -webarena_verified.54.761,False,map,program_html,761,train,webarena_verified.42.760 -webarena_verified.54.762,False,map,program_html,762,train,webarena_verified.54.761 -webarena_verified.75.763,False,map,program_html,763,test,webarena_verified.54.762 -webarena_verified.75.764,False,map,program_html,764,test,webarena_verified.75.763 -webarena_verified.75.765,False,map,program_html,765,train,webarena_verified.75.764 -webarena_verified.75.766,False,map,program_html,766,train,webarena_verified.75.765 -webarena_verified.75.767,False,map,program_html,767,train,webarena_verified.75.766 -webarena_verified.241.768,False,shopping_admin,backend_state,768,test,webarena_verified.42.760 -webarena_verified.241.769,False,shopping_admin,backend_state,769,test,webarena_verified.241.768 -webarena_verified.241.770,False,shopping_admin,backend_state,770,train,webarena_verified.241.769 -webarena_verified.243.771,False,shopping_admin,backend_state,771,test,webarena_verified.241.770 -webarena_verified.246.772,False,shopping_admin,backend_state,772,test,webarena_verified.243.771 -webarena_verified.246.773,False,shopping_admin,backend_state,773,train,webarena_verified.246.772 -webarena_verified.246.774,False,shopping_admin,backend_state,774,train,webarena_verified.246.773 -webarena_verified.246.775,False,shopping_admin,backend_state,775,train,webarena_verified.246.774 -webarena_verified.246.776,False,shopping_admin,backend_state,776,test,webarena_verified.246.775 -webarena_verified.742.777,False,shopping_admin,backend_state,777,train,webarena_verified.246.776 -webarena_verified.742.778,False,shopping_admin,backend_state,778,test,webarena_verified.742.777 -webarena_verified.742.779,False,shopping_admin,backend_state,779,train,webarena_verified.742.778 -webarena_verified.742.780,False,shopping_admin,backend_state,780,test,webarena_verified.742.779 -webarena_verified.742.781,False,shopping_admin,backend_state,781,train,webarena_verified.742.780 -webarena_verified.742.782,False,shopping_admin,backend_state,782,test,webarena_verified.742.781 -webarena_verified.351.783,False,gitlab,retrieve_value,783,train,webarena_verified.332.756 -webarena_verified.316.784,False,gitlab,retrieve_value,784,test,webarena_verified.351.783 -webarena_verified.316.785,False,gitlab,retrieve_value,785,test,webarena_verified.316.784 -webarena_verified.316.786,False,gitlab,retrieve_value,786,test,webarena_verified.316.785 -webarena_verified.316.787,False,gitlab,retrieve_value,787,test,webarena_verified.316.786 -webarena_verified.316.788,False,gitlab,retrieve_value,788,test,webarena_verified.316.787 -webarena_verified.328.789,False,gitlab,retrieve_value,789,test,webarena_verified.316.788 -webarena_verified.246.790,False,shopping_admin,retrieve_value,790,test,webarena_verified.742.782 -webarena_verified.84.791,False,gitlab reddit,string_match,791,train,webarena_verified.27.735 -webarena_verified.172.792,False,shopping,retrieve_value,792,test,webarena_verified.163.693 -webarena_verified.172.793,False,shopping,retrieve_value,793,train,webarena_verified.172.792 -webarena_verified.191.794,False,shopping,retrieve_value,794,test,webarena_verified.172.793 -webarena_verified.191.795,False,shopping,retrieve_value,795,train,webarena_verified.191.794 -webarena_verified.191.796,False,shopping,retrieve_value,796,train,webarena_verified.191.795 -webarena_verified.191.797,False,shopping,retrieve_value,797,test,webarena_verified.191.796 -webarena_verified.191.798,False,shopping,retrieve_value,798,train,webarena_verified.191.797 -webarena_verified.600.799,False,gitlab,backend_state,799,train,webarena_verified.84.791 -webarena_verified.600.800,False,gitlab,backend_state,800,test,webarena_verified.600.799 -webarena_verified.600.801,False,gitlab,backend_state,801,train,webarena_verified.600.800 -webarena_verified.600.802,False,gitlab,backend_state,802,train,webarena_verified.600.801 -webarena_verified.600.803,False,gitlab,backend_state,803,test,webarena_verified.600.802 -webarena_verified.999.804,False,gitlab,backend_state,804,train,webarena_verified.600.803 -webarena_verified.335.805,False,gitlab,backend_state,805,test,webarena_verified.999.804 -webarena_verified.335.806,False,gitlab,backend_state,806,test,webarena_verified.335.805 -webarena_verified.335.807,False,gitlab,backend_state,807,train,webarena_verified.335.806 -webarena_verified.327.808,False,gitlab,backend_state,808,train,webarena_verified.335.807 -webarena_verified.327.809,False,gitlab,backend_state,809,train,webarena_verified.327.808 -webarena_verified.999.810,False,gitlab,backend_state,810,test,webarena_verified.327.809 -webarena_verified.999.811,False,gitlab,backend_state,811,test,webarena_verified.999.810 +webarena_verified.279.0.2,False,shopping_admin,AgentResponseEvaluator,0,train, +webarena_verified.279.1.2,False,shopping_admin,AgentResponseEvaluator,1,test,webarena_verified.279.0.2 +webarena_verified.279.2.2,False,shopping_admin,AgentResponseEvaluator,2,train,webarena_verified.279.1.2 +webarena_verified.279.3.2,False,shopping_admin,AgentResponseEvaluator,3,test,webarena_verified.279.2.2 +webarena_verified.279.4.2,False,shopping_admin,AgentResponseEvaluator,4,train,webarena_verified.279.3.2 +webarena_verified.279.5.2,False,shopping_admin,AgentResponseEvaluator,5,train,webarena_verified.279.4.2 +webarena_verified.279.6.2,False,shopping_admin,AgentResponseEvaluator,6,test,webarena_verified.279.5.2 +webarena_verified.79.7.2,False,map,AgentResponseEvaluator,7,train, +webarena_verified.79.8.2,False,map,AgentResponseEvaluator,8,test,webarena_verified.79.7.2 +webarena_verified.79.9.2,False,map,AgentResponseEvaluator,9,test,webarena_verified.79.8.2 +webarena_verified.79.10.2,False,map,AgentResponseEvaluator,10,test,webarena_verified.79.9.2 +webarena_verified.288.11.2,False,shopping_admin,AgentResponseEvaluator,11,test,webarena_verified.279.6.2 +webarena_verified.288.12.2,False,shopping_admin,AgentResponseEvaluator,12,train,webarena_verified.288.11.2 +webarena_verified.288.13.2,False,shopping_admin,AgentResponseEvaluator,13,train,webarena_verified.288.12.2 +webarena_verified.288.14.2,False,shopping_admin,AgentResponseEvaluator,14,train,webarena_verified.288.13.2 +webarena_verified.288.15.2,False,shopping_admin,AgentResponseEvaluator,15,test,webarena_verified.288.14.2 +webarena_verified.73.16.2,False,map,AgentResponseEvaluator,16,test,webarena_verified.79.10.2 +webarena_verified.73.17.2,False,map,AgentResponseEvaluator,17,train,webarena_verified.73.16.2 +webarena_verified.73.18.2,False,map,AgentResponseEvaluator,18,test,webarena_verified.73.17.2 +webarena_verified.73.19.2,False,map,AgentResponseEvaluator,19,train,webarena_verified.73.18.2 +webarena_verified.73.20.2,False,map,AgentResponseEvaluator,20,test,webarena_verified.73.19.2 +webarena_verified.222.21.2,False,shopping,AgentResponseEvaluator,21,test, +webarena_verified.222.22.2,False,shopping,AgentResponseEvaluator,22,test,webarena_verified.222.21.2 +webarena_verified.222.23.2,False,shopping,AgentResponseEvaluator,23,test,webarena_verified.222.22.2 +webarena_verified.222.24.2,False,shopping,AgentResponseEvaluator,24,test,webarena_verified.222.23.2 +webarena_verified.222.25.2,False,shopping,AgentResponseEvaluator,25,test,webarena_verified.222.24.2 +webarena_verified.222.26.2,False,shopping,AgentResponseEvaluator,26,test,webarena_verified.222.25.2 +webarena_verified.33.27.2,False,reddit,AgentResponseEvaluator,27,test, +webarena_verified.33.28.2,False,reddit,AgentResponseEvaluator,28,train,webarena_verified.33.27.2 +webarena_verified.33.29.2,False,reddit,AgentResponseEvaluator,29,train,webarena_verified.33.28.2 +webarena_verified.33.30.2,False,reddit,AgentResponseEvaluator,30,test,webarena_verified.33.29.2 +webarena_verified.33.31.2,False,reddit,AgentResponseEvaluator,31,train,webarena_verified.33.30.2 +webarena_verified.78.32.2,False,map,AgentResponseEvaluator,32,test,webarena_verified.73.20.2 +webarena_verified.78.33.2,False,map,AgentResponseEvaluator,33,test,webarena_verified.78.32.2 +webarena_verified.78.34.2,False,map,AgentResponseEvaluator,34,train,webarena_verified.78.33.2 +webarena_verified.78.35.2,False,map,AgentResponseEvaluator,35,test,webarena_verified.78.34.2 +webarena_verified.77.36.2,False,map,AgentResponseEvaluator,36,test,webarena_verified.78.35.2 +webarena_verified.77.37.2,False,map,AgentResponseEvaluator,37,train,webarena_verified.77.36.2 +webarena_verified.77.38.2,False,map,AgentResponseEvaluator,38,train,webarena_verified.77.37.2 +webarena_verified.77.39.2,False,map,AgentResponseEvaluator,39,train,webarena_verified.77.38.2 +webarena_verified.77.40.2,False,map,AgentResponseEvaluator,40,test,webarena_verified.77.39.2 +webarena_verified.285.41.2,False,shopping_admin,AgentResponseEvaluator,41,train,webarena_verified.288.15.2 +webarena_verified.285.42.2,False,shopping_admin,AgentResponseEvaluator,42,train,webarena_verified.285.41.2 +webarena_verified.285.43.2,False,shopping_admin,AgentResponseEvaluator,43,test,webarena_verified.285.42.2 +webarena_verified.303.44.2,False,gitlab,AgentResponseEvaluator NetworkEventEvaluator,44,train, +webarena_verified.300.45.2,False,gitlab,AgentResponseEvaluator NetworkEventEvaluator,45,test,webarena_verified.303.44.2 +webarena_verified.300.46.2,False,gitlab,AgentResponseEvaluator NetworkEventEvaluator,46,test,webarena_verified.300.45.2 +webarena_verified.197.47.2,False,shopping,AgentResponseEvaluator,47,train,webarena_verified.222.26.2 +webarena_verified.197.48.2,False,shopping,AgentResponseEvaluator,48,test,webarena_verified.197.47.2 +webarena_verified.197.49.2,False,shopping,AgentResponseEvaluator,49,train,webarena_verified.197.48.2 +webarena_verified.197.50.2,False,shopping,AgentResponseEvaluator,50,train,webarena_verified.197.49.2 +webarena_verified.197.51.2,False,shopping,AgentResponseEvaluator,51,test,webarena_verified.197.50.2 +webarena_verified.68.52.2,False,map,AgentResponseEvaluator,52,test,webarena_verified.77.40.2 +webarena_verified.68.53.2,False,map,AgentResponseEvaluator,53,train,webarena_verified.68.52.2 +webarena_verified.68.54.2,False,map,AgentResponseEvaluator,54,test,webarena_verified.68.53.2 +webarena_verified.68.55.2,False,map,AgentResponseEvaluator,55,train,webarena_verified.68.54.2 +webarena_verified.68.56.2,False,map,AgentResponseEvaluator,56,train,webarena_verified.68.55.2 +webarena_verified.69.57.2,False,map,AgentResponseEvaluator,57,train,webarena_verified.68.56.2 +webarena_verified.69.58.2,False,map,AgentResponseEvaluator,58,train,webarena_verified.69.57.2 +webarena_verified.69.59.2,False,map,AgentResponseEvaluator,59,test,webarena_verified.69.58.2 +webarena_verified.69.60.2,False,map,AgentResponseEvaluator,60,test,webarena_verified.69.59.2 +webarena_verified.69.61.2,False,map,AgentResponseEvaluator,61,train,webarena_verified.69.60.2 +webarena_verified.276.62.2,False,shopping_admin,AgentResponseEvaluator,62,train,webarena_verified.285.43.2 +webarena_verified.276.63.2,False,shopping_admin,AgentResponseEvaluator,63,test,webarena_verified.276.62.2 +webarena_verified.276.64.2,False,shopping_admin,AgentResponseEvaluator,64,test,webarena_verified.276.63.2 +webarena_verified.276.65.2,False,shopping_admin,AgentResponseEvaluator,65,train,webarena_verified.276.64.2 +webarena_verified.17.66.2,False,reddit,AgentResponseEvaluator,66,test,webarena_verified.33.31.2 +webarena_verified.17.67.2,False,reddit,AgentResponseEvaluator,67,test,webarena_verified.17.66.2 +webarena_verified.17.68.2,False,reddit,AgentResponseEvaluator,68,train,webarena_verified.17.67.2 +webarena_verified.17.69.2,False,reddit,AgentResponseEvaluator,69,test,webarena_verified.17.68.2 +webarena_verified.70.70.2,False,map,AgentResponseEvaluator,70,train,webarena_verified.69.61.2 +webarena_verified.70.71.2,False,map,AgentResponseEvaluator,71,test,webarena_verified.70.70.2 +webarena_verified.70.72.2,False,map,AgentResponseEvaluator,72,train,webarena_verified.70.71.2 +webarena_verified.70.73.2,False,map,AgentResponseEvaluator,73,test,webarena_verified.70.72.2 +webarena_verified.65.74.2,False,map,AgentResponseEvaluator,74,train,webarena_verified.70.73.2 +webarena_verified.65.75.2,False,map,AgentResponseEvaluator,75,train,webarena_verified.65.74.2 +webarena_verified.65.76.2,False,map,AgentResponseEvaluator,76,train,webarena_verified.65.75.2 +webarena_verified.277.77.2,False,shopping_admin,AgentResponseEvaluator,77,test,webarena_verified.276.65.2 +webarena_verified.277.78.2,False,shopping_admin,AgentResponseEvaluator,78,train,webarena_verified.277.77.2 +webarena_verified.277.79.2,False,shopping_admin,AgentResponseEvaluator,79,test,webarena_verified.277.78.2 +webarena_verified.72.80.2,False,map,AgentResponseEvaluator,80,test,webarena_verified.65.76.2 +webarena_verified.72.81.2,False,map,AgentResponseEvaluator,81,test,webarena_verified.72.80.2 +webarena_verified.72.82.2,False,map,AgentResponseEvaluator,82,train,webarena_verified.72.81.2 +webarena_verified.72.83.2,False,map,AgentResponseEvaluator,83,train,webarena_verified.72.82.2 +webarena_verified.64.84.2,False,map,AgentResponseEvaluator,84,train,webarena_verified.72.83.2 +webarena_verified.64.85.2,False,map,AgentResponseEvaluator,85,test,webarena_verified.64.84.2 +webarena_verified.64.86.2,False,map,AgentResponseEvaluator,86,test,webarena_verified.64.85.2 +webarena_verified.64.87.2,False,map,AgentResponseEvaluator,87,train,webarena_verified.64.86.2 +webarena_verified.64.88.2,False,map,AgentResponseEvaluator,88,train,webarena_verified.64.87.2 +webarena_verified.67.89.3,False,map,AgentResponseEvaluator,89,test,webarena_verified.64.88.2 +webarena_verified.67.90.3,False,map,AgentResponseEvaluator,90,test,webarena_verified.67.89.3 +webarena_verified.67.91.3,False,map,AgentResponseEvaluator,91,train,webarena_verified.67.90.3 +webarena_verified.67.92.3,False,map,AgentResponseEvaluator,92,train,webarena_verified.67.91.3 +webarena_verified.67.93.3,False,map,AgentResponseEvaluator,93,train,webarena_verified.67.92.3 +webarena_verified.274.94.2,False,shopping_admin,AgentResponseEvaluator,94,test,webarena_verified.277.79.2 +webarena_verified.274.95.2,False,shopping_admin,AgentResponseEvaluator,95,train,webarena_verified.274.94.2 +webarena_verified.193.96.2,False,shopping,AgentResponseEvaluator,96,test,webarena_verified.197.51.2 +webarena_verified.120.97.2,False,map wikipedia,AgentResponseEvaluator NetworkEventEvaluator,97,test,webarena_verified.67.93.3 +webarena_verified.66.98.2,False,map,AgentResponseEvaluator,98,test,webarena_verified.120.97.2 +webarena_verified.66.99.2,False,map,AgentResponseEvaluator,99,train,webarena_verified.66.98.2 +webarena_verified.66.100.2,False,map,AgentResponseEvaluator,100,test,webarena_verified.66.99.2 +webarena_verified.66.101.2,False,map,AgentResponseEvaluator,101,train,webarena_verified.66.100.2 +webarena_verified.349.102.2,False,gitlab,AgentResponseEvaluator NetworkEventEvaluator NetworkEventEvaluator,102,train,webarena_verified.300.46.2 +webarena_verified.349.103.2,False,gitlab,AgentResponseEvaluator NetworkEventEvaluator NetworkEventEvaluator,103,train,webarena_verified.349.102.2 +webarena_verified.349.104.2,False,gitlab,AgentResponseEvaluator NetworkEventEvaluator NetworkEventEvaluator,104,test,webarena_verified.349.103.2 +webarena_verified.349.105.2,False,gitlab,AgentResponseEvaluator NetworkEventEvaluator NetworkEventEvaluator,105,train,webarena_verified.349.104.2 +webarena_verified.349.106.2,False,gitlab,AgentResponseEvaluator NetworkEventEvaluator NetworkEventEvaluator,106,test,webarena_verified.349.105.2 +webarena_verified.270.107.2,False,shopping_admin,AgentResponseEvaluator,107,test,webarena_verified.274.95.2 +webarena_verified.270.108.2,False,shopping_admin,AgentResponseEvaluator,108,train,webarena_verified.270.107.2 +webarena_verified.270.109.2,False,shopping_admin,AgentResponseEvaluator,109,test,webarena_verified.270.108.2 +webarena_verified.270.110.2,False,shopping_admin,AgentResponseEvaluator,110,train,webarena_verified.270.109.2 +webarena_verified.270.111.2,False,shopping_admin,AgentResponseEvaluator,111,train,webarena_verified.270.110.2 +webarena_verified.245.112.2,False,shopping_admin,AgentResponseEvaluator,112,test,webarena_verified.270.111.2 +webarena_verified.245.113.2,False,shopping_admin,AgentResponseEvaluator,113,test,webarena_verified.245.112.2 +webarena_verified.245.114.2,False,shopping_admin,AgentResponseEvaluator,114,train,webarena_verified.245.113.2 +webarena_verified.245.115.2,False,shopping_admin,AgentResponseEvaluator,115,test,webarena_verified.245.114.2 +webarena_verified.245.116.2,False,shopping_admin,AgentResponseEvaluator,116,test,webarena_verified.245.115.2 +webarena_verified.161.117.2,False,shopping,AgentResponseEvaluator,117,test,webarena_verified.193.96.2 +webarena_verified.151.118.2,False,shopping,AgentResponseEvaluator NetworkEventEvaluator,118,train,webarena_verified.161.117.2 +webarena_verified.250.119.2,False,shopping_admin,AgentResponseEvaluator,119,test,webarena_verified.245.116.2 +webarena_verified.250.120.2,False,shopping_admin,AgentResponseEvaluator,120,train,webarena_verified.250.119.2 +webarena_verified.250.121.2,False,shopping_admin,AgentResponseEvaluator,121,train,webarena_verified.250.120.2 +webarena_verified.250.122.2,False,shopping_admin,AgentResponseEvaluator,122,test,webarena_verified.250.121.2 +webarena_verified.250.123.2,False,shopping_admin,AgentResponseEvaluator,123,train,webarena_verified.250.122.2 +webarena_verified.159.124.2,False,shopping,AgentResponseEvaluator,124,train,webarena_verified.151.118.2 +webarena_verified.159.125.2,False,shopping,AgentResponseEvaluator,125,train,webarena_verified.159.124.2 +webarena_verified.159.126.2,False,shopping,AgentResponseEvaluator,126,test,webarena_verified.159.125.2 +webarena_verified.1001.127.2,False,shopping_admin,AgentResponseEvaluator,127,train,webarena_verified.250.123.2 +webarena_verified.1002.128.2,False,shopping_admin,AgentResponseEvaluator,128,train,webarena_verified.1001.127.2 +webarena_verified.1002.129.2,False,shopping_admin,AgentResponseEvaluator,129,train,webarena_verified.1002.128.2 +webarena_verified.1002.130.2,False,shopping_admin,AgentResponseEvaluator,130,train,webarena_verified.1002.129.2 +webarena_verified.1002.131.2,False,shopping_admin,AgentResponseEvaluator,131,test,webarena_verified.1002.130.2 +webarena_verified.322.132.2,False,gitlab,AgentResponseEvaluator,132,train,webarena_verified.349.106.2 +webarena_verified.322.133.2,False,gitlab,AgentResponseEvaluator,133,test,webarena_verified.322.132.2 +webarena_verified.322.134.2,False,gitlab,AgentResponseEvaluator,134,test,webarena_verified.322.133.2 +webarena_verified.322.135.2,False,gitlab,AgentResponseEvaluator,135,train,webarena_verified.322.134.2 +webarena_verified.322.136.2,False,gitlab,AgentResponseEvaluator,136,train,webarena_verified.322.135.2 +webarena_verified.51.137.2,False,map,AgentResponseEvaluator,137,test,webarena_verified.66.101.2 +webarena_verified.51.138.2,False,map,AgentResponseEvaluator,138,test,webarena_verified.51.137.2 +webarena_verified.51.139.2,False,map,AgentResponseEvaluator,139,test,webarena_verified.51.138.2 +webarena_verified.51.140.2,False,map,AgentResponseEvaluator,140,train,webarena_verified.51.139.2 +webarena_verified.162.141.2,False,shopping,AgentResponseEvaluator,141,train,webarena_verified.159.126.2 +webarena_verified.162.142.2,False,shopping,AgentResponseEvaluator,142,train,webarena_verified.162.141.2 +webarena_verified.162.143.2,False,shopping,AgentResponseEvaluator,143,test,webarena_verified.162.142.2 +webarena_verified.162.144.2,False,shopping,AgentResponseEvaluator,144,test,webarena_verified.162.143.2 +webarena_verified.162.145.2,False,shopping,AgentResponseEvaluator,145,train,webarena_verified.162.144.2 +webarena_verified.155.146.2,False,shopping,AgentResponseEvaluator,146,test,webarena_verified.162.145.2 +webarena_verified.155.147.2,False,shopping,AgentResponseEvaluator,147,train,webarena_verified.155.146.2 +webarena_verified.155.148.2,False,shopping,AgentResponseEvaluator,148,train,webarena_verified.155.147.2 +webarena_verified.155.149.2,False,shopping,AgentResponseEvaluator,149,test,webarena_verified.155.148.2 +webarena_verified.155.150.2,False,shopping,AgentResponseEvaluator,150,train,webarena_verified.155.149.2 +webarena_verified.36.151.2,False,map,AgentResponseEvaluator,151,train,webarena_verified.51.140.2 +webarena_verified.36.152.2,False,map,AgentResponseEvaluator,152,train,webarena_verified.36.151.2 +webarena_verified.36.153.2,False,map,AgentResponseEvaluator,153,test,webarena_verified.36.152.2 +webarena_verified.36.154.2,False,map,AgentResponseEvaluator,154,train,webarena_verified.36.153.2 +webarena_verified.36.155.2,False,map,AgentResponseEvaluator,155,test,webarena_verified.36.154.2 +webarena_verified.290.156.2,False,gitlab,AgentResponseEvaluator NetworkEventEvaluator,156,test,webarena_verified.322.136.2 +webarena_verified.255.157.2,False,shopping_admin,AgentResponseEvaluator NetworkEventEvaluator,157,train,webarena_verified.1002.131.2 +webarena_verified.171.158.2,False,shopping,AgentResponseEvaluator NetworkEventEvaluator,158,test,webarena_verified.155.150.2 +webarena_verified.171.159.2,False,shopping,AgentResponseEvaluator NetworkEventEvaluator,159,train,webarena_verified.171.158.2 +webarena_verified.171.160.2,False,shopping,AgentResponseEvaluator NetworkEventEvaluator,160,train,webarena_verified.171.159.2 +webarena_verified.171.161.2,False,shopping,AgentResponseEvaluator NetworkEventEvaluator,161,train,webarena_verified.171.160.2 +webarena_verified.171.162.2,False,shopping,AgentResponseEvaluator NetworkEventEvaluator,162,test,webarena_verified.171.161.2 +webarena_verified.136.163.2,False,shopping,AgentResponseEvaluator,163,test,webarena_verified.171.162.2 +webarena_verified.136.164.2,False,shopping,AgentResponseEvaluator,164,test,webarena_verified.136.163.2 +webarena_verified.136.165.2,False,shopping,AgentResponseEvaluator,165,test,webarena_verified.136.164.2 +webarena_verified.136.166.2,False,shopping,AgentResponseEvaluator,166,test,webarena_verified.136.165.2 +webarena_verified.136.167.2,False,shopping,AgentResponseEvaluator,167,test,webarena_verified.136.166.2 +webarena_verified.289.168.2,False,gitlab,AgentResponseEvaluator,168,test,webarena_verified.290.156.2 +webarena_verified.289.169.2,False,gitlab,AgentResponseEvaluator,169,train,webarena_verified.289.168.2 +webarena_verified.289.170.2,False,gitlab,AgentResponseEvaluator,170,train,webarena_verified.289.169.2 +webarena_verified.289.171.2,False,gitlab,AgentResponseEvaluator,171,test,webarena_verified.289.170.2 +webarena_verified.289.172.2,False,gitlab,AgentResponseEvaluator,172,train,webarena_verified.289.171.2 +webarena_verified.310.173.2,False,gitlab,AgentResponseEvaluator,173,train,webarena_verified.289.172.2 +webarena_verified.310.174.2,False,gitlab,AgentResponseEvaluator,174,test,webarena_verified.310.173.2 +webarena_verified.310.175.2,False,gitlab,AgentResponseEvaluator,175,train,webarena_verified.310.174.2 +webarena_verified.310.176.2,False,gitlab,AgentResponseEvaluator,176,train,webarena_verified.310.175.2 +webarena_verified.310.177.2,False,gitlab,AgentResponseEvaluator,177,test,webarena_verified.310.176.2 +webarena_verified.500.178.2,False,gitlab,AgentResponseEvaluator,178,test,webarena_verified.310.177.2 +webarena_verified.500.179.2,False,gitlab,AgentResponseEvaluator,179,train,webarena_verified.500.178.2 +webarena_verified.500.180.2,False,gitlab,AgentResponseEvaluator,180,train,webarena_verified.500.179.2 +webarena_verified.500.181.2,False,gitlab,AgentResponseEvaluator,181,test,webarena_verified.500.180.2 +webarena_verified.500.182.2,False,gitlab,AgentResponseEvaluator,182,train,webarena_verified.500.181.2 +webarena_verified.368.183.2,False,shopping_admin,AgentResponseEvaluator,183,train,webarena_verified.255.157.2 +webarena_verified.368.184.2,False,shopping_admin,AgentResponseEvaluator,184,train,webarena_verified.368.183.2 +webarena_verified.368.185.2,False,shopping_admin,AgentResponseEvaluator,185,test,webarena_verified.368.184.2 +webarena_verified.368.186.2,False,shopping_admin,AgentResponseEvaluator,186,train,webarena_verified.368.185.2 +webarena_verified.368.187.2,False,shopping_admin,AgentResponseEvaluator,187,test,webarena_verified.368.186.2 +webarena_verified.214.188.2,False,shopping,AgentResponseEvaluator,188,test,webarena_verified.136.167.2 +webarena_verified.214.189.2,False,shopping,AgentResponseEvaluator,189,train,webarena_verified.214.188.2 +webarena_verified.214.190.2,False,shopping,AgentResponseEvaluator,190,train,webarena_verified.214.189.2 +webarena_verified.214.191.2,False,shopping,AgentResponseEvaluator,191,train,webarena_verified.214.190.2 +webarena_verified.214.192.2,False,shopping,AgentResponseEvaluator,192,test,webarena_verified.214.191.2 +webarena_verified.367.193.2,False,shopping_admin,AgentResponseEvaluator,193,train,webarena_verified.368.187.2 +webarena_verified.367.194.2,False,shopping_admin,AgentResponseEvaluator,194,train,webarena_verified.367.193.2 +webarena_verified.367.195.2,False,shopping_admin,AgentResponseEvaluator,195,test,webarena_verified.367.194.2 +webarena_verified.367.196.2,False,shopping_admin,AgentResponseEvaluator,196,train,webarena_verified.367.195.2 +webarena_verified.367.197.2,False,shopping_admin,AgentResponseEvaluator,197,train,webarena_verified.367.196.2 +webarena_verified.366.198.2,False,shopping_admin,AgentResponseEvaluator,198,train,webarena_verified.367.197.2 +webarena_verified.366.199.2,False,shopping_admin,AgentResponseEvaluator,199,train,webarena_verified.366.198.2 +webarena_verified.366.200.2,False,shopping_admin,AgentResponseEvaluator,200,train,webarena_verified.366.199.2 +webarena_verified.366.201.2,False,shopping_admin,AgentResponseEvaluator,201,test,webarena_verified.366.200.2 +webarena_verified.366.202.2,False,shopping_admin,AgentResponseEvaluator,202,train,webarena_verified.366.201.2 +webarena_verified.366.203.2,False,shopping_admin,AgentResponseEvaluator,203,test,webarena_verified.366.202.2 +webarena_verified.366.204.2,False,shopping_admin,AgentResponseEvaluator,204,test,webarena_verified.366.203.2 +webarena_verified.320.205.2,False,gitlab,AgentResponseEvaluator,205,train,webarena_verified.500.182.2 +webarena_verified.320.206.2,False,gitlab,AgentResponseEvaluator,206,test,webarena_verified.320.205.2 +webarena_verified.320.207.2,False,gitlab,AgentResponseEvaluator,207,test,webarena_verified.320.206.2 +webarena_verified.364.208.2,False,shopping_admin,AgentResponseEvaluator,208,test,webarena_verified.366.204.2 +webarena_verified.364.209.2,False,shopping_admin,AgentResponseEvaluator,209,test,webarena_verified.364.208.2 +webarena_verified.364.210.2,False,shopping_admin,AgentResponseEvaluator,210,train,webarena_verified.364.209.2 +webarena_verified.364.211.2,False,shopping_admin,AgentResponseEvaluator,211,train,webarena_verified.364.210.2 +webarena_verified.364.212.2,False,shopping_admin,AgentResponseEvaluator,212,train,webarena_verified.364.211.2 +webarena_verified.249.213.2,False,shopping_admin,AgentResponseEvaluator,213,test,webarena_verified.364.212.2 +webarena_verified.249.214.2,False,shopping_admin,AgentResponseEvaluator,214,train,webarena_verified.249.213.2 +webarena_verified.249.215.2,False,shopping_admin,AgentResponseEvaluator,215,test,webarena_verified.249.214.2 +webarena_verified.249.216.2,False,shopping_admin,AgentResponseEvaluator,216,train,webarena_verified.249.215.2 +webarena_verified.249.217.2,False,shopping_admin,AgentResponseEvaluator,217,train,webarena_verified.249.216.2 +webarena_verified.41.218.2,False,map,AgentResponseEvaluator,218,train,webarena_verified.36.155.2 +webarena_verified.41.219.2,False,map,AgentResponseEvaluator,219,test,webarena_verified.41.218.2 +webarena_verified.41.220.2,False,map,AgentResponseEvaluator,220,train,webarena_verified.41.219.2 +webarena_verified.35.221.2,False,map,AgentResponseEvaluator,221,test,webarena_verified.41.220.2 +webarena_verified.35.222.2,False,map,AgentResponseEvaluator,222,train,webarena_verified.35.221.2 +webarena_verified.35.223.2,False,map,AgentResponseEvaluator,223,test,webarena_verified.35.222.2 +webarena_verified.35.224.2,False,map,AgentResponseEvaluator,224,test,webarena_verified.35.223.2 +webarena_verified.135.225.2,False,shopping,AgentResponseEvaluator,225,test,webarena_verified.214.192.2 +webarena_verified.370.226.2,False,shopping,AgentResponseEvaluator,226,train,webarena_verified.135.225.2 +webarena_verified.370.227.2,False,shopping,AgentResponseEvaluator,227,train,webarena_verified.370.226.2 +webarena_verified.370.228.2,False,shopping,AgentResponseEvaluator,228,test,webarena_verified.370.227.2 +webarena_verified.370.229.2,False,shopping,AgentResponseEvaluator,229,test,webarena_verified.370.228.2 +webarena_verified.370.230.2,False,shopping,AgentResponseEvaluator,230,train,webarena_verified.370.229.2 +webarena_verified.213.231.2,False,shopping,AgentResponseEvaluator,231,test,webarena_verified.370.230.2 +webarena_verified.213.232.2,False,shopping,AgentResponseEvaluator,232,train,webarena_verified.213.231.2 +webarena_verified.213.233.2,False,shopping,AgentResponseEvaluator,233,test,webarena_verified.213.232.2 +webarena_verified.213.234.2,False,shopping,AgentResponseEvaluator,234,train,webarena_verified.213.233.2 +webarena_verified.213.235.2,False,shopping,AgentResponseEvaluator,235,train,webarena_verified.213.234.2 +webarena_verified.39.236.2,False,map,AgentResponseEvaluator,236,train,webarena_verified.35.224.2 +webarena_verified.39.237.2,False,map,AgentResponseEvaluator,237,train,webarena_verified.39.236.2 +webarena_verified.138.238.2,False,shopping,AgentResponseEvaluator NetworkEventEvaluator,238,train,webarena_verified.213.235.2 +webarena_verified.138.239.2,False,shopping,AgentResponseEvaluator NetworkEventEvaluator,239,train,webarena_verified.138.238.2 +webarena_verified.138.240.2,False,shopping,AgentResponseEvaluator NetworkEventEvaluator,240,test,webarena_verified.138.239.2 +webarena_verified.138.241.2,False,shopping,AgentResponseEvaluator NetworkEventEvaluator,241,train,webarena_verified.138.240.2 +webarena_verified.138.242.2,False,shopping,AgentResponseEvaluator NetworkEventEvaluator,242,test,webarena_verified.138.241.2 +webarena_verified.244.243.2,False,shopping_admin,AgentResponseEvaluator,243,train,webarena_verified.249.217.2 +webarena_verified.244.244.2,False,shopping_admin,AgentResponseEvaluator,244,test,webarena_verified.244.243.2 +webarena_verified.244.245.2,False,shopping_admin,AgentResponseEvaluator,245,train,webarena_verified.244.244.2 +webarena_verified.244.246.2,False,shopping_admin,AgentResponseEvaluator,246,test,webarena_verified.244.245.2 +webarena_verified.244.247.2,False,shopping_admin,AgentResponseEvaluator,247,train,webarena_verified.244.246.2 +webarena_verified.46.248.2,False,map,AgentResponseEvaluator,248,test,webarena_verified.39.237.2 +webarena_verified.46.249.2,False,map,AgentResponseEvaluator,249,train,webarena_verified.46.248.2 +webarena_verified.46.250.2,False,map,AgentResponseEvaluator,250,test,webarena_verified.46.249.2 +webarena_verified.46.251.2,False,map,AgentResponseEvaluator,251,train,webarena_verified.46.250.2 +webarena_verified.46.252.2,False,map,AgentResponseEvaluator,252,train,webarena_verified.46.251.2 +webarena_verified.501.253.2,False,map,AgentResponseEvaluator,253,test,webarena_verified.46.252.2 +webarena_verified.501.254.2,False,map,AgentResponseEvaluator,254,train,webarena_verified.501.253.2 +webarena_verified.501.255.2,False,map,AgentResponseEvaluator,255,test,webarena_verified.501.254.2 +webarena_verified.501.256.2,False,map,AgentResponseEvaluator,256,train,webarena_verified.501.255.2 +webarena_verified.501.257.2,False,map,AgentResponseEvaluator,257,test,webarena_verified.501.256.2 +webarena_verified.325.258.2,False,gitlab,AgentResponseEvaluator NetworkEventEvaluator,258,train,webarena_verified.320.207.2 +webarena_verified.312.259.2,False,gitlab,AgentResponseEvaluator,259,train,webarena_verified.325.258.2 +webarena_verified.211.260.2,False,shopping,AgentResponseEvaluator NetworkEventEvaluator,260,test,webarena_verified.138.242.2 +webarena_verified.211.261.2,False,shopping,AgentResponseEvaluator NetworkEventEvaluator,261,train,webarena_verified.211.260.2 +webarena_verified.211.262.2,False,shopping,AgentResponseEvaluator NetworkEventEvaluator,262,train,webarena_verified.211.261.2 +webarena_verified.211.263.2,False,shopping,AgentResponseEvaluator NetworkEventEvaluator,263,test,webarena_verified.211.262.2 +webarena_verified.211.264.2,False,shopping,AgentResponseEvaluator NetworkEventEvaluator,264,train,webarena_verified.211.263.2 +webarena_verified.85.265.4,False,wikipedia map,AgentResponseEvaluator NetworkEventEvaluator,265,test,webarena_verified.501.257.2 +webarena_verified.85.266.4,False,wikipedia map,AgentResponseEvaluator NetworkEventEvaluator,266,test,webarena_verified.85.265.4 +webarena_verified.85.267.4,False,wikipedia map,AgentResponseEvaluator NetworkEventEvaluator,267,train,webarena_verified.85.266.4 +webarena_verified.85.268.4,False,wikipedia map,AgentResponseEvaluator NetworkEventEvaluator,268,test,webarena_verified.85.267.4 +webarena_verified.139.269.2,False,shopping,AgentResponseEvaluator NetworkEventEvaluator,269,train,webarena_verified.211.264.2 +webarena_verified.139.270.2,False,shopping,AgentResponseEvaluator NetworkEventEvaluator,270,train,webarena_verified.139.269.2 +webarena_verified.139.271.2,False,shopping,AgentResponseEvaluator NetworkEventEvaluator,271,test,webarena_verified.139.270.2 +webarena_verified.139.272.2,False,shopping,AgentResponseEvaluator NetworkEventEvaluator,272,test,webarena_verified.139.271.2 +webarena_verified.139.273.2,False,shopping,AgentResponseEvaluator NetworkEventEvaluator,273,train,webarena_verified.139.272.2 +webarena_verified.212.274.2,False,shopping,AgentResponseEvaluator NetworkEventEvaluator,274,test,webarena_verified.139.273.2 +webarena_verified.212.275.2,False,shopping,AgentResponseEvaluator NetworkEventEvaluator,275,test,webarena_verified.212.274.2 +webarena_verified.212.276.2,False,shopping,AgentResponseEvaluator NetworkEventEvaluator,276,train,webarena_verified.212.275.2 +webarena_verified.212.277.2,False,shopping,AgentResponseEvaluator NetworkEventEvaluator,277,train,webarena_verified.212.276.2 +webarena_verified.212.278.2,False,shopping,AgentResponseEvaluator NetworkEventEvaluator,278,train,webarena_verified.212.277.2 +webarena_verified.204.279.2,False,shopping,AgentResponseEvaluator,279,train,webarena_verified.212.278.2 +webarena_verified.204.280.2,False,shopping,AgentResponseEvaluator,280,test,webarena_verified.204.279.2 +webarena_verified.204.281.2,False,shopping,AgentResponseEvaluator,281,train,webarena_verified.204.280.2 +webarena_verified.204.282.2,False,shopping,AgentResponseEvaluator,282,train,webarena_verified.204.281.2 +webarena_verified.210.283.2,False,shopping,AgentResponseEvaluator NetworkEventEvaluator,283,test,webarena_verified.204.282.2 +webarena_verified.207.284.2,False,shopping,AgentResponseEvaluator NetworkEventEvaluator,284,test,webarena_verified.210.283.2 +webarena_verified.207.285.2,False,shopping,AgentResponseEvaluator NetworkEventEvaluator,285,train,webarena_verified.207.284.2 +webarena_verified.207.286.2,False,shopping,AgentResponseEvaluator NetworkEventEvaluator,286,test,webarena_verified.207.285.2 +webarena_verified.47.287.2,False,map,AgentResponseEvaluator,287,test,webarena_verified.85.268.4 +webarena_verified.234.288.2,False,shopping_admin,AgentResponseEvaluator,288,train,webarena_verified.244.247.2 +webarena_verified.234.289.2,False,shopping_admin,AgentResponseEvaluator,289,test,webarena_verified.234.288.2 +webarena_verified.234.290.2,False,shopping_admin,AgentResponseEvaluator,290,train,webarena_verified.234.289.2 +webarena_verified.234.291.2,False,shopping_admin,AgentResponseEvaluator,291,train,webarena_verified.234.290.2 +webarena_verified.234.292.2,False,shopping_admin,AgentResponseEvaluator,292,test,webarena_verified.234.291.2 +webarena_verified.329.293.2,False,gitlab,AgentResponseEvaluator,293,train,webarena_verified.312.259.2 +webarena_verified.329.294.2,False,gitlab,AgentResponseEvaluator,294,train,webarena_verified.329.293.2 +webarena_verified.329.295.2,False,gitlab,AgentResponseEvaluator,295,test,webarena_verified.329.294.2 +webarena_verified.329.296.2,False,gitlab,AgentResponseEvaluator,296,train,webarena_verified.329.295.2 +webarena_verified.329.297.2,False,gitlab,AgentResponseEvaluator,297,test,webarena_verified.329.296.2 +webarena_verified.180.298.2,False,shopping,AgentResponseEvaluator NetworkEventEvaluator,298,train,webarena_verified.207.286.2 +webarena_verified.180.299.2,False,shopping,AgentResponseEvaluator NetworkEventEvaluator,299,train,webarena_verified.180.298.2 +webarena_verified.180.300.2,False,shopping,AgentResponseEvaluator NetworkEventEvaluator,300,test,webarena_verified.180.299.2 +webarena_verified.180.301.2,False,shopping,AgentResponseEvaluator NetworkEventEvaluator,301,test,webarena_verified.180.300.2 +webarena_verified.180.302.2,False,shopping,AgentResponseEvaluator NetworkEventEvaluator,302,train,webarena_verified.180.301.2 +webarena_verified.321.303.2,False,gitlab,AgentResponseEvaluator,303,test,webarena_verified.329.297.2 +webarena_verified.321.304.2,False,gitlab,AgentResponseEvaluator,304,train,webarena_verified.321.303.2 +webarena_verified.321.305.2,False,gitlab,AgentResponseEvaluator,305,train,webarena_verified.321.304.2 +webarena_verified.321.306.2,False,gitlab,AgentResponseEvaluator,306,test,webarena_verified.321.305.2 +webarena_verified.321.307.2,False,gitlab,AgentResponseEvaluator,307,train,webarena_verified.321.306.2 +webarena_verified.323.308.2,False,gitlab,AgentResponseEvaluator,308,train,webarena_verified.321.307.2 +webarena_verified.323.309.2,False,gitlab,AgentResponseEvaluator,309,train,webarena_verified.323.308.2 +webarena_verified.323.310.2,False,gitlab,AgentResponseEvaluator,310,train,webarena_verified.323.309.2 +webarena_verified.323.311.2,False,gitlab,AgentResponseEvaluator,311,test,webarena_verified.323.310.2 +webarena_verified.323.312.2,False,gitlab,AgentResponseEvaluator,312,test,webarena_verified.323.311.2 +webarena_verified.134.313.2,False,shopping,AgentResponseEvaluator,313,train,webarena_verified.180.302.2 +webarena_verified.324.314.2,False,gitlab,AgentResponseEvaluator,314,train,webarena_verified.323.312.2 +webarena_verified.324.315.2,False,gitlab,AgentResponseEvaluator,315,train,webarena_verified.324.314.2 +webarena_verified.324.316.2,False,gitlab,AgentResponseEvaluator,316,test,webarena_verified.324.315.2 +webarena_verified.324.317.2,False,gitlab,AgentResponseEvaluator,317,test,webarena_verified.324.316.2 +webarena_verified.324.318.2,False,gitlab,AgentResponseEvaluator,318,train,webarena_verified.324.317.2 +webarena_verified.160.319.2,False,shopping,AgentResponseEvaluator,319,train,webarena_verified.134.313.2 +webarena_verified.160.320.2,False,shopping,AgentResponseEvaluator,320,test,webarena_verified.160.319.2 +webarena_verified.160.321.2,False,shopping,AgentResponseEvaluator,321,train,webarena_verified.160.320.2 +webarena_verified.160.322.2,False,shopping,AgentResponseEvaluator,322,test,webarena_verified.160.321.2 +webarena_verified.160.323.2,False,shopping,AgentResponseEvaluator,323,train,webarena_verified.160.322.2 +webarena_verified.208.324.2,False,shopping,AgentResponseEvaluator NetworkEventEvaluator,324,train,webarena_verified.160.323.2 +webarena_verified.208.325.2,False,shopping,AgentResponseEvaluator NetworkEventEvaluator,325,test,webarena_verified.208.324.2 +webarena_verified.208.326.2,False,shopping,AgentResponseEvaluator NetworkEventEvaluator,326,train,webarena_verified.208.325.2 +webarena_verified.208.327.2,False,shopping,AgentResponseEvaluator NetworkEventEvaluator,327,test,webarena_verified.208.326.2 +webarena_verified.208.328.2,False,shopping,AgentResponseEvaluator NetworkEventEvaluator,328,train,webarena_verified.208.327.2 +webarena_verified.147.329.2,False,shopping,AgentResponseEvaluator,329,test,webarena_verified.208.328.2 +webarena_verified.147.330.2,False,shopping,AgentResponseEvaluator,330,test,webarena_verified.147.329.2 +webarena_verified.147.331.2,False,shopping,AgentResponseEvaluator,331,test,webarena_verified.147.330.2 +webarena_verified.147.332.2,False,shopping,AgentResponseEvaluator,332,train,webarena_verified.147.331.2 +webarena_verified.147.333.2,False,shopping,AgentResponseEvaluator,333,train,webarena_verified.147.332.2 +webarena_verified.169.334.2,False,shopping,AgentResponseEvaluator,334,train,webarena_verified.147.333.2 +webarena_verified.169.335.2,False,shopping,AgentResponseEvaluator,335,train,webarena_verified.169.334.2 +webarena_verified.169.336.2,False,shopping,AgentResponseEvaluator,336,test,webarena_verified.169.335.2 +webarena_verified.169.337.2,False,shopping,AgentResponseEvaluator,337,test,webarena_verified.169.336.2 +webarena_verified.169.338.2,False,shopping,AgentResponseEvaluator,338,train,webarena_verified.169.337.2 +webarena_verified.299.339.2,False,gitlab,AgentResponseEvaluator NetworkEventEvaluator NetworkEventEvaluator,339,test,webarena_verified.324.318.2 +webarena_verified.299.340.2,False,gitlab,AgentResponseEvaluator NetworkEventEvaluator NetworkEventEvaluator,340,train,webarena_verified.299.339.2 +webarena_verified.299.341.2,False,gitlab,AgentResponseEvaluator NetworkEventEvaluator NetworkEventEvaluator,341,test,webarena_verified.299.340.2 +webarena_verified.299.342.2,False,gitlab,AgentResponseEvaluator NetworkEventEvaluator NetworkEventEvaluator,342,test,webarena_verified.299.341.2 +webarena_verified.299.343.2,False,gitlab,AgentResponseEvaluator NetworkEventEvaluator NetworkEventEvaluator,343,test,webarena_verified.299.342.2 +webarena_verified.248.344.2,False,shopping_admin,AgentResponseEvaluator,344,test,webarena_verified.234.292.2 +webarena_verified.248.345.2,False,shopping_admin,AgentResponseEvaluator,345,train,webarena_verified.248.344.2 +webarena_verified.248.346.2,False,shopping_admin,AgentResponseEvaluator,346,train,webarena_verified.248.345.2 +webarena_verified.248.347.2,False,shopping_admin,AgentResponseEvaluator,347,train,webarena_verified.248.346.2 +webarena_verified.248.348.2,False,shopping_admin,AgentResponseEvaluator,348,test,webarena_verified.248.347.2 +webarena_verified.298.349.3,False,gitlab,AgentResponseEvaluator,349,test,webarena_verified.299.343.2 +webarena_verified.298.350.3,False,gitlab,AgentResponseEvaluator,350,test,webarena_verified.298.349.3 +webarena_verified.137.351.2,False,shopping,AgentResponseEvaluator NetworkEventEvaluator,351,train,webarena_verified.169.338.2 +webarena_verified.137.352.2,False,shopping,AgentResponseEvaluator NetworkEventEvaluator,352,test,webarena_verified.137.351.2 +webarena_verified.137.353.2,False,shopping,AgentResponseEvaluator NetworkEventEvaluator,353,test,webarena_verified.137.352.2 +webarena_verified.137.354.2,False,shopping,AgentResponseEvaluator NetworkEventEvaluator,354,train,webarena_verified.137.353.2 +webarena_verified.137.355.2,False,shopping,AgentResponseEvaluator NetworkEventEvaluator,355,train,webarena_verified.137.354.2 +webarena_verified.49.356.2,False,map,AgentResponseEvaluator NetworkEventEvaluator,356,test,webarena_verified.47.287.2 +webarena_verified.291.357.2,False,gitlab,AgentResponseEvaluator NetworkEventEvaluator,357,test,webarena_verified.298.350.3 +webarena_verified.206.358.2,False,shopping,AgentResponseEvaluator,358,train,webarena_verified.137.355.2 +webarena_verified.206.359.2,False,shopping,AgentResponseEvaluator,359,test,webarena_verified.206.358.2 +webarena_verified.206.360.2,False,shopping,AgentResponseEvaluator,360,train,webarena_verified.206.359.2 +webarena_verified.206.361.2,False,shopping,AgentResponseEvaluator,361,train,webarena_verified.206.360.2 +webarena_verified.206.362.2,False,shopping,AgentResponseEvaluator,362,test,webarena_verified.206.361.2 +webarena_verified.58.363.2,False,map,AgentResponseEvaluator,363,train,webarena_verified.49.356.2 +webarena_verified.58.364.2,False,map,AgentResponseEvaluator,364,test,webarena_verified.58.363.2 +webarena_verified.58.365.2,False,map,AgentResponseEvaluator,365,test,webarena_verified.58.364.2 +webarena_verified.58.366.2,False,map,AgentResponseEvaluator,366,train,webarena_verified.58.365.2 +webarena_verified.58.367.2,False,map,AgentResponseEvaluator,367,train,webarena_verified.58.366.2 +webarena_verified.188.368.2,False,shopping,AgentResponseEvaluator,368,test,webarena_verified.206.362.2 +webarena_verified.52.369.2,False,map,AgentResponseEvaluator NetworkEventEvaluator,369,train,webarena_verified.58.367.2 +webarena_verified.52.370.2,False,map,AgentResponseEvaluator NetworkEventEvaluator,370,test,webarena_verified.52.369.2 +webarena_verified.52.371.2,False,map,AgentResponseEvaluator NetworkEventEvaluator,371,test,webarena_verified.52.370.2 +webarena_verified.52.372.2,False,map,AgentResponseEvaluator NetworkEventEvaluator,372,train,webarena_verified.52.371.2 +webarena_verified.52.373.2,False,map,AgentResponseEvaluator NetworkEventEvaluator,373,train,webarena_verified.52.372.2 +webarena_verified.266.374.2,False,shopping_admin,AgentResponseEvaluator NetworkEventEvaluator,374,train,webarena_verified.248.348.2 +webarena_verified.266.375.2,False,shopping_admin,AgentResponseEvaluator NetworkEventEvaluator,375,train,webarena_verified.266.374.2 +webarena_verified.182.376.2,False,shopping,AgentResponseEvaluator,376,test,webarena_verified.188.368.2 +webarena_verified.59.377.2,False,map,AgentResponseEvaluator NetworkEventEvaluator,377,test,webarena_verified.52.373.2 +webarena_verified.59.378.2,False,map,AgentResponseEvaluator NetworkEventEvaluator,378,train,webarena_verified.59.377.2 +webarena_verified.59.379.2,False,map,AgentResponseEvaluator NetworkEventEvaluator,379,train,webarena_verified.59.378.2 +webarena_verified.59.380.2,False,map,AgentResponseEvaluator NetworkEventEvaluator,380,test,webarena_verified.59.379.2 +webarena_verified.59.381.2,False,map,AgentResponseEvaluator NetworkEventEvaluator,381,train,webarena_verified.59.380.2 +webarena_verified.781.382.2,False,map,AgentResponseEvaluator,382,test,webarena_verified.59.381.2 +webarena_verified.782.383.2,False,map,AgentResponseEvaluator,383,test,webarena_verified.781.382.2 +webarena_verified.666.384.2,False,shopping,AgentResponseEvaluator,384,test,webarena_verified.182.376.2 +webarena_verified.666.385.2,False,shopping,AgentResponseEvaluator,385,train,webarena_verified.666.384.2 +webarena_verified.1355.386.2,False,shopping,AgentResponseEvaluator,386,test,webarena_verified.666.385.2 +webarena_verified.1356.387.2,False,shopping,AgentResponseEvaluator,387,train,webarena_verified.1355.386.2 +webarena_verified.1356.388.2,False,shopping,AgentResponseEvaluator,388,test,webarena_verified.1356.387.2 +webarena_verified.348.389.2,False,gitlab,AgentResponseEvaluator NetworkEventEvaluator,389,test,webarena_verified.291.357.2 +webarena_verified.348.390.2,False,gitlab,AgentResponseEvaluator NetworkEventEvaluator,390,train,webarena_verified.348.389.2 +webarena_verified.348.391.2,False,gitlab,AgentResponseEvaluator NetworkEventEvaluator,391,train,webarena_verified.348.390.2 +webarena_verified.348.392.2,False,gitlab,AgentResponseEvaluator NetworkEventEvaluator,392,test,webarena_verified.348.391.2 +webarena_verified.348.393.2,False,gitlab,AgentResponseEvaluator NetworkEventEvaluator,393,train,webarena_verified.348.392.2 +webarena_verified.352.394.2,False,gitlab,AgentResponseEvaluator NetworkEventEvaluator,394,test,webarena_verified.348.393.2 +webarena_verified.352.395.2,False,gitlab,AgentResponseEvaluator NetworkEventEvaluator,395,train,webarena_verified.352.394.2 +webarena_verified.352.396.2,False,gitlab,AgentResponseEvaluator NetworkEventEvaluator,396,train,webarena_verified.352.395.2 +webarena_verified.352.397.2,False,gitlab,AgentResponseEvaluator NetworkEventEvaluator,397,train,webarena_verified.352.396.2 +webarena_verified.352.398.2,False,gitlab,AgentResponseEvaluator NetworkEventEvaluator NetworkEventEvaluator NetworkEventEvaluator NetworkEventEvaluator NetworkEventEvaluator,398,test,webarena_verified.352.397.2 +webarena_verified.6.399.2,False,reddit,AgentResponseEvaluator NetworkEventEvaluator,399,train,webarena_verified.17.69.2 +webarena_verified.6.400.2,False,reddit,AgentResponseEvaluator NetworkEventEvaluator,400,test,webarena_verified.6.399.2 +webarena_verified.6.401.2,False,reddit,AgentResponseEvaluator NetworkEventEvaluator,401,train,webarena_verified.6.400.2 +webarena_verified.6.402.2,False,reddit,AgentResponseEvaluator NetworkEventEvaluator,402,train,webarena_verified.6.401.2 +webarena_verified.6.403.2,False,reddit,AgentResponseEvaluator NetworkEventEvaluator,403,test,webarena_verified.6.402.2 +webarena_verified.22.404.2,False,reddit,AgentResponseEvaluator NetworkEventEvaluator,404,train,webarena_verified.6.403.2 +webarena_verified.22.405.2,False,reddit,AgentResponseEvaluator NetworkEventEvaluator,405,test,webarena_verified.22.404.2 +webarena_verified.22.406.2,False,reddit,AgentResponseEvaluator NetworkEventEvaluator,406,train,webarena_verified.22.405.2 +webarena_verified.22.407.2,False,reddit,AgentResponseEvaluator NetworkEventEvaluator,407,test,webarena_verified.22.406.2 +webarena_verified.22.408.2,False,reddit,AgentResponseEvaluator NetworkEventEvaluator,408,train,webarena_verified.22.407.2 +webarena_verified.23.409.2,False,reddit,AgentResponseEvaluator NetworkEventEvaluator,409,test,webarena_verified.22.408.2 +webarena_verified.23.410.2,False,reddit,AgentResponseEvaluator NetworkEventEvaluator,410,test,webarena_verified.23.409.2 +webarena_verified.355.411.2,False,gitlab,AgentResponseEvaluator NetworkEventEvaluator,411,test,webarena_verified.352.398.2 +webarena_verified.355.412.2,False,gitlab,AgentResponseEvaluator NetworkEventEvaluator,412,test,webarena_verified.355.411.2 +webarena_verified.355.413.2,False,gitlab,AgentResponseEvaluator NetworkEventEvaluator,413,test,webarena_verified.355.412.2 +webarena_verified.355.414.2,False,gitlab,AgentResponseEvaluator NetworkEventEvaluator,414,test,webarena_verified.355.413.2 +webarena_verified.360.415.2,False,gitlab,AgentResponseEvaluator NetworkEventEvaluator,415,test,webarena_verified.355.414.2 +webarena_verified.360.416.2,False,gitlab,AgentResponseEvaluator NetworkEventEvaluator,416,test,webarena_verified.360.415.2 +webarena_verified.360.417.2,False,gitlab,AgentResponseEvaluator NetworkEventEvaluator,417,test,webarena_verified.360.416.2 +webarena_verified.361.418.2,False,gitlab,AgentResponseEvaluator NetworkEventEvaluator,418,train,webarena_verified.360.417.2 +webarena_verified.361.419.2,False,gitlab,AgentResponseEvaluator NetworkEventEvaluator,419,test,webarena_verified.361.418.2 +webarena_verified.361.420.2,False,gitlab,AgentResponseEvaluator NetworkEventEvaluator,420,test,webarena_verified.361.419.2 +webarena_verified.361.421.2,False,gitlab,AgentResponseEvaluator NetworkEventEvaluator,421,train,webarena_verified.361.420.2 +webarena_verified.361.422.2,False,gitlab,AgentResponseEvaluator NetworkEventEvaluator,422,train,webarena_verified.361.421.2 +webarena_verified.237.423.2,False,shopping_admin,AgentResponseEvaluator NetworkEventEvaluator,423,train,webarena_verified.266.375.2 +webarena_verified.371.424.2,False,wikipedia map,AgentResponseEvaluator NetworkEventEvaluator,424,train,webarena_verified.782.383.2 +webarena_verified.371.425.2,False,wikipedia map,AgentResponseEvaluator NetworkEventEvaluator,425,train,webarena_verified.371.424.2 +webarena_verified.371.426.2,False,wikipedia map,AgentResponseEvaluator NetworkEventEvaluator,426,test,webarena_verified.371.425.2 +webarena_verified.371.427.2,False,wikipedia map,AgentResponseEvaluator NetworkEventEvaluator,427,test,webarena_verified.371.426.2 +webarena_verified.371.428.2,False,wikipedia map,AgentResponseEvaluator NetworkEventEvaluator,428,train,webarena_verified.371.427.2 +webarena_verified.371.429.2,False,wikipedia map,AgentResponseEvaluator NetworkEventEvaluator NetworkEventEvaluator,429,train,webarena_verified.371.428.2 +webarena_verified.371.430.2,False,wikipedia map,AgentResponseEvaluator NetworkEventEvaluator,430,test,webarena_verified.371.429.2 +webarena_verified.145.431.2,False,shopping,AgentResponseEvaluator NetworkEventEvaluator,431,train,webarena_verified.1356.388.2 +webarena_verified.145.432.2,False,shopping,AgentResponseEvaluator NetworkEventEvaluator,432,test,webarena_verified.145.431.2 +webarena_verified.145.433.2,False,shopping,AgentResponseEvaluator NetworkEventEvaluator,433,train,webarena_verified.145.432.2 +webarena_verified.145.434.2,False,shopping,AgentResponseEvaluator NetworkEventEvaluator,434,train,webarena_verified.145.433.2 +webarena_verified.145.435.2,False,shopping,AgentResponseEvaluator NetworkEventEvaluator,435,train,webarena_verified.145.434.2 +webarena_verified.156.436.2,False,shopping,AgentResponseEvaluator NetworkEventEvaluator,436,test,webarena_verified.145.435.2 +webarena_verified.156.437.2,False,shopping,AgentResponseEvaluator NetworkEventEvaluator,437,train,webarena_verified.156.436.2 +webarena_verified.156.438.2,False,shopping,AgentResponseEvaluator NetworkEventEvaluator,438,train,webarena_verified.156.437.2 +webarena_verified.156.439.2,False,shopping,AgentResponseEvaluator NetworkEventEvaluator,439,train,webarena_verified.156.438.2 +webarena_verified.156.440.2,False,shopping,AgentResponseEvaluator NetworkEventEvaluator,440,test,webarena_verified.156.439.2 +webarena_verified.308.441.2,False,gitlab,AgentResponseEvaluator NetworkEventEvaluator,441,train,webarena_verified.361.422.2 +webarena_verified.308.442.2,False,gitlab,AgentResponseEvaluator NetworkEventEvaluator,442,train,webarena_verified.308.441.2 +webarena_verified.308.443.2,False,gitlab,AgentResponseEvaluator NetworkEventEvaluator,443,test,webarena_verified.308.442.2 +webarena_verified.308.444.2,False,gitlab,AgentResponseEvaluator NetworkEventEvaluator,444,train,webarena_verified.308.443.2 +webarena_verified.308.445.2,False,gitlab,AgentResponseEvaluator NetworkEventEvaluator,445,test,webarena_verified.308.444.2 +webarena_verified.999.446.2,False,gitlab,AgentResponseEvaluator NetworkEventEvaluator,446,test,webarena_verified.308.445.2 +webarena_verified.999.447.2,False,gitlab,AgentResponseEvaluator NetworkEventEvaluator,447,train,webarena_verified.999.446.2 +webarena_verified.331.448.2,False,gitlab,AgentResponseEvaluator NetworkEventEvaluator,448,test,webarena_verified.999.447.2 +webarena_verified.331.449.2,False,gitlab,AgentResponseEvaluator NetworkEventEvaluator,449,test,webarena_verified.331.448.2 +webarena_verified.331.450.2,False,gitlab,AgentResponseEvaluator NetworkEventEvaluator,450,train,webarena_verified.331.449.2 +webarena_verified.331.451.2,False,gitlab,AgentResponseEvaluator NetworkEventEvaluator,451,train,webarena_verified.331.450.2 +webarena_verified.331.452.2,False,gitlab,AgentResponseEvaluator NetworkEventEvaluator,452,train,webarena_verified.331.451.2 +webarena_verified.242.453.2,False,shopping_admin,AgentResponseEvaluator NetworkEventEvaluator,453,train,webarena_verified.237.423.2 +webarena_verified.242.454.2,False,shopping_admin,AgentResponseEvaluator NetworkEventEvaluator,454,test,webarena_verified.242.453.2 +webarena_verified.242.455.2,False,shopping_admin,AgentResponseEvaluator NetworkEventEvaluator,455,train,webarena_verified.242.454.2 +webarena_verified.242.456.2,False,shopping_admin,AgentResponseEvaluator NetworkEventEvaluator,456,test,webarena_verified.242.455.2 +webarena_verified.242.457.2,False,shopping_admin,AgentResponseEvaluator NetworkEventEvaluator,457,train,webarena_verified.242.456.2 +webarena_verified.247.458.2,False,shopping_admin,AgentResponseEvaluator NetworkEventEvaluator,458,test,webarena_verified.242.457.2 +webarena_verified.247.459.2,False,shopping_admin,AgentResponseEvaluator NetworkEventEvaluator,459,test,webarena_verified.247.458.2 +webarena_verified.247.460.2,False,shopping_admin,AgentResponseEvaluator NetworkEventEvaluator,460,train,webarena_verified.247.459.2 +webarena_verified.247.461.2,False,shopping_admin,AgentResponseEvaluator NetworkEventEvaluator,461,train,webarena_verified.247.460.2 +webarena_verified.247.462.2,False,shopping_admin,AgentResponseEvaluator NetworkEventEvaluator,462,test,webarena_verified.247.461.2 +webarena_verified.247.463.2,False,shopping_admin,AgentResponseEvaluator NetworkEventEvaluator,463,test,webarena_verified.247.462.2 +webarena_verified.251.464.2,False,shopping_admin,AgentResponseEvaluator NetworkEventEvaluator,464,train,webarena_verified.247.463.2 +webarena_verified.186.465.2,False,shopping,AgentResponseEvaluator NetworkEventEvaluator,465,train,webarena_verified.156.440.2 +webarena_verified.186.466.2,False,shopping,AgentResponseEvaluator NetworkEventEvaluator,466,train,webarena_verified.186.465.2 +webarena_verified.186.467.2,False,shopping,AgentResponseEvaluator NetworkEventEvaluator,467,train,webarena_verified.186.466.2 +webarena_verified.186.468.2,False,shopping,AgentResponseEvaluator NetworkEventEvaluator,468,test,webarena_verified.186.467.2 +webarena_verified.186.469.2,False,shopping,AgentResponseEvaluator NetworkEventEvaluator,469,test,webarena_verified.186.468.2 +webarena_verified.257.470.2,False,shopping_admin,AgentResponseEvaluator NetworkEventEvaluator,470,test,webarena_verified.251.464.2 +webarena_verified.257.471.2,False,shopping_admin,AgentResponseEvaluator NetworkEventEvaluator,471,test,webarena_verified.257.470.2 +webarena_verified.257.472.2,False,shopping_admin,AgentResponseEvaluator NetworkEventEvaluator,472,train,webarena_verified.257.471.2 +webarena_verified.257.473.2,False,shopping_admin,AgentResponseEvaluator NetworkEventEvaluator,473,train,webarena_verified.257.472.2 +webarena_verified.257.474.2,False,shopping_admin,AgentResponseEvaluator NetworkEventEvaluator,474,train,webarena_verified.257.473.2 +webarena_verified.292.475.2,False,gitlab,AgentResponseEvaluator NetworkEventEvaluator,475,train,webarena_verified.331.452.2 +webarena_verified.292.476.2,False,gitlab,AgentResponseEvaluator NetworkEventEvaluator,476,train,webarena_verified.292.475.2 +webarena_verified.292.477.2,False,gitlab,AgentResponseEvaluator NetworkEventEvaluator,477,train,webarena_verified.292.476.2 +webarena_verified.292.478.2,False,gitlab,AgentResponseEvaluator NetworkEventEvaluator,478,test,webarena_verified.292.477.2 +webarena_verified.292.479.2,False,gitlab,AgentResponseEvaluator NetworkEventEvaluator,479,test,webarena_verified.292.478.2 +webarena_verified.293.480.2,False,gitlab,AgentResponseEvaluator NetworkEventEvaluator,480,train,webarena_verified.292.479.2 +webarena_verified.294.481.2,False,gitlab,AgentResponseEvaluator NetworkEventEvaluator,481,train,webarena_verified.293.480.2 +webarena_verified.294.482.2,False,gitlab,AgentResponseEvaluator NetworkEventEvaluator,482,train,webarena_verified.294.481.2 +webarena_verified.294.483.2,False,gitlab,AgentResponseEvaluator NetworkEventEvaluator,483,test,webarena_verified.294.482.2 +webarena_verified.294.484.2,False,gitlab,AgentResponseEvaluator NetworkEventEvaluator,484,train,webarena_verified.294.483.2 +webarena_verified.294.485.2,False,gitlab,AgentResponseEvaluator NetworkEventEvaluator,485,test,webarena_verified.294.484.2 +webarena_verified.275.486.2,False,shopping_admin,AgentResponseEvaluator NetworkEventEvaluator,486,train,webarena_verified.257.474.2 +webarena_verified.275.487.2,False,shopping_admin,AgentResponseEvaluator NetworkEventEvaluator,487,test,webarena_verified.275.486.2 +webarena_verified.275.488.2,False,shopping_admin,AgentResponseEvaluator NetworkEventEvaluator,488,test,webarena_verified.275.487.2 +webarena_verified.275.489.2,False,shopping_admin,AgentResponseEvaluator NetworkEventEvaluator,489,train,webarena_verified.275.488.2 +webarena_verified.275.490.2,False,shopping_admin,AgentResponseEvaluator NetworkEventEvaluator,490,train,webarena_verified.275.489.2 +webarena_verified.280.491.2,False,shopping_admin,AgentResponseEvaluator,491,test,webarena_verified.275.490.2 +webarena_verified.280.492.2,False,shopping_admin,AgentResponseEvaluator NetworkEventEvaluator,492,train,webarena_verified.280.491.2 +webarena_verified.280.493.2,False,shopping_admin,AgentResponseEvaluator NetworkEventEvaluator,493,train,webarena_verified.280.492.2 +webarena_verified.280.494.2,False,shopping_admin,AgentResponseEvaluator NetworkEventEvaluator,494,train,webarena_verified.280.493.2 +webarena_verified.280.495.2,False,shopping_admin,AgentResponseEvaluator NetworkEventEvaluator,495,test,webarena_verified.280.494.2 +webarena_verified.284.496.2,False,shopping_admin,AgentResponseEvaluator NetworkEventEvaluator,496,train,webarena_verified.280.495.2 +webarena_verified.284.497.2,False,shopping_admin,AgentResponseEvaluator NetworkEventEvaluator,497,test,webarena_verified.284.496.2 +webarena_verified.284.498.2,False,shopping_admin,AgentResponseEvaluator NetworkEventEvaluator,498,test,webarena_verified.284.497.2 +webarena_verified.284.499.2,False,shopping_admin,AgentResponseEvaluator NetworkEventEvaluator,499,train,webarena_verified.284.498.2 +webarena_verified.284.500.2,False,shopping_admin,AgentResponseEvaluator NetworkEventEvaluator,500,train,webarena_verified.284.499.2 +webarena_verified.287.501.2,False,shopping_admin,AgentResponseEvaluator NetworkEventEvaluator,501,train,webarena_verified.284.500.2 +webarena_verified.287.502.2,False,shopping_admin,AgentResponseEvaluator NetworkEventEvaluator,502,test,webarena_verified.287.501.2 +webarena_verified.287.503.2,False,shopping_admin,AgentResponseEvaluator NetworkEventEvaluator,503,train,webarena_verified.287.502.2 +webarena_verified.287.504.2,False,shopping_admin,AgentResponseEvaluator NetworkEventEvaluator,504,test,webarena_verified.287.503.2 +webarena_verified.287.505.2,False,shopping_admin,AgentResponseEvaluator NetworkEventEvaluator,505,train,webarena_verified.287.504.2 +webarena_verified.172.506.2,False,shopping,AgentResponseEvaluator NetworkEventEvaluator NetworkEventEvaluator NetworkEventEvaluator NetworkEventEvaluator,506,train,webarena_verified.186.469.2 +webarena_verified.172.507.2,False,shopping,AgentResponseEvaluator NetworkEventEvaluator NetworkEventEvaluator NetworkEventEvaluator NetworkEventEvaluator,507,train,webarena_verified.172.506.2 +webarena_verified.172.508.2,False,shopping,AgentResponseEvaluator NetworkEventEvaluator NetworkEventEvaluator NetworkEventEvaluator NetworkEventEvaluator,508,test,webarena_verified.172.507.2 +webarena_verified.216.509.2,False,shopping,AgentResponseEvaluator NetworkEventEvaluator NetworkEventEvaluator NetworkEventEvaluator NetworkEventEvaluator,509,test,webarena_verified.172.508.2 +webarena_verified.216.510.2,False,shopping,AgentResponseEvaluator NetworkEventEvaluator NetworkEventEvaluator NetworkEventEvaluator NetworkEventEvaluator,510,test,webarena_verified.216.509.2 +webarena_verified.189.511.2,False,shopping,AgentResponseEvaluator NetworkEventEvaluator,511,test,webarena_verified.216.510.2 +webarena_verified.189.512.2,False,shopping,AgentResponseEvaluator NetworkEventEvaluator,512,train,webarena_verified.189.511.2 +webarena_verified.189.513.2,False,shopping,AgentResponseEvaluator NetworkEventEvaluator,513,train,webarena_verified.189.512.2 +webarena_verified.189.514.2,False,shopping,AgentResponseEvaluator NetworkEventEvaluator,514,test,webarena_verified.189.513.2 +webarena_verified.189.515.2,False,shopping,AgentResponseEvaluator NetworkEventEvaluator,515,train,webarena_verified.189.514.2 +webarena_verified.196.516.2,False,shopping,AgentResponseEvaluator NetworkEventEvaluator,516,train,webarena_verified.189.515.2 +webarena_verified.196.517.2,False,shopping,AgentResponseEvaluator NetworkEventEvaluator,517,test,webarena_verified.196.516.2 +webarena_verified.196.518.2,False,shopping,AgentResponseEvaluator NetworkEventEvaluator,518,test,webarena_verified.196.517.2 +webarena_verified.196.519.2,False,shopping,AgentResponseEvaluator NetworkEventEvaluator,519,test,webarena_verified.196.518.2 +webarena_verified.196.520.2,False,shopping,AgentResponseEvaluator NetworkEventEvaluator,520,train,webarena_verified.196.519.2 +webarena_verified.199.521.2,False,shopping,AgentResponseEvaluator NetworkEventEvaluator,521,test,webarena_verified.196.520.2 +webarena_verified.352.522.2,False,gitlab,AgentResponseEvaluator NetworkEventEvaluator,522,test,webarena_verified.294.485.2 +webarena_verified.354.523.2,False,gitlab,AgentResponseEvaluator NetworkEventEvaluator NetworkEventEvaluator NetworkEventEvaluator NetworkEventEvaluator NetworkEventEvaluator,523,train,webarena_verified.352.522.2 +webarena_verified.354.524.2,False,gitlab,AgentResponseEvaluator NetworkEventEvaluator NetworkEventEvaluator NetworkEventEvaluator NetworkEventEvaluator NetworkEventEvaluator NetworkEventEvaluator NetworkEventEvaluator NetworkEventEvaluator,524,test,webarena_verified.354.523.2 +webarena_verified.354.525.2,False,gitlab,AgentResponseEvaluator NetworkEventEvaluator NetworkEventEvaluator NetworkEventEvaluator NetworkEventEvaluator,525,train,webarena_verified.354.524.2 +webarena_verified.354.526.2,False,gitlab,AgentResponseEvaluator NetworkEventEvaluator NetworkEventEvaluator NetworkEventEvaluator,526,train,webarena_verified.354.525.2 +webarena_verified.354.527.2,False,gitlab,AgentResponseEvaluator NetworkEventEvaluator,527,test,webarena_verified.354.526.2 +webarena_verified.154.528.2,False,shopping,AgentResponseEvaluator NetworkEventEvaluator NetworkEventEvaluator,528,train,webarena_verified.199.521.2 +webarena_verified.154.529.2,False,shopping,AgentResponseEvaluator NetworkEventEvaluator NetworkEventEvaluator,529,test,webarena_verified.154.528.2 +webarena_verified.154.530.2,False,shopping,AgentResponseEvaluator NetworkEventEvaluator NetworkEventEvaluator,530,test,webarena_verified.154.529.2 +webarena_verified.154.531.2,False,shopping,AgentResponseEvaluator NetworkEventEvaluator,531,train,webarena_verified.154.530.2 +webarena_verified.154.532.2,False,shopping,AgentResponseEvaluator NetworkEventEvaluator,532,train,webarena_verified.154.531.2 +webarena_verified.330.533.2,False,gitlab,AgentResponseEvaluator NetworkEventEvaluator NetworkEventEvaluator,533,test,webarena_verified.354.527.2 +webarena_verified.330.534.2,False,gitlab,AgentResponseEvaluator NetworkEventEvaluator NetworkEventEvaluator NetworkEventEvaluator,534,train,webarena_verified.330.533.2 +webarena_verified.330.535.2,False,gitlab,AgentResponseEvaluator NetworkEventEvaluator NetworkEventEvaluator NetworkEventEvaluator,535,test,webarena_verified.330.534.2 +webarena_verified.330.536.2,False,gitlab,AgentResponseEvaluator NetworkEventEvaluator NetworkEventEvaluator NetworkEventEvaluator,536,train,webarena_verified.330.535.2 +webarena_verified.330.537.2,False,gitlab,AgentResponseEvaluator NetworkEventEvaluator NetworkEventEvaluator NetworkEventEvaluator NetworkEventEvaluator NetworkEventEvaluator,537,train,webarena_verified.330.536.2 +webarena_verified.240.538.2,False,shopping_admin,AgentResponseEvaluator NetworkEventEvaluator,538,train,webarena_verified.287.505.2 +webarena_verified.240.539.2,False,shopping_admin,AgentResponseEvaluator NetworkEventEvaluator,539,train,webarena_verified.240.538.2 +webarena_verified.240.540.2,False,shopping_admin,AgentResponseEvaluator NetworkEventEvaluator,540,test,webarena_verified.240.539.2 +webarena_verified.240.541.2,False,shopping_admin,AgentResponseEvaluator NetworkEventEvaluator,541,test,webarena_verified.240.540.2 +webarena_verified.240.542.2,False,shopping_admin,AgentResponseEvaluator NetworkEventEvaluator,542,train,webarena_verified.240.541.2 +webarena_verified.251.543.2,False,shopping_admin,AgentResponseEvaluator NetworkEventEvaluator,543,test,webarena_verified.240.542.2 +webarena_verified.251.544.2,False,shopping_admin,AgentResponseEvaluator NetworkEventEvaluator,544,test,webarena_verified.251.543.2 +webarena_verified.251.545.2,False,shopping_admin,AgentResponseEvaluator NetworkEventEvaluator,545,test,webarena_verified.251.544.2 +webarena_verified.251.546.2,False,shopping_admin,AgentResponseEvaluator NetworkEventEvaluator,546,train,webarena_verified.251.545.2 +webarena_verified.252.547.2,False,shopping_admin,AgentResponseEvaluator NetworkEventEvaluator NetworkEventEvaluator,547,train,webarena_verified.251.546.2 +webarena_verified.252.548.2,False,shopping_admin,AgentResponseEvaluator NetworkEventEvaluator NetworkEventEvaluator,548,train,webarena_verified.252.547.2 +webarena_verified.252.549.2,False,shopping_admin,AgentResponseEvaluator NetworkEventEvaluator NetworkEventEvaluator,549,test,webarena_verified.252.548.2 +webarena_verified.252.550.2,False,shopping_admin,AgentResponseEvaluator NetworkEventEvaluator NetworkEventEvaluator,550,train,webarena_verified.252.549.2 +webarena_verified.252.551.2,False,shopping_admin,AgentResponseEvaluator NetworkEventEvaluator,551,test,webarena_verified.252.550.2 +webarena_verified.84.552.2,False,gitlab reddit,AgentResponseEvaluator NetworkEventEvaluator,552,test,webarena_verified.23.410.2 +webarena_verified.84.553.2,False,gitlab reddit,AgentResponseEvaluator NetworkEventEvaluator,553,test,webarena_verified.84.552.2 +webarena_verified.84.554.2,False,gitlab reddit,AgentResponseEvaluator NetworkEventEvaluator,554,test,webarena_verified.84.553.2 +webarena_verified.84.555.2,False,gitlab reddit,AgentResponseEvaluator NetworkEventEvaluator,555,test,webarena_verified.84.554.2 +webarena_verified.87.556.3,False,gitlab wikipedia,AgentResponseEvaluator NetworkEventEvaluator NetworkEventEvaluator,556,train,webarena_verified.84.555.2 +webarena_verified.87.557.3,False,gitlab wikipedia,AgentResponseEvaluator NetworkEventEvaluator NetworkEventEvaluator,557,test,webarena_verified.87.556.3 +webarena_verified.87.558.3,False,gitlab wikipedia,AgentResponseEvaluator NetworkEventEvaluator NetworkEventEvaluator,558,train,webarena_verified.87.557.3 +webarena_verified.87.559.3,False,gitlab wikipedia,AgentResponseEvaluator NetworkEventEvaluator NetworkEventEvaluator,559,train,webarena_verified.87.558.3 +webarena_verified.87.560.3,False,gitlab wikipedia,AgentResponseEvaluator NetworkEventEvaluator NetworkEventEvaluator,560,test,webarena_verified.87.559.3 +webarena_verified.87.561.3,False,gitlab wikipedia,AgentResponseEvaluator NetworkEventEvaluator NetworkEventEvaluator,561,test,webarena_verified.87.560.3 +webarena_verified.88.562.2,False,gitlab reddit,AgentResponseEvaluator NetworkEventEvaluator NetworkEventEvaluator,562,train,webarena_verified.84.555.2 +webarena_verified.88.563.2,False,gitlab reddit,AgentResponseEvaluator NetworkEventEvaluator NetworkEventEvaluator,563,train,webarena_verified.88.562.2 +webarena_verified.88.564.2,False,gitlab reddit,AgentResponseEvaluator NetworkEventEvaluator NetworkEventEvaluator,564,train,webarena_verified.88.563.2 +webarena_verified.88.565.2,False,gitlab reddit,AgentResponseEvaluator NetworkEventEvaluator NetworkEventEvaluator,565,test,webarena_verified.88.564.2 +webarena_verified.88.566.2,False,gitlab reddit,AgentResponseEvaluator NetworkEventEvaluator NetworkEventEvaluator,566,test,webarena_verified.88.565.2 +webarena_verified.293.567.2,False,gitlab,AgentResponseEvaluator NetworkEventEvaluator NetworkEventEvaluator,567,test,webarena_verified.88.566.2 +webarena_verified.293.568.2,False,gitlab,AgentResponseEvaluator NetworkEventEvaluator NetworkEventEvaluator,568,train,webarena_verified.293.567.2 +webarena_verified.293.569.2,False,gitlab,AgentResponseEvaluator NetworkEventEvaluator NetworkEventEvaluator,569,train,webarena_verified.293.568.2 +webarena_verified.293.570.2,False,gitlab,AgentResponseEvaluator NetworkEventEvaluator NetworkEventEvaluator NetworkEventEvaluator NetworkEventEvaluator,570,test,webarena_verified.293.569.2 +webarena_verified.165.571.2,False,shopping,AgentResponseEvaluator NetworkEventEvaluator,571,test,webarena_verified.154.532.2 +webarena_verified.165.572.2,False,shopping,AgentResponseEvaluator NetworkEventEvaluator,572,train,webarena_verified.165.571.2 +webarena_verified.165.573.2,False,shopping,AgentResponseEvaluator NetworkEventEvaluator,573,train,webarena_verified.165.572.2 +webarena_verified.165.574.2,False,shopping,AgentResponseEvaluator NetworkEventEvaluator,574,test,webarena_verified.165.573.2 +webarena_verified.165.575.2,False,shopping,AgentResponseEvaluator NetworkEventEvaluator,575,train,webarena_verified.165.574.2 +webarena_verified.351.576.2,False,gitlab,AgentResponseEvaluator NetworkEventEvaluator,576,test,webarena_verified.293.570.2 +webarena_verified.351.577.2,False,gitlab,AgentResponseEvaluator NetworkEventEvaluator,577,train,webarena_verified.351.576.2 +webarena_verified.351.578.2,False,gitlab,AgentResponseEvaluator NetworkEventEvaluator,578,test,webarena_verified.351.577.2 +webarena_verified.351.579.2,False,gitlab,AgentResponseEvaluator NetworkEventEvaluator,579,train,webarena_verified.351.578.2 +webarena_verified.7.580.2,False,reddit,AgentResponseEvaluator NetworkEventEvaluator,580,train,webarena_verified.88.566.2 +webarena_verified.7.581.2,False,reddit,AgentResponseEvaluator NetworkEventEvaluator,581,train,webarena_verified.7.580.2 +webarena_verified.7.582.2,False,reddit,AgentResponseEvaluator NetworkEventEvaluator,582,test,webarena_verified.7.581.2 +webarena_verified.7.583.2,False,reddit,AgentResponseEvaluator NetworkEventEvaluator,583,test,webarena_verified.7.582.2 +webarena_verified.7.584.2,False,reddit,AgentResponseEvaluator NetworkEventEvaluator,584,train,webarena_verified.7.583.2 +webarena_verified.194.585.2,False,shopping,AgentResponseEvaluator NetworkEventEvaluator,585,train,webarena_verified.165.575.2 +webarena_verified.194.586.2,False,shopping,AgentResponseEvaluator NetworkEventEvaluator,586,test,webarena_verified.194.585.2 +webarena_verified.194.587.2,False,shopping,AgentResponseEvaluator NetworkEventEvaluator,587,train,webarena_verified.194.586.2 +webarena_verified.194.588.2,False,shopping,AgentResponseEvaluator NetworkEventEvaluator,588,train,webarena_verified.194.587.2 +webarena_verified.194.589.2,False,shopping,AgentResponseEvaluator NetworkEventEvaluator,589,test,webarena_verified.194.588.2 +webarena_verified.339.590.2,False,gitlab,AgentResponseEvaluator NetworkEventEvaluator,590,train,webarena_verified.351.579.2 +webarena_verified.339.591.2,False,gitlab,AgentResponseEvaluator NetworkEventEvaluator,591,test,webarena_verified.339.590.2 +webarena_verified.339.592.2,False,gitlab,AgentResponseEvaluator NetworkEventEvaluator,592,test,webarena_verified.339.591.2 +webarena_verified.339.593.2,False,gitlab,AgentResponseEvaluator NetworkEventEvaluator,593,test,webarena_verified.339.592.2 +webarena_verified.339.594.2,False,gitlab,AgentResponseEvaluator NetworkEventEvaluator,594,train,webarena_verified.339.593.2 +webarena_verified.4.595.2,False,reddit,AgentResponseEvaluator NetworkEventEvaluator,595,train,webarena_verified.7.584.2 +webarena_verified.4.596.2,False,reddit,AgentResponseEvaluator NetworkEventEvaluator,596,test,webarena_verified.4.595.2 +webarena_verified.4.597.2,False,reddit,AgentResponseEvaluator NetworkEventEvaluator,597,train,webarena_verified.4.596.2 +webarena_verified.4.598.2,False,reddit,AgentResponseEvaluator NetworkEventEvaluator,598,train,webarena_verified.4.597.2 +webarena_verified.4.599.2,False,reddit,AgentResponseEvaluator NetworkEventEvaluator,599,test,webarena_verified.4.598.2 +webarena_verified.3765.600.2,False,reddit,AgentResponseEvaluator NetworkEventEvaluator,600,test,webarena_verified.4.599.2 +webarena_verified.3765.601.2,False,reddit,AgentResponseEvaluator NetworkEventEvaluator,601,train,webarena_verified.3765.600.2 +webarena_verified.3765.602.2,False,reddit,AgentResponseEvaluator NetworkEventEvaluator,602,train,webarena_verified.3765.601.2 +webarena_verified.3765.603.2,False,reddit,AgentResponseEvaluator NetworkEventEvaluator,603,train,webarena_verified.3765.602.2 +webarena_verified.3765.604.2,False,reddit,AgentResponseEvaluator NetworkEventEvaluator,604,test,webarena_verified.3765.603.2 +webarena_verified.5.605.2,False,reddit,AgentResponseEvaluator NetworkEventEvaluator,605,train,webarena_verified.3765.604.2 +webarena_verified.5.606.2,False,reddit,AgentResponseEvaluator NetworkEventEvaluator,606,train,webarena_verified.5.605.2 +webarena_verified.5.607.2,False,reddit,AgentResponseEvaluator NetworkEventEvaluator,607,test,webarena_verified.5.606.2 +webarena_verified.5.608.2,False,reddit,AgentResponseEvaluator NetworkEventEvaluator,608,test,webarena_verified.5.607.2 +webarena_verified.5.609.2,False,reddit,AgentResponseEvaluator NetworkEventEvaluator,609,train,webarena_verified.5.608.2 +webarena_verified.9.610.2,False,reddit,AgentResponseEvaluator NetworkEventEvaluator NetworkEventEvaluator,610,train,webarena_verified.5.609.2 +webarena_verified.9.611.2,False,reddit,AgentResponseEvaluator NetworkEventEvaluator NetworkEventEvaluator,611,train,webarena_verified.9.610.2 +webarena_verified.9.612.2,False,reddit,AgentResponseEvaluator NetworkEventEvaluator NetworkEventEvaluator,612,test,webarena_verified.9.611.2 +webarena_verified.9.613.2,False,reddit,AgentResponseEvaluator NetworkEventEvaluator NetworkEventEvaluator,613,train,webarena_verified.9.612.2 +webarena_verified.9.614.2,False,reddit,AgentResponseEvaluator NetworkEventEvaluator NetworkEventEvaluator,614,test,webarena_verified.9.613.2 +webarena_verified.11.615.2,False,reddit,AgentResponseEvaluator NetworkEventEvaluator,615,test,webarena_verified.9.614.2 +webarena_verified.11.616.2,False,reddit,AgentResponseEvaluator NetworkEventEvaluator,616,test,webarena_verified.11.615.2 +webarena_verified.11.617.2,False,reddit,AgentResponseEvaluator NetworkEventEvaluator,617,train,webarena_verified.11.616.2 +webarena_verified.11.618.2,False,reddit,AgentResponseEvaluator NetworkEventEvaluator,618,train,webarena_verified.11.617.2 +webarena_verified.11.619.2,False,reddit,AgentResponseEvaluator NetworkEventEvaluator,619,train,webarena_verified.11.618.2 +webarena_verified.12.620.2,False,reddit,AgentResponseEvaluator NetworkEventEvaluator,620,train,webarena_verified.11.619.2 +webarena_verified.12.621.2,False,reddit,AgentResponseEvaluator NetworkEventEvaluator,621,train,webarena_verified.12.620.2 +webarena_verified.12.622.2,False,reddit,AgentResponseEvaluator NetworkEventEvaluator,622,train,webarena_verified.12.621.2 +webarena_verified.12.623.2,False,reddit,AgentResponseEvaluator NetworkEventEvaluator,623,test,webarena_verified.12.622.2 +webarena_verified.12.624.2,False,reddit,AgentResponseEvaluator NetworkEventEvaluator,624,test,webarena_verified.12.623.2 +webarena_verified.13.625.2,False,reddit,AgentResponseEvaluator NetworkEventEvaluator,625,train,webarena_verified.12.624.2 +webarena_verified.13.626.2,False,reddit,AgentResponseEvaluator NetworkEventEvaluator,626,train,webarena_verified.13.625.2 +webarena_verified.13.627.2,False,reddit,AgentResponseEvaluator NetworkEventEvaluator,627,train,webarena_verified.13.626.2 +webarena_verified.13.628.2,False,reddit,AgentResponseEvaluator NetworkEventEvaluator,628,test,webarena_verified.13.627.2 +webarena_verified.13.629.2,False,reddit,AgentResponseEvaluator NetworkEventEvaluator,629,test,webarena_verified.13.628.2 +webarena_verified.15.630.2,False,reddit,AgentResponseEvaluator NetworkEventEvaluator,630,test,webarena_verified.13.629.2 +webarena_verified.15.631.2,False,reddit,AgentResponseEvaluator NetworkEventEvaluator,631,train,webarena_verified.15.630.2 +webarena_verified.15.632.2,False,reddit,AgentResponseEvaluator NetworkEventEvaluator,632,train,webarena_verified.15.631.2 +webarena_verified.15.633.2,False,reddit,AgentResponseEvaluator NetworkEventEvaluator,633,test,webarena_verified.15.632.2 +webarena_verified.15.634.2,False,reddit,AgentResponseEvaluator NetworkEventEvaluator,634,train,webarena_verified.15.633.2 +webarena_verified.6100.635.2,False,reddit,AgentResponseEvaluator NetworkEventEvaluator,635,train,webarena_verified.15.634.2 +webarena_verified.6100.636.2,False,reddit,AgentResponseEvaluator NetworkEventEvaluator,636,train,webarena_verified.6100.635.2 +webarena_verified.6100.637.2,False,reddit,AgentResponseEvaluator NetworkEventEvaluator,637,train,webarena_verified.6100.636.2 +webarena_verified.6100.638.2,False,reddit,AgentResponseEvaluator NetworkEventEvaluator,638,test,webarena_verified.6100.637.2 +webarena_verified.6100.639.2,False,reddit,AgentResponseEvaluator NetworkEventEvaluator,639,test,webarena_verified.6100.638.2 +webarena_verified.16.640.2,False,reddit,AgentResponseEvaluator NetworkEventEvaluator,640,train,webarena_verified.6100.639.2 +webarena_verified.16.641.2,False,reddit,AgentResponseEvaluator NetworkEventEvaluator,641,test,webarena_verified.16.640.2 +webarena_verified.16.642.2,False,reddit,AgentResponseEvaluator NetworkEventEvaluator,642,test,webarena_verified.16.641.2 +webarena_verified.16.643.2,False,reddit,AgentResponseEvaluator NetworkEventEvaluator,643,train,webarena_verified.16.642.2 +webarena_verified.16.644.2,False,reddit,AgentResponseEvaluator NetworkEventEvaluator,644,train,webarena_verified.16.643.2 +webarena_verified.19.645.2,False,reddit,AgentResponseEvaluator NetworkEventEvaluator,645,train,webarena_verified.16.644.2 +webarena_verified.19.646.2,False,reddit,AgentResponseEvaluator NetworkEventEvaluator,646,train,webarena_verified.19.645.2 +webarena_verified.19.647.2,False,reddit,AgentResponseEvaluator NetworkEventEvaluator,647,train,webarena_verified.19.646.2 +webarena_verified.19.648.2,False,reddit,AgentResponseEvaluator NetworkEventEvaluator,648,test,webarena_verified.19.647.2 +webarena_verified.19.649.2,False,reddit,AgentResponseEvaluator NetworkEventEvaluator,649,test,webarena_verified.19.648.2 +webarena_verified.23.650.2,False,reddit,AgentResponseEvaluator NetworkEventEvaluator,650,train,webarena_verified.19.649.2 +webarena_verified.23.651.2,False,reddit,AgentResponseEvaluator NetworkEventEvaluator,651,train,webarena_verified.23.650.2 +webarena_verified.23.652.2,False,reddit,AgentResponseEvaluator NetworkEventEvaluator,652,train,webarena_verified.23.651.2 +webarena_verified.153.653.2,False,shopping,AgentResponseEvaluator NetworkEventEvaluator,653,train,webarena_verified.194.589.2 +webarena_verified.153.654.2,False,shopping,AgentResponseEvaluator NetworkEventEvaluator,654,test,webarena_verified.153.653.2 +webarena_verified.153.655.2,False,shopping,AgentResponseEvaluator NetworkEventEvaluator,655,test,webarena_verified.153.654.2 +webarena_verified.153.656.3,False,shopping,AgentResponseEvaluator NetworkEventEvaluator,656,train,webarena_verified.153.655.2 +webarena_verified.153.657.2,False,shopping,AgentResponseEvaluator NetworkEventEvaluator,657,train,webarena_verified.153.656.3 +webarena_verified.327.658.2,False,gitlab,AgentResponseEvaluator NetworkEventEvaluator,658,train,webarena_verified.339.594.2 +webarena_verified.327.659.2,False,gitlab,AgentResponseEvaluator NetworkEventEvaluator NetworkEventEvaluator,659,test,webarena_verified.327.658.2 +webarena_verified.327.660.2,False,gitlab,AgentResponseEvaluator NetworkEventEvaluator NetworkEventEvaluator,660,test,webarena_verified.327.659.2 +webarena_verified.328.661.2,False,gitlab,AgentResponseEvaluator NetworkEventEvaluator,661,test,webarena_verified.327.660.2 +webarena_verified.328.662.2,False,gitlab,AgentResponseEvaluator NetworkEventEvaluator,662,train,webarena_verified.328.661.2 +webarena_verified.328.663.2,False,gitlab,AgentResponseEvaluator NetworkEventEvaluator,663,train,webarena_verified.328.662.2 +webarena_verified.328.664.2,False,gitlab,AgentResponseEvaluator NetworkEventEvaluator,664,test,webarena_verified.328.663.2 +webarena_verified.328.665.2,False,gitlab,AgentResponseEvaluator NetworkEventEvaluator,665,train,webarena_verified.328.664.2 +webarena_verified.335.666.2,False,gitlab,AgentResponseEvaluator NetworkEventEvaluator,666,test,webarena_verified.328.665.2 +webarena_verified.335.667.2,False,gitlab,AgentResponseEvaluator NetworkEventEvaluator,667,test,webarena_verified.335.666.2 +webarena_verified.335.668.2,False,gitlab,AgentResponseEvaluator NetworkEventEvaluator,668,test,webarena_verified.335.667.2 +webarena_verified.337.669.2,False,gitlab,AgentResponseEvaluator NetworkEventEvaluator,669,test,webarena_verified.335.668.2 +webarena_verified.337.670.2,False,gitlab,AgentResponseEvaluator NetworkEventEvaluator,670,train,webarena_verified.337.669.2 +webarena_verified.101.671.2,False,shopping reddit,AgentResponseEvaluator NetworkEventEvaluator,671,train,webarena_verified.23.652.2 +webarena_verified.101.672.2,False,shopping reddit,AgentResponseEvaluator NetworkEventEvaluator,672,train,webarena_verified.101.671.2 +webarena_verified.101.673.2,False,shopping reddit,AgentResponseEvaluator NetworkEventEvaluator,673,test,webarena_verified.101.672.2 +webarena_verified.101.674.2,False,shopping reddit,AgentResponseEvaluator NetworkEventEvaluator,674,test,webarena_verified.101.673.2 +webarena_verified.101.675.2,False,shopping reddit,AgentResponseEvaluator NetworkEventEvaluator,675,train,webarena_verified.101.674.2 +webarena_verified.253.676.2,False,shopping_admin,AgentResponseEvaluator NetworkEventEvaluator,676,test,webarena_verified.252.551.2 +webarena_verified.253.677.2,False,shopping_admin,AgentResponseEvaluator NetworkEventEvaluator,677,test,webarena_verified.253.676.2 +webarena_verified.253.678.2,False,shopping_admin,AgentResponseEvaluator NetworkEventEvaluator,678,train,webarena_verified.253.677.2 +webarena_verified.253.679.2,False,shopping_admin,AgentResponseEvaluator NetworkEventEvaluator,679,train,webarena_verified.253.678.2 +webarena_verified.253.680.2,False,shopping_admin,AgentResponseEvaluator NetworkEventEvaluator,680,train,webarena_verified.253.679.2 +webarena_verified.116.681.2,False,reddit gitlab,AgentResponseEvaluator NetworkEventEvaluator,681,train,webarena_verified.337.670.2 +webarena_verified.116.682.2,False,reddit gitlab,AgentResponseEvaluator NetworkEventEvaluator,682,train,webarena_verified.116.681.2 +webarena_verified.116.683.2,False,reddit gitlab,AgentResponseEvaluator NetworkEventEvaluator,683,test,webarena_verified.116.682.2 +webarena_verified.117.684.2,False,reddit gitlab,AgentResponseEvaluator NetworkEventEvaluator,684,train,webarena_verified.116.683.2 +webarena_verified.117.685.2,False,reddit gitlab,AgentResponseEvaluator NetworkEventEvaluator,685,train,webarena_verified.117.684.2 +webarena_verified.117.686.2,False,reddit gitlab,AgentResponseEvaluator NetworkEventEvaluator,686,train,webarena_verified.117.685.2 +webarena_verified.117.687.2,False,reddit gitlab,AgentResponseEvaluator NetworkEventEvaluator,687,test,webarena_verified.117.686.2 +webarena_verified.117.688.2,False,reddit gitlab,AgentResponseEvaluator NetworkEventEvaluator,688,test,webarena_verified.117.687.2 +webarena_verified.163.689.2,False,shopping,AgentResponseEvaluator NetworkEventEvaluator NetworkEventEvaluator,689,test,webarena_verified.101.675.2 +webarena_verified.163.690.2,False,shopping,AgentResponseEvaluator NetworkEventEvaluator NetworkEventEvaluator,690,test,webarena_verified.163.689.2 +webarena_verified.163.691.2,False,shopping,AgentResponseEvaluator NetworkEventEvaluator NetworkEventEvaluator,691,train,webarena_verified.163.690.2 +webarena_verified.163.692.2,False,shopping,AgentResponseEvaluator NetworkEventEvaluator NetworkEventEvaluator,692,train,webarena_verified.163.691.2 +webarena_verified.163.693.2,False,shopping,AgentResponseEvaluator NetworkEventEvaluator NetworkEventEvaluator,693,train,webarena_verified.163.692.2 +webarena_verified.256.694.2,False,shopping_admin,AgentResponseEvaluator NetworkEventEvaluator,694,train,webarena_verified.253.680.2 +webarena_verified.256.695.2,False,shopping_admin,AgentResponseEvaluator NetworkEventEvaluator,695,train,webarena_verified.256.694.2 +webarena_verified.256.696.2,False,shopping_admin,AgentResponseEvaluator NetworkEventEvaluator,696,test,webarena_verified.256.695.2 +webarena_verified.256.697.2,False,shopping_admin,AgentResponseEvaluator NetworkEventEvaluator,697,train,webarena_verified.256.696.2 +webarena_verified.256.698.2,False,shopping_admin,AgentResponseEvaluator NetworkEventEvaluator,698,test,webarena_verified.256.697.2 +webarena_verified.258.699.2,False,shopping_admin,AgentResponseEvaluator NetworkEventEvaluator,699,train,webarena_verified.256.698.2 +webarena_verified.258.700.2,False,shopping_admin,AgentResponseEvaluator NetworkEventEvaluator,700,test,webarena_verified.258.699.2 +webarena_verified.258.701.2,False,shopping_admin,AgentResponseEvaluator NetworkEventEvaluator,701,test,webarena_verified.258.700.2 +webarena_verified.258.702.2,False,shopping_admin,AgentResponseEvaluator NetworkEventEvaluator,702,train,webarena_verified.258.701.2 +webarena_verified.258.703.2,False,shopping_admin,AgentResponseEvaluator NetworkEventEvaluator,703,train,webarena_verified.258.702.2 +webarena_verified.268.704.2,False,shopping_admin,AgentResponseEvaluator NetworkEventEvaluator,704,test,webarena_verified.258.703.2 +webarena_verified.268.705.2,False,shopping_admin,AgentResponseEvaluator NetworkEventEvaluator,705,test,webarena_verified.268.704.2 +webarena_verified.268.706.2,False,shopping_admin,AgentResponseEvaluator NetworkEventEvaluator,706,train,webarena_verified.268.705.2 +webarena_verified.268.707.2,False,shopping_admin,AgentResponseEvaluator NetworkEventEvaluator,707,train,webarena_verified.268.706.2 +webarena_verified.268.708.2,False,shopping_admin,AgentResponseEvaluator NetworkEventEvaluator,708,train,webarena_verified.268.707.2 +webarena_verified.271.709.2,False,shopping_admin,AgentResponseEvaluator NetworkEventEvaluator,709,test,webarena_verified.268.708.2 +webarena_verified.271.710.2,False,shopping_admin,AgentResponseEvaluator NetworkEventEvaluator,710,test,webarena_verified.271.709.2 +webarena_verified.271.711.2,False,shopping_admin,AgentResponseEvaluator NetworkEventEvaluator,711,train,webarena_verified.271.710.2 +webarena_verified.271.712.2,False,shopping_admin,AgentResponseEvaluator NetworkEventEvaluator,712,train,webarena_verified.271.711.2 +webarena_verified.271.713.2,False,shopping_admin,AgentResponseEvaluator NetworkEventEvaluator,713,train,webarena_verified.271.712.2 +webarena_verified.24.714.2,False,reddit,AgentResponseEvaluator NetworkEventEvaluator,714,train,webarena_verified.117.688.2 +webarena_verified.24.715.2,False,reddit,AgentResponseEvaluator NetworkEventEvaluator NetworkEventEvaluator,715,train,webarena_verified.24.714.2 +webarena_verified.24.716.2,False,reddit,AgentResponseEvaluator NetworkEventEvaluator NetworkEventEvaluator NetworkEventEvaluator,716,train,webarena_verified.24.715.2 +webarena_verified.24.717.2,False,reddit,AgentResponseEvaluator NetworkEventEvaluator NetworkEventEvaluator NetworkEventEvaluator NetworkEventEvaluator,717,test,webarena_verified.24.716.2 +webarena_verified.24.718.2,False,reddit,AgentResponseEvaluator NetworkEventEvaluator NetworkEventEvaluator NetworkEventEvaluator NetworkEventEvaluator NetworkEventEvaluator,718,test,webarena_verified.24.717.2 +webarena_verified.25.719.2,False,reddit,AgentResponseEvaluator NetworkEventEvaluator NetworkEventEvaluator,719,train,webarena_verified.24.718.2 +webarena_verified.25.720.2,False,reddit,AgentResponseEvaluator NetworkEventEvaluator NetworkEventEvaluator NetworkEventEvaluator,720,test,webarena_verified.25.719.2 +webarena_verified.25.721.2,False,reddit,AgentResponseEvaluator NetworkEventEvaluator NetworkEventEvaluator NetworkEventEvaluator NetworkEventEvaluator NetworkEventEvaluator NetworkEventEvaluator NetworkEventEvaluator NetworkEventEvaluator,721,train,webarena_verified.25.720.2 +webarena_verified.25.722.2,False,reddit,AgentResponseEvaluator NetworkEventEvaluator,722,train,webarena_verified.25.721.2 +webarena_verified.25.723.2,False,reddit,AgentResponseEvaluator,723,test,webarena_verified.25.722.2 +webarena_verified.25.724.2,False,reddit,AgentResponseEvaluator NetworkEventEvaluator NetworkEventEvaluator NetworkEventEvaluator NetworkEventEvaluator NetworkEventEvaluator NetworkEventEvaluator NetworkEventEvaluator NetworkEventEvaluator NetworkEventEvaluator NetworkEventEvaluator,724,test,webarena_verified.25.723.2 +webarena_verified.1510.725.2,False,reddit,AgentResponseEvaluator NetworkEventEvaluator,725,test,webarena_verified.25.724.2 +webarena_verified.1510.726.2,False,reddit,AgentResponseEvaluator,726,test,webarena_verified.1510.725.2 +webarena_verified.1510.727.2,False,reddit,AgentResponseEvaluator NetworkEventEvaluator,727,train,webarena_verified.1510.726.2 +webarena_verified.1510.728.2,False,reddit,AgentResponseEvaluator NetworkEventEvaluator,728,train,webarena_verified.1510.727.2 +webarena_verified.1510.729.2,False,reddit,AgentResponseEvaluator NetworkEventEvaluator,729,train,webarena_verified.1510.728.2 +webarena_verified.1510.730.2,False,reddit,AgentResponseEvaluator NetworkEventEvaluator NetworkEventEvaluator NetworkEventEvaluator NetworkEventEvaluator NetworkEventEvaluator NetworkEventEvaluator NetworkEventEvaluator NetworkEventEvaluator NetworkEventEvaluator NetworkEventEvaluator,730,test,webarena_verified.1510.729.2 +webarena_verified.27.731.2,False,reddit,AgentResponseEvaluator NetworkEventEvaluator,731,test,webarena_verified.1510.730.2 +webarena_verified.27.732.2,False,reddit,AgentResponseEvaluator NetworkEventEvaluator,732,train,webarena_verified.27.731.2 +webarena_verified.27.733.2,False,reddit,AgentResponseEvaluator NetworkEventEvaluator,733,train,webarena_verified.27.732.2 +webarena_verified.27.734.2,False,reddit,AgentResponseEvaluator NetworkEventEvaluator,734,train,webarena_verified.27.733.2 +webarena_verified.27.735.2,False,reddit,AgentResponseEvaluator NetworkEventEvaluator,735,test,webarena_verified.27.734.2 +webarena_verified.355.736.2,False,gitlab,AgentResponseEvaluator NetworkEventEvaluator NetworkEventEvaluator,736,train,webarena_verified.117.688.2 +webarena_verified.94.737.2,False,wikipedia map,AgentResponseEvaluator NetworkEventEvaluator,737,train,webarena_verified.371.430.2 +webarena_verified.94.738.2,False,wikipedia map,AgentResponseEvaluator NetworkEventEvaluator,738,test,webarena_verified.94.737.2 +webarena_verified.94.739.2,False,wikipedia map,AgentResponseEvaluator NetworkEventEvaluator,739,train,webarena_verified.94.738.2 +webarena_verified.94.740.2,False,wikipedia map,AgentResponseEvaluator NetworkEventEvaluator,740,test,webarena_verified.94.739.2 +webarena_verified.94.741.2,False,wikipedia map,AgentResponseEvaluator NetworkEventEvaluator,741,train,webarena_verified.94.740.2 +webarena_verified.332.742.2,False,gitlab,AgentResponseEvaluator NetworkEventEvaluator NetworkEventEvaluator NetworkEventEvaluator,742,test,webarena_verified.355.736.2 +webarena_verified.332.743.2,False,gitlab,AgentResponseEvaluator NetworkEventEvaluator NetworkEventEvaluator NetworkEventEvaluator,743,test,webarena_verified.332.742.2 +webarena_verified.332.744.2,False,gitlab,AgentResponseEvaluator NetworkEventEvaluator NetworkEventEvaluator,744,test,webarena_verified.332.743.2 +webarena_verified.332.745.2,False,gitlab,AgentResponseEvaluator NetworkEventEvaluator NetworkEventEvaluator NetworkEventEvaluator NetworkEventEvaluator,745,test,webarena_verified.332.744.2 +webarena_verified.332.746.2,False,gitlab,AgentResponseEvaluator NetworkEventEvaluator NetworkEventEvaluator NetworkEventEvaluator NetworkEventEvaluator,746,train,webarena_verified.332.745.2 +webarena_verified.2100.747.2,False,gitlab,AgentResponseEvaluator NetworkEventEvaluator NetworkEventEvaluator,747,train,webarena_verified.332.746.2 +webarena_verified.2100.748.2,False,gitlab,AgentResponseEvaluator NetworkEventEvaluator NetworkEventEvaluator,748,train,webarena_verified.2100.747.2 +webarena_verified.2100.749.2,False,gitlab,AgentResponseEvaluator NetworkEventEvaluator NetworkEventEvaluator,749,test,webarena_verified.2100.748.2 +webarena_verified.2100.750.2,False,gitlab,AgentResponseEvaluator NetworkEventEvaluator NetworkEventEvaluator,750,test,webarena_verified.2100.749.2 +webarena_verified.2100.751.2,False,gitlab,AgentResponseEvaluator NetworkEventEvaluator NetworkEventEvaluator,751,train,webarena_verified.2100.750.2 +webarena_verified.332.752.2,False,gitlab,AgentResponseEvaluator NetworkEventEvaluator,752,train,webarena_verified.2100.751.2 +webarena_verified.332.753.2,False,gitlab,AgentResponseEvaluator NetworkEventEvaluator,753,test,webarena_verified.332.752.2 +webarena_verified.332.754.2,False,gitlab,AgentResponseEvaluator NetworkEventEvaluator,754,train,webarena_verified.332.753.2 +webarena_verified.332.755.2,False,gitlab,AgentResponseEvaluator NetworkEventEvaluator,755,test,webarena_verified.332.754.2 +webarena_verified.332.756.2,False,gitlab,AgentResponseEvaluator NetworkEventEvaluator,756,train,webarena_verified.332.755.2 +webarena_verified.42.757.2,False,map,AgentResponseEvaluator NetworkEventEvaluator,757,test,webarena_verified.94.741.2 +webarena_verified.42.758.2,False,map,AgentResponseEvaluator NetworkEventEvaluator,758,test,webarena_verified.42.757.2 +webarena_verified.42.759.2,False,map shopping_admin,AgentResponseEvaluator NetworkEventEvaluator,759,test,webarena_verified.271.713.2 +webarena_verified.42.760.2,False,map shopping_admin,AgentResponseEvaluator NetworkEventEvaluator,760,test,webarena_verified.42.759.2 +webarena_verified.54.761.2,False,map,AgentResponseEvaluator NetworkEventEvaluator,761,train,webarena_verified.42.760.2 +webarena_verified.54.762.2,False,map,AgentResponseEvaluator NetworkEventEvaluator,762,train,webarena_verified.54.761.2 +webarena_verified.75.763.2,False,map,AgentResponseEvaluator NetworkEventEvaluator,763,test,webarena_verified.54.762.2 +webarena_verified.75.764.2,False,map,AgentResponseEvaluator NetworkEventEvaluator,764,test,webarena_verified.75.763.2 +webarena_verified.75.765.2,False,map,AgentResponseEvaluator NetworkEventEvaluator,765,train,webarena_verified.75.764.2 +webarena_verified.75.766.2,False,map,AgentResponseEvaluator NetworkEventEvaluator,766,train,webarena_verified.75.765.2 +webarena_verified.75.767.2,False,map,AgentResponseEvaluator NetworkEventEvaluator,767,train,webarena_verified.75.766.2 +webarena_verified.241.768.2,False,shopping_admin,AgentResponseEvaluator NetworkEventEvaluator,768,test,webarena_verified.42.760.2 +webarena_verified.241.769.2,False,shopping_admin,AgentResponseEvaluator NetworkEventEvaluator NetworkEventEvaluator NetworkEventEvaluator NetworkEventEvaluator NetworkEventEvaluator,769,test,webarena_verified.241.768.2 +webarena_verified.241.770.2,False,shopping_admin,AgentResponseEvaluator NetworkEventEvaluator NetworkEventEvaluator,770,train,webarena_verified.241.769.2 +webarena_verified.243.771.2,False,shopping_admin,AgentResponseEvaluator NetworkEventEvaluator NetworkEventEvaluator,771,test,webarena_verified.241.770.2 +webarena_verified.246.772.2,False,shopping_admin,AgentResponseEvaluator NetworkEventEvaluator,772,test,webarena_verified.243.771.2 +webarena_verified.246.773.2,False,shopping_admin,AgentResponseEvaluator NetworkEventEvaluator NetworkEventEvaluator,773,train,webarena_verified.246.772.2 +webarena_verified.246.774.2,False,shopping_admin,AgentResponseEvaluator NetworkEventEvaluator NetworkEventEvaluator NetworkEventEvaluator,774,train,webarena_verified.246.773.2 +webarena_verified.246.775.2,False,shopping_admin,AgentResponseEvaluator NetworkEventEvaluator,775,train,webarena_verified.246.774.2 +webarena_verified.246.776.2,False,shopping_admin,AgentResponseEvaluator NetworkEventEvaluator NetworkEventEvaluator,776,test,webarena_verified.246.775.2 +webarena_verified.742.777.2,False,shopping_admin,AgentResponseEvaluator NetworkEventEvaluator NetworkEventEvaluator NetworkEventEvaluator NetworkEventEvaluator NetworkEventEvaluator,777,train,webarena_verified.246.776.2 +webarena_verified.742.778.2,False,shopping_admin,AgentResponseEvaluator NetworkEventEvaluator NetworkEventEvaluator NetworkEventEvaluator,778,test,webarena_verified.742.777.2 +webarena_verified.742.779.2,False,shopping_admin,AgentResponseEvaluator NetworkEventEvaluator NetworkEventEvaluator NetworkEventEvaluator,779,train,webarena_verified.742.778.2 +webarena_verified.742.780.2,False,shopping_admin,AgentResponseEvaluator NetworkEventEvaluator NetworkEventEvaluator,780,test,webarena_verified.742.779.2 +webarena_verified.742.781.2,False,shopping_admin,AgentResponseEvaluator NetworkEventEvaluator,781,train,webarena_verified.742.780.2 +webarena_verified.742.782.2,False,shopping_admin,AgentResponseEvaluator NetworkEventEvaluator NetworkEventEvaluator NetworkEventEvaluator,782,test,webarena_verified.742.781.2 +webarena_verified.351.783.2,False,gitlab,AgentResponseEvaluator,783,train,webarena_verified.332.756.2 +webarena_verified.316.784.2,False,gitlab,AgentResponseEvaluator,784,test,webarena_verified.351.783.2 +webarena_verified.316.785.2,False,gitlab,AgentResponseEvaluator,785,test,webarena_verified.316.784.2 +webarena_verified.316.786.2,False,gitlab,AgentResponseEvaluator,786,test,webarena_verified.316.785.2 +webarena_verified.316.787.2,False,gitlab,AgentResponseEvaluator,787,test,webarena_verified.316.786.2 +webarena_verified.316.788.4,False,gitlab,AgentResponseEvaluator,788,test,webarena_verified.316.787.2 +webarena_verified.328.789.2,False,gitlab,AgentResponseEvaluator,789,test,webarena_verified.316.788.4 +webarena_verified.246.790.2,False,shopping_admin,AgentResponseEvaluator,790,test,webarena_verified.742.782.2 +webarena_verified.84.791.2,False,gitlab reddit,AgentResponseEvaluator NetworkEventEvaluator,791,train,webarena_verified.27.735.2 +webarena_verified.172.792.2,False,shopping,AgentResponseEvaluator,792,test,webarena_verified.163.693.2 +webarena_verified.172.793.2,False,shopping,AgentResponseEvaluator,793,train,webarena_verified.172.792.2 +webarena_verified.191.794.2,False,shopping,AgentResponseEvaluator NetworkEventEvaluator,794,test,webarena_verified.172.793.2 +webarena_verified.191.795.2,False,shopping,AgentResponseEvaluator NetworkEventEvaluator,795,train,webarena_verified.191.794.2 +webarena_verified.191.796.2,False,shopping,AgentResponseEvaluator NetworkEventEvaluator,796,train,webarena_verified.191.795.2 +webarena_verified.191.797.2,False,shopping,AgentResponseEvaluator NetworkEventEvaluator,797,test,webarena_verified.191.796.2 +webarena_verified.191.798.2,False,shopping,AgentResponseEvaluator NetworkEventEvaluator,798,train,webarena_verified.191.797.2 +webarena_verified.600.799.2,False,gitlab,AgentResponseEvaluator NetworkEventEvaluator NetworkEventEvaluator,799,train,webarena_verified.84.791.2 +webarena_verified.600.800.2,False,gitlab,AgentResponseEvaluator NetworkEventEvaluator NetworkEventEvaluator,800,test,webarena_verified.600.799.2 +webarena_verified.600.801.2,False,gitlab,AgentResponseEvaluator NetworkEventEvaluator NetworkEventEvaluator,801,train,webarena_verified.600.800.2 +webarena_verified.600.802.2,False,gitlab,AgentResponseEvaluator NetworkEventEvaluator NetworkEventEvaluator,802,train,webarena_verified.600.801.2 +webarena_verified.600.803.2,False,gitlab,AgentResponseEvaluator NetworkEventEvaluator NetworkEventEvaluator,803,test,webarena_verified.600.802.2 +webarena_verified.999.804.2,False,gitlab,AgentResponseEvaluator NetworkEventEvaluator NetworkEventEvaluator,804,train,webarena_verified.600.803.2 +webarena_verified.335.805.2,False,gitlab,AgentResponseEvaluator,805,test,webarena_verified.999.804.2 +webarena_verified.335.806.2,False,gitlab,AgentResponseEvaluator NetworkEventEvaluator,806,test,webarena_verified.335.805.2 +webarena_verified.335.807.2,False,gitlab,AgentResponseEvaluator,807,train,webarena_verified.335.806.2 +webarena_verified.327.808.2,False,gitlab,AgentResponseEvaluator NetworkEventEvaluator,808,train,webarena_verified.335.807.2 +webarena_verified.327.809.2,False,gitlab,AgentResponseEvaluator NetworkEventEvaluator,809,train,webarena_verified.327.808.2 +webarena_verified.999.810.2,False,gitlab,AgentResponseEvaluator NetworkEventEvaluator,810,test,webarena_verified.327.809.2 +webarena_verified.999.811.2,False,gitlab,AgentResponseEvaluator NetworkEventEvaluator,811,test,webarena_verified.999.810.2 diff --git a/browsergym/experiments/src/browsergym/experiments/benchmark/utils.py b/browsergym/experiments/src/browsergym/experiments/benchmark/utils.py index 7a9948e3..a5b5d2ee 100644 --- a/browsergym/experiments/src/browsergym/experiments/benchmark/utils.py +++ b/browsergym/experiments/src/browsergym/experiments/benchmark/utils.py @@ -157,16 +157,16 @@ def prepare_backend(backend: str): ) massage_tasks( [ - f"webarena_verified.{intent_template_id}.{task_id}" - for intent_template_id, task_id in [ - (23, 410), # reddit - (330, 533), # gitlab - (87, 561), # gitlab wiki - (88, 562), # gitlab reddit - (165, 574), # shopping - (16, 640), # reddit - (253, 680), # shopping_admin - (94, 740), # wiki map + f"webarena_verified.{intent_template_id}.{task_id}.{revision}" + for intent_template_id, task_id, revision in [ + (23, 410, 2), # reddit + (330, 533, 2), # gitlab + (87, 561, 3), # gitlab wiki + (88, 562, 2), # gitlab reddit + (165, 574, 2), # shopping + (16, 640, 2), # reddit + (253, 680, 2), # shopping_admin + (94, 740, 2), # wiki map ] ] ) diff --git a/browsergym/pyproject.toml b/browsergym/pyproject.toml index 9489bfea..1988e430 100644 --- a/browsergym/pyproject.toml +++ b/browsergym/pyproject.toml @@ -16,6 +16,7 @@ authors = [ {name = "Thibault Le Sellier De Chezelles"}, {name = "Tom Marty"}, {name = "Aman Jaiswal"}, + {name = "Nicolas Gontier"}, ] readme = "README.md" requires-python = ">3.10" diff --git a/browsergym/webarena_verified/README.md b/browsergym/webarena_verified/README.md index 279505b7..44fe4d5e 100644 --- a/browsergym/webarena_verified/README.md +++ b/browsergym/webarena_verified/README.md @@ -49,4 +49,10 @@ benchmark = benchmark.subset_from_list( ) ``` -**NOTE**: Tasks are registered with this template: `webarena_verified.{intent_template_id}.{task_id}` +#### 3. Task gym ID format + +Tasks are registered to gym with this template: `webarena_verified.{intent_template_id}.{task_id}.{revision}` + +- the `intent_template_id` (int) refers to the template of the question. Multiple tasks can have the same template question but with different instantiations. +- the `task_id` (int from 0 to 811) is unique to each question. This is the same task ID as in the original webarena benchmark. +- the `revision` (int) is a version number to keep track of updates done to all webarena-verified tasks accross time. diff --git a/browsergym/webarena_verified/pyproject.toml b/browsergym/webarena_verified/pyproject.toml index da593c2a..166eb1d3 100644 --- a/browsergym/webarena_verified/pyproject.toml +++ b/browsergym/webarena_verified/pyproject.toml @@ -29,11 +29,6 @@ path = "../core/src/browsergym/core/__init__.py" [tool.hatch.metadata.hooks.requirements_txt] files = ["requirements.txt"] -[tool.hatch.build] -include = [ - "src/browsergym/webarena_verified/webarena_verified.json" -] - [tool.hatch.metadata] allow-direct-references = true diff --git a/browsergym/webarena_verified/src/browsergym/webarena_verified/__init__.py b/browsergym/webarena_verified/src/browsergym/webarena_verified/__init__.py index 030d4830..bc73cc73 100644 --- a/browsergym/webarena_verified/src/browsergym/webarena_verified/__init__.py +++ b/browsergym/webarena_verified/src/browsergym/webarena_verified/__init__.py @@ -14,8 +14,10 @@ ALL_WEBARENA_TASK_IDS = [] # register all WebArena benchmark -for task_id, intent_template_id in zip(config.TASK_IDS, config.INTENT_TEMPLATE_IDS): - gym_id = f"webarena_verified.{intent_template_id}.{task_id}" +for task_id, intent_template_id, revision in zip( + config.TASK_IDS, config.INTENT_TEMPLATE_IDS, config.REVISIONS +): + gym_id = f"webarena_verified.{intent_template_id}.{task_id}.{revision}" register_task( gym_id, task.WebArenaVerifiedTask, diff --git a/browsergym/webarena_verified/src/browsergym/webarena_verified/config.py b/browsergym/webarena_verified/src/browsergym/webarena_verified/config.py index c412c7df..0ff7c883 100644 --- a/browsergym/webarena_verified/src/browsergym/webarena_verified/config.py +++ b/browsergym/webarena_verified/src/browsergym/webarena_verified/config.py @@ -5,34 +5,19 @@ TASK_IDS = range(812) INTENT_TEMPLATE_IDS = [] +REVISIONS = [] -with open(Path(__file__).parent / "webarena_verified.json", "r") as f: - data = json.load(f) - -# Check if the json file is the same as the one in the webarena-verified repository -library_json_string = ( +# Load the json file from the webarena-verified library +data = json.loads( importlib.resources.files("webarena_verified") .joinpath("assets/dataset/webarena-verified.json") .read_text() ) -library_json = json.loads(library_json_string) - -if json.dumps(data, sort_keys=True, indent=2) != json.dumps(library_json, sort_keys=True, indent=2): - print( - "Warning: the json file is not the same as the one in the webarena-verified repository. Consider updating the library." - ) - print("=" * 100) - print("Differences:") - for diff in difflib.unified_diff( - json.dumps(data, sort_keys=True, indent=2).splitlines(), - json.dumps(library_json, sort_keys=True, indent=2).splitlines(), - ): - print(diff) - print("=" * 100) for task in data: INTENT_TEMPLATE_IDS.append(task["intent_template_id"]) + REVISIONS.append(task["revision"]) -assert len(INTENT_TEMPLATE_IDS) == len( - TASK_IDS -), "Number of intent template IDs must match number of task IDs" +assert ( + len(INTENT_TEMPLATE_IDS) == len(TASK_IDS) == len(REVISIONS) +), f"Number of intent template IDs ({len(INTENT_TEMPLATE_IDS)}), task IDs ({len(TASK_IDS)}), and revisions ({len(REVISIONS)}) must match" diff --git a/browsergym/webarena_verified/src/browsergym/webarena_verified/evaluators.py b/browsergym/webarena_verified/src/browsergym/webarena_verified/evaluators.py index da436ea7..bd30bcb3 100644 --- a/browsergym/webarena_verified/src/browsergym/webarena_verified/evaluators.py +++ b/browsergym/webarena_verified/src/browsergym/webarena_verified/evaluators.py @@ -3,6 +3,7 @@ from platform-labs-agent-eval-harness. """ +import importlib.resources import json import logging import tempfile @@ -46,7 +47,9 @@ def __init__(self, webarena_instance: WebArenaInstance): """ # Create configuration for all sites and homepage from webarena_instance config = WebArenaVerifiedConfig( - test_data_file=Path(__file__).parent.joinpath("webarena_verified.json"), + test_data_file=importlib.resources.files("webarena_verified").joinpath( + "assets/dataset/webarena-verified.json" + ), environments={ **{ site: EnvironmentConfig( diff --git a/browsergym/webarena_verified/src/browsergym/webarena_verified/task.py b/browsergym/webarena_verified/src/browsergym/webarena_verified/task.py index 986a6231..f887b5b7 100644 --- a/browsergym/webarena_verified/src/browsergym/webarena_verified/task.py +++ b/browsergym/webarena_verified/src/browsergym/webarena_verified/task.py @@ -46,8 +46,8 @@ def __init__( # Load the webarena_verified.json file all_configs_str = ( - importlib.resources.files("browsergym.webarena_verified") - .joinpath("webarena_verified.json") + importlib.resources.files("webarena_verified") + .joinpath("assets/dataset/webarena-verified.json") .read_text() ) diff --git a/browsergym/webarena_verified/src/browsergym/webarena_verified/webarena_verified.json b/browsergym/webarena_verified/src/browsergym/webarena_verified/webarena_verified.json deleted file mode 100644 index eb73812e..00000000 --- a/browsergym/webarena_verified/src/browsergym/webarena_verified/webarena_verified.json +++ /dev/null @@ -1,23962 +0,0 @@ -[ - { - "sites": ["shopping_admin"], - "task_id": 0, - "intent_template_id": 279, - "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Get the top-1 best-selling product name(s) in 2022", - "intent_template": "Get the top-{{n}} best-selling {{entity}} in {{period}}", - "instantiation_dict": {"n": 1, "entity": "product name(s)", "period": "2022"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { "type": "array", "items": {"type": "string"} }, - "expected": { - "task_type": "retrieve", - "status": "SUCCESS", - "retrieved_data": ["Quest Lumaflex\u2122 Band"] - } - } - ], - "revision": 2 - }, - { - "sites": ["shopping_admin"], - "task_id": 1, - "intent_template_id": 279, - "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Get the top-1 best-selling brand name(s) in Quarter 1 2022", - "intent_template": "Get the top-{{n}} best-selling {{entity}} in {{period}}", - "instantiation_dict": {"n": 1, "period": "Quarter 1 2022", "entity": "brand name(s)"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { "type": "array", "items": {"type": "string"} }, - "expected": { "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": ["Sprite"] } - } - ], - "revision": 2 - }, - { - "sites": ["shopping_admin"], - "task_id": 2, - "intent_template_id": 279, - "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Get the top-1 best-selling product type name(s) in Quarter 1 2022", - "intent_template": "Get the top-{{n}} best-selling {{entity}} in {{period}}", - "instantiation_dict": {"n": 1, "period": "Quarter 1 2022", "entity": "product type name(s)"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { "type": "array", "items": {"type": "string"} }, - "expected": { - "task_type": "retrieve", - "status": "SUCCESS", - "retrieved_data": [ ["Digital Watch", "Band", "Stasis Ball", "Yoga Strap"] ] - } - } - ], - "revision": 2 - }, - { - "sites": ["shopping_admin"], - "task_id": 3, - "intent_template_id": 279, - "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Get the top-2 best-selling product name(s) in 2022", - "intent_template": "Get the top-{{n}} best-selling {{entity}} in {{period}}", - "instantiation_dict": {"n": 2, "entity": "product name(s)", "period": "2022"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { "type": "array", "items": {"type": "string"} }, - "expected": { - "task_type": "retrieve", - "status": "SUCCESS", - "retrieved_data": [ - "Quest Lumaflex\u2122 Band", - [ - "Sprite Stasis Ball 65 cm", "Cruise Stasis Ball 65 cm", - "Sprite Stasis Ball 55 cm" - ] - ] - } - } - ], - "revision": 2 - }, - { - "sites": ["shopping_admin"], - "task_id": 4, - "intent_template_id": 279, - "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Get the top-3 best-selling product name(s) in Jan 2023", - "intent_template": "Get the top-{{n}} best-selling {{entity}} in {{period}}", - "instantiation_dict": {"n": 3, "period": "Jan 2023", "entity": "product name(s)"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { "type": "array", "items": {"type": "string"} }, - "expected": { - "task_type": "retrieve", - "status": "SUCCESS", - "retrieved_data": ["Impulse Duffle", "Overnight Duffle", "Hawkeye Yoga Short-32-Blue"] - } - } - ], - "revision": 2 - }, - { - "sites": ["shopping_admin"], - "task_id": 5, - "intent_template_id": 279, - "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Get the top-1 best-selling product type name(s) in Jan 2023", - "intent_template": "Get the top-{{n}} best-selling {{entity}} in {{period}}", - "instantiation_dict": {"n": 1, "period": "Jan 2023", "entity": "product type name(s)"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { "type": "array", "items": {"type": "string"} }, - "expected": { "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": ["Duffle"] } - } - ], - "revision": 2 - }, - { - "sites": ["shopping_admin"], - "task_id": 6, - "intent_template_id": 279, - "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Get the top-2 best-selling product name(s) in 2023", - "intent_template": "Get the top-{{n}} best-selling {{entity}} in {{period}}", - "instantiation_dict": {"n": 2, "entity": "product name(s)", "period": "2023"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { "type": "array", "items": {"type": "string"} }, - "expected": { - "task_type": "retrieve", - "status": "SUCCESS", - "retrieved_data": [ - "Sprite Yoga Strap 6 foot", - ["Overnight Duffle", "Ida Workout Parachute Pant-29-Purple"] - ] - } - } - ], - "revision": 2 - }, - { - "sites": ["map"], - "task_id": 7, - "intent_template_id": 79, - "start_urls": ["__MAP__"], - "intent": "Get the name, state, and zip code of all international airports that are within a driving distance of 50 km to Carnegie Mellon University. Return a list of objects with keys \"name\", \"state\", and \"postcode\". (Use the OSRM direction service.)", - "intent_template": "Get the name, state, and zip code of all {{airport_type}} that are within a driving distance of {{radius}} to {{start}}. {{retrieved_data_format_spec}}. (Use the OSRM direction service.)", - "instantiation_dict": { - "airport_type": "international airports", - "start": "Carnegie Mellon University", - "radius": "50 km", - "retrieved_data_format_spec": "Return a list of objects with keys \"name\", \"state\", and \"postcode\"" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { - "type": "array", - "items": { - "type": "object", - "title": "full_address", - "properties": { - "name" : { "type": "string" }, - "state" : { "type": "string" }, - "postcode": { "type": "string" } - } - } - }, - "expected": { - "task_type": "retrieve", - "status": "SUCCESS", - "retrieved_data": [ - { - "name": "Pittsburgh International Airport", - "state": "Pennsylvania", - "postcode": "15231" - } - ] - } - } - ], - "revision": 2 - }, - { - "sites": ["map"], - "task_id": 8, - "intent_template_id": 79, - "start_urls": ["__MAP__"], - "intent": "Get the name, state, and zip code of all international airports that are within a driving distance of 5 km to Carnegie Mellon University. Return a list of objects with keys \"name\", \"state\", and \"postcode\". (Use the OSRM direction service.)", - "intent_template": "Get the name, state, and zip code of all {{airport_type}} that are within a driving distance of {{radius}} to {{start}}. {{retrieved_data_format_spec}}. (Use the OSRM direction service.)", - "instantiation_dict": { - "airport_type": "international airports", - "start": "Carnegie Mellon University", - "radius": "5 km", - "retrieved_data_format_spec": "Return a list of objects with keys \"name\", \"state\", and \"postcode\"" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "retrieve", "status": "NOT_FOUND_ERROR", "retrieved_data": null} - } - ], - "revision": 2 - }, - { - "sites": ["map"], - "task_id": 9, - "intent_template_id": 79, - "start_urls": ["__MAP__"], - "intent": "Get the name, state, and zip code of all international airports that are within a driving distance of 30 km to Carnegie Art Museum. Return a list of objects with keys \"name\", \"state\", and \"postcode\". (Use the OSRM direction service.)", - "intent_template": "Get the name, state, and zip code of all {{airport_type}} that are within a driving distance of {{radius}} to {{start}}. {{retrieved_data_format_spec}}. (Use the OSRM direction service.)", - "instantiation_dict": { - "airport_type": "international airports", - "start": "Carnegie Art Museum", - "radius": "30 km", - "retrieved_data_format_spec": "Return a list of objects with keys \"name\", \"state\", and \"postcode\"" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { - "type": "array", - "items": { - "type": "object", - "title": "full_address", - "properties": { - "name" : { "type": "string" }, - "state" : { "type": "string" }, - "postcode": { "type": "string" } - } - } - }, - "expected": { - "task_type": "retrieve", - "status": "SUCCESS", - "retrieved_data": [ - { - "name": "Pittsburgh International Airport", - "state": "Pennsylvania", - "postcode": "15231" - } - ] - } - } - ], - "revision": 2 - }, - { - "sites": ["map"], - "task_id": 10, - "intent_template_id": 79, - "start_urls": ["__MAP__"], - "intent": "Get the name, state, and zip code of all US international airports that are within a driving distance of 60 km to Niagara Falls. Return a list of objects with keys \"name\", \"state\", and \"postcode\". (Use the OSRM direction service.)", - "intent_template": "Get the name, state, and zip code of all {{airport_type}} that are within a driving distance of {{radius}} to {{start}}. {{retrieved_data_format_spec}}. (Use the OSRM direction service.)", - "instantiation_dict": { - "airport_type": "US international airports", - "start": "Niagara Falls", - "radius": "60 km", - "retrieved_data_format_spec": "Return a list of objects with keys \"name\", \"state\", and \"postcode\"" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { - "type": "array", - "items": { - "type": "object", - "title": "full_address", - "properties": { - "name" : { "type": "string" }, - "state" : { "type": "string" }, - "postcode": { "type": "string" } - } - } - }, - "expected": { - "task_type": "retrieve", - "status": "SUCCESS", - "retrieved_data": [ - { - "name": "Niagara Falls International Airport", - "state": "New York", - "postcode": "14304" - }, - { - "name": "Buffalo Niagara International Airport", - "state": "New York", - "postcode": "14225" - } - ] - } - } - ], - "revision": 2 - }, - { - "sites": ["shopping_admin"], - "task_id": 11, - "intent_template_id": 288, - "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Get the total number of reviews that our store received so far that mention term \"disappointed\"", - "intent_template": "Get the total number of reviews that our store received so far that mention term \"{{term}}\"", - "instantiation_dict": {"term": "disappointed"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { "type": "array", "items": {"type": "number"} }, - "expected": { "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": [6] } - } - ], - "revision": 2 - }, - { - "sites": ["shopping_admin"], - "task_id": 12, - "intent_template_id": 288, - "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Get the total number of reviews that our store received so far that mention term \"satisfied\"", - "intent_template": "Get the total number of reviews that our store received so far that mention term \"{{term}}\"", - "instantiation_dict": {"term": "satisfied"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { "type": "array", "items": {"type": "number"} }, - "expected": { "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": [2] } - } - ], - "revision": 2 - }, - { - "sites": ["shopping_admin"], - "task_id": 13, - "intent_template_id": 288, - "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Get the total number of reviews that our store received so far that mention term \"decent\"", - "intent_template": "Get the total number of reviews that our store received so far that mention term \"{{term}}\"", - "instantiation_dict": {"term": "decent"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { "type": "array", "items": {"type": "number"} }, - "expected": { "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": [2] } - } - ], - "revision": 2 - }, - { - "sites": ["shopping_admin"], - "task_id": 14, - "intent_template_id": 288, - "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Get the total number of reviews that our store received so far that mention term \"not useful\"", - "intent_template": "Get the total number of reviews that our store received so far that mention term \"{{term}}\"", - "instantiation_dict": {"term": "not useful"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { "type": "array", "items": {"type": "number"} }, - "expected": { "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": [0] } - } - ], - "revision": 2 - }, - { - "sites": ["shopping_admin"], - "task_id": 15, - "intent_template_id": 288, - "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Get the total number of reviews that our store received so far that mention term \"best\"", - "intent_template": "Get the total number of reviews that our store received so far that mention term \"{{term}}\"", - "instantiation_dict": {"term": "best"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { "type": "array", "items": {"type": "number"} }, - "expected": { "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": [2] } - } - ], - "revision": 2 - }, - { - "sites": ["map"], - "task_id": 16, - "intent_template_id": 73, - "start_urls": ["__MAP__"], - "intent": "Get the time for walking and driving route from 5000 Fifth Avenue, Pittsburgh to UPMC family health center. Return a list of objects with keys \"mode\" (driving or walking) and \"duration\" (in HH:MM:SS format) only, without any additional details. (Use the OSRM direction service.)", - "intent_template": "Get the time for walking and driving route from {{start}} to {{end}}. {{retrieved_data_format_spec}}. (Use the OSRM direction service.)", - "instantiation_dict": { - "start": "5000 Fifth Avenue, Pittsburgh", - "end": "UPMC family health center", - "retrieved_data_format_spec": "Return a list of objects with keys \"mode\" (driving or walking) and \"duration\" (in HH:MM:SS format) only, without any additional details" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { - "type": "array", - "items": { - "type": "object", - "properties": { - "mode" : { "type": "string" }, - "duration": { "type": "string", "format": "duration" } - } - } - }, - "expected": { - "task_type": "retrieve", - "status": "SUCCESS", - "retrieved_data": [ - { "mode": "driving", "duration": "2min" }, - { "mode": "walking", "duration": "16min" } - ] - } - } - ], - "revision": 2 - }, - { - "sites": ["map"], - "task_id": 17, - "intent_template_id": 73, - "start_urls": ["__MAP__"], - "intent": "Get the time for walking and driving route from AMC Waterfront to Carnegie Mellon University. Return a list of objects with keys \"mode\" (driving or walking) and \"duration\" (in HH:MM:SS format) only, without any additional details. (Use the OSRM direction service.)", - "intent_template": "Get the time for walking and driving route from {{start}} to {{end}}. {{retrieved_data_format_spec}}. (Use the OSRM direction service.)", - "instantiation_dict": { - "start": "AMC Waterfront", - "end": "Carnegie Mellon University", - "retrieved_data_format_spec": "Return a list of objects with keys \"mode\" (driving or walking) and \"duration\" (in HH:MM:SS format) only, without any additional details" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { - "type": "array", - "items": { - "type": "object", - "properties": { - "mode" : { "type": "string" }, - "duration": { "type": "string", "format": "duration" } - } - } - }, - "expected": { - "task_type": "retrieve", - "status": "SUCCESS", - "retrieved_data": [ - { "mode": "driving", "duration": "13min" }, - { "mode": "walking", "duration": "1hr 35min" } - ] - } - } - ], - "revision": 2 - }, - { - "sites": ["map"], - "task_id": 18, - "intent_template_id": 73, - "start_urls": ["__MAP__"], - "intent": "Get the time for walking and driving route from AMC Waterfront to Univ of Pittsburgh. Return a list of objects with keys \"mode\" (driving or walking) and \"duration\" (in HH:MM:SS format) only, without any additional details. (Use the OSRM direction service.)", - "intent_template": "Get the time for walking and driving route from {{start}} to {{end}}. {{retrieved_data_format_spec}}. (Use the OSRM direction service.)", - "instantiation_dict": { - "start": "AMC Waterfront", - "end": "Univ of Pittsburgh", - "retrieved_data_format_spec": "Return a list of objects with keys \"mode\" (driving or walking) and \"duration\" (in HH:MM:SS format) only, without any additional details" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { - "type": "array", - "items": { - "type": "object", - "properties": { - "mode" : { "type": "string" }, - "duration": { "type": "string", "format": "duration" } - } - } - }, - "expected": { - "task_type": "retrieve", - "status": "SUCCESS", - "retrieved_data": [ - { "mode": "driving", "duration": "2min" }, - { "mode": "walking", "duration": "16min" } - ] - } - } - ], - "revision": 2 - }, - { - "sites": ["map"], - "task_id": 19, - "intent_template_id": 73, - "start_urls": ["__MAP__"], - "intent": "Get the time for walking and driving route from Carnegie Science Center to Carnegie Mellon University. Return a list of objects with keys \"mode\" (driving or walking) and \"duration\" (in HH:MM:SS format) only, without any additional details. (Use the OSRM direction service.)", - "intent_template": "Get the time for walking and driving route from {{start}} to {{end}}. {{retrieved_data_format_spec}}. (Use the OSRM direction service.)", - "instantiation_dict": { - "start": "Carnegie Science Center", - "end": "Carnegie Mellon University", - "retrieved_data_format_spec": "Return a list of objects with keys \"mode\" (driving or walking) and \"duration\" (in HH:MM:SS format) only, without any additional details" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { - "type": "array", - "items": { - "type": "object", - "properties": { - "mode" : { "type": "string" }, - "duration": { "type": "string", "format": "duration" } - } - } - }, - "expected": { - "task_type": "retrieve", - "status": "SUCCESS", - "retrieved_data": [ - { "mode": "driving", "duration": "12min" }, - { "mode": "walking", "duration": "1hr 44min" } - ] - } - } - ], - "revision": 2 - }, - { - "sites": ["map"], - "task_id": 20, - "intent_template_id": 73, - "start_urls": ["__MAP__"], - "intent": "Get the time for walking and driving route from Randyland to Carnegie Mellon University. Return a list of objects with keys \"mode\" (driving or walking) and \"duration\" (in HH:MM:SS format) only, without any additional details. (Use the OSRM direction service.)", - "intent_template": "Get the time for walking and driving route from {{start}} to {{end}}. {{retrieved_data_format_spec}}. (Use the OSRM direction service.)", - "instantiation_dict": { - "start": "Randyland", - "end": "Carnegie Mellon University", - "retrieved_data_format_spec": "Return a list of objects with keys \"mode\" (driving or walking) and \"duration\" (in HH:MM:SS format) only, without any additional details" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { - "type": "array", - "items": { - "type": "object", - "properties": { - "mode" : { "type": "string" }, - "duration": { "type": "string", "format": "duration" } - } - } - }, - "expected": { - "task_type": "retrieve", - "status": "SUCCESS", - "retrieved_data": [ - { "mode": "driving", "duration": "13min" }, - { "mode": "walking", "duration": "1hr 45min" } - ] - } - } - ], - "revision": 2 - }, - { - "sites": ["shopping"], - "task_id": 21, - "intent_template_id": 222, - "start_urls": [ - "__SHOPPING__/6s-wireless-headphones-over-ear-noise-canceling-hi-fi-bass-foldable-stereo-wireless-kid-headsets-earbuds-with-built-in-mic-micro-sd-tf-fm-for-iphone-samsung-ipad-pc-black-gold.html" - ], - "intent": "Get name(s) of reviewer(s) who mention ear cups being small for the product on the current page", - "intent_template": "Get name(s) of reviewer(s) who mention {{description}} for the product on the current page", - "instantiation_dict": {"description": "ear cups being small"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { "type": "array", "items": {"type": "string"} }, - "expected": { - "task_type": "retrieve", - "status": "SUCCESS", - "retrieved_data": [ - "Catso", - "Dibbins", - ["Anglebert Dinkherhump", "Anglebert", "Dinkherhump"], - ["Michelle Davis", "Michelle DavisMichelle Davis"] - ] - } - } - ], - "revision": 2 - }, - { - "sites": ["shopping"], - "task_id": 22, - "intent_template_id": 222, - "start_urls": [ - "__SHOPPING__/fujifilm-finepix-z200fd-10mp-digital-camera-with-5x-optical-dual-image-stabilized-zoom-black.html" - ], - "intent": "Get name(s) of reviewer(s) who mention under water photo for the product on the current page", - "intent_template": "Get name(s) of reviewer(s) who mention {{description}} for the product on the current page", - "instantiation_dict": {"description": "under water photo"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "retrieve", "status": "NOT_FOUND_ERROR", "retrieved_data": null} - } - ], - "revision": 2 - }, - { - "sites": ["shopping"], - "task_id": 23, - "intent_template_id": 222, - "start_urls": [ - "__SHOPPING__/3-pack-samsung-galaxy-s6-screen-protector-nearpow-tempered-glass-screen-protector-with-9h-hardness-crystal-clear-easy-bubble-free-installation-scratch-resist.html" - ], - "intent": "Get name(s) of reviewer(s) who mention good fingerprint resistant for the product on the current page", - "intent_template": "Get name(s) of reviewer(s) who mention {{description}} for the product on the current page", - "instantiation_dict": {"description": "good fingerprint resistant"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { "type": "array", "items": {"type": "string"} }, - "expected": { - "task_type": "retrieve", - "status": "SUCCESS", - "retrieved_data": ["Rachel", "T. Gannon"] - } - } - ], - "revision": 2 - }, - { - "sites": ["shopping"], - "task_id": 24, - "intent_template_id": 222, - "start_urls": [ - "__SHOPPING__/haflinger-men-s-wool-felt-open-back-slippers-beige-550-peat-us-7.html" - ], - "intent": "Get name(s) of reviewer(s) who mention price being unfair for the product on the current page", - "intent_template": "Get name(s) of reviewer(s) who mention {{description}} for the product on the current page", - "instantiation_dict": {"description": "price being unfair"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "retrieve", "status": "NOT_FOUND_ERROR", "retrieved_data": null} - } - ], - "revision": 2 - }, - { - "sites": ["shopping"], - "task_id": 25, - "intent_template_id": 222, - "start_urls": [ - "__SHOPPING__/epson-workforce-wf-3620-wifi-direct-all-in-one-color-inkjet-printer-copier-scanner-amazon-dash-replenishment-ready.html" - ], - "intent": "Get name(s) of reviewer(s) who mention print quality explicitly with a rating of 3 or less stars for the product on the current page", - "intent_template": "Get name(s) of reviewer(s) who mention {{description}} for the product on the current page", - "instantiation_dict": {"description": "print quality explicitly with a rating of 3 or less stars"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { "type": "array", "items": {"type": "string"} }, - "expected": { - "task_type": "retrieve", - "status": "SUCCESS", - "retrieved_data": ["Roxanne Brandon Coffey", "Nelson"] - } - } - ], - "revision": 2 - }, - { - "sites": ["shopping"], - "task_id": 26, - "intent_template_id": 222, - "start_urls": [ - "__SHOPPING__/epson-workforce-wf-3620-wifi-direct-all-in-one-color-inkjet-printer-copier-scanner-amazon-dash-replenishment-ready.html" - ], - "intent": "Get name(s) of reviewer(s) who mention complain of the customer service for the product on the current page", - "intent_template": "Get name(s) of reviewer(s) who mention {{description}} for the product on the current page", - "instantiation_dict": {"description": "complain of the customer service"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { "type": "array", "items": {"type": "string"} }, - "expected": { - "task_type": "retrieve", - "status": "SUCCESS", - "retrieved_data": ["RemyRRemyR", "Bob in Vegas"] - } - } - ], - "revision": 2 - }, - { - "sites": ["reddit"], - "task_id": 27, - "intent_template_id": 33, - "start_urls": ["__REDDIT__"], - "intent": "In the personal finances forum, get the username and post title of the most recent post, and count the number of comments on that post that are not from the author and have more downvotes than upvotes. Return a list of objects with keys \"username\", \"post_title\", and \"count\".", - "intent_template": "In the {{forum}} forum, get the username and post title of the most recent post, and count the number of comments on that post that are not from the author and have more downvotes than upvotes. {{retrieved_data_format_spec}}.", - "instantiation_dict": { - "forum": "personal finances", - "retrieved_data_format_spec": "Return a list of objects with keys \"username\", \"post_title\", and \"count\"" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { - "type": "array", - "items": { - "type": "object", - "properties": { - "username" : { "type": "string" }, - "post_title": { "type": "string" }, - "count" : { "type": "number" } - } - } - }, - "expected": { - "task_type": "retrieve", - "status": "SUCCESS", - "retrieved_data": [ - { - "username": "Hammer94", - "post_title": "56 year old mom has no retirement. Where do I even start on her behalf?", - "count": 0 - } - ] - } - } - ], - "revision": 2 - }, - { - "sites": ["reddit"], - "task_id": 28, - "intent_template_id": 33, - "start_urls": ["__REDDIT__"], - "intent": "In the Worcester forum, get the username and post title of the most recent post, and count the number of comments on that post that are not from the author and have more downvotes than upvotes. Return a list of objects with keys \"username\", \"post_title\", and \"count\".", - "intent_template": "In the {{forum}} forum, get the username and post title of the most recent post, and count the number of comments on that post that are not from the author and have more downvotes than upvotes. {{retrieved_data_format_spec}}.", - "instantiation_dict": { - "forum": "Worcester", - "retrieved_data_format_spec": "Return a list of objects with keys \"username\", \"post_title\", and \"count\"" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { - "type": "array", - "items": { - "type": "object", - "properties": { - "username" : { "type": "string" }, - "post_title": { "type": "string" }, - "count" : { "type": "number" } - } - } - }, - "expected": { - "task_type": "retrieve", - "status": "SUCCESS", - "retrieved_data": [ - { - "username": "mineinhusdson", - "post_title": "Best place for a foot rub?", - "count": 0 - } - ] - } - } - ], - "revision": 2 - }, - { - "sites": ["reddit"], - "task_id": 29, - "intent_template_id": 33, - "start_urls": ["__REDDIT__"], - "intent": "In the DIY forum, get the username and post title of the most recent post, and count the number of comments on that post that are not from the author and have more downvotes than upvotes. Return a list of objects with keys \"username\", \"post_title\", and \"count\".", - "intent_template": "In the {{forum}} forum, get the username and post title of the most recent post, and count the number of comments on that post that are not from the author and have more downvotes than upvotes. {{retrieved_data_format_spec}}.", - "instantiation_dict": { - "forum": "DIY", - "retrieved_data_format_spec": "Return a list of objects with keys \"username\", \"post_title\", and \"count\"" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { - "type": "array", - "items": { - "type": "object", - "properties": { - "username" : { "type": "string" }, - "post_title": { "type": "string" }, - "count" : { "type": "number" } - } - } - }, - "expected": { - "task_type": "retrieve", - "status": "SUCCESS", - "retrieved_data": [ - { - "username": "ziostraccette", - "post_title": "How can I bring an HDMI cable from my pc downstairs to my TV upstairs?", - "count": 0 - } - ] - } - } - ], - "revision": 2 - }, - { - "sites": ["reddit"], - "task_id": 30, - "intent_template_id": 33, - "start_urls": ["__REDDIT__"], - "intent": "In the space forum, get the username and post title of the most recent post, and count the number of comments on that post that are not from the author and have more downvotes than upvotes. Return a list of objects with keys \"username\", \"post_title\", and \"count\".", - "intent_template": "In the {{forum}} forum, get the username and post title of the most recent post, and count the number of comments on that post that are not from the author and have more downvotes than upvotes. {{retrieved_data_format_spec}}.", - "instantiation_dict": { - "forum": "space", - "retrieved_data_format_spec": "Return a list of objects with keys \"username\", \"post_title\", and \"count\"" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { - "type": "array", - "items": { - "type": "object", - "properties": { - "username" : { "type": "string" }, - "post_title": { "type": "string" }, - "count" : { "type": "number" } - } - } - }, - "expected": { - "task_type": "retrieve", - "status": "SUCCESS", - "retrieved_data": [ - { - "username": "Dhghomon", - "post_title": "Scientists erupt at NASA gutting funding for crucial Venus mission", - "count": 0 - } - ] - } - } - ], - "revision": 2 - }, - { - "sites": ["reddit"], - "task_id": 31, - "intent_template_id": 33, - "start_urls": ["__REDDIT__"], - "intent": "In the photoshopbattles forum, get the username and post title of the most recent post, and count the number of comments on that post that are not from the author and have more downvotes than upvotes. Return a list of objects with keys \"username\", \"post_title\", and \"count\".", - "intent_template": "In the {{forum}} forum, get the username and post title of the most recent post, and count the number of comments on that post that are not from the author and have more downvotes than upvotes. {{retrieved_data_format_spec}}.", - "instantiation_dict": { - "forum": "photoshopbattles", - "retrieved_data_format_spec": "Return a list of objects with keys \"username\", \"post_title\", and \"count\"" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { - "type": "array", - "items": { - "type": "object", - "properties": { - "username" : { "type": "string" }, - "post_title": { "type": "string" }, - "count" : { "type": "number" } - } - } - }, - "expected": { - "task_type": "retrieve", - "status": "SUCCESS", - "retrieved_data": [ - { - "username": "Proud_Idiot", - "post_title": "UK Prime Minister Rishi Sunak looking at a pothole", - "count": 0 - } - ] - } - } - ], - "revision": 2 - }, - { - "sites": ["map"], - "task_id": 32, - "intent_template_id": 78, - "start_urls": ["__MAP__"], - "intent": "I will arrive at Pittsburgh Airport soon. Find a Hilton hotel in the vicinity, if available, and get me its name and the walking distance to the nearest supermarket own by a local company from the hotel. Return a list of objects with keys \"hotel\" (hotel name) and \"distance\" (numeric value with unit km or m, e.g., 2.4km, 500m) only, without any additional details. (Use the OSRM direction service.)", - "intent_template": "I will arrive at {{place}} soon. Find a {{target1}} in the vicinity, if available, and get me its name and the {{information}} to {{target2}} from the hotel. {{retrieved_data_format_spec}}. (Use the OSRM direction service.)", - "instantiation_dict": { - "place": "Pittsburgh Airport", - "information": "walking distance", - "target1": "Hilton hotel", - "target2": "the nearest supermarket own by a local company", - "retrieved_data_format_spec": "Return a list of objects with keys \"hotel\" (hotel name) and \"distance\" (numeric value with unit km or m, e.g., 2.4km, 500m) only, without any additional details" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { - "type": "array", - "items": { - "type": "object", - "properties": { - "hotel" : { "type": "string", "format": "location-name" }, - "distance": { "type": "string", "format": "distance" } - } - } - }, - "expected": { - "task_type": "retrieve", - "status": "SUCCESS", - "retrieved_data": [ - {"hotel": "DoubleTree by Hilton Hotel Pittsburgh Airport", "distance": "2km"} - ] - } - } - ], - "revision": 2 - }, - { - "sites": ["map"], - "task_id": 33, - "intent_template_id": 78, - "start_urls": ["__MAP__"], - "intent": "I will arrive at Pittsburgh Airport soon. Find a Hilton hotel in the vicinity, if available, and get me its name and the shortest walking distance to a supermarket from the hotel. Return a list of objects with keys \"hotel\" (hotel name) and \"distance\" (numeric value with unit km or m, e.g., 2.4km, 500m) only, without any additional details. (Use the OSRM direction service.)", - "intent_template": "I will arrive at {{place}} soon. Find a {{target1}} in the vicinity, if available, and get me its name and the {{information}} to {{target2}} from the hotel. {{retrieved_data_format_spec}}. (Use the OSRM direction service.)", - "instantiation_dict": { - "place": "Pittsburgh Airport", - "target1": "Hilton hotel", - "information": "shortest walking distance", - "target2": "a supermarket", - "retrieved_data_format_spec": "Return a list of objects with keys \"hotel\" (hotel name) and \"distance\" (numeric value with unit km or m, e.g., 2.4km, 500m) only, without any additional details" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { - "type": "array", - "items": { - "type": "object", - "properties": { - "hotel" : { "type": "string", "format": "location-name" }, - "distance": { "type": "string", "format": "distance" } - } - } - }, - "expected": { - "task_type": "retrieve", - "status": "SUCCESS", - "retrieved_data": [ - {"hotel": "DoubleTree by Hilton Hotel Pittsburgh Airport", "distance": "1.4km"} - ] - } - } - ], - "revision": 2 - }, - { - "sites": ["map"], - "task_id": 34, - "intent_template_id": 78, - "start_urls": ["__MAP__"], - "intent": "I will arrive at Pittsburgh Airport soon. Find a Hyatt hotel in the vicinity, if available, and get me its name and the shortest walking time to a supermarket from the hotel. Return a list of objects with keys \"hotel_name\" (hotel name only) and \"travel_time\" (in HH:MM:SS format) only, without any additional details. (Use the OSRM direction service.)", - "intent_template": "I will arrive at {{place}} soon. Find a {{target1}} in the vicinity, if available, and get me its name and the {{information}} to {{target2}} from the hotel. {{retrieved_data_format_spec}}. (Use the OSRM direction service.)", - "instantiation_dict": { - "place": "Pittsburgh Airport", - "target1": "Hyatt hotel", - "information": "shortest walking time", - "target2": "a supermarket", - "retrieved_data_format_spec": "Return a list of objects with keys \"hotel_name\" (hotel name only) and \"travel_time\" (in HH:MM:SS format) only, without any additional details" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { - "type": "array", - "items": { - "type": "object", - "properties": { - "hotel" : { "type": "string", "format": "location-name" }, - "information": { "type": "string", "format": "duration" } - } - } - }, - "expected": { - "task_type": "retrieve", - "status": "SUCCESS", - "retrieved_data": [ - { - "hotel_name": "Hyatt Regency Pittsburgh International Airport", - "travel_time": "3h 30min" - } - ] - } - } - ], - "revision": 2 - }, - { - "sites": ["map"], - "task_id": 35, - "intent_template_id": 78, - "start_urls": ["__MAP__"], - "intent": "I will arrive at Pittsburgh Airport soon. Find a Hyatt hotel in the vicinity, if available, and get me its name and the minimal driving time to a supermarket from the hotel. Return a list of objects with keys \"hotel_name\" (hotel name only) and \"travel_time\" (in HH:MM:SS format) only, without any additional details. (Use the OSRM direction service.)", - "intent_template": "I will arrive at {{place}} soon. Find a {{target1}} in the vicinity, if available, and get me its name and the {{information}} to {{target2}} from the hotel. {{retrieved_data_format_spec}}. (Use the OSRM direction service.)", - "instantiation_dict": { - "place": "Pittsburgh Airport", - "target1": "Hyatt hotel", - "information": "minimal driving time", - "target2": "a supermarket", - "retrieved_data_format_spec": "Return a list of objects with keys \"hotel_name\" (hotel name only) and \"travel_time\" (in HH:MM:SS format) only, without any additional details" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { - "type": "array", - "items": { - "type": "object", - "properties": { - "hotel" : { "type": "string", "format": "location-name" }, - "information": { "type": "string", "format": "duration" } - } - } - }, - "expected": { - "task_type": "retrieve", - "status": "SUCCESS", - "retrieved_data": [ - { - "hotel_name": "Hyatt Regency Pittsburgh International Airport", - "travel_time": "15min" - } - ] - } - } - ], - "revision": 2 - }, - { - "sites": ["map"], - "task_id": 36, - "intent_template_id": 77, - "start_urls": ["__MAP__"], - "intent": "Determine whether the social security administration in Pittsburgh can be reached within one hour by car from Carnegie Mellon University. Return true if it can, otherwise false. (Use the OSRM direction service.)", - "intent_template": "Determine whether the {{place}} in Pittsburgh can be reached within one hour by car from {{location}}{{retrieved_data_format_spec}} (Use the OSRM direction service.)", - "instantiation_dict": { - "place": "social security administration", - "location": "Carnegie Mellon University", - "retrieved_data_format_spec": ". Return true if it can, otherwise false." - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { "type": "array", "items": {"type": "boolean"} }, - "expected": { "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": [true] } - } - ], - "revision": 2 - }, - { - "sites": ["map"], - "task_id": 37, - "intent_template_id": 77, - "start_urls": ["__MAP__"], - "intent": "Determine whether the police station in Pittsburgh can be reached within one hour by car from gates building at CMU. Return true if it can, otherwise false. (Use the OSRM direction service.)", - "intent_template": "Determine whether the {{place}} in Pittsburgh can be reached within one hour by car from {{location}}{{retrieved_data_format_spec}} (Use the OSRM direction service.)", - "instantiation_dict": { - "place": "police station", - "location": "gates building at CMU", - "retrieved_data_format_spec": ". Return true if it can, otherwise false." - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { "type": "array", "items": {"type": "boolean"} }, - "expected": { "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": [true] } - } - ], - "revision": 2 - }, - { - "sites": ["map"], - "task_id": 38, - "intent_template_id": 77, - "start_urls": ["__MAP__"], - "intent": "Determine whether the duquesne university in Pittsburgh can be reached within one hour by car from pittsburgh airport. Return true if it can, otherwise false. (Use the OSRM direction service.)", - "intent_template": "Determine whether the {{place}} in Pittsburgh can be reached within one hour by car from {{location}}{{retrieved_data_format_spec}} (Use the OSRM direction service.)", - "instantiation_dict": { - "place": "duquesne university", - "location": "pittsburgh airport", - "retrieved_data_format_spec": ". Return true if it can, otherwise false." - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { "type": "array", "items": {"type": "boolean"} }, - "expected": { "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": [true] } - } - ], - "revision": 2 - }, - { - "sites": ["map"], - "task_id": 39, - "intent_template_id": 77, - "start_urls": ["__MAP__"], - "intent": "Determine whether the walmart in Pittsburgh can be reached within one hour by car from 5600 fifth avenue. Return true if it can, otherwise false. (Use the OSRM direction service.)", - "intent_template": "Determine whether the {{place}} in Pittsburgh can be reached within one hour by car from {{location}}{{retrieved_data_format_spec}} (Use the OSRM direction service.)", - "instantiation_dict": { - "place": "walmart", - "location": "5600 fifth avenue", - "retrieved_data_format_spec": ". Return true if it can, otherwise false." - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { "type": "array", "items": {"type": "boolean"} }, - "expected": { "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": [true] } - } - ], - "revision": 2 - }, - { - "sites": ["map"], - "task_id": 40, - "intent_template_id": 77, - "start_urls": ["__MAP__"], - "intent": "Determine whether the amc theatre in Pittsburgh can be reached within one hour by car from hobart street. Return true if it can, otherwise false. (Use the OSRM direction service.)", - "intent_template": "Determine whether the {{place}} in Pittsburgh can be reached within one hour by car from {{location}}{{retrieved_data_format_spec}} (Use the OSRM direction service.)", - "instantiation_dict": { - "place": "amc theatre", - "location": "hobart street", - "retrieved_data_format_spec": ". Return true if it can, otherwise false." - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { "type": "array", "items": {"type": "boolean"} }, - "expected": { "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": [true] } - } - ], - "revision": 2 - }, - { - "sites": ["shopping_admin"], - "task_id": 41, - "intent_template_id": 285, - "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Get the top 1 search term(s) in my store", - "intent_template": "Get the top {{n}} search term(s) in my store", - "instantiation_dict": {"n": "1"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { "type": "array", "items": {"type": "string"} }, - "expected": { - "task_type": "retrieve", - "status": "SUCCESS", - "retrieved_data": ["hollister"] - } - } - ], - "revision": 2 - }, - { - "sites": ["shopping_admin"], - "task_id": 42, - "intent_template_id": 285, - "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Get the top 2 search term(s) in my store", - "intent_template": "Get the top {{n}} search term(s) in my store", - "instantiation_dict": {"n": "2"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { "type": "array", "items": {"type": "string"} }, - "expected": { - "task_type": "retrieve", - "status": "SUCCESS", - "retrieved_data": ["hollister", "Joust Bag"] - } - } - ], - "revision": 2 - }, - { - "sites": ["shopping_admin"], - "task_id": 43, - "intent_template_id": 285, - "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Get the top 3 search term(s) in my store", - "intent_template": "Get the top {{n}} search term(s) in my store", - "instantiation_dict": {"n": "3"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { "type": "array", "items": {"type": "string"} }, - "expected": { - "task_type": "retrieve", - "status": "SUCCESS", - "retrieved_data": ["hollister", "Joust Bag", "nike"] - } - } - ], - "revision": 2 - }, - { - "sites": ["gitlab"], - "task_id": 44, - "intent_template_id": 303, - "start_urls": ["__GITLAB__"], - "intent": "Open my todos page", - "intent_template": "Open my todos page", - "instantiation_dict": {}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "navigate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "ignored_query_params_patterns": ["page", "sort"], - "expected": { - "url": ["__GITLAB__/dashboard/todos", "__GITLAB__/dashboard/todos?state=pending"] - } - } - ], - "revision": 2 - }, - { - "sites": ["gitlab"], - "task_id": 45, - "intent_template_id": 300, - "start_urls": ["__GITLAB__/a11yproject/a11yproject.com"], - "intent": "Open the issues page for the current project filtered to the most recent open issues", - "intent_template": "Open the issues page for the current project filtered to the most recent open issues", - "instantiation_dict": {}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "navigate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": {"url": "__GITLAB__/a11yproject/a11yproject.com/-/issues"} - } - ], - "revision": 2 - }, - { - "sites": ["gitlab"], - "task_id": 46, - "intent_template_id": 300, - "start_urls": ["__GITLAB__/primer/design"], - "intent": "Open the issues page for the current project filtered to the most recent open issues", - "intent_template": "Open the issues page for the current project filtered to the most recent open issues", - "instantiation_dict": {}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "navigate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": [ - "__GITLAB__/primer/design/-/issues", - "__GITLAB__/primer/design/-/issues/?sort=created_date&state=opened" - ] - } - } - ], - "revision": 2 - }, - { - "sites": ["shopping"], - "task_id": 47, - "intent_template_id": 197, - "start_urls": ["__SHOPPING__"], - "intent": "Today is June 12, 2023. Get how many complete orders I have over the past months, and the total amount of money I spent (including shipping and handling fees). Return an object with keys \"order_count\" (as integer) and \"amount\" (as number, e.g., 10.99) only, without any additional details.", - "intent_template": "Today is June 12, 2023. Get how many complete orders I have {{period}}, and the total amount of money I spent (including shipping and handling fees). {{retrieved_data_format_spec}}.", - "instantiation_dict": { - "period": "over the past months", - "retrieved_data_format_spec": "Return an object with keys \"order_count\" (as integer) and \"amount\" (as number, e.g., 10.99) only, without any additional details" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { - "type": "array", - "items": { - "type": "object", - "properties": { - "order_count": { "type": "number" }, - "amount" : { "type": "number", "format": "currency" } - } - } - }, - "expected": { - "task_type": "retrieve", - "status": "SUCCESS", - "retrieved_data": [ {"order_count": 0, "amount": 0} ] - } - } - ], - "revision": 2 - }, - { - "sites": ["shopping"], - "task_id": 48, - "intent_template_id": 197, - "start_urls": ["__SHOPPING__"], - "intent": "Today is June 12, 2023. Get how many complete orders I have over the past three days, and the total amount of money I spent (including shipping and handling fees). Return an object with keys \"order_count\" (as integer) and \"amount\" (as number, e.g., 10.99) only, without any additional details.", - "intent_template": "Today is June 12, 2023. Get how many complete orders I have {{period}}, and the total amount of money I spent (including shipping and handling fees). {{retrieved_data_format_spec}}.", - "instantiation_dict": { - "period": "over the past three days", - "retrieved_data_format_spec": "Return an object with keys \"order_count\" (as integer) and \"amount\" (as number, e.g., 10.99) only, without any additional details" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { - "type": "array", - "items": { - "type": "object", - "properties": { - "order_count": { "type": "number" }, - "amount" : { "type": "number", "format": "currency" } - } - } - }, - "expected": { - "task_type": "retrieve", - "status": "SUCCESS", - "retrieved_data": [ {"order_count": 0, "amount": 0} ] - } - } - ], - "revision": 2 - }, - { - "sites": ["shopping"], - "task_id": 49, - "intent_template_id": 197, - "start_urls": ["__SHOPPING__"], - "intent": "Today is June 12, 2023. Get how many complete orders I have over the past four months, and the total amount of money I spent (including shipping and handling fees). Return an object with keys \"order_count\" (as integer) and \"amount\" (as number, e.g., 10.99) only, without any additional details.", - "intent_template": "Today is June 12, 2023. Get how many complete orders I have {{period}}, and the total amount of money I spent (including shipping and handling fees). {{retrieved_data_format_spec}}.", - "instantiation_dict": { - "period": "over the past four months", - "retrieved_data_format_spec": "Return an object with keys \"order_count\" (as integer) and \"amount\" (as number, e.g., 10.99) only, without any additional details" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { - "type": "array", - "items": { - "type": "object", - "properties": { - "order_count": { "type": "number" }, - "amount" : { "type": "number", "format": "currency" } - } - } - }, - "expected": { - "task_type": "retrieve", - "status": "SUCCESS", - "retrieved_data": [ {"order_count": 3, "amount": 845.49} ] - } - } - ], - "revision": 2 - }, - { - "sites": ["shopping"], - "task_id": 50, - "intent_template_id": 197, - "start_urls": ["__SHOPPING__"], - "intent": "Today is June 12, 2023. Get how many complete orders I have over the past year, and the total amount of money I spent (including shipping and handling fees). Return an object with keys \"order_count\" (as integer) and \"amount\" (as number, e.g., 10.99) only, without any additional details.", - "intent_template": "Today is June 12, 2023. Get how many complete orders I have {{period}}, and the total amount of money I spent (including shipping and handling fees). {{retrieved_data_format_spec}}.", - "instantiation_dict": { - "period": "over the past year", - "retrieved_data_format_spec": "Return an object with keys \"order_count\" (as integer) and \"amount\" (as number, e.g., 10.99) only, without any additional details" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { - "type": "array", - "items": { - "type": "object", - "properties": { - "order_count": { "type": "number" }, - "amount" : { "type": "number", "format": "currency" } - } - } - }, - "expected": { - "task_type": "retrieve", - "status": "SUCCESS", - "retrieved_data": [ {"order_count": 21, "amount": 6560.69} ] - } - } - ], - "revision": 2 - }, - { - "sites": ["shopping"], - "task_id": 51, - "intent_template_id": 197, - "start_urls": ["__SHOPPING__"], - "intent": "Today is June 12, 2023. Get how many complete orders I have over the past six months, and the total amount of money I spent (including shipping and handling fees). Return an object with keys \"order_count\" (as integer) and \"amount\" (as number, e.g., 10.99) only, without any additional details.", - "intent_template": "Today is June 12, 2023. Get how many complete orders I have {{period}}, and the total amount of money I spent (including shipping and handling fees). {{retrieved_data_format_spec}}.", - "instantiation_dict": { - "period": "over the past six months", - "retrieved_data_format_spec": "Return an object with keys \"order_count\" (as integer) and \"amount\" (as number, e.g., 10.99) only, without any additional details" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { - "type": "array", - "items": { - "type": "object", - "properties": { - "order_count": { "type": "number" }, - "amount" : { "type": "number", "format": "currency" } - } - } - }, - "expected": { - "task_type": "retrieve", - "status": "SUCCESS", - "retrieved_data": [ {"order_count": 7, "amount": 1700.84} ] - } - } - ], - "revision": 2 - }, - { - "sites": ["map"], - "task_id": 52, - "intent_template_id": 68, - "start_urls": ["__MAP__"], - "intent": "How long does it take to walk from Carnegie Mellon University to starbucks on Craig Street? Return the value as a string in HH:MM:SS format only, without any additional details. (Use the OSRM direction service.)", - "intent_template": "How long does it take to walk from {{start}} to {{end}}? {{retrieved_data_format_spec}}. (Use the OSRM direction service.)", - "instantiation_dict": { - "start": "Carnegie Mellon University", - "end": "starbucks on Craig Street", - "retrieved_data_format_spec": "Return the value as a string in HH:MM:SS format only, without any additional details" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { "type": "array", "items": {"type": "string", "format": "duration"} }, - "expected": { "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": ["7min"] } - } - ], - "revision": 2 - }, - { - "sites": ["map"], - "task_id": 53, - "intent_template_id": 68, - "start_urls": ["__MAP__"], - "intent": "How long does it take to walk from Univ of Pittsburgh to starbucks on Craig Street? Return the value as a string in HH:MM:SS format only, without any additional details. (Use the OSRM direction service.)", - "intent_template": "How long does it take to walk from {{start}} to {{end}}? {{retrieved_data_format_spec}}. (Use the OSRM direction service.)", - "instantiation_dict": { - "start": "Univ of Pittsburgh", - "end": "starbucks on Craig Street", - "retrieved_data_format_spec": "Return the value as a string in HH:MM:SS format only, without any additional details" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { "type": "array", "items": {"type": "string", "format": "duration"} }, - "expected": { "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": ["18min"] } - } - ], - "revision": 2 - }, - { - "sites": ["map"], - "task_id": 54, - "intent_template_id": 68, - "start_urls": ["__MAP__"], - "intent": "How long does it take to walk from Carnegie Mellon University to Univ of Pittsburgh? Return the value as a string in HH:MM:SS format only, without any additional details. (Use the OSRM direction service.)", - "intent_template": "How long does it take to walk from {{start}} to {{end}}? {{retrieved_data_format_spec}}. (Use the OSRM direction service.)", - "instantiation_dict": { - "start": "Carnegie Mellon University", - "end": "Univ of Pittsburgh", - "retrieved_data_format_spec": "Return the value as a string in HH:MM:SS format only, without any additional details" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { "type": "array", "items": {"type": "string", "format": "duration"} }, - "expected": { "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": ["25min"] } - } - ], - "revision": 2 - }, - { - "sites": ["map"], - "task_id": 55, - "intent_template_id": 68, - "start_urls": ["__MAP__"], - "intent": "How long does it take to walk from the Starbucks near CMU to Chatham university? Return the value as a string in HH:MM:SS format only, without any additional details. (Use the OSRM direction service.)", - "intent_template": "How long does it take to walk from {{start}} to {{end}}? {{retrieved_data_format_spec}}. (Use the OSRM direction service.)", - "instantiation_dict": { - "start": "the Starbucks near CMU", - "end": "Chatham university", - "retrieved_data_format_spec": "Return the value as a string in HH:MM:SS format only, without any additional details" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { "type": "array", "items": {"type": "string", "format": "duration"} }, - "expected": { "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": ["30min"] } - } - ], - "revision": 2 - }, - { - "sites": ["map"], - "task_id": 56, - "intent_template_id": 68, - "start_urls": ["__MAP__"], - "intent": "How long does it take to walk from Carnegie Museum of Art to a library at CMU? Return the value as a string in HH:MM:SS format only, without any additional details. (Use the OSRM direction service.)", - "intent_template": "How long does it take to walk from {{start}} to {{end}}? {{retrieved_data_format_spec}}. (Use the OSRM direction service.)", - "instantiation_dict": { - "start": "Carnegie Museum of Art", - "end": "a library at CMU", - "retrieved_data_format_spec": "Return the value as a string in HH:MM:SS format only, without any additional details" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { "type": "array", "items": {"type": "string", "format": "duration"} }, - "expected": { "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": ["11min"] } - } - ], - "revision": 2 - }, - { - "sites": ["map"], - "task_id": 57, - "intent_template_id": 69, - "start_urls": ["__MAP__"], - "intent": "Get the closest restaurant(s) to university center at Carnegie Mellon University", - "intent_template": "Get the closest {{place1}}(s) to {{place2}}", - "instantiation_dict": { - "place1": "restaurant", - "place2": "university center at Carnegie Mellon University" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { "type": "array", "items": {"type": "string", "format": "location-name"} }, - "expected": { - "task_type": "retrieve", - "status": "SUCCESS", - "retrieved_data": [ - "El Gallo de Oro", "Back Bar Grill", "Grano", "Beefsteak", "Nourish", - "Schatz Dining Room", "Au Bon Pain" - ] - } - } - ], - "revision": 2 - }, - { - "sites": ["map"], - "task_id": 58, - "intent_template_id": 69, - "start_urls": ["__MAP__"], - "intent": "Get the closest cafe(s) to CMU Hunt library", - "intent_template": "Get the closest {{place1}}(s) to {{place2}}", - "instantiation_dict": {"place1": "cafe", "place2": "CMU Hunt library"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { "type": "array", "items": {"type": "string", "format": "location-name"} }, - "expected": { - "task_type": "retrieve", - "status": "SUCCESS", - "retrieved_data": ["De Fer Coffee & Tea"] - } - } - ], - "revision": 2 - }, - { - "sites": ["map"], - "task_id": 59, - "intent_template_id": 69, - "start_urls": ["__MAP__"], - "intent": "Get the closest restaurant(s) to CMU Hunt library", - "intent_template": "Get the closest {{place1}}(s) to {{place2}}", - "instantiation_dict": {"place1": "restaurant", "place2": "CMU Hunt library"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { "type": "array", "items": {"type": "string", "format": "location-name"} }, - "expected": { - "task_type": "retrieve", - "status": "SUCCESS", - "retrieved_data": ["The exchange"] - } - } - ], - "revision": 2 - }, - { - "sites": ["map"], - "task_id": 60, - "intent_template_id": 69, - "start_urls": ["__MAP__"], - "intent": "Get the closest restaurant(s) to CMU Posner Hall", - "intent_template": "Get the closest {{place1}}(s) to {{place2}}", - "instantiation_dict": {"place1": "restaurant", "place2": "CMU Posner Hall"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { "type": "array", "items": {"type": "string", "format": "location-name"} }, - "expected": { - "task_type": "retrieve", - "status": "SUCCESS", - "retrieved_data": ["The exchange"] - } - } - ], - "revision": 2 - }, - { - "sites": ["map"], - "task_id": 61, - "intent_template_id": 69, - "start_urls": ["__MAP__"], - "intent": "Get the closest restaurant(s) to CMU Sorrells Library", - "intent_template": "Get the closest {{place1}}(s) to {{place2}}", - "instantiation_dict": {"place1": "restaurant", "place2": "CMU Sorrells Library"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { "type": "array", "items": {"type": "string", "format": "location-name"} }, - "expected": { - "task_type": "retrieve", - "status": "SUCCESS", - "retrieved_data": ["La Prima Espresso"] - } - } - ], - "revision": 2 - }, - { - "sites": ["shopping_admin"], - "task_id": 62, - "intent_template_id": 276, - "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Get customer email(s) who completed the most number of orders in the entire history", - "intent_template": "Get customer email(s) who {{order_criteria}} in the entire history", - "instantiation_dict": {"order_criteria": "completed the most number of orders"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { "type": "array", "items": {"type": "string"} }, - "expected": { - "task_type": "retrieve", - "status": "SUCCESS", - "retrieved_data": ["janesmith456@yahoo.com"] - } - } - ], - "revision": 2 - }, - { - "sites": ["shopping_admin"], - "task_id": 63, - "intent_template_id": 276, - "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Get customer email(s) who completed the second most number of orders in the entire history", - "intent_template": "Get customer email(s) who {{order_criteria}} in the entire history", - "instantiation_dict": {"order_criteria": "completed the second most number of orders"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { "type": "array", "items": {"type": "string"} }, - "expected": { - "task_type": "retrieve", - "status": "SUCCESS", - "retrieved_data": ["helloworld@yahoo.com", "michael.nguyen@yahoo.com"] - } - } - ], - "revision": 2 - }, - { - "sites": ["shopping_admin"], - "task_id": 64, - "intent_template_id": 276, - "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Get customer email(s) who have 2 orders in any state in the entire history", - "intent_template": "Get customer email(s) who {{order_criteria}} in the entire history", - "instantiation_dict": {"order_criteria": "have 2 orders in any state"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { "type": "array", "items": {"type": "string"} }, - "expected": { - "task_type": "retrieve", - "status": "SUCCESS", - "retrieved_data": ["musiclover99@hotmail.com", "roni_cost@example.com"] - } - } - ], - "revision": 2 - }, - { - "sites": ["shopping_admin"], - "task_id": 65, - "intent_template_id": 276, - "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Get customer email(s) who completed the fifth most number of orders in the entire history", - "intent_template": "Get customer email(s) who {{order_criteria}} in the entire history", - "instantiation_dict": {"order_criteria": "completed the fifth most number of orders"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { "type": "array", "items": {"type": "string"} }, - "expected": { - "task_type": "retrieve", - "status": "SUCCESS", - "retrieved_data": ["adam.garcia@gmail.com", "jason.miller@yahoo.com", "jennifer.white@yahoo.com"] - } - } - ], - "revision": 2 - }, - { - "sites": ["reddit"], - "task_id": 66, - "intent_template_id": 17, - "start_urls": ["__REDDIT__"], - "intent": "Among the top 10 hottest posts in the \"Books\" forum, get the post titles from those that recommend exactly one book.", - "intent_template": "Among the top {{number}} hottest posts in the \"{{forum}}\" forum, get {{description}} from those that {{filter_criterion}}.{{retrieved_data_format_spec}}", - "instantiation_dict": { - "number": 10, - "description": "the post titles", - "forum": "Books", - "filter_criterion": "recommend exactly one book", - "retrieved_data_format_spec": "" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { "type": "array", "items": {"type": "string"} }, - "expected": { - "task_type": "retrieve", - "status": "SUCCESS", - "retrieved_data": [ - "I just finished reading The Hobbit to my 6 year old daughter, and she loved it!", - "Apple Books has a free audiobook of A Christmas Carol narrated by LeVar Burton!" - ] - } - } - ], - "revision": 2 - }, - { - "sites": ["reddit"], - "task_id": 67, - "intent_template_id": 17, - "start_urls": ["__REDDIT__"], - "intent": "Among the top 10 hottest posts in the \"Books\" forum, get the book titles, exactly as they appear in the post description, from those that recommend exactly one book.", - "intent_template": "Among the top {{number}} hottest posts in the \"{{forum}}\" forum, get {{description}} from those that {{filter_criterion}}.{{retrieved_data_format_spec}}", - "instantiation_dict": { - "number": 10, - "description": "the book titles, exactly as they appear in the post description,", - "forum": "Books", - "filter_criterion": "recommend exactly one book", - "retrieved_data_format_spec": "" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { "type": "array", "items": {"type": "string"} }, - "expected": { - "task_type": "retrieve", - "status": "SUCCESS", - "retrieved_data": ["The Hobbit", "A Christmas Carol"] - } - } - ], - "revision": 2 - }, - { - "sites": ["reddit"], - "task_id": 68, - "intent_template_id": 17, - "start_urls": ["__REDDIT__"], - "intent": "Among the top 10 hottest posts in the \"Books\" forum, get the author names and book titles, exactly as they appear in the post description, from those that recommend exactly one book. Return a list of objects with keys \"book\" and \"author\".", - "intent_template": "Among the top {{number}} hottest posts in the \"{{forum}}\" forum, get {{description}} from those that {{filter_criterion}}.{{retrieved_data_format_spec}}", - "instantiation_dict": { - "number": 10, - "description": "the author names and book titles, exactly as they appear in the post description,", - "forum": "Books", - "filter_criterion": "recommend exactly one book", - "retrieved_data_format_spec": " Return a list of objects with keys \"book\" and \"author\"." - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { - "type": "array", - "items": { - "type": "object", - "properties": { "book": {"type": "string"}, "author": {"type": "string"} } - } - }, - "expected": { - "task_type": "retrieve", - "status": "SUCCESS", - "retrieved_data": [ - { "book": "The Hobbit" , "author": "Tolkien" }, - { "book": "A Christmas Carol", "author": null } - ] - } - } - ], - "revision": 2 - }, - { - "sites": ["reddit"], - "task_id": 69, - "intent_template_id": 17, - "start_urls": ["__REDDIT__"], - "intent": "Among the top 10 hottest posts in the \"Books\" forum, get the URLs of any organizations involved, exactly as they appear in the post description, from those that talk about supporting local book stores.", - "intent_template": "Among the top {{number}} hottest posts in the \"{{forum}}\" forum, get {{description}} from those that {{filter_criterion}}.{{retrieved_data_format_spec}}", - "instantiation_dict": { - "number": 10, - "description": "the URLs of any organizations involved, exactly as they appear in the post description,", - "forum": "Books", - "filter_criterion": "talk about supporting local book stores", - "retrieved_data_format_spec": "" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { "type": "array", "items": {"type": "string"} }, - "expected": { - "task_type": "retrieve", - "status": "SUCCESS", - "retrieved_data": [ ["bookshop.org", "https://bookshop.org"] ] - } - } - ], - "revision": 2 - }, - { - "sites": ["map"], - "task_id": 70, - "intent_template_id": 70, - "start_urls": ["__MAP__"], - "intent": "What is the zip code of Carnegie Mellon University?", - "intent_template": "What is the zip code of {{place}}?", - "instantiation_dict": {"place": "Carnegie Mellon University"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { "type": "array", "items": {"type": "string"} }, - "expected": { "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": ["15213"] } - } - ], - "revision": 2 - }, - { - "sites": ["map"], - "task_id": 71, - "intent_template_id": 70, - "start_urls": ["__MAP__"], - "intent": "What is the zip code of Chatham University?", - "intent_template": "What is the zip code of {{place}}?", - "instantiation_dict": {"place": "Chatham University"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { "type": "array", "items": {"type": "string"} }, - "expected": { "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": ["15232"] } - } - ], - "revision": 2 - }, - { - "sites": ["map"], - "task_id": 72, - "intent_template_id": 70, - "start_urls": ["__MAP__"], - "intent": "What is the zip code of Yale University?", - "intent_template": "What is the zip code of {{place}}?", - "instantiation_dict": {"place": "Yale University"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { "type": "array", "items": {"type": "string"} }, - "expected": { "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": ["06516"] } - } - ], - "revision": 2 - }, - { - "sites": ["map"], - "task_id": 73, - "intent_template_id": 70, - "start_urls": ["__MAP__"], - "intent": "What is the zip code of Columbia University?", - "intent_template": "What is the zip code of {{place}}?", - "instantiation_dict": {"place": "Columbia University"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { "type": "array", "items": {"type": "string"} }, - "expected": { "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": ["10027"] } - } - ], - "revision": 2 - }, - { - "sites": ["map"], - "task_id": 74, - "intent_template_id": 65, - "start_urls": ["__MAP__"], - "intent": "Given the following locations,\"Carnegie Mellon University\", \"apple store shadyside\", \"starbucks on craig street\", what would be the optimal route to travel through them all in order to minimize total travel time? Please note the journey begins at the first place listed. (Use the OSRM direction service.)", - "intent_template": "Given the following locations,{% for place in place_list %}\"{{ place }}\"{% if not loop.last %}, {% endif %}{% endfor %}, what would be the optimal route to travel through them all in order to minimize total travel time? Please note the journey begins at the first place listed. (Use the OSRM direction service.)", - "instantiation_dict": { - "place_list": [ - "Carnegie Mellon University", "apple store shadyside", - "starbucks on craig street" - ] - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "ordered": true, - "results_schema": { "type": "array", "items": {"type": "string", "format": "location-name"} }, - "expected": { - "task_type": "retrieve", - "status": "SUCCESS", - "retrieved_data": [ - "Carnegie Mellon University", "starbucks on craig street", - "apple store shadyside" - ] - } - } - ], - "revision": 2 - }, - { - "sites": ["map"], - "task_id": 75, - "intent_template_id": 65, - "start_urls": ["__MAP__"], - "intent": "Given the following locations,\"Massachusetts Institute of Technology\", \"Harvard University\", \"Boston Logan International Airport\", what would be the optimal route to travel through them all in order to minimize total travel time? Please note the journey begins at the first place listed. (Use the OSRM direction service.)", - "intent_template": "Given the following locations,{% for place in place_list %}\"{{ place }}\"{% if not loop.last %}, {% endif %}{% endfor %}, what would be the optimal route to travel through them all in order to minimize total travel time? Please note the journey begins at the first place listed. (Use the OSRM direction service.)", - "instantiation_dict": { - "place_list": [ - "Massachusetts Institute of Technology", "Harvard University", - "Boston Logan International Airport" - ] - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "ordered": true, - "results_schema": { "type": "array", "items": {"type": "string", "format": "location-name"} }, - "expected": { - "task_type": "retrieve", - "status": "SUCCESS", - "retrieved_data": [ - "Massachusetts Institute of Technology", "Harvard University", - "Boston Logan International Airport" - ] - } - } - ], - "revision": 2 - }, - { - "sites": ["map"], - "task_id": 76, - "intent_template_id": 65, - "start_urls": ["__MAP__"], - "intent": "Given the following locations,\"Princeton University\", \"Yale University\", \"Harvard University\", what would be the optimal route to travel through them all in order to minimize total travel time? Please note the journey begins at the first place listed. (Use the OSRM direction service.)", - "intent_template": "Given the following locations,{% for place in place_list %}\"{{ place }}\"{% if not loop.last %}, {% endif %}{% endfor %}, what would be the optimal route to travel through them all in order to minimize total travel time? Please note the journey begins at the first place listed. (Use the OSRM direction service.)", - "instantiation_dict": { - "place_list": ["Princeton University", "Yale University", "Harvard University"] - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "ordered": true, - "results_schema": { "type": "array", "items": {"type": "string", "format": "location-name"} }, - "expected": { - "task_type": "retrieve", - "status": "SUCCESS", - "retrieved_data": ["Princeton University", "Yale University", "Harvard University"] - } - } - ], - "revision": 2 - }, - { - "sites": ["shopping_admin"], - "task_id": 77, - "intent_template_id": 277, - "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Get the total number of Pending reviews amongst all the reviews", - "intent_template": "Get the total number of {{status}} reviews amongst all the reviews", - "instantiation_dict": {"status": "Pending"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { "type": "array", "items": {"type": "number"} }, - "expected": { "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": [5] } - } - ], - "revision": 2 - }, - { - "sites": ["shopping_admin"], - "task_id": 78, - "intent_template_id": 277, - "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Get the total number of Approved reviews amongst all the reviews", - "intent_template": "Get the total number of {{status}} reviews amongst all the reviews", - "instantiation_dict": {"status": "Approved"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { "type": "array", "items": {"type": "number"} }, - "expected": { "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": [346] } - } - ], - "revision": 2 - }, - { - "sites": ["shopping_admin"], - "task_id": 79, - "intent_template_id": 277, - "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Get the total number of Not Approved reviews amongst all the reviews", - "intent_template": "Get the total number of {{status}} reviews amongst all the reviews", - "instantiation_dict": {"status": "Not Approved"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { "type": "array", "items": {"type": "number"} }, - "expected": { "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": [0] } - } - ], - "revision": 2 - }, - { - "sites": ["map"], - "task_id": 80, - "intent_template_id": 72, - "start_urls": ["__MAP__"], - "intent": "What is the duration required to first walk from Carnegie Mellon University to Starbucks on Craig Street, and then drive to Pittsburgh International Airport? Return the value as a string in HH:MM:SS format only, without any additional details. (Use the OSRM direction service.)", - "intent_template": "What is the duration required to first walk from {{place_A}} to {{place_B}}, and then drive to {{place_C}}? {{retrieved_data_format_spec}}. (Use the OSRM direction service.)", - "instantiation_dict": { - "place_A": "Carnegie Mellon University", - "place_B": "Starbucks on Craig Street", - "place_C": "Pittsburgh International Airport", - "retrieved_data_format_spec": "Return the value as a string in HH:MM:SS format only, without any additional details" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { "type": "array", "items": {"type": "string", "format": "duration"} }, - "expected": { "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": ["38min"] } - } - ], - "revision": 2 - }, - { - "sites": ["map"], - "task_id": 81, - "intent_template_id": 72, - "start_urls": ["__MAP__"], - "intent": "What is the duration required to first walk from Univ of Pittsburgh to starbucks on Craig Street, and then drive to Pittsburgh International Airport? Return the value as a string in HH:MM:SS format only, without any additional details. (Use the OSRM direction service.)", - "intent_template": "What is the duration required to first walk from {{place_A}} to {{place_B}}, and then drive to {{place_C}}? {{retrieved_data_format_spec}}. (Use the OSRM direction service.)", - "instantiation_dict": { - "place_A": "Univ of Pittsburgh", - "place_B": "starbucks on Craig Street", - "place_C": "Pittsburgh International Airport", - "retrieved_data_format_spec": "Return the value as a string in HH:MM:SS format only, without any additional details" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { "type": "array", "items": {"type": "string", "format": "duration"} }, - "expected": { "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": ["49min"] } - } - ], - "revision": 2 - }, - { - "sites": ["map"], - "task_id": 82, - "intent_template_id": 72, - "start_urls": ["__MAP__"], - "intent": "What is the duration required to first walk from Massachusetts Institute of Technology to Harvard University, and then drive to Boston Logan International Airport? Return the value as a string in HH:MM:SS format only, without any additional details. (Use the OSRM direction service.)", - "intent_template": "What is the duration required to first walk from {{place_A}} to {{place_B}}, and then drive to {{place_C}}? {{retrieved_data_format_spec}}. (Use the OSRM direction service.)", - "instantiation_dict": { - "place_A": "Massachusetts Institute of Technology", - "place_B": "Harvard University", - "place_C": "Boston Logan International Airport", - "retrieved_data_format_spec": "Return the value as a string in HH:MM:SS format only, without any additional details" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { "type": "array", "items": {"type": "string", "format": "duration"} }, - "expected": { "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": ["63min"] } - } - ], - "revision": 2 - }, - { - "sites": ["map"], - "task_id": 83, - "intent_template_id": 72, - "start_urls": ["__MAP__"], - "intent": "What is the duration required to first walk from Carnegie Mellon University to apple store shadyside, and then drive to starbucks on craig street? Return the value as a string in HH:MM:SS format only, without any additional details. (Use the OSRM direction service.)", - "intent_template": "What is the duration required to first walk from {{place_A}} to {{place_B}}, and then drive to {{place_C}}? {{retrieved_data_format_spec}}. (Use the OSRM direction service.)", - "instantiation_dict": { - "place_A": "Carnegie Mellon University", - "place_B": "apple store shadyside", - "place_C": "starbucks on craig street", - "retrieved_data_format_spec": "Return the value as a string in HH:MM:SS format only, without any additional details" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { "type": "array", "items": {"type": "string", "format": "duration"} }, - "expected": { "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": ["22min"] } - } - ], - "revision": 2 - }, - { - "sites": ["map"], - "task_id": 84, - "intent_template_id": 64, - "start_urls": ["__MAP__"], - "intent": "From my stay at DoubleTree by Hilton New York Downtown, what's the estimated driving time to reach Keens Steakhouse? Return the value as a string in HH:MM:SS format only, without any additional details. (Use the OSRM direction service.)", - "intent_template": "From my stay at {{hotel}}, what's the estimated driving time to reach {{place}}? {{retrieved_data_format_spec}}. (Use the OSRM direction service.)", - "instantiation_dict": { - "hotel": "DoubleTree by Hilton New York Downtown", - "place": "Keens Steakhouse", - "retrieved_data_format_spec": "Return the value as a string in HH:MM:SS format only, without any additional details" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { "type": "array", "items": {"type": "string", "format": "duration"} }, - "expected": { "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": ["14min"] } - } - ], - "revision": 2 - }, - { - "sites": ["map"], - "task_id": 85, - "intent_template_id": 64, - "start_urls": ["__MAP__"], - "intent": "From my stay at La Quinta Inn near the airport, what's the estimated driving time to reach Carnegie Mellon University? Return the value as a string in HH:MM:SS format only, without any additional details. (Use the OSRM direction service.)", - "intent_template": "From my stay at {{hotel}}, what's the estimated driving time to reach {{place}}? {{retrieved_data_format_spec}}. (Use the OSRM direction service.)", - "instantiation_dict": { - "hotel": "La Quinta Inn near the airport", - "place": "Carnegie Mellon University", - "retrieved_data_format_spec": "Return the value as a string in HH:MM:SS format only, without any additional details" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { "type": "array", "items": {"type": "string", "format": "duration"} }, - "expected": { "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": ["30min"] } - } - ], - "revision": 2 - }, - { - "sites": ["map"], - "task_id": 86, - "intent_template_id": 64, - "start_urls": ["__MAP__"], - "intent": "From my stay at La Quinta Inn near the airport, what's the estimated driving time to reach Upitt? Return the value as a string in HH:MM:SS format only, without any additional details. (Use the OSRM direction service.)", - "intent_template": "From my stay at {{hotel}}, what's the estimated driving time to reach {{place}}? {{retrieved_data_format_spec}}. (Use the OSRM direction service.)", - "instantiation_dict": { - "hotel": "La Quinta Inn near the airport", - "place": "Upitt", - "retrieved_data_format_spec": "Return the value as a string in HH:MM:SS format only, without any additional details" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { "type": "array", "items": {"type": "string", "format": "duration"} }, - "expected": { "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": ["29min"] } - } - ], - "revision": 2 - }, - { - "sites": ["map"], - "task_id": 87, - "intent_template_id": 64, - "start_urls": ["__MAP__"], - "intent": "From my stay at red roof inn, what's the estimated driving time to reach Pittsburgh science museum? Return the value as a string in HH:MM:SS format only, without any additional details. (Use the OSRM direction service.)", - "intent_template": "From my stay at {{hotel}}, what's the estimated driving time to reach {{place}}? {{retrieved_data_format_spec}}. (Use the OSRM direction service.)", - "instantiation_dict": { - "hotel": "red roof inn", - "place": "Pittsburgh science museum", - "retrieved_data_format_spec": "Return the value as a string in HH:MM:SS format only, without any additional details" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { "type": "array", "items": {"type": "string", "format": "duration"} }, - "expected": { "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": ["20min"] } - } - ], - "revision": 2 - }, - { - "sites": ["map"], - "task_id": 88, - "intent_template_id": 64, - "start_urls": ["__MAP__"], - "intent": "From my stay at Homewood Suites Southpointe, what's the estimated driving time to reach PPG Paints Arena? Return the value as a string in HH:MM:SS format only, without any additional details. (Use the OSRM direction service.)", - "intent_template": "From my stay at {{hotel}}, what's the estimated driving time to reach {{place}}? {{retrieved_data_format_spec}}. (Use the OSRM direction service.)", - "instantiation_dict": { - "hotel": "Homewood Suites Southpointe", - "place": "PPG Paints Arena", - "retrieved_data_format_spec": "Return the value as a string in HH:MM:SS format only, without any additional details" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { "type": "array", "items": {"type": "string", "format": "duration"} }, - "expected": { "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": ["34min"] } - } - ], - "revision": 2 - }, - { - "sites": ["map"], - "task_id": 89, - "intent_template_id": 67, - "start_urls": ["__MAP__"], - "intent": "Get the relation IDs of each US state that borders Connecticut. Return a list of integers only, without any additional details.", - "intent_template": "Get the relation IDs of each US state that borders {{state}}. {{retrieved_data_format_spec}}.", - "instantiation_dict": { - "state": "Connecticut", - "retrieved_data_format_spec": "Return a list of integers only, without any additional details" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { "type": "array", "items": {"type": "number"} }, - "expected": { - "task_type": "retrieve", - "status": "SUCCESS", - "retrieved_data": [392915, 61315, 175905] - } - } - ], - "revision": 3 - }, - { - "sites": ["map"], - "task_id": 90, - "intent_template_id": 67, - "start_urls": ["__MAP__"], - "intent": "Get the relation IDs of each US state that borders Pennsylvania. Return a list of integers only, without any additional details.", - "intent_template": "Get the relation IDs of each US state that borders {{state}}. {{retrieved_data_format_spec}}.", - "instantiation_dict": { - "state": "Pennsylvania", - "retrieved_data_format_spec": "Return a list of integers only, without any additional details" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { "type": "array", "items": {"type": "number"} }, - "expected": { - "task_type": "retrieve", - "status": "SUCCESS", - "retrieved_data": [162061, 162112, 175905, 224951, 162110, 162068] - } - } - ], - "revision": 3 - }, - { - "sites": ["map"], - "task_id": 91, - "intent_template_id": 67, - "start_urls": ["__MAP__"], - "intent": "Get the relation IDs of each US state that borders Massachusetts. Return a list of integers only, without any additional details.", - "intent_template": "Get the relation IDs of each US state that borders {{state}}. {{retrieved_data_format_spec}}.", - "instantiation_dict": { - "state": "Massachusetts", - "retrieved_data_format_spec": "Return a list of integers only, without any additional details" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { "type": "array", "items": {"type": "number"} }, - "expected": { - "task_type": "retrieve", - "status": "SUCCESS", - "retrieved_data": [392915, 165794, 175905, 67213, 60759] - } - } - ], - "revision": 3 - }, - { - "sites": ["map"], - "task_id": 92, - "intent_template_id": 67, - "start_urls": ["__MAP__"], - "intent": "Get the relation IDs of each US state that borders Vermont. Return a list of integers only, without any additional details.", - "intent_template": "Get the relation IDs of each US state that borders {{state}}. {{retrieved_data_format_spec}}.", - "instantiation_dict": { - "state": "Vermont", - "retrieved_data_format_spec": "Return a list of integers only, without any additional details" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { "type": "array", "items": {"type": "number"} }, - "expected": { - "task_type": "retrieve", - "status": "SUCCESS", - "retrieved_data": [175905, 67213, 61315] - } - } - ], - "revision": 3 - }, - { - "sites": ["map"], - "task_id": 93, - "intent_template_id": 67, - "start_urls": ["__MAP__"], - "intent": "Get the relation IDs of each US state that borders New Hampshire. Return a list of integers only, without any additional details.", - "intent_template": "Get the relation IDs of each US state that borders {{state}}. {{retrieved_data_format_spec}}.", - "instantiation_dict": { - "state": "New Hampshire", - "retrieved_data_format_spec": "Return a list of integers only, without any additional details" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { "type": "array", "items": {"type": "number"} }, - "expected": { - "task_type": "retrieve", - "status": "SUCCESS", - "retrieved_data": [61315, 60759, 63512] - } - } - ], - "revision": 3 - }, - { - "sites": ["shopping_admin"], - "task_id": 94, - "intent_template_id": 274, - "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Get the grand total of invoice 000000001. Return the value as a number (e.g., 10.99) only, without any additional details.", - "intent_template": "Get the grand total of invoice {{id}}. {{retrieved_data_format_spec}}.", - "instantiation_dict": { - "id": "000000001", - "retrieved_data_format_spec": "Return the value as a number (e.g., 10.99) only, without any additional details" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { "type": "array", "items": {"type": "number", "format": "currency"} }, - "expected": { "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": [36.39] } - } - ], - "revision": 2 - }, - { - "sites": ["shopping_admin"], - "task_id": 95, - "intent_template_id": 274, - "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Get the grand total of invoice 000000002. Return the value as a number (e.g., 10.99) only, without any additional details.", - "intent_template": "Get the grand total of invoice {{id}}. {{retrieved_data_format_spec}}.", - "instantiation_dict": { - "id": "000000002", - "retrieved_data_format_spec": "Return the value as a number (e.g., 10.99) only, without any additional details" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { "type": "array", "items": {"type": "number", "format": "currency"} }, - "expected": { "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": [39.64] } - } - ], - "revision": 2 - }, - { - "sites": ["shopping"], - "task_id": 96, - "intent_template_id": 193, - "start_urls": ["__SHOPPING__"], - "intent": "Get the status of my latest order and when will it arrive. Return a list of objects with keys \"status\" and \"arrival_date\" (YYYY-MM-DD format or null if not available), without any additional details.", - "intent_template": "Get the status of my latest order and when will it arrive. {{retrieved_data_format_spec}}.", - "instantiation_dict": { - "retrieved_data_format_spec": "Return a list of objects with keys \"status\" and \"arrival_date\" (YYYY-MM-DD format or null if not available), without any additional details" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { - "type": "array", - "items": { - "type": "object", - "properties": { - "status" : { "type": "string" }, - "arrival_date": { "type": "string", "format": "date" } - } - } - }, - "expected": { - "task_type": "retrieve", - "status": "SUCCESS", - "retrieved_data": [ {"status": "canceled", "arrival_date": null} ] - } - } - ], - "revision": 2 - }, - { - "sites": ["map", "wikipedia"], - "task_id": 97, - "intent_template_id": 120, - "start_urls": ["__MAP__", "__WIKIPEDIA__"], - "intent": "Get the driving distance from Carnegie Mellon University to the 3rd ranked university worldwide by SCImago Institutions Rankings in 2019. Return the value as a string (numeric value with unit km or m, e.g., 2.4km, 500m) only, without any additional details. (Use the OSRM direction service and the provided wiki site to look up any needed information.)", - "intent_template": "Get the driving distance from Carnegie Mellon University to the 3rd ranked university worldwide by SCImago Institutions Rankings in 2019. {{retrieved_data_format_spec}}. (Use the OSRM direction service and the provided wiki site to look up any needed information.)", - "instantiation_dict": { - "retrieved_data_format_spec": "Return the value as a string (numeric value with unit km or m, e.g., 2.4km, 500m) only, without any additional details" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { "type": "array", "items": {"type": "string", "format": "distance"} }, - "expected": { "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": ["914km"] } - }, - { - "evaluator": "NetworkEventEvaluator", - "ignored_query_params_patterns": [".*"], - "expected": {"url": "^.*/routed-car/route/v1/.*/.*$"} - } - ], - "revision": 2 - }, - { - "sites": ["map"], - "task_id": 98, - "intent_template_id": 66, - "start_urls": ["__MAP__"], - "intent": "Where is the nearest tea cafe to University of Pittsburgh, and what is the walking distance to it? Return a list of objects with keys \"location\" and \"distance\" (numeric value with unit km or m, e.g., 2.4km, 500m) only, without any additional details. (Use the OSRM direction service.)", - "intent_template": "Where is the nearest {{places}} to {{start}}, and what is the walking distance to it? {{retrieved_data_format_spec}}. (Use the OSRM direction service.)", - "instantiation_dict": { - "places": "tea cafe", - "start": "University of Pittsburgh", - "retrieved_data_format_spec": "Return a list of objects with keys \"location\" and \"distance\" (numeric value with unit km or m, e.g., 2.4km, 500m) only, without any additional details" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { - "type": "array", - "items": { - "type": "object", - "properties": { - "location": { - "type": "object", - "title": "full_address", - "properties": { - "name" : { "type": "string" }, - "house_number": { "type": "string" }, - "street" : { "type": "string" }, - "city" : { "type": "string" }, - "state" : { "type": "string" }, - "postcode" : { "type": "string" } - } - }, - "distance": {"type": "string", "format": "distance"} - } - } - }, - "expected": { - "task_type": "retrieve", - "status": "SUCCESS", - "retrieved_data": [ - { - "location": { - "name": "Fuku Tea", - "house_number": "3716", - "street": "Forbes Avenue", - "city": "Pittsburgh", - "state": "Pennsylvania", - "postcode": "15213" - }, - "distance": "653m" - } - ] - } - } - ], - "revision": 2 - }, - { - "sites": ["map"], - "task_id": 99, - "intent_template_id": 66, - "start_urls": ["__MAP__"], - "intent": "Where is the nearest Five Guys to 5700 Penn Ave, and what is the walking distance to it? Return a list of objects with keys \"location\" and \"distance\" (numeric value with unit km or m, e.g., 2.4km, 500m) only, without any additional details. (Use the OSRM direction service.)", - "intent_template": "Where is the nearest {{places}} to {{start}}, and what is the walking distance to it? {{retrieved_data_format_spec}}. (Use the OSRM direction service.)", - "instantiation_dict": { - "places": "Five Guys", - "start": "5700 Penn Ave", - "retrieved_data_format_spec": "Return a list of objects with keys \"location\" and \"distance\" (numeric value with unit km or m, e.g., 2.4km, 500m) only, without any additional details" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { - "type": "array", - "items": { - "type": "object", - "properties": { - "location": { - "type": "object", - "title": "full_address", - "properties": { - "name" : { "type": "string" }, - "house_number": { "type": "string" }, - "street" : { "type": "string" }, - "city" : { "type": "string" }, - "state" : { "type": "string" }, - "postcode" : { "type": "string" } - } - }, - "distance": {"type": "string", "format": "distance"} - } - } - }, - "expected": { - "task_type": "retrieve", - "status": "SUCCESS", - "retrieved_data": [ - { - "location": { - "name": "Five Guys", - "house_number": "117", - "street": "South Bouquet Street", - "city": "Pittsburgh", - "state": "Pennsylvania", - "postcode": "15213" - }, - "distance": "4km" - } - ] - } - } - ], - "revision": 2 - }, - { - "sites": ["map"], - "task_id": 100, - "intent_template_id": 66, - "start_urls": ["__MAP__"], - "intent": "Where is the nearest Starbucks to Carnegie Mellon, and what is the walking distance to it? Return a list of objects with keys \"location\" and \"distance\" (numeric value with unit km or m, e.g., 2.4km, 500m) only, without any additional details. (Use the OSRM direction service.)", - "intent_template": "Where is the nearest {{places}} to {{start}}, and what is the walking distance to it? {{retrieved_data_format_spec}}. (Use the OSRM direction service.)", - "instantiation_dict": { - "places": "Starbucks", - "start": "Carnegie Mellon", - "retrieved_data_format_spec": "Return a list of objects with keys \"location\" and \"distance\" (numeric value with unit km or m, e.g., 2.4km, 500m) only, without any additional details" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { - "type": "array", - "items": { - "type": "object", - "properties": { - "location": { - "type": "object", - "title": "full_address", - "properties": { - "name" : { "type": "string" }, - "house_number": { "type": "string" }, - "street" : { "type": "string" }, - "city" : { "type": "string" }, - "state" : { "type": "string" }, - "postcode" : { "type": "string" } - } - }, - "distance": {"type": "string", "format": "distance"} - } - } - }, - "expected": { - "task_type": "retrieve", - "status": "SUCCESS", - "retrieved_data": [ - { - "location": { - "name": "Starbucks", - "house_number": "417", - "street": "South Craig Street", - "city": "Pittsburgh", - "state": "Pennsylvania", - "postcode": "15213" - }, - "distance": "557m" - } - ] - } - } - ], - "revision": 2 - }, - { - "sites": ["map"], - "task_id": 101, - "intent_template_id": 66, - "start_urls": ["__MAP__"], - "intent": "Where is the nearest In-N-Out to Upitts, and what is the walking distance to it? Use \"location\" for the name and location of the place and \"distance\" for the walking distance. (Use the OSRM direction service.)", - "intent_template": "Where is the nearest {{places}} to {{start}}, and what is the walking distance to it? {{retrieved_data_format_spec}}. (Use the OSRM direction service.)", - "instantiation_dict": { - "places": "In-N-Out", - "start": "Upitts", - "retrieved_data_format_spec": "Use \"location\" for the name and location of the place and \"distance\" for the walking distance" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "retrieve", "status": "NOT_FOUND_ERROR", "retrieved_data": null} - } - ], - "revision": 2 - }, - { - "sites": ["gitlab"], - "task_id": 102, - "intent_template_id": 349, - "start_urls": ["__GITLAB__"], - "intent": "Navigate to the page showing the list of open issues in the a11yproject/a11yproject.com repository that have labels related to help wanted", - "intent_template": "Navigate to the page showing the list of {{state}} issues in the {{repo}} repository that have labels related to {{label}}", - "instantiation_dict": {"label": "help wanted", "repo": "a11yproject/a11yproject.com", "state": "open"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "navigate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "ignored_query_params_patterns": [".*"], - "expected": {"url": "^__GITLAB__/byteblaze/a11y-syntax-highlighting/-/issues.*$"} - }, - { - "evaluator": "NetworkEventEvaluator", - "ignored_query_params_patterns": ["page", "sort"], - "expected": { - "url": "__GITLAB__/api/graphql", - "http_method": "POST", - "headers": { - "referer": "__GITLAB__/byteblaze/a11y-syntax-highlighting/-/issues/?state=opened&label_name%5B%5D=help wanted" - } - } - } - ], - "revision": 2 - }, - { - "sites": ["gitlab"], - "task_id": 103, - "intent_template_id": 349, - "start_urls": ["__GITLAB__"], - "intent": "Navigate to the page showing the list of closed issues in the kkroening/ffmpeg-python repository that have labels related to question", - "intent_template": "Navigate to the page showing the list of {{state}} issues in the {{repo}} repository that have labels related to {{label}}", - "instantiation_dict": {"label": "question", "repo": "kkroening/ffmpeg-python", "state": "closed"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "navigate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "ignored_query_params_patterns": [".*"], - "expected": {"url": "^__GITLAB__/kkroening/ffmpeg-python/-/issues.*$"} - }, - { - "evaluator": "NetworkEventEvaluator", - "ignored_query_params_patterns": ["page", "sort"], - "expected": { - "url": "__GITLAB__/api/graphql", - "http_method": "POST", - "headers": { - "referer": "__GITLAB__/kkroening/ffmpeg-python/-/issues/?state=closed&label_name%5B%5D=question" - } - } - } - ], - "revision": 2 - }, - { - "sites": ["gitlab"], - "task_id": 104, - "intent_template_id": 349, - "start_urls": ["__GITLAB__"], - "intent": "Navigate to the page showing the list of all issues in the keycloak/keycloak repository that have labels related to flaky-test", - "intent_template": "Navigate to the page showing the list of {{state}} issues in the {{repo}} repository that have labels related to {{label}}", - "instantiation_dict": {"label": "flaky-test", "repo": "keycloak/keycloak", "state": "all"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "navigate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "ignored_query_params_patterns": [".*"], - "expected": {"url": "^__GITLAB__/keycloak/keycloak/-/issues.*$"} - }, - { - "evaluator": "NetworkEventEvaluator", - "ignored_query_params_patterns": ["page", "sort"], - "expected": { - "url": "__GITLAB__/api/graphql", - "http_method": "POST", - "headers": { - "referer": "__GITLAB__/keycloak/keycloak/-/issues/?state=all&label_name%5B%5D=flaky-test" - } - } - } - ], - "revision": 2 - }, - { - "sites": ["gitlab"], - "task_id": 105, - "intent_template_id": 349, - "start_urls": ["__GITLAB__"], - "intent": "Navigate to the page showing the list of not yet closed issues in the OpenAPITools/openapi-generator repository that have labels related to OpenAPI Generator CLI", - "intent_template": "Navigate to the page showing the list of {{state}} issues in the {{repo}} repository that have labels related to {{label}}", - "instantiation_dict": { - "label": "OpenAPI Generator CLI", - "repo": "OpenAPITools/openapi-generator", - "state": "not yet closed" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "navigate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "ignored_query_params_patterns": [".*"], - "expected": {"url": "^__GITLAB__/OpenAPITools/openapi-generator/-/issues.*$"} - }, - { - "evaluator": "NetworkEventEvaluator", - "ignored_query_params_patterns": ["page", "sort"], - "expected": { - "url": "__GITLAB__/api/graphql", - "http_method": "POST", - "headers": { - "referer": [ - "__GITLAB__/OpenAPITools/openapi-generator/-/issues/?state=opened&label_name%5B%5D=OpenAPI%20Generator%20CLI", - "__GITLAB__/OpenAPITools/openapi-generator/-/issues/?label_name%5B%5D=OpenAPI%20Generator%20CLI" - ] - } - } - } - ], - "revision": 2 - }, - { - "sites": ["gitlab"], - "task_id": 106, - "intent_template_id": 349, - "start_urls": ["__GITLAB__"], - "intent": "Navigate to the page showing the list of open issues in the umano/AndroidSlidingUpPanel repository that have labels related to all except BUG", - "intent_template": "Navigate to the page showing the list of {{state}} issues in the {{repo}} repository that have labels related to {{label}}", - "instantiation_dict": { - "label": "all except BUG", - "repo": "umano/AndroidSlidingUpPanel", - "state": "open" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "navigate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "ignored_query_params_patterns": [".*"], - "expected": {"url": "^__GITLAB__/umano/AndroidSlidingUpPanel/-/issues.*$"} - }, - { - "evaluator": "NetworkEventEvaluator", - "ignored_query_params_patterns": ["page", "sort"], - "expected": { - "url": "__GITLAB__/api/graphql", - "http_method": "POST", - "headers": { - "referer": "__GITLAB__/umano/AndroidSlidingUpPanel/-/issues/?state=opened¬%5Blabel_name%5D%5B%5D=BUG" - } - } - } - ], - "revision": 2 - }, - { - "sites": ["shopping_admin"], - "task_id": 107, - "intent_template_id": 270, - "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Get the monthly count of completed orders from May 2022 through December 2022, inclusive. Return a list of objects with keys \"month\" (month name) and \"count\" (as integer) only, without any additional details.", - "intent_template": "Get the monthly count of completed orders {{period}}. {{retrieved_data_format_spec}}.", - "instantiation_dict": { - "period": "from May 2022 through December 2022, inclusive", - "retrieved_data_format_spec": "Return a list of objects with keys \"month\" (month name) and \"count\" (as integer) only, without any additional details" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { - "type": "array", - "items": { - "type": "object", - "properties": { "month": {"type": "string", "format": "month"}, "count": {"type": "number"} } - } - }, - "expected": { - "task_type": "retrieve", - "status": "SUCCESS", - "retrieved_data": [ - { "month": "May" , "count": 8 }, - { "month": "June" , "count": 13 }, - { "month": "July" , "count": 9 }, - { "month": "August" , "count": 8 }, - { "month": "September", "count": 10 }, - { "month": "October" , "count": 4 }, - { "month": "November" , "count": 5 }, - { "month": "December" , "count": 10 } - ] - } - } - ], - "revision": 2 - }, - { - "sites": ["shopping_admin"], - "task_id": 108, - "intent_template_id": 270, - "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Get the monthly count of completed orders from January 2023 through May 2023, inclusive. Return a list of objects with keys \"month\" (month name) and \"count\" (as integer) only, without any additional details.", - "intent_template": "Get the monthly count of completed orders {{period}}. {{retrieved_data_format_spec}}.", - "instantiation_dict": { - "period": "from January 2023 through May 2023, inclusive", - "retrieved_data_format_spec": "Return a list of objects with keys \"month\" (month name) and \"count\" (as integer) only, without any additional details" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { - "type": "array", - "items": { - "type": "object", - "properties": { "month": {"type": "string", "format": "month"}, "count": {"type": "number"} } - } - }, - "expected": { - "task_type": "retrieve", - "status": "SUCCESS", - "retrieved_data": [ - { "month": "January" , "count": 12 }, - { "month": "February", "count": 7 }, - { "month": "March" , "count": 5 }, - { "month": "April" , "count": 9 }, - { "month": "May" , "count": 5 } - ] - } - } - ], - "revision": 2 - }, - { - "sites": ["shopping_admin"], - "task_id": 109, - "intent_template_id": 270, - "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Get the monthly count of completed orders from Jan to December 2022, inclusive. Return a list of objects with keys \"month\" (month name) and \"count\" (as integer) only, without any additional details.", - "intent_template": "Get the monthly count of completed orders {{period}}. {{retrieved_data_format_spec}}.", - "instantiation_dict": { - "period": "from Jan to December 2022, inclusive", - "retrieved_data_format_spec": "Return a list of objects with keys \"month\" (month name) and \"count\" (as integer) only, without any additional details" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { - "type": "array", - "items": { - "type": "object", - "properties": { "month": {"type": "string", "format": "month"}, "count": {"type": "number"} } - } - }, - "expected": { - "task_type": "retrieve", - "status": "SUCCESS", - "retrieved_data": [ - { "month": "January" , "count": 11 }, - { "month": "February" , "count": 16 }, - { "month": "March" , "count": 14 }, - { "month": "April" , "count": 7 }, - { "month": "May" , "count": 8 }, - { "month": "June" , "count": 13 }, - { "month": "July" , "count": 9 }, - { "month": "August" , "count": 8 }, - { "month": "September", "count": 10 }, - { "month": "October" , "count": 4 }, - { "month": "November" , "count": 5 }, - { "month": "December" , "count": 10 } - ] - } - } - ], - "revision": 2 - }, - { - "sites": ["shopping_admin"], - "task_id": 110, - "intent_template_id": 270, - "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Get the monthly count of completed orders from Jan 2022 through Nov 2022, inclusive. Return a list of objects with keys \"month\" (month name) and \"count\" (as integer) only, without any additional details.", - "intent_template": "Get the monthly count of completed orders {{period}}. {{retrieved_data_format_spec}}.", - "instantiation_dict": { - "period": "from Jan 2022 through Nov 2022, inclusive", - "retrieved_data_format_spec": "Return a list of objects with keys \"month\" (month name) and \"count\" (as integer) only, without any additional details" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { - "type": "array", - "items": { - "type": "object", - "properties": { "month": {"type": "string", "format": "month"}, "count": {"type": "number"} } - } - }, - "expected": { - "task_type": "retrieve", - "status": "SUCCESS", - "retrieved_data": [ - { "month": "January" , "count": 11 }, - { "month": "February" , "count": 16 }, - { "month": "March" , "count": 14 }, - { "month": "April" , "count": 7 }, - { "month": "May" , "count": 8 }, - { "month": "June" , "count": 13 }, - { "month": "July" , "count": 9 }, - { "month": "August" , "count": 8 }, - { "month": "September", "count": 10 }, - { "month": "October" , "count": 4 }, - { "month": "November" , "count": 5 } - ] - } - } - ], - "revision": 2 - }, - { - "sites": ["shopping_admin"], - "task_id": 111, - "intent_template_id": 270, - "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Get the monthly count of completed orders from Feb 2022 through Nov 2022, inclusive. Return a list of objects with keys \"month\" (month name) and \"count\" (as integer) only, without any additional details.", - "intent_template": "Get the monthly count of completed orders {{period}}. {{retrieved_data_format_spec}}.", - "instantiation_dict": { - "period": "from Feb 2022 through Nov 2022, inclusive", - "retrieved_data_format_spec": "Return a list of objects with keys \"month\" (month name) and \"count\" (as integer) only, without any additional details" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { - "type": "array", - "items": { - "type": "object", - "properties": { "month": {"type": "string", "format": "month"}, "count": {"type": "number"} } - } - }, - "expected": { - "task_type": "retrieve", - "status": "SUCCESS", - "retrieved_data": [ - { "month": "February" , "count": 16 }, - { "month": "March" , "count": 14 }, - { "month": "April" , "count": 7 }, - { "month": "May" , "count": 8 }, - { "month": "June" , "count": 13 }, - { "month": "July" , "count": 9 }, - { "month": "August" , "count": 8 }, - { "month": "September", "count": 10 }, - { "month": "October" , "count": 4 }, - { "month": "November" , "count": 5 } - ] - } - } - ], - "revision": 2 - }, - { - "sites": ["shopping_admin"], - "task_id": 112, - "intent_template_id": 245, - "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Return the customer nickname(s) who gave a rating of 3 stars or below for Circe fleece", - "intent_template": "Return the customer nickname(s) who gave a rating of 3 stars or below for {{product}}", - "instantiation_dict": {"product": "Circe fleece"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { "type": "array", "items": {"type": "string"} }, - "expected": { - "task_type": "retrieve", - "status": "SUCCESS", - "retrieved_data": ["Hannah Lim"] - } - } - ], - "revision": 2 - }, - { - "sites": ["shopping_admin"], - "task_id": 113, - "intent_template_id": 245, - "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Return the customer nickname(s) who gave a rating of 3 stars or below for Olivia zip jacket", - "intent_template": "Return the customer nickname(s) who gave a rating of 3 stars or below for {{product}}", - "instantiation_dict": {"product": "Olivia zip jacket"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { "type": "array", "items": {"type": "string"} }, - "expected": { - "task_type": "retrieve", - "status": "SUCCESS", - "retrieved_data": ["Emma", "Seam Miller"] - } - } - ], - "revision": 2 - }, - { - "sites": ["shopping_admin"], - "task_id": 114, - "intent_template_id": 245, - "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Return the customer nickname(s) who gave a rating of 3 stars or below for Antonia racer tank", - "intent_template": "Return the customer nickname(s) who gave a rating of 3 stars or below for {{product}}", - "instantiation_dict": {"product": "Antonia racer tank"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { "type": "array", "items": {"type": "string"} }, - "expected": { - "task_type": "retrieve", - "status": "SUCCESS", - "retrieved_data": ["Shaunte", "Merrie"] - } - } - ], - "revision": 2 - }, - { - "sites": ["shopping_admin"], - "task_id": 115, - "intent_template_id": 245, - "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Return the customer nickname(s) who gave a rating of 3 stars or below for Chloe tank", - "intent_template": "Return the customer nickname(s) who gave a rating of 3 stars or below for {{product}}", - "instantiation_dict": {"product": "Chloe tank"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "retrieve", "status": "NOT_FOUND_ERROR", "retrieved_data": null} - } - ], - "revision": 2 - }, - { - "sites": ["shopping_admin"], - "task_id": 116, - "intent_template_id": 245, - "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Return the customer nickname(s) who gave a rating of 3 stars or below for tanks products", - "intent_template": "Return the customer nickname(s) who gave a rating of 3 stars or below for {{product}}", - "instantiation_dict": {"product": "tanks products"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { "type": "array", "items": {"type": "string"} }, - "expected": { - "task_type": "retrieve", - "status": "SUCCESS", - "retrieved_data": [ - "Dominic", "Trey", "Edmund", "Merrie", "Shaunte", "Teofila", "Carma", "Yan", - "Valorie", "Mikkel", "Matthew", "Monroe", "Scotty", "Alexander", "Tracey", - "Joey" - ] - } - } - ], - "revision": 2 - }, - { - "sites": ["shopping"], - "task_id": 117, - "intent_template_id": 161, - "start_urls": ["__SHOPPING__"], - "intent": "Get the date when I made my first purchase on this site. Return the date in YYYY-MM-DD format or null if not available, without any additional details.", - "intent_template": "Get the date when I made my first purchase on this site. {{retrieved_data_format_spec}}.", - "instantiation_dict": { - "retrieved_data_format_spec": "Return the date in YYYY-MM-DD format or null if not available, without any additional details" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { "type": "array", "items": {"type": "string", "format": "date"} }, - "expected": { - "task_type": "retrieve", - "status": "SUCCESS", - "retrieved_data": ["March 2, 2022"] - } - } - ], - "revision": 2 - }, - { - "sites": ["shopping"], - "task_id": 118, - "intent_template_id": 151, - "start_urls": ["__SHOPPING__"], - "intent": "I have a jaw bruxism problem, go to the product page for something that could alleviate the problem.", - "intent_template": "I have a jaw bruxism problem, go to the product page for something that could alleviate the problem.", - "instantiation_dict": {}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "navigate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "last_event_only": true, - "expected": { - "url": "^__SHOPPING__/[a-z0-9-]*(?:guard|mouth|teeth|night|dental|bruxism)[a-z0-9-]*\\.html$", - "response_status": 200 - } - } - ], - "revision": 2 - }, - { - "sites": ["shopping_admin"], - "task_id": 119, - "intent_template_id": 250, - "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Get title and rating for all reviews with 4 stars or above for Antonia Racer Tank. Return a list of objects with keys \"title\" and \"rating\".", - "intent_template": "Get title and rating for all reviews with 4 stars or above for {{product}}. {{retrieved_data_format_spec}}.", - "instantiation_dict": { - "product": "Antonia Racer Tank", - "retrieved_data_format_spec": "Return a list of objects with keys \"title\" and \"rating\"" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { - "type": "array", - "items": { - "type": "object", - "properties": { "title": {"type": "string"}, "rating": {"type": "number"} } - } - }, - "expected": { - "task_type": "retrieve", - "status": "SUCCESS", - "retrieved_data": [ {"title": "A regular or me", "rating": 4} ] - } - } - ], - "revision": 2 - }, - { - "sites": ["shopping_admin"], - "task_id": 120, - "intent_template_id": 250, - "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Get title and rating for all reviews with 4 stars or above for Ana Running Short. Return a list of objects with keys \"title\" and \"rating\".", - "intent_template": "Get title and rating for all reviews with 4 stars or above for {{product}}. {{retrieved_data_format_spec}}.", - "instantiation_dict": { - "product": "Ana Running Short", - "retrieved_data_format_spec": "Return a list of objects with keys \"title\" and \"rating\"" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { - "type": "array", - "items": { - "type": "object", - "properties": { "title": {"type": "string"}, "rating": {"type": "number"} } - } - }, - "expected": { - "task_type": "retrieve", - "status": "SUCCESS", - "retrieved_data": [ - { "title": "It was really hard to find the right siz", "rating": 4 }, - { "title": "VERY LIGHTWEIGHT COMFY-GOOD SHOES" , "rating": 5 } - ] - } - } - ], - "revision": 2 - }, - { - "sites": ["shopping_admin"], - "task_id": 121, - "intent_template_id": 250, - "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Get title and rating for all reviews with 4 stars or above for Circe hooded fleece. Return a list of objects with keys \"title\" and \"rating\".", - "intent_template": "Get title and rating for all reviews with 4 stars or above for {{product}}. {{retrieved_data_format_spec}}.", - "instantiation_dict": { - "product": "Circe hooded fleece", - "retrieved_data_format_spec": "Return a list of objects with keys \"title\" and \"rating\"" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { - "type": "array", - "items": { - "type": "object", - "properties": { "title": {"type": "string"}, "rating": {"type": "number"} } - } - }, - "expected": { - "task_type": "retrieve", - "status": "SUCCESS", - "retrieved_data": [ {"title": "Good but not perfect", "rating": 4} ] - } - } - ], - "revision": 2 - }, - { - "sites": ["shopping_admin"], - "task_id": 122, - "intent_template_id": 250, - "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Get title and rating for all reviews with 4 stars or above for Olivia zip jacket. Return a list of objects with keys \"title\" and \"rating\".", - "intent_template": "Get title and rating for all reviews with 4 stars or above for {{product}}. {{retrieved_data_format_spec}}.", - "instantiation_dict": { - "product": "Olivia zip jacket", - "retrieved_data_format_spec": "Return a list of objects with keys \"title\" and \"rating\"" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { - "type": "array", - "items": { - "type": "object", - "properties": { "title": {"type": "string"}, "rating": {"type": "number"} } - } - }, - "expected": { - "task_type": "retrieve", - "status": "SUCCESS", - "retrieved_data": [ {"title": "Quite good", "rating": 5} ] - } - } - ], - "revision": 2 - }, - { - "sites": ["shopping_admin"], - "task_id": 123, - "intent_template_id": 250, - "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Get title and rating for all reviews with 4 stars or above for Circe's products. Return a list of objects with keys \"title\" and \"rating\".", - "intent_template": "Get title and rating for all reviews with 4 stars or above for {{product}}. {{retrieved_data_format_spec}}.", - "instantiation_dict": { - "product": "Circe's products", - "retrieved_data_format_spec": "Return a list of objects with keys \"title\" and \"rating\"" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { - "type": "array", - "items": { - "type": "object", - "properties": { "title": {"type": "string"}, "rating": {"type": "number"} } - } - }, - "expected": { - "task_type": "retrieve", - "status": "SUCCESS", - "retrieved_data": [ {"title": "Good but not perfect", "rating": 4} ] - } - } - ], - "revision": 2 - }, - { - "sites": ["shopping"], - "task_id": 124, - "intent_template_id": 159, - "start_urls": ["__SHOPPING__"], - "intent": "What is the price range of wireless earphone in the One Stop Market?. Return an object with keys \"min\" and \"max\" (as numbers, e.g., 10.99) only, without any additional details.", - "intent_template": "What is the price range of {{product}} in the One Stop Market?. {{retrieved_data_format_spec}}.", - "instantiation_dict": { - "product": "wireless earphone", - "retrieved_data_format_spec": "Return an object with keys \"min\" and \"max\" (as numbers, e.g., 10.99) only, without any additional details" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { - "type": "array", - "items": { - "type": "object", - "properties": { - "min": { "type": "number", "format": "currency" }, - "max": { "type": "number", "format": "currency" } - } - } - }, - "expected": { - "task_type": "retrieve", - "status": "SUCCESS", - "retrieved_data": [ {"min": 0.01, "max": 298.0} ] - } - } - ], - "revision": 2 - }, - { - "sites": ["shopping"], - "task_id": 125, - "intent_template_id": 159, - "start_urls": ["__SHOPPING__"], - "intent": "What is the price range of teeth grinding mouth guard in the One Stop Market?. Return an object with keys \"min\" and \"max\" (as numbers, e.g., 10.99) only, without any additional details.", - "intent_template": "What is the price range of {{product}} in the One Stop Market?. {{retrieved_data_format_spec}}.", - "instantiation_dict": { - "product": "teeth grinding mouth guard", - "retrieved_data_format_spec": "Return an object with keys \"min\" and \"max\" (as numbers, e.g., 10.99) only, without any additional details" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { - "type": "array", - "items": { - "type": "object", - "properties": { - "min": { "type": "number", "format": "currency" }, - "max": { "type": "number", "format": "currency" } - } - } - }, - "expected": { - "task_type": "retrieve", - "status": "SUCCESS", - "retrieved_data": [ {"min": 1.46, "max": 179.99} ] - } - } - ], - "revision": 2 - }, - { - "sites": ["shopping"], - "task_id": 126, - "intent_template_id": 159, - "start_urls": ["__SHOPPING__"], - "intent": "What is the price range of Canon photo printer in the One Stop Market?. Return an object with keys \"min\" and \"max\" (as numbers, e.g., 10.99) only, without any additional details.", - "intent_template": "What is the price range of {{product}} in the One Stop Market?. {{retrieved_data_format_spec}}.", - "instantiation_dict": { - "product": "Canon photo printer", - "retrieved_data_format_spec": "Return an object with keys \"min\" and \"max\" (as numbers, e.g., 10.99) only, without any additional details" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { - "type": "array", - "items": { - "type": "object", - "properties": { - "min": { "type": "number", "format": "currency" }, - "max": { "type": "number", "format": "currency" } - } - } - }, - "expected": { - "task_type": "retrieve", - "status": "SUCCESS", - "retrieved_data": [ {"min": 2.56, "max": 649.99} ] - } - } - ], - "revision": 2 - }, - { - "sites": ["shopping_admin"], - "task_id": 127, - "intent_template_id": 1001, - "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Get the top 3 search terms that match available products in the store.", - "intent_template": "Get the top 3 search terms that match available products in the store.", - "instantiation_dict": {}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { "type": "array", "items": {"type": "string"} }, - "expected": { - "task_type": "retrieve", - "status": "SUCCESS", - "retrieved_data": ["Hollister", "Joust Bag", "Antonia Racer Tank"] - } - } - ], - "revision": 2 - }, - { - "sites": ["shopping_admin"], - "task_id": 128, - "intent_template_id": 1002, - "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "What's the total number of items sold in the most recent 2 complete orders?", - "intent_template": "What's the total number of items sold in the most recent {{k}} complete orders?", - "instantiation_dict": {"k": "2"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { "type": "array", "items": {"type": "number"} }, - "expected": { "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": [3] } - } - ], - "revision": 2 - }, - { - "sites": ["shopping_admin"], - "task_id": 129, - "intent_template_id": 1002, - "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "What's the total number of items sold in the most recent 4 complete orders?", - "intent_template": "What's the total number of items sold in the most recent {{k}} complete orders?", - "instantiation_dict": {"k": "4"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { "type": "array", "items": {"type": "number"} }, - "expected": { "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": [9] } - } - ], - "revision": 2 - }, - { - "sites": ["shopping_admin"], - "task_id": 130, - "intent_template_id": 1002, - "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "What's the total number of items sold in the most recent 5 complete orders?", - "intent_template": "What's the total number of items sold in the most recent {{k}} complete orders?", - "instantiation_dict": {"k": "5"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { "type": "array", "items": {"type": "number"} }, - "expected": { "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": [12] } - } - ], - "revision": 2 - }, - { - "sites": ["shopping_admin"], - "task_id": 131, - "intent_template_id": 1002, - "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "What's the total number of items sold in the most recent 7 complete orders?", - "intent_template": "What's the total number of items sold in the most recent {{k}} complete orders?", - "instantiation_dict": {"k": "7"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { "type": "array", "items": {"type": "number"} }, - "expected": { "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": [17] } - } - ], - "revision": 2 - }, - { - "sites": ["gitlab"], - "task_id": 132, - "intent_template_id": 322, - "start_urls": ["__GITLAB__"], - "intent": "How many commits did kilian make to a11yproject.com on March 5, 2023?", - "intent_template": "How many commits did {{user}} make to {{repo}} on {{date}}?", - "instantiation_dict": {"user": "kilian", "repo": "a11yproject.com", "date": "March 5, 2023"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { "type": "array", "items": {"type": "number"} }, - "expected": { "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": [1] } - } - ], - "revision": 2 - }, - { - "sites": ["gitlab"], - "task_id": 133, - "intent_template_id": 322, - "start_urls": ["__GITLAB__"], - "intent": "How many commits did Eric make to a11yproject.com on March 2, 2023?", - "intent_template": "How many commits did {{user}} make to {{repo}} on {{date}}?", - "instantiation_dict": {"user": "Eric", "repo": "a11yproject.com", "date": "March 2, 2023"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { "type": "array", "items": {"type": "number"} }, - "expected": { "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": [2] } - } - ], - "revision": 2 - }, - { - "sites": ["gitlab"], - "task_id": 134, - "intent_template_id": 322, - "start_urls": ["__GITLAB__"], - "intent": "How many commits did kilian make to a11yproject.com on March 1, 2023?", - "intent_template": "How many commits did {{user}} make to {{repo}} on {{date}}?", - "instantiation_dict": {"user": "kilian", "repo": "a11yproject.com", "date": "March 1, 2023"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { "type": "array", "items": {"type": "number"} }, - "expected": { "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": [0] } - } - ], - "revision": 2 - }, - { - "sites": ["gitlab"], - "task_id": 135, - "intent_template_id": 322, - "start_urls": ["__GITLAB__"], - "intent": "How many commits did Eric and Kilian make to a11yproject.com on January 3, 2023?", - "intent_template": "How many commits did {{user}} make to {{repo}} on {{date}}?", - "instantiation_dict": { - "user": "Eric and Kilian", - "repo": "a11yproject.com", - "date": "January 3, 2023" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { "type": "array", "items": {"type": "number"} }, - "expected": { "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": [1] } - } - ], - "revision": 2 - }, - { - "sites": ["gitlab"], - "task_id": 136, - "intent_template_id": 322, - "start_urls": ["__GITLAB__"], - "intent": "How many commits did Steven Woodson make to a11y-webring.club on February 6, 2023?", - "intent_template": "How many commits did {{user}} make to {{repo}} on {{date}}?", - "instantiation_dict": { - "user": "Steven Woodson", - "repo": "a11y-webring.club", - "date": "February 6, 2023" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { "type": "array", "items": {"type": "number"} }, - "expected": { "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": [5] } - } - ], - "revision": 2 - }, - { - "sites": ["map"], - "task_id": 137, - "intent_template_id": 51, - "start_urls": ["__MAP__"], - "intent": "What is the estimated driving time between the city where the Liberty Bell is located and the home city of Pirates? Return the value as a string in HH:MM:SS format only, without any additional details. (Use the OSRM direction service.)", - "intent_template": "What is the estimated driving time between {{city1}} and {{city2}}? {{retrieved_data_format_spec}}. (Use the OSRM direction service.)", - "instantiation_dict": { - "city1": "the city where the Liberty Bell is located", - "city2": "the home city of Pirates", - "retrieved_data_format_spec": "Return the value as a string in HH:MM:SS format only, without any additional details" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { "type": "array", "items": {"type": "string", "format": "duration"} }, - "expected": { "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": ["5h 47min"] } - } - ], - "revision": 2 - }, - { - "sites": ["map"], - "task_id": 138, - "intent_template_id": 51, - "start_urls": ["__MAP__"], - "intent": "What is the estimated driving time between the big apple and the city with the most authentic Philly cheesesteaks? Return the value as a string in HH:MM:SS format only, without any additional details. (Use the OSRM direction service.)", - "intent_template": "What is the estimated driving time between {{city1}} and {{city2}}? {{retrieved_data_format_spec}}. (Use the OSRM direction service.)", - "instantiation_dict": { - "city1": "the big apple", - "city2": "the city with the most authentic Philly cheesesteaks", - "retrieved_data_format_spec": "Return the value as a string in HH:MM:SS format only, without any additional details" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { "type": "array", "items": {"type": "string", "format": "duration"} }, - "expected": { "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": ["1h 58min"] } - } - ], - "revision": 2 - }, - { - "sites": ["map"], - "task_id": 139, - "intent_template_id": 51, - "start_urls": ["__MAP__"], - "intent": "What is the estimated driving time between the hometown of Joe Biden and Bridgeport? Return the value as a string in HH:MM:SS format only, without any additional details. (Use the OSRM direction service.)", - "intent_template": "What is the estimated driving time between {{city1}} and {{city2}}? {{retrieved_data_format_spec}}. (Use the OSRM direction service.)", - "instantiation_dict": { - "city1": "the hometown of Joe Biden", - "city2": "Bridgeport", - "retrieved_data_format_spec": "Return the value as a string in HH:MM:SS format only, without any additional details" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { "type": "array", "items": {"type": "string", "format": "duration"} }, - "expected": { "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": ["3h 20min"] } - } - ], - "revision": 2 - }, - { - "sites": ["map"], - "task_id": 140, - "intent_template_id": 51, - "start_urls": ["__MAP__"], - "intent": "What is the estimated driving time between the city of Niagara Falls and the city of Yale University? Return the value as a string in HH:MM:SS format only, without any additional details. (Use the OSRM direction service.)", - "intent_template": "What is the estimated driving time between {{city1}} and {{city2}}? {{retrieved_data_format_spec}}. (Use the OSRM direction service.)", - "instantiation_dict": { - "city1": "the city of Niagara Falls", - "city2": "the city of Yale University", - "retrieved_data_format_spec": "Return the value as a string in HH:MM:SS format only, without any additional details" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { "type": "array", "items": {"type": "string", "format": "duration"} }, - "expected": { "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": ["8h 33min"] } - } - ], - "revision": 2 - }, - { - "sites": ["shopping"], - "task_id": 141, - "intent_template_id": 162, - "start_urls": ["__SHOPPING__"], - "intent": "Return how much I spent on food-related shopping during March 2023 without considering shipping and handling fee. Return the value as a number (e.g., 10.99) only, without any additional details", - "intent_template": "Return how much I spent on {{category}} shopping {{time}} without considering shipping and handling fee. {{retrieved_data_format_spec}}", - "instantiation_dict": { - "category": "food-related", - "time": "during March 2023", - "retrieved_data_format_spec": "Return the value as a number (e.g., 10.99) only, without any additional details" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { "type": "array", "items": {"type": "number", "format": "currency"} }, - "expected": { "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": [32.41] } - } - ], - "revision": 2 - }, - { - "sites": ["shopping"], - "task_id": 142, - "intent_template_id": 162, - "start_urls": ["__SHOPPING__"], - "intent": "Return how much I spent on hair care and hair style shopping during Jan 2023 without considering shipping and handling fee. Return the value as a number (e.g., 10.99) only, without any additional details", - "intent_template": "Return how much I spent on {{category}} shopping {{time}} without considering shipping and handling fee. {{retrieved_data_format_spec}}", - "instantiation_dict": { - "category": "hair care and hair style", - "time": "during Jan 2023", - "retrieved_data_format_spec": "Return the value as a number (e.g., 10.99) only, without any additional details" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { "type": "array", "items": {"type": "number", "format": "currency"} }, - "expected": { "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": [68.51] } - } - ], - "revision": 2 - }, - { - "sites": ["shopping"], - "task_id": 143, - "intent_template_id": 162, - "start_urls": ["__SHOPPING__"], - "intent": "Return how much I spent on home decoration shopping during January 29, 2023 without considering shipping and handling fee. Return the value as a number (e.g., 10.99) only, without any additional details", - "intent_template": "Return how much I spent on {{category}} shopping {{time}} without considering shipping and handling fee. {{retrieved_data_format_spec}}", - "instantiation_dict": { - "category": "home decoration", - "time": "during January 29, 2023", - "retrieved_data_format_spec": "Return the value as a number (e.g., 10.99) only, without any additional details" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { "type": "array", "items": {"type": "number", "format": "currency"} }, - "expected": { "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": [260.69] } - } - ], - "revision": 2 - }, - { - "sites": ["shopping"], - "task_id": 144, - "intent_template_id": 162, - "start_urls": ["__SHOPPING__"], - "intent": "Return how much I spent on food shopping from January 15 to January 31 2023 without considering shipping and handling fee. ", - "intent_template": "Return how much I spent on {{category}} shopping {{time}} without considering shipping and handling fee. {{retrieved_data_format_spec}}", - "instantiation_dict": { - "category": "food", - "time": "from January 15 to January 31 2023", - "retrieved_data_format_spec": "" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { "type": "array", "items": {"type": "number"} }, - "expected": { "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": [0] } - } - ], - "revision": 2 - }, - { - "sites": ["shopping"], - "task_id": 145, - "intent_template_id": 162, - "start_urls": ["__SHOPPING__"], - "intent": "Return how much I spent on cooking and food shopping during March 2022 without considering shipping and handling fee. Return the value as a number (e.g., 10.99) only, without any additional details", - "intent_template": "Return how much I spent on {{category}} shopping {{time}} without considering shipping and handling fee. {{retrieved_data_format_spec}}", - "instantiation_dict": { - "category": "cooking and food", - "time": "during March 2022", - "retrieved_data_format_spec": "Return the value as a number (e.g., 10.99) only, without any additional details" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { "type": "array", "items": {"type": "number", "format": "currency"} }, - "expected": { "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": [42.35] } - } - ], - "revision": 2 - }, - { - "sites": ["shopping"], - "task_id": 146, - "intent_template_id": 155, - "start_urls": ["__SHOPPING__"], - "intent": "Get the size of the picture frame I bought in Sep 2022. Return a list of objects with keys \"width\" and \"height\".", - "intent_template": "Get the {{option}} of the {{product}} I bought {{time}}.{{retrieved_data_format_spec}}", - "instantiation_dict": { - "option": "size", - "product": "picture frame", - "time": "in Sep 2022", - "retrieved_data_format_spec": " Return a list of objects with keys \"width\" and \"height\"." - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { - "type": "array", - "items": { - "type": "object", - "properties": { "width": {"type": "string"}, "height": {"type": "string"} } - } - }, - "expected": { - "task_type": "retrieve", - "status": "SUCCESS", - "retrieved_data": [ - { - "width": "^16(?:\\.0+)?\\s*[-]?\\s*(?:inch(?:es)?|in\\.?|\u2033|\"|'|')$", - "height": "^24(?:\\.0+)?\\s*[-]?\\s*(?:inch(?:es)?|in\\.?|\u2033|\"|'|')$" - } - ] - } - } - ], - "revision": 2 - }, - { - "sites": ["shopping"], - "task_id": 147, - "intent_template_id": 155, - "start_urls": ["__SHOPPING__"], - "intent": "Get the size of the picture frame I bought in 2022. Return a list of objects with keys \"width\" and \"height\".", - "intent_template": "Get the {{option}} of the {{product}} I bought {{time}}.{{retrieved_data_format_spec}}", - "instantiation_dict": { - "option": "size", - "product": "picture frame", - "time": "in 2022", - "retrieved_data_format_spec": " Return a list of objects with keys \"width\" and \"height\"." - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { - "type": "array", - "items": { - "type": "object", - "properties": { "width": {"type": "string"}, "height": {"type": "string"} } - } - }, - "expected": { - "task_type": "retrieve", - "status": "SUCCESS", - "retrieved_data": [ - { - "width": "^16(?:\\.0+)?\\s*[-]?\\s*(?:inch(?:es)?|in\\.?|\u2033|\"|'|')$", - "height": "^24(?:\\.0+)?\\s*[-]?\\s*(?:inch(?:es)?|in\\.?|\u2033|\"|'|')$" - } - ] - } - } - ], - "revision": 2 - }, - { - "sites": ["shopping"], - "task_id": 148, - "intent_template_id": 155, - "start_urls": ["__SHOPPING__"], - "intent": "Get the color of the picture frame I bought Sep 2022.", - "intent_template": "Get the {{option}} of the {{product}} I bought {{time}}.{{retrieved_data_format_spec}}", - "instantiation_dict": { - "option": "color", - "product": "picture frame", - "time": "Sep 2022", - "retrieved_data_format_spec": "" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { "type": "array", "items": {"type": "string"} }, - "expected": { - "task_type": "retrieve", - "status": "SUCCESS", - "retrieved_data": ["Mist 16*24"] - } - } - ], - "revision": 2 - }, - { - "sites": ["shopping"], - "task_id": 149, - "intent_template_id": 155, - "start_urls": ["__SHOPPING__"], - "intent": "Get the color of the artifical plants I bought Feb 2023.", - "intent_template": "Get the {{option}} of the {{product}} I bought {{time}}.{{retrieved_data_format_spec}}", - "instantiation_dict": { - "option": "color", - "product": "artifical plants", - "time": "Feb 2023", - "retrieved_data_format_spec": "" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { "type": "array", "items": {"type": "string"} }, - "expected": { - "task_type": "retrieve", - "status": "SUCCESS", - "retrieved_data": ["Green-vines"] - } - } - ], - "revision": 2 - }, - { - "sites": ["shopping"], - "task_id": 150, - "intent_template_id": 155, - "start_urls": ["__SHOPPING__"], - "intent": "Get the price of the fake tree I bought Jan 2023.Return the value as a number (e.g., 10.99) only, without any additional details", - "intent_template": "Get the {{option}} of the {{product}} I bought {{time}}.{{retrieved_data_format_spec}}", - "instantiation_dict": { - "option": "price", - "product": "fake tree", - "time": "Jan 2023", - "retrieved_data_format_spec": "Return the value as a number (e.g., 10.99) only, without any additional details" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { "type": "array", "items": {"type": "number", "format": "currency"} }, - "expected": { "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": [260.69] } - } - ], - "revision": 2 - }, - { - "sites": ["map"], - "task_id": 151, - "intent_template_id": 36, - "start_urls": ["__MAP__"], - "intent": "What is the minimum travel time by car from CMU to University of Pittsburgh? Return the value as a string in HH:MM:SS format only, without any additional details. (Use the OSRM direction service.)", - "intent_template": "What is the minimum travel time by car from {{location1}} to {{location2}}? {{retrieved_data_format_spec}}. (Use the OSRM direction service.)", - "instantiation_dict": { - "location1": "CMU", - "location2": "University of Pittsburgh", - "retrieved_data_format_spec": "Return the value as a string in HH:MM:SS format only, without any additional details" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { "type": "array", "items": {"type": "string", "format": "duration"} }, - "expected": { "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": ["4min"] } - } - ], - "revision": 2 - }, - { - "sites": ["map"], - "task_id": 152, - "intent_template_id": 36, - "start_urls": ["__MAP__"], - "intent": "What is the minimum travel time by car from Schenley park to Upitt? Return the value as a string in HH:MM:SS format only, without any additional details. (Use the OSRM direction service.)", - "intent_template": "What is the minimum travel time by car from {{location1}} to {{location2}}? {{retrieved_data_format_spec}}. (Use the OSRM direction service.)", - "instantiation_dict": { - "location1": "Schenley park", - "location2": "Upitt", - "retrieved_data_format_spec": "Return the value as a string in HH:MM:SS format only, without any additional details" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { "type": "array", "items": {"type": "string", "format": "duration"} }, - "expected": { "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": ["4min"] } - } - ], - "revision": 2 - }, - { - "sites": ["map"], - "task_id": 153, - "intent_template_id": 36, - "start_urls": ["__MAP__"], - "intent": "What is the minimum travel time by car from REI to CMU? Return the value as a string in HH:MM:SS format only, without any additional details. (Use the OSRM direction service.)", - "intent_template": "What is the minimum travel time by car from {{location1}} to {{location2}}? {{retrieved_data_format_spec}}. (Use the OSRM direction service.)", - "instantiation_dict": { - "location1": "REI", - "location2": "CMU", - "retrieved_data_format_spec": "Return the value as a string in HH:MM:SS format only, without any additional details" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { "type": "array", "items": {"type": "string", "format": "duration"} }, - "expected": { "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": ["7min"] } - } - ], - "revision": 2 - }, - { - "sites": ["map"], - "task_id": 154, - "intent_template_id": 36, - "start_urls": ["__MAP__"], - "intent": "What is the minimum travel time by car from CMU gates building to Schenley park? Return the value as a string in HH:MM:SS format only, without any additional details. (Use the OSRM direction service.)", - "intent_template": "What is the minimum travel time by car from {{location1}} to {{location2}}? {{retrieved_data_format_spec}}. (Use the OSRM direction service.)", - "instantiation_dict": { - "location1": "CMU gates building", - "location2": "Schenley park", - "retrieved_data_format_spec": "Return the value as a string in HH:MM:SS format only, without any additional details" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { "type": "array", "items": {"type": "string", "format": "duration"} }, - "expected": { "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": ["4min"] } - } - ], - "revision": 2 - }, - { - "sites": ["map"], - "task_id": 155, - "intent_template_id": 36, - "start_urls": ["__MAP__"], - "intent": "What is the minimum travel time by car from Animal Rescue League of Pittsburgh to Schenley park? Return the value as a string in HH:MM:SS format only, without any additional details. (Use the OSRM direction service.)", - "intent_template": "What is the minimum travel time by car from {{location1}} to {{location2}}? {{retrieved_data_format_spec}}. (Use the OSRM direction service.)", - "instantiation_dict": { - "location1": "Animal Rescue League of Pittsburgh", - "location2": "Schenley park", - "retrieved_data_format_spec": "Return the value as a string in HH:MM:SS format only, without any additional details" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { "type": "array", "items": {"type": "string", "format": "duration"} }, - "expected": { "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": ["9min"] } - } - ], - "revision": 2 - }, - { - "sites": ["gitlab"], - "task_id": 156, - "intent_template_id": 290, - "start_urls": ["__GITLAB__"], - "intent": "Go to the merge requests assigned to me", - "intent_template": "Go to the merge requests assigned to me", - "instantiation_dict": {}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "navigate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__GITLAB__/dashboard/merge_requests", - "query_params": { "assignee_username": ["byteblaze"] } - } - } - ], - "revision": 2 - }, - { - "sites": ["shopping_admin"], - "task_id": 157, - "intent_template_id": 255, - "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "View the details of all customers", - "intent_template": "View the details of all customers", - "instantiation_dict": {}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "navigate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": {"url": "__SHOPPING_ADMIN__/customer/index/"} - } - ], - "revision": 2 - }, - { - "sites": ["shopping"], - "task_id": 158, - "intent_template_id": 171, - "start_urls": ["__SHOPPING__"], - "intent": "Go to the product page of the best storage option that fits 11 Nintendo Switch game cards", - "intent_template": "Go to the product page of the best storage option that fits {{num}} Nintendo Switch game cards", - "instantiation_dict": {"num": 11}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "navigate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__SHOPPING__/heiying-game-card-case-for-nintendo-switch-switch-oled-game-card-or-micro-sd-memory-cards-portable-switch-game-memory-card-storage-with-24-game-card-slots-and-24-micro-sd-card-slots-black.html" - } - } - ], - "revision": 2 - }, - { - "sites": ["shopping"], - "task_id": 159, - "intent_template_id": 171, - "start_urls": ["__SHOPPING__"], - "intent": "Go to the product page of the best storage option that fits 31 Nintendo Switch game cards", - "intent_template": "Go to the product page of the best storage option that fits {{num}} Nintendo Switch game cards", - "instantiation_dict": {"num": 31}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "navigate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__SHOPPING__/game-card-holder-storage-case-for-nintendo-switch-games-or-ps-vita-game-case-or-sd-memory-cards-black.html" - } - } - ], - "revision": 2 - }, - { - "sites": ["shopping"], - "task_id": 160, - "intent_template_id": 171, - "start_urls": ["__SHOPPING__"], - "intent": "Go to the product page of the best storage option that fits 6 Nintendo Switch game cards", - "intent_template": "Go to the product page of the best storage option that fits {{num}} Nintendo Switch game cards", - "instantiation_dict": {"num": 6}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "navigate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__SHOPPING__/heiying-game-card-case-for-nintendo-switch-switch-oled-game-card-or-micro-sd-memory-cards-portable-switch-game-memory-card-storage-with-24-game-card-slots-and-24-micro-sd-card-slots-black.html" - } - } - ], - "revision": 2 - }, - { - "sites": ["shopping"], - "task_id": 161, - "intent_template_id": 171, - "start_urls": ["__SHOPPING__"], - "intent": "Go to the product page of the best storage option that fits 23 Nintendo Switch game cards", - "intent_template": "Go to the product page of the best storage option that fits {{num}} Nintendo Switch game cards", - "instantiation_dict": {"num": 23}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "navigate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__SHOPPING__/heiying-game-card-case-for-nintendo-switch-switch-oled-game-card-or-micro-sd-memory-cards-portable-switch-game-memory-card-storage-with-24-game-card-slots-and-24-micro-sd-card-slots-black.html" - } - } - ], - "revision": 2 - }, - { - "sites": ["shopping"], - "task_id": 162, - "intent_template_id": 171, - "start_urls": ["__SHOPPING__"], - "intent": "Go to the product page of the best storage option that fits 40 Nintendo Switch game cards", - "intent_template": "Go to the product page of the best storage option that fits {{num}} Nintendo Switch game cards", - "instantiation_dict": {"num": 40}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "navigate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__SHOPPING__/game-card-holder-storage-case-for-nintendo-switch-games-or-ps-vita-game-case-or-sd-memory-cards-black.html" - } - } - ], - "revision": 2 - }, - { - "sites": ["shopping"], - "task_id": 163, - "intent_template_id": 136, - "start_urls": [ - "__SHOPPING__/ostent-16gb-memory-card-stick-storage-for-sony-ps-vita-psv1000-2000-pch-z081-z161-z321-z641.html" - ], - "intent": "Get all review titles with 2 stars or below for the product on the current page.", - "intent_template": "Get all review titles with 2 stars or below for the product on the current page.", - "instantiation_dict": {}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { "type": "array", "items": {"type": "string"} }, - "expected": { - "task_type": "retrieve", - "status": "SUCCESS", - "retrieved_data": ["Memory Card Came Defective"] - } - } - ], - "revision": 2 - }, - { - "sites": ["shopping"], - "task_id": 164, - "intent_template_id": 136, - "start_urls": ["__SHOPPING__/mineralogie-all-natural-lip-gloss-ruby-rose.html"], - "intent": "Get all review titles with 2 stars or below for the product on the current page.", - "intent_template": "Get all review titles with 2 stars or below for the product on the current page.", - "instantiation_dict": {}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { "type": "array", "items": {"type": "string"} }, - "expected": { "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": ["Meh"] } - } - ], - "revision": 2 - }, - { - "sites": ["shopping"], - "task_id": 165, - "intent_template_id": 136, - "start_urls": ["__SHOPPING__/sandgrens-swedish-handmade-wooden-clog-sandal-copenhagen.html"], - "intent": "Get all review titles with 2 stars or below for the product on the current page.", - "intent_template": "Get all review titles with 2 stars or below for the product on the current page.", - "instantiation_dict": {}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { "type": "array", "items": {"type": "string"} }, - "expected": { - "task_type": "retrieve", - "status": "SUCCESS", - "retrieved_data": ["So cute but too small", "Toe rubbed"] - } - } - ], - "revision": 2 - }, - { - "sites": ["shopping"], - "task_id": 166, - "intent_template_id": 136, - "start_urls": [ - "__SHOPPING__/sensodyne-repair-protect-whitening-toothpaste-with-fluoride-3-4-oz-pack-of-3.html" - ], - "intent": "Get all review titles with 2 stars or below for the product on the current page.", - "intent_template": "Get all review titles with 2 stars or below for the product on the current page.", - "instantiation_dict": {}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "retrieve", "status": "NOT_FOUND_ERROR", "retrieved_data": null} - } - ], - "revision": 2 - }, - { - "sites": ["shopping"], - "task_id": 167, - "intent_template_id": 136, - "start_urls": ["__SHOPPING__/photosmart-plus-b209-clr-inkjetfb-p-s-c-usb-wrls-1.html"], - "intent": "Get all review titles with 2 stars or below for the product on the current page.", - "intent_template": "Get all review titles with 2 stars or below for the product on the current page.", - "instantiation_dict": {}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { "type": "array", "items": {"type": "string"} }, - "expected": { - "task_type": "retrieve", - "status": "SUCCESS", - "retrieved_data": ["Waste of big money"] - } - } - ], - "revision": 2 - }, - { - "sites": ["gitlab"], - "task_id": 168, - "intent_template_id": 289, - "start_urls": ["__GITLAB__"], - "intent": "Get the project ID(s) of my personal project(s) that received more than 100 stars", - "intent_template": "Get the project ID(s) of my personal project(s) that received {{description}}", - "instantiation_dict": {"description": "more than 100 stars"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "retrieve", "status": "NOT_FOUND_ERROR", "retrieved_data": null} - } - ], - "revision": 2 - }, - { - "sites": ["gitlab"], - "task_id": 169, - "intent_template_id": 289, - "start_urls": ["__GITLAB__"], - "intent": "Get the project ID(s) of my personal project(s) that received the most stars", - "intent_template": "Get the project ID(s) of my personal project(s) that received {{description}}", - "instantiation_dict": {"description": "the most stars"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { "type": "array", "items": {"type": "number"} }, - "expected": { "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": [187, 183] } - } - ], - "revision": 2 - }, - { - "sites": ["gitlab"], - "task_id": 170, - "intent_template_id": 289, - "start_urls": ["__GITLAB__"], - "intent": "Get the project ID(s) of my personal project(s) that received the least stars", - "intent_template": "Get the project ID(s) of my personal project(s) that received {{description}}", - "instantiation_dict": {"description": "the least stars"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { "type": "array", "items": {"type": "number"} }, - "expected": { - "task_type": "retrieve", - "status": "SUCCESS", - "retrieved_data": [193, 190, 189, 188, 184, 181] - } - } - ], - "revision": 2 - }, - { - "sites": ["gitlab"], - "task_id": 171, - "intent_template_id": 289, - "start_urls": ["__GITLAB__"], - "intent": "Get the project ID(s) of my personal project(s) that received less than 5 stars", - "intent_template": "Get the project ID(s) of my personal project(s) that received {{description}}", - "instantiation_dict": {"description": "less than 5 stars"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { "type": "array", "items": {"type": "number"} }, - "expected": { - "task_type": "retrieve", - "status": "SUCCESS", - "retrieved_data": [182, 179, 186, 185, 193, 190, 189, 188, 184, 181] - } - } - ], - "revision": 2 - }, - { - "sites": ["gitlab"], - "task_id": 172, - "intent_template_id": 289, - "start_urls": ["__GITLAB__"], - "intent": "Get the project ID(s) of my personal project(s) that received no stars", - "intent_template": "Get the project ID(s) of my personal project(s) that received {{description}}", - "instantiation_dict": {"description": "no stars"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { "type": "array", "items": {"type": "number"} }, - "expected": { - "task_type": "retrieve", - "status": "SUCCESS", - "retrieved_data": [193, 190, 189, 188, 184, 181] - } - } - ], - "revision": 2 - }, - { - "sites": ["gitlab"], - "task_id": 173, - "intent_template_id": 310, - "start_urls": ["__GITLAB__"], - "intent": "Get whether my latest updated issue with \"better\" in its title is closed. Return a boolean (true if closed, false if opened).", - "intent_template": "Get whether my latest updated issue with \"{{keyword}}\" in its title is closed. {{retrieved_data_format_spec}}.", - "instantiation_dict": { - "keyword": "better", - "retrieved_data_format_spec": "Return a boolean (true if closed, false if opened)" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { "type": "array", "items": {"type": "boolean"} }, - "expected": { "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": [false] } - } - ], - "revision": 2 - }, - { - "sites": ["gitlab"], - "task_id": 174, - "intent_template_id": 310, - "start_urls": ["__GITLAB__"], - "intent": "Get whether my latest updated issue with \"feature\" in its title is closed. Return a boolean (true if closed, false if opened).", - "intent_template": "Get whether my latest updated issue with \"{{keyword}}\" in its title is closed. {{retrieved_data_format_spec}}.", - "instantiation_dict": { - "keyword": "feature", - "retrieved_data_format_spec": "Return a boolean (true if closed, false if opened)" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { "type": "array", "items": {"type": "boolean"} }, - "expected": { "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": [false] } - } - ], - "revision": 2 - }, - { - "sites": ["gitlab"], - "task_id": 175, - "intent_template_id": 310, - "start_urls": ["__GITLAB__"], - "intent": "Get whether my latest updated issue with \"dependency\" in its title is closed. Return a boolean (true if closed, false if opened).", - "intent_template": "Get whether my latest updated issue with \"{{keyword}}\" in its title is closed. {{retrieved_data_format_spec}}.", - "instantiation_dict": { - "keyword": "dependency", - "retrieved_data_format_spec": "Return a boolean (true if closed, false if opened)" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { "type": "array", "items": {"type": "boolean"} }, - "expected": { "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": [false] } - } - ], - "revision": 2 - }, - { - "sites": ["gitlab"], - "task_id": 176, - "intent_template_id": 310, - "start_urls": ["__GITLAB__"], - "intent": "Get whether my latest updated issue with \"theme editor\" in its title is closed. Return a boolean (true if closed, false if opened).", - "intent_template": "Get whether my latest updated issue with \"{{keyword}}\" in its title is closed. {{retrieved_data_format_spec}}.", - "instantiation_dict": { - "keyword": "theme editor", - "retrieved_data_format_spec": "Return a boolean (true if closed, false if opened)" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { "type": "array", "items": {"type": "boolean"} }, - "expected": { "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": [false] } - } - ], - "revision": 2 - }, - { - "sites": ["gitlab"], - "task_id": 177, - "intent_template_id": 310, - "start_urls": ["__GITLAB__"], - "intent": "Get whether my latest updated issue with \"homepage content\" in its title is closed. Return a boolean (true if closed, false if opened).", - "intent_template": "Get whether my latest updated issue with \"{{keyword}}\" in its title is closed. {{retrieved_data_format_spec}}.", - "instantiation_dict": { - "keyword": "homepage content", - "retrieved_data_format_spec": "Return a boolean (true if closed, false if opened)" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { "type": "array", "items": {"type": "boolean"} }, - "expected": { "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": [true] } - } - ], - "revision": 2 - }, - { - "sites": ["gitlab"], - "task_id": 178, - "intent_template_id": 500, - "start_urls": ["__GITLAB__"], - "intent": "Get whether my latest created issue with better in its title is closed. Return a boolean (true if closed, false if opened).", - "intent_template": "Get whether my latest created issue with {{keyword}} in its title is closed. {{retrieved_data_format_spec}}.", - "instantiation_dict": { - "keyword": "better", - "retrieved_data_format_spec": "Return a boolean (true if closed, false if opened)" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { "type": "array", "items": {"type": "boolean"} }, - "expected": { "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": [true] } - } - ], - "revision": 2 - }, - { - "sites": ["gitlab"], - "task_id": 179, - "intent_template_id": 500, - "start_urls": ["__GITLAB__"], - "intent": "Get whether my latest created issue with feature in its title is closed. Return a boolean (true if closed, false if opened).", - "intent_template": "Get whether my latest created issue with {{keyword}} in its title is closed. {{retrieved_data_format_spec}}.", - "instantiation_dict": { - "keyword": "feature", - "retrieved_data_format_spec": "Return a boolean (true if closed, false if opened)" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { "type": "array", "items": {"type": "boolean"} }, - "expected": { "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": [true] } - } - ], - "revision": 2 - }, - { - "sites": ["gitlab"], - "task_id": 180, - "intent_template_id": 500, - "start_urls": ["__GITLAB__"], - "intent": "Get whether my latest created issue with dependency in its title is closed. Return a boolean (true if closed, false if opened).", - "intent_template": "Get whether my latest created issue with {{keyword}} in its title is closed. {{retrieved_data_format_spec}}.", - "instantiation_dict": { - "keyword": "dependency", - "retrieved_data_format_spec": "Return a boolean (true if closed, false if opened)" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { "type": "array", "items": {"type": "boolean"} }, - "expected": { "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": [false] } - } - ], - "revision": 2 - }, - { - "sites": ["gitlab"], - "task_id": 181, - "intent_template_id": 500, - "start_urls": ["__GITLAB__"], - "intent": "Get whether my latest created issue with theme editor in its title is closed. Return a boolean (true if closed, false if opened).", - "intent_template": "Get whether my latest created issue with {{keyword}} in its title is closed. {{retrieved_data_format_spec}}.", - "instantiation_dict": { - "keyword": "theme editor", - "retrieved_data_format_spec": "Return a boolean (true if closed, false if opened)" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { "type": "array", "items": {"type": "boolean"} }, - "expected": { "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": [false] } - } - ], - "revision": 2 - }, - { - "sites": ["gitlab"], - "task_id": 182, - "intent_template_id": 500, - "start_urls": ["__GITLAB__"], - "intent": "Get whether my latest created issue with homepage content in its title is closed. Return a boolean (true if closed, false if opened).", - "intent_template": "Get whether my latest created issue with {{keyword}} in its title is closed. {{retrieved_data_format_spec}}.", - "instantiation_dict": { - "keyword": "homepage content", - "retrieved_data_format_spec": "Return a boolean (true if closed, false if opened)" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { "type": "array", "items": {"type": "boolean"} }, - "expected": { "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": [true] } - } - ], - "revision": 2 - }, - { - "sites": ["shopping_admin"], - "task_id": 183, - "intent_template_id": 368, - "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Give me the SKU of the products that have 10 units left", - "intent_template": "Give me the {{Attribute}} of the products that have {{N}} units left{{retrieved_data_format_spec}}", - "instantiation_dict": {"Attribute": "SKU", "N": "10", "retrieved_data_format_spec": ""}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "retrieve", "status": "NOT_FOUND_ERROR", "retrieved_data": null} - } - ], - "revision": 2 - }, - { - "sites": ["shopping_admin"], - "task_id": 184, - "intent_template_id": 368, - "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Give me the name and color of the products that have 0 units left. Return a list of objects with keys \"name\" and \"color\".", - "intent_template": "Give me the {{Attribute}} of the products that have {{N}} units left{{retrieved_data_format_spec}}", - "instantiation_dict": { - "Attribute": "name and color", - "N": "0", - "retrieved_data_format_spec": ". Return a list of objects with keys \"name\" and \"color\"." - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { - "type": "array", - "items": { - "type": "object", - "properties": { "name": {"type": "string"}, "color": {"type": "string"} } - } - }, - "expected": { - "task_type": "retrieve", - "status": "SUCCESS", - "retrieved_data": [ {"name": "Cronus Yoga Pant -33-Blue", "color": "Blue"} ] - } - } - ], - "revision": 2 - }, - { - "sites": ["shopping_admin"], - "task_id": 185, - "intent_template_id": 368, - "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Give me the material of the products that have 3 units left", - "intent_template": "Give me the {{Attribute}} of the products that have {{N}} units left{{retrieved_data_format_spec}}", - "instantiation_dict": {"Attribute": "material", "N": "3", "retrieved_data_format_spec": ""}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { "type": "array", "items": {"type": "string"} }, - "expected": { - "task_type": "retrieve", - "status": "SUCCESS", - "retrieved_data": ["Cotton", "Fleece"] - } - } - ], - "revision": 2 - }, - { - "sites": ["shopping_admin"], - "task_id": 186, - "intent_template_id": 368, - "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Give me the product names and the sizes of the products that have 2-3 units left. Return a list of objects with keys \"name\" and \"size\".", - "intent_template": "Give me the {{Attribute}} of the products that have {{N}} units left{{retrieved_data_format_spec}}", - "instantiation_dict": { - "Attribute": "product names and the sizes", - "N": "2-3", - "retrieved_data_format_spec": ". Return a list of objects with keys \"name\" and \"size\"." - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { - "type": "array", - "items": { - "type": "object", - "properties": { "name": {"type": "string"}, "size": {"type": "string"} } - } - }, - "expected": { - "task_type": "retrieve", - "status": "SUCCESS", - "retrieved_data": [ - { "name": "Eos V-Neck Hoodie-S-Blue" , "size": "S" }, - { "name": "Minerva LumaTech\u2122 V-Tee-XS-Blue", "size": "XS" } - ] - } - } - ], - "revision": 2 - }, - { - "sites": ["shopping_admin"], - "task_id": 187, - "intent_template_id": 368, - "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Give me the SKU of the products that have 1-3 units left", - "intent_template": "Give me the {{Attribute}} of the products that have {{N}} units left{{retrieved_data_format_spec}}", - "instantiation_dict": {"Attribute": "SKU", "N": "1-3", "retrieved_data_format_spec": ""}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { "type": "array", "items": {"type": "string"} }, - "expected": { - "task_type": "retrieve", - "status": "SUCCESS", - "retrieved_data": ["WH11-S-Blue", "WS08-XS-Blue"] - } - } - ], - "revision": 2 - }, - { - "sites": ["shopping"], - "task_id": 188, - "intent_template_id": 214, - "start_urls": ["__SHOPPING__"], - "intent": "Get the total cost of my latest order marked as \"cancelled\". Return the value as a number (e.g., 10.99) only, without any additional details", - "intent_template": "Get the total cost of my latest order {{status}}. {{retrieved_data_format_spec}}", - "instantiation_dict": { - "status": "marked as \"cancelled\"", - "retrieved_data_format_spec": "Return the value as a number (e.g., 10.99) only, without any additional details" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { "type": "array", "items": {"type": "number", "format": "currency"} }, - "expected": { "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": [365.42] } - } - ], - "revision": 2 - }, - { - "sites": ["shopping"], - "task_id": 189, - "intent_template_id": 214, - "start_urls": ["__SHOPPING__"], - "intent": "Get the total cost of my latest order marked as \"pending\". Return the value as a number (e.g., 10.99) only, without any additional details", - "intent_template": "Get the total cost of my latest order {{status}}. {{retrieved_data_format_spec}}", - "instantiation_dict": { - "status": "marked as \"pending\"", - "retrieved_data_format_spec": "Return the value as a number (e.g., 10.99) only, without any additional details" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { "type": "array", "items": {"type": "number", "format": "currency"} }, - "expected": { "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": [754.99] } - } - ], - "revision": 2 - }, - { - "sites": ["shopping"], - "task_id": 190, - "intent_template_id": 214, - "start_urls": ["__SHOPPING__"], - "intent": "Get the total cost of my latest order marked as \"complete\". Return the value as a number (e.g., 10.99) only, without any additional details", - "intent_template": "Get the total cost of my latest order {{status}}. {{retrieved_data_format_spec}}", - "instantiation_dict": { - "status": "marked as \"complete\"", - "retrieved_data_format_spec": "Return the value as a number (e.g., 10.99) only, without any additional details" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { "type": "array", "items": {"type": "number", "format": "currency"} }, - "expected": { "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": [65.32] } - } - ], - "revision": 2 - }, - { - "sites": ["shopping"], - "task_id": 191, - "intent_template_id": 214, - "start_urls": ["__SHOPPING__"], - "intent": "Get the total cost of my latest order marked as \"processing\". ", - "intent_template": "Get the total cost of my latest order {{status}}. {{retrieved_data_format_spec}}", - "instantiation_dict": {"status": "marked as \"processing\"", "retrieved_data_format_spec": ""}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "retrieve", "status": "NOT_FOUND_ERROR", "retrieved_data": null} - } - ], - "revision": 2 - }, - { - "sites": ["shopping"], - "task_id": 192, - "intent_template_id": 214, - "start_urls": ["__SHOPPING__"], - "intent": "Get the total cost of my latest order that is not cancelled. Return the value as a number (e.g., 10.99) only, without any additional details", - "intent_template": "Get the total cost of my latest order {{status}}. {{retrieved_data_format_spec}}", - "instantiation_dict": { - "status": "that is not cancelled", - "retrieved_data_format_spec": "Return the value as a number (e.g., 10.99) only, without any additional details" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { "type": "array", "items": {"type": "number", "format": "currency"} }, - "expected": { "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": [754.99] } - } - ], - "revision": 2 - }, - { - "sites": ["shopping_admin"], - "task_id": 193, - "intent_template_id": 367, - "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Get the total payment amount of the last 2 completed orders. Return the value as a number (e.g., 10.99) only, without any additional details.", - "intent_template": "{{payment_query}}. {{retrieved_data_format_spec}}.", - "instantiation_dict": { - "payment_query": "Get the total payment amount of the last 2 completed orders", - "retrieved_data_format_spec": "Return the value as a number (e.g., 10.99) only, without any additional details" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { "type": "array", "items": {"type": "number", "format": "currency"} }, - "expected": { "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": [182.4] } - } - ], - "revision": 2 - }, - { - "sites": ["shopping_admin"], - "task_id": 194, - "intent_template_id": 367, - "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Get the total payment amount of the last 5 completed orders. Return the value as a number (e.g., 10.99) only, without any additional details.", - "intent_template": "{{payment_query}}. {{retrieved_data_format_spec}}.", - "instantiation_dict": { - "payment_query": "Get the total payment amount of the last 5 completed orders", - "retrieved_data_format_spec": "Return the value as a number (e.g., 10.99) only, without any additional details" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { "type": "array", "items": {"type": "number", "format": "currency"} }, - "expected": { "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": [555.2] } - } - ], - "revision": 2 - }, - { - "sites": ["shopping_admin"], - "task_id": 195, - "intent_template_id": 367, - "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Get the total payment amount of the last 5 pending orders. Return the value as a number (e.g., 10.99) only, without any additional details.", - "intent_template": "{{payment_query}}. {{retrieved_data_format_spec}}.", - "instantiation_dict": { - "payment_query": "Get the total payment amount of the last 5 pending orders", - "retrieved_data_format_spec": "Return the value as a number (e.g., 10.99) only, without any additional details" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { "type": "array", "items": {"type": "number", "format": "currency"} }, - "expected": { "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": [885.4] } - } - ], - "revision": 2 - }, - { - "sites": ["shopping_admin"], - "task_id": 196, - "intent_template_id": 367, - "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Get the payment difference between the last 4 cancelled orders and the last 4 completed orders. Return the value as a number (e.g., 10.99) only, without any additional details.", - "intent_template": "{{payment_query}}. {{retrieved_data_format_spec}}.", - "instantiation_dict": { - "payment_query": "Get the payment difference between the last 4 cancelled orders and the last 4 completed orders", - "retrieved_data_format_spec": "Return the value as a number (e.g., 10.99) only, without any additional details" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { "type": "array", "items": {"type": "number", "format": "currency"} }, - "expected": { "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": [194.25] } - } - ], - "revision": 2 - }, - { - "sites": ["shopping_admin"], - "task_id": 197, - "intent_template_id": 367, - "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Get the total payment amount of the last 5 non-cancelled orders. Return the value as a number (e.g., 10.99) only, without any additional details.", - "intent_template": "{{payment_query}}. {{retrieved_data_format_spec}}.", - "instantiation_dict": { - "payment_query": "Get the total payment amount of the last 5 non-cancelled orders", - "retrieved_data_format_spec": "Return the value as a number (e.g., 10.99) only, without any additional details" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { "type": "array", "items": {"type": "number", "format": "currency"} }, - "expected": { "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": [778.2] } - } - ], - "revision": 2 - }, - { - "sites": ["shopping_admin"], - "task_id": 198, - "intent_template_id": 366, - "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Get the customer email of the most recent cancelled order.", - "intent_template": "Get the {{attribute}} of the {{status}} order.{{retrieved_data_format_spec}}", - "instantiation_dict": { - "attribute": "customer email", - "status": "most recent cancelled", - "retrieved_data_format_spec": "" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { "type": "array", "items": {"type": "string"} }, - "expected": { - "task_type": "retrieve", - "status": "SUCCESS", - "retrieved_data": ["harrypotterfan1@gmail.com"] - } - } - ], - "revision": 2 - }, - { - "sites": ["shopping_admin"], - "task_id": 199, - "intent_template_id": 366, - "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Get the order ID of the newest pending order.", - "intent_template": "Get the {{attribute}} of the {{status}} order.{{retrieved_data_format_spec}}", - "instantiation_dict": { - "attribute": "order ID", - "status": "newest pending", - "retrieved_data_format_spec": "" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { "type": "array", "items": {"type": "number"} }, - "expected": { "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": [299] } - } - ], - "revision": 2 - }, - { - "sites": ["shopping_admin"], - "task_id": 200, - "intent_template_id": 366, - "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Get the billing name of the oldest complete order.", - "intent_template": "Get the {{attribute}} of the {{status}} order.{{retrieved_data_format_spec}}", - "instantiation_dict": { - "attribute": "billing name", - "status": "oldest complete", - "retrieved_data_format_spec": "" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { "type": "array", "items": {"type": "string"} }, - "expected": { "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": ["John Lee"] } - } - ], - "revision": 2 - }, - { - "sites": ["shopping_admin"], - "task_id": 201, - "intent_template_id": 366, - "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Get the customer email of the earliest fraud suspect order.", - "intent_template": "Get the {{attribute}} of the {{status}} order.{{retrieved_data_format_spec}}", - "instantiation_dict": { - "attribute": "customer email", - "status": "earliest fraud suspect", - "retrieved_data_format_spec": "" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "retrieve", "status": "NOT_FOUND_ERROR", "retrieved_data": null} - } - ], - "revision": 2 - }, - { - "sites": ["shopping_admin"], - "task_id": 202, - "intent_template_id": 366, - "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Get the date of the most recent cancelled order.Return the date in YYYY-MM-DD format or null if not available, without any additional details", - "intent_template": "Get the {{attribute}} of the {{status}} order.{{retrieved_data_format_spec}}", - "instantiation_dict": { - "attribute": "date", - "status": "most recent cancelled", - "retrieved_data_format_spec": "Return the date in YYYY-MM-DD format or null if not available, without any additional details" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { "type": "array", "items": {"type": "string", "format": "date"} }, - "expected": { - "task_type": "retrieve", - "status": "SUCCESS", - "retrieved_data": ["May 23 2023"] - } - } - ], - "revision": 2 - }, - { - "sites": ["shopping_admin"], - "task_id": 203, - "intent_template_id": 366, - "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Get the purchase date and order id of the most recent pending order.Return a list of objects with keys \"purchase_date\" (YYYY-MM-DD format or null if not available) and \"order_id\", without any additional details", - "intent_template": "Get the {{attribute}} of the {{status}} order.{{retrieved_data_format_spec}}", - "instantiation_dict": { - "attribute": "purchase date and order id", - "status": "most recent pending", - "retrieved_data_format_spec": "Return a list of objects with keys \"purchase_date\" (YYYY-MM-DD format or null if not available) and \"order_id\", without any additional details" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { - "type": "array", - "items": { - "type": "object", - "properties": { "date": {"type": "string", "format": "date"}, "order_id": {"type": "string"} } - } - }, - "expected": { - "task_type": "retrieve", - "status": "SUCCESS", - "retrieved_data": [ {"date": "May 31, 2023", "order_id": "000000299"} ] - } - } - ], - "revision": 2 - }, - { - "sites": ["shopping_admin"], - "task_id": 204, - "intent_template_id": 366, - "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Get the product name and final price (low to high) of the most recent completed order.Return a list of objects with keys \"name\" (product name) and \"price\" (as number, e.g., 10.99) only, without any additional details", - "intent_template": "Get the {{attribute}} of the {{status}} order.{{retrieved_data_format_spec}}", - "instantiation_dict": { - "attribute": "product name and final price (low to high)", - "status": "most recent completed", - "retrieved_data_format_spec": "Return a list of objects with keys \"name\" (product name) and \"price\" (as number, e.g., 10.99) only, without any additional details" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "ordered": true, - "results_schema": { - "type": "array", - "items": { - "type": "object", - "properties": { - "name" : { "type": "string" }, - "price": { "type": "number", "format": "currency" } - } - } - }, - "expected": { - "task_type": "retrieve", - "status": "SUCCESS", - "retrieved_data": [ - { "name": "Ida Workout Parachute Pant", "price": 38.4 }, - { "name": "Proteus Fitness Jackshirt" , "price": 45.0 } - ] - } - } - ], - "revision": 2 - }, - { - "sites": ["gitlab"], - "task_id": 205, - "intent_template_id": 320, - "start_urls": ["__GITLAB__/a11yproject/a11yproject.com"], - "intent": "How many commits did kilian make on March 5, 2023 for the current project?", - "intent_template": "How many commits did {{user}} make on {{date}}{{modifier}} for the current project?", - "instantiation_dict": {"user": "kilian", "date": "March 5, 2023", "modifier": ""}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { "type": "array", "items": {"type": "number"} }, - "expected": { "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": [1] } - } - ], - "revision": 2 - }, - { - "sites": ["gitlab"], - "task_id": 206, - "intent_template_id": 320, - "start_urls": ["__GITLAB__/a11yproject/a11yproject.com"], - "intent": "How many commits did Eric make on March 2, 2023 for the current project?", - "intent_template": "How many commits did {{user}} make on {{date}}{{modifier}} for the current project?", - "instantiation_dict": {"user": "Eric", "date": "March 2, 2023", "modifier": ""}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { "type": "array", "items": {"type": "number"} }, - "expected": { "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": [2] } - } - ], - "revision": 2 - }, - { - "sites": ["gitlab"], - "task_id": 207, - "intent_template_id": 320, - "start_urls": ["__GITLAB__/a11yproject/a11yproject.com"], - "intent": "How many commits did Eric and Kilian make on January 3, 2023 in total for the current project?", - "intent_template": "How many commits did {{user}} make on {{date}}{{modifier}} for the current project?", - "instantiation_dict": {"user": "Eric and Kilian", "date": "January 3, 2023", "modifier": " in total"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { "type": "array", "items": {"type": "number"} }, - "expected": { "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": [1] } - } - ], - "revision": 2 - }, - { - "sites": ["shopping_admin"], - "task_id": 208, - "intent_template_id": 364, - "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Get the customer name and email with phone number +1 2058812302. Return a list of objects with keys \"name\" and \"email\".", - "intent_template": "Get the customer name and email with phone number {{PhoneNum}}. {{retrieved_data_format_spec}}.", - "instantiation_dict": { - "PhoneNum": "+1 2058812302", - "retrieved_data_format_spec": "Return a list of objects with keys \"name\" and \"email\"" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { - "type": "array", - "items": { - "type": "object", - "properties": { "name": {"type": "string"}, "email": {"type": "string"} } - } - }, - "expected": { - "task_type": "retrieve", - "status": "SUCCESS", - "retrieved_data": [ {"name": "John Smith", "email": "john.smith.xyz@gmail.com"} ] - } - } - ], - "revision": 2 - }, - { - "sites": ["shopping_admin"], - "task_id": 209, - "intent_template_id": 364, - "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Get the customer name and email with phone number 2137418080. Return a list of objects with keys \"name\" and \"email\".", - "intent_template": "Get the customer name and email with phone number {{PhoneNum}}. {{retrieved_data_format_spec}}.", - "instantiation_dict": { - "PhoneNum": "2137418080", - "retrieved_data_format_spec": "Return a list of objects with keys \"name\" and \"email\"" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { - "type": "array", - "items": { - "type": "object", - "properties": { "name": {"type": "string"}, "email": {"type": "string"} } - } - }, - "expected": { - "task_type": "retrieve", - "status": "SUCCESS", - "retrieved_data": [ {"name": "Jennifer White", "email": "jennifer.white@yahoo.com"} ] - } - } - ], - "revision": 2 - }, - { - "sites": ["shopping_admin"], - "task_id": 210, - "intent_template_id": 364, - "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Get the customer name and email with phone number 2065555555. Return a list of objects with keys \"name\" and \"email\".", - "intent_template": "Get the customer name and email with phone number {{PhoneNum}}. {{retrieved_data_format_spec}}.", - "instantiation_dict": { - "PhoneNum": "2065555555", - "retrieved_data_format_spec": "Return a list of objects with keys \"name\" and \"email\"" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { - "type": "array", - "items": { - "type": "object", - "properties": { "name": {"type": "string"}, "email": {"type": "string"} } - } - }, - "expected": { - "task_type": "retrieve", - "status": "SUCCESS", - "retrieved_data": [ {"name": "Adam Garcia", "email": "gamingpro456@gmail.com"} ] - } - } - ], - "revision": 2 - }, - { - "sites": ["shopping_admin"], - "task_id": 211, - "intent_template_id": 364, - "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Get the customer name and email with phone number 8015551212. Return a list of objects with keys \"name\" and \"email\".", - "intent_template": "Get the customer name and email with phone number {{PhoneNum}}. {{retrieved_data_format_spec}}.", - "instantiation_dict": { - "PhoneNum": "8015551212", - "retrieved_data_format_spec": "Return a list of objects with keys \"name\" and \"email\"" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { - "type": "array", - "items": { - "type": "object", - "properties": { "name": {"type": "string"}, "email": {"type": "string"} } - } - }, - "expected": { - "task_type": "retrieve", - "status": "SUCCESS", - "retrieved_data": [ {"name": "Sean Miller", "email": "sean.miller@gmail.com"} ] - } - } - ], - "revision": 2 - }, - { - "sites": ["shopping_admin"], - "task_id": 212, - "intent_template_id": 364, - "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Get the customer name and email with phone number 555-229-3326. Return a list of objects with keys \"name\" and \"email\".", - "intent_template": "Get the customer name and email with phone number {{PhoneNum}}. {{retrieved_data_format_spec}}.", - "instantiation_dict": { - "PhoneNum": "555-229-3326", - "retrieved_data_format_spec": "Return a list of objects with keys \"name\" and \"email\"" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { - "type": "array", - "items": { - "type": "object", - "properties": { "name": {"type": "string"}, "email": {"type": "string"} } - } - }, - "expected": { - "task_type": "retrieve", - "status": "SUCCESS", - "retrieved_data": [ {"name": "Veronica Costello", "email": "roni_cost@example.com"} ] - } - } - ], - "revision": 2 - }, - { - "sites": ["shopping_admin"], - "task_id": 213, - "intent_template_id": 249, - "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Get the title and rating for all reviews with 3 stars or below for Antonia Racer Tank. Return a list of objects with keys \"title\" and \"rating\".", - "intent_template": "Get the title and rating for all reviews with 3 stars or below for {{product}}. {{retrieved_data_format_spec}}.", - "instantiation_dict": { - "product": "Antonia Racer Tank", - "retrieved_data_format_spec": "Return a list of objects with keys \"title\" and \"rating\"" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { - "type": "array", - "items": { - "type": "object", - "properties": { "title": {"type": "string"}, "rating": {"type": "string"} } - } - }, - "expected": { - "task_type": "retrieve", - "status": "SUCCESS", - "retrieved_data": [ - { "title": "Zero support/modesty", "rating": "2" }, - { "title": "Not for high impact" , "rating": "3" } - ] - } - } - ], - "revision": 2 - }, - { - "sites": ["shopping_admin"], - "task_id": 214, - "intent_template_id": 249, - "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Get the title and rating for all reviews with 3 stars or below for Erica Sports Bra. Return a list of objects with keys \"title\" and \"rating\".", - "intent_template": "Get the title and rating for all reviews with 3 stars or below for {{product}}. {{retrieved_data_format_spec}}.", - "instantiation_dict": { - "product": "Erica Sports Bra", - "retrieved_data_format_spec": "Return a list of objects with keys \"title\" and \"rating\"" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { - "type": "array", - "items": { - "type": "object", - "properties": { "title": {"type": "string"}, "rating": {"type": "string"} } - } - }, - "expected": { - "task_type": "retrieve", - "status": "SUCCESS", - "retrieved_data": [ - { "title": "Doesn't fit me. Luma fail.", "rating": "2" }, - { "title": "does not fit. worthless." , "rating": "1" } - ] - } - } - ], - "revision": 2 - }, - { - "sites": ["shopping_admin"], - "task_id": 215, - "intent_template_id": 249, - "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Get the title and rating for all reviews with 3 stars or below for Circe ice fleece. Return a list of objects with keys \"title\" and \"rating\".", - "intent_template": "Get the title and rating for all reviews with 3 stars or below for {{product}}. {{retrieved_data_format_spec}}.", - "instantiation_dict": { - "product": "Circe ice fleece", - "retrieved_data_format_spec": "Return a list of objects with keys \"title\" and \"rating\"" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { - "type": "array", - "items": { - "type": "object", - "properties": { "title": {"type": "string"}, "rating": {"type": "string"} } - } - }, - "expected": { - "task_type": "retrieve", - "status": "SUCCESS", - "retrieved_data": [ {"title": "Bad!", "rating": "1"} ] - } - } - ], - "revision": 2 - }, - { - "sites": ["shopping_admin"], - "task_id": 216, - "intent_template_id": 249, - "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Get the title and rating for all reviews with 3 stars or below for Electra Bra Top. Return a list of objects with keys \"title\" and \"rating\".", - "intent_template": "Get the title and rating for all reviews with 3 stars or below for {{product}}. {{retrieved_data_format_spec}}.", - "instantiation_dict": { - "product": "Electra Bra Top", - "retrieved_data_format_spec": "Return a list of objects with keys \"title\" and \"rating\"" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { - "type": "array", - "items": { - "type": "object", - "properties": { "title": {"type": "string"}, "rating": {"type": "string"} } - } - }, - "expected": { - "task_type": "retrieve", - "status": "SUCCESS", - "retrieved_data": [ {"title": "Not exactly true to size", "rating": "3"} ] - } - } - ], - "revision": 2 - }, - { - "sites": ["shopping_admin"], - "task_id": 217, - "intent_template_id": 249, - "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Get the title and rating for all reviews with 3 stars or below for Pursuit Tone Band. Return a list of objects with keys \"title\" and \"rating\".", - "intent_template": "Get the title and rating for all reviews with 3 stars or below for {{product}}. {{retrieved_data_format_spec}}.", - "instantiation_dict": { - "product": "Pursuit Tone Band", - "retrieved_data_format_spec": "Return a list of objects with keys \"title\" and \"rating\"" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { - "type": "array", - "items": { - "type": "object", - "properties": { "title": {"type": "string"}, "rating": {"type": "string"} } - } - }, - "expected": { - "task_type": "retrieve", - "status": "SUCCESS", - "retrieved_data": [ - { "title": "Agreed. More resistance", "rating": "3" }, - { "title": "Want more resistance" , "rating": "3" } - ] - } - } - ], - "revision": 2 - }, - { - "sites": ["map"], - "task_id": 218, - "intent_template_id": 41, - "start_urls": ["__MAP__"], - "intent": "Get the name of the hotel and walking distance of nearby hotels to CMU, Pittsburgh that take at most 5 minutes. Use \"hotel\" for the name and \"distance\" for the distance. (Use the OSRM direction service.)", - "intent_template": "Get the name of the hotel and walking distance of nearby hotels to {{location}} that take at most {{n}} minutes. {{retrieved_data_format_spec}}. (Use the OSRM direction service.)", - "instantiation_dict": { - "location": "CMU, Pittsburgh", - "n": "5", - "retrieved_data_format_spec": "Use \"hotel\" for the name and \"distance\" for the distance" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "retrieve", "status": "NOT_FOUND_ERROR", "retrieved_data": null} - } - ], - "revision": 2 - }, - { - "sites": ["map"], - "task_id": 219, - "intent_template_id": 41, - "start_urls": ["__MAP__"], - "intent": "Get the name of the hotel and walking distance of nearby hotels to Pittsburgh airport that take at most 3 minutes. Use \"hotel\" for the name and \"distance\" for the distance. (Use the OSRM direction service.)", - "intent_template": "Get the name of the hotel and walking distance of nearby hotels to {{location}} that take at most {{n}} minutes. {{retrieved_data_format_spec}}. (Use the OSRM direction service.)", - "instantiation_dict": { - "location": "Pittsburgh airport", - "n": "3", - "retrieved_data_format_spec": "Use \"hotel\" for the name and \"distance\" for the distance" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "retrieve", "status": "NOT_FOUND_ERROR", "retrieved_data": null} - } - ], - "revision": 2 - }, - { - "sites": ["map"], - "task_id": 220, - "intent_template_id": 41, - "start_urls": ["__MAP__"], - "intent": "Get the name of the hotel and walking distance of nearby hotels to Gardner Steel Conference Center, that take at most 5 minutes. Return a list of objects with keys \"hotel\" (hotel name) and \"distance\" (numeric value with unit km or m, e.g., 2.4km, 500m) only, without any additional details. (Use the OSRM direction service.)", - "intent_template": "Get the name of the hotel and walking distance of nearby hotels to {{location}} that take at most {{n}} minutes. {{retrieved_data_format_spec}}. (Use the OSRM direction service.)", - "instantiation_dict": { - "location": "Gardner Steel Conference Center,", - "n": 5, - "retrieved_data_format_spec": "Return a list of objects with keys \"hotel\" (hotel name) and \"distance\" (numeric value with unit km or m, e.g., 2.4km, 500m) only, without any additional details" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { - "type": "array", - "items": { - "type": "object", - "properties": { - "hotel" : { "type": "string", "format": "location-name" }, - "distance": { "type": "string", "format": "distance" } - } - } - }, - "expected": { - "task_type": "retrieve", - "status": "SUCCESS", - "retrieved_data": [ - { "distance": "375m", "hotel": "Wyndham Pittsburgh University Center" }, - { "distance": "338m", "hotel": "The Oaklander Hotel" } - ] - } - } - ], - "revision": 2 - }, - { - "sites": ["map"], - "task_id": 221, - "intent_template_id": 35, - "start_urls": ["__MAP__"], - "intent": "I am at CMU Pittsburgh, how long does it take to reach the nearest USPS postal office with different transportation methods? Return a list of objects with keys \"transportation_method\" (Walking, Driving, or Biking) and \"duration\" (in HH:MM:SS format) only, without any additional details or text. (Use the OSRM direction service.)", - "intent_template": "I am at CMU Pittsburgh, how long does it take to reach the nearest {{location}} with different transportation methods? {{retrieved_data_format_spec}}. (Use the OSRM direction service.)", - "instantiation_dict": { - "location": "USPS postal office", - "retrieved_data_format_spec": "Return a list of objects with keys \"transportation_method\" (Walking, Driving, or Biking) and \"duration\" (in HH:MM:SS format) only, without any additional details or text" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { - "type": "array", - "items": { - "type": "object", - "properties": { - "transportation_method": { "type": "string" }, - "duration" : { "type": "string", "format": "duration" } - } - } - }, - "expected": { - "task_type": "retrieve", - "status": "SUCCESS", - "retrieved_data": [ - { "transportation_method": "Walking", "duration": "19min" }, - { "transportation_method": "Driving", "duration": "2min" }, - { "transportation_method": "Biking" , "duration": "7min" } - ] - } - } - ], - "revision": 2 - }, - { - "sites": ["map"], - "task_id": 222, - "intent_template_id": 35, - "start_urls": ["__MAP__"], - "intent": "I am at CMU Pittsburgh, how long does it take to reach the nearest cold stone ice cream with different transportation methods? Return duration in HH:MM:SS format. (Use the OSRM direction service.)", - "intent_template": "I am at CMU Pittsburgh, how long does it take to reach the nearest {{location}} with different transportation methods? {{retrieved_data_format_spec}}. (Use the OSRM direction service.)", - "instantiation_dict": { - "location": "cold stone ice cream", - "retrieved_data_format_spec": "Return duration in HH:MM:SS format" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { "type": "array", "items": {"type": "string", "format": "duration"} }, - "expected": { "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": ["3min"] } - } - ], - "revision": 2 - }, - { - "sites": ["map"], - "task_id": 223, - "intent_template_id": 35, - "start_urls": ["__MAP__"], - "intent": "I am at CMU Pittsburgh, how long does it take to reach the nearest Mcdonald's with different transportation methods? Return duration in HH:MM:SS format. (Use the OSRM direction service.)", - "intent_template": "I am at CMU Pittsburgh, how long does it take to reach the nearest {{location}} with different transportation methods? {{retrieved_data_format_spec}}. (Use the OSRM direction service.)", - "instantiation_dict": { - "location": "Mcdonald's", - "retrieved_data_format_spec": "Return duration in HH:MM:SS format" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { "type": "array", "items": {"type": "string", "format": "duration"} }, - "expected": { "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": ["4min"] } - } - ], - "revision": 2 - }, - { - "sites": ["map"], - "task_id": 224, - "intent_template_id": 35, - "start_urls": ["__MAP__"], - "intent": "I am at CMU Pittsburgh, how long does it take to reach the nearest wendys with different transportation methods? Return duration in HH:MM:SS format. (Use the OSRM direction service.)", - "intent_template": "I am at CMU Pittsburgh, how long does it take to reach the nearest {{location}} with different transportation methods? {{retrieved_data_format_spec}}. (Use the OSRM direction service.)", - "instantiation_dict": { - "location": "wendys", - "retrieved_data_format_spec": "Return duration in HH:MM:SS format" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { "type": "array", "items": {"type": "string", "format": "duration"} }, - "expected": { "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": ["3min"] } - } - ], - "revision": 2 - }, - { - "sites": ["shopping"], - "task_id": 225, - "intent_template_id": 135, - "start_urls": ["__SHOPPING__"], - "intent": "Return the titles for reviews with 3 stars or below for brush from sephora", - "intent_template": "Return the titles for reviews with 3 stars or below for {{product_type}} from {{manufature}}", - "instantiation_dict": {"product_type": "brush", "manufature": "sephora"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "retrieve", "status": "NOT_FOUND_ERROR", "retrieved_data": null} - } - ], - "revision": 2 - }, - { - "sites": ["shopping"], - "task_id": 226, - "intent_template_id": 370, - "start_urls": ["__SHOPPING__"], - "intent": "What is the price range for products from Amazon basic?. Return an object with keys \"min\" and \"max\" (as numbers, e.g., 10.99) only, without any additional details.", - "intent_template": "What is the price range for products from {{brand}}?. {{retrieved_data_format_spec}}.", - "instantiation_dict": { - "brand": "Amazon basic", - "retrieved_data_format_spec": "Return an object with keys \"min\" and \"max\" (as numbers, e.g., 10.99) only, without any additional details" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { - "type": "array", - "items": { - "type": "object", - "properties": { - "min": { "type": "number", "format": "currency" }, - "max": { "type": "number", "format": "currency" } - } - } - }, - "expected": { - "task_type": "retrieve", - "status": "SUCCESS", - "retrieved_data": [ {"min": 5.49, "max": 375.19} ] - } - } - ], - "revision": 2 - }, - { - "sites": ["shopping"], - "task_id": 227, - "intent_template_id": 370, - "start_urls": ["__SHOPPING__"], - "intent": "What is the price range for products from EYZUTAK?. Return an object with keys \"min\" and \"max\" (as numbers, e.g., 10.99) only, without any additional details.", - "intent_template": "What is the price range for products from {{brand}}?. {{retrieved_data_format_spec}}.", - "instantiation_dict": { - "brand": "EYZUTAK", - "retrieved_data_format_spec": "Return an object with keys \"min\" and \"max\" (as numbers, e.g., 10.99) only, without any additional details" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { - "type": "array", - "items": { - "type": "object", - "properties": { - "min": { "type": "number", "format": "currency" }, - "max": { "type": "number", "format": "currency" } - } - } - }, - "expected": { - "task_type": "retrieve", - "status": "SUCCESS", - "retrieved_data": [ {"min": 9.99, "max": 9.99} ] - } - } - ], - "revision": 2 - }, - { - "sites": ["shopping"], - "task_id": 228, - "intent_template_id": 370, - "start_urls": ["__SHOPPING__"], - "intent": "What is the price range for products from sephora?. Return an object with keys \"min\" and \"max\" (as numbers, e.g., 10.99) only, without any additional details.", - "intent_template": "What is the price range for products from {{brand}}?. {{retrieved_data_format_spec}}.", - "instantiation_dict": { - "brand": "sephora", - "retrieved_data_format_spec": "Return an object with keys \"min\" and \"max\" (as numbers, e.g., 10.99) only, without any additional details" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { - "type": "array", - "items": { - "type": "object", - "properties": { - "min": { "type": "number", "format": "currency" }, - "max": { "type": "number", "format": "currency" } - } - } - }, - "expected": { - "task_type": "retrieve", - "status": "SUCCESS", - "retrieved_data": [ {"min": 18.18, "max": 94.99} ] - } - } - ], - "revision": 2 - }, - { - "sites": ["shopping"], - "task_id": 229, - "intent_template_id": 370, - "start_urls": ["__SHOPPING__"], - "intent": "What is the price range for products from ugreen?. Return an object with keys \"min\" and \"max\" (as numbers, e.g., 10.99) only, without any additional details.", - "intent_template": "What is the price range for products from {{brand}}?. {{retrieved_data_format_spec}}.", - "instantiation_dict": { - "brand": "ugreen", - "retrieved_data_format_spec": "Return an object with keys \"min\" and \"max\" (as numbers, e.g., 10.99) only, without any additional details" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { - "type": "array", - "items": { - "type": "object", - "properties": { - "min": { "type": "number", "format": "currency" }, - "max": { "type": "number", "format": "currency" } - } - } - }, - "expected": { - "task_type": "retrieve", - "status": "SUCCESS", - "retrieved_data": [ {"min": 6.99, "max": 38.99} ] - } - } - ], - "revision": 2 - }, - { - "sites": ["shopping"], - "task_id": 230, - "intent_template_id": 370, - "start_urls": ["__SHOPPING__"], - "intent": "What is the price range for products from Perricone MD?. Return an object with keys \"min\" and \"max\" (as numbers, e.g., 10.99) only, without any additional details.", - "intent_template": "What is the price range for products from {{brand}}?. {{retrieved_data_format_spec}}.", - "instantiation_dict": { - "brand": "Perricone MD", - "retrieved_data_format_spec": "Return an object with keys \"min\" and \"max\" (as numbers, e.g., 10.99) only, without any additional details" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { - "type": "array", - "items": { - "type": "object", - "properties": { - "min": { "type": "number", "format": "currency" }, - "max": { "type": "number", "format": "currency" } - } - } - }, - "expected": { - "task_type": "retrieve", - "status": "SUCCESS", - "retrieved_data": [ {"min": 35.0, "max": 149.0} ] - } - } - ], - "revision": 2 - }, - { - "sites": ["shopping"], - "task_id": 231, - "intent_template_id": 213, - "start_urls": ["__SHOPPING__"], - "intent": "Get the order number of my most recent cancelled order", - "intent_template": "Get the order number of my most recent {{status}} order", - "instantiation_dict": {"status": "cancelled"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { "type": "array", "items": {"type": "string"} }, - "expected": { - "task_type": "retrieve", - "status": "SUCCESS", - "retrieved_data": ["^#?\\s*0*170$"] - } - } - ], - "revision": 2 - }, - { - "sites": ["shopping"], - "task_id": 232, - "intent_template_id": 213, - "start_urls": ["__SHOPPING__"], - "intent": "Get the order number of my most recent pending order", - "intent_template": "Get the order number of my most recent {{status}} order", - "instantiation_dict": {"status": "pending"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { "type": "array", "items": {"type": "string"} }, - "expected": { - "task_type": "retrieve", - "status": "SUCCESS", - "retrieved_data": ["^#?\\s*0*189$"] - } - } - ], - "revision": 2 - }, - { - "sites": ["shopping"], - "task_id": 233, - "intent_template_id": 213, - "start_urls": ["__SHOPPING__"], - "intent": "Get the order number of my most recent complete order", - "intent_template": "Get the order number of my most recent {{status}} order", - "instantiation_dict": {"status": "complete"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { "type": "array", "items": {"type": "string"} }, - "expected": { - "task_type": "retrieve", - "status": "SUCCESS", - "retrieved_data": ["^#?\\s*0*180$"] - } - } - ], - "revision": 2 - }, - { - "sites": ["shopping"], - "task_id": 234, - "intent_template_id": 213, - "start_urls": ["__SHOPPING__"], - "intent": "Get the order number of my most recent on hold order", - "intent_template": "Get the order number of my most recent {{status}} order", - "instantiation_dict": {"status": "on hold"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "retrieve", "status": "NOT_FOUND_ERROR", "retrieved_data": null} - } - ], - "revision": 2 - }, - { - "sites": ["shopping"], - "task_id": 235, - "intent_template_id": 213, - "start_urls": ["__SHOPPING__"], - "intent": "Get the order number of my most recent under delivery order", - "intent_template": "Get the order number of my most recent {{status}} order", - "instantiation_dict": {"status": "under delivery"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "retrieve", "status": "NOT_FOUND_ERROR", "retrieved_data": null} - } - ], - "revision": 2 - }, - { - "sites": ["map"], - "task_id": 236, - "intent_template_id": 39, - "start_urls": ["__MAP__"], - "intent": "Get the name and address of the nearest pharmacy from Carnegie Mellon I can walk within 20mins (use OSRM direction service). Return a list of objects with keys \"name\", \"house_number\", \"street\", \"city\", \"state\", and \"postcode\".", - "intent_template": "Get the name and address of the nearest {{location}} from {{location2}} {{condition}}. {{retrieved_data_format_spec}}.", - "instantiation_dict": { - "location": "pharmacy", - "location2": "Carnegie Mellon", - "condition": "I can walk within 20mins (use OSRM direction service)", - "retrieved_data_format_spec": "Return a list of objects with keys \"name\", \"house_number\", \"street\", \"city\", \"state\", and \"postcode\"" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { - "type": "array", - "items": { - "type": "object", - "title": "full_address", - "properties": { - "name" : { "type": "string" }, - "house_number": { "type": "string" }, - "street" : { "type": "string" }, - "city" : { "type": "string" }, - "state" : { "type": "string" }, - "postcode" : { "type": "string" } - } - } - }, - "expected": { - "task_type": "retrieve", - "status": "SUCCESS", - "retrieved_data": [ - { - "name": "Schiller's Pharmacy", - "house_number": "811", - "street": "South Aiken Avenue", - "city": "Pittsburgh", - "state": "Pennsylvania", - "postcode": "15232" - } - ] - } - } - ], - "revision": 2 - }, - { - "sites": ["map"], - "task_id": 237, - "intent_template_id": 39, - "start_urls": ["__MAP__"], - "intent": "Get the name and address of the nearest gas station from CMU . Return a list of objects with keys \"name\", \"house_number\", \"street\", \"city\", \"state\", and \"postcode\".", - "intent_template": "Get the name and address of the nearest {{location}} from {{location2}} {{condition}}. {{retrieved_data_format_spec}}.", - "instantiation_dict": { - "location": "gas station", - "location2": "CMU", - "condition": "", - "retrieved_data_format_spec": "Return a list of objects with keys \"name\", \"house_number\", \"street\", \"city\", \"state\", and \"postcode\"" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { - "type": "array", - "items": { - "type": "object", - "title": "full_address", - "properties": { - "name" : { "type": "string" }, - "house_number": { "type": "string" }, - "street" : { "type": "string" }, - "city" : { "type": "string" }, - "state" : { "type": "string" }, - "postcode" : { "type": "string" } - } - } - }, - "expected": { - "task_type": "retrieve", - "status": "SUCCESS", - "retrieved_data": [ - { - "name": "Sunoco", - "house_number": "", - "street": "North Craig Street", - "city": "Pittsburgh", - "state": "Pennsylvania", - "postcode": "15213" - } - ] - } - } - ], - "revision": 2 - }, - { - "sites": ["shopping"], - "task_id": 238, - "intent_template_id": 138, - "start_urls": ["__SHOPPING__"], - "intent": "I am doing a market survey for one stop market, go to the product page for the most expensive PS4 accessories", - "intent_template": "I am doing a market survey for one stop market, go to the product page for the most expensive {{product_category}}", - "instantiation_dict": {"product_category": "PS4 accessories"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "navigate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__SHOPPING__/astro-gaming-a50-wireless-headset-base-station-gen-4-compatible-with-ps5-ps4-pc-mac-black-silver.html" - } - } - ], - "revision": 2 - }, - { - "sites": ["shopping"], - "task_id": 239, - "intent_template_id": 138, - "start_urls": ["__SHOPPING__"], - "intent": "I am doing a market survey for one stop market, go to the product page for the most expensive nutrition bars and drinks", - "intent_template": "I am doing a market survey for one stop market, go to the product page for the most expensive {{product_category}}", - "instantiation_dict": {"product_category": "nutrition bars and drinks"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "navigate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__SHOPPING__/kellogg-s-special-k-protein-meal-bars-chocolate-caramel-12-7oz-6-count.html" - } - } - ], - "revision": 2 - }, - { - "sites": ["shopping"], - "task_id": 240, - "intent_template_id": 138, - "start_urls": ["__SHOPPING__"], - "intent": "I am doing a market survey for one stop market, go to the product page for the most expensive men's Uniforms, Work & Safety", - "intent_template": "I am doing a market survey for one stop market, go to the product page for the most expensive {{product_category}}", - "instantiation_dict": {"product_category": "men's Uniforms, Work & Safety"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "navigate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__SHOPPING__/arc-teryx-beta-sv-bib-men-s-versatile-durable-gore-tex-pro-bib-for-severe-conditions.html" - } - } - ], - "revision": 2 - }, - { - "sites": ["shopping"], - "task_id": 241, - "intent_template_id": 138, - "start_urls": ["__SHOPPING__"], - "intent": "I am doing a market survey for one stop market, go to the product page for the most expensive skin care", - "intent_template": "I am doing a market survey for one stop market, go to the product page for the most expensive {{product_category}}", - "instantiation_dict": {"product_category": "skin care"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "navigate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__SHOPPING__/professional-medi-spa-scar-stretch-mark-reduction-system.html" - } - } - ], - "revision": 2 - }, - { - "sites": ["shopping"], - "task_id": 242, - "intent_template_id": 138, - "start_urls": ["__SHOPPING__"], - "intent": "I am doing a market survey for one stop market, go to the product page for the most expensive Household Supplies", - "intent_template": "I am doing a market survey for one stop market, go to the product page for the most expensive {{product_category}}", - "instantiation_dict": {"product_category": "Household Supplies"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "navigate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__SHOPPING__/lynx-battery-12v-200ah-lithium-iron-phosphate-lifepo4-prismatic-deep-cell-battery-set-of-4-3-2v-cells-with-3-bus-bars-and-8-lug-nuts-for-rv-solar-marine-off-grid-applications.html" - } - } - ], - "revision": 2 - }, - { - "sites": ["shopping_admin"], - "task_id": 243, - "intent_template_id": 244, - "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Get the email address of the customer who is the most unhappy with Circe fleece", - "intent_template": "Get the {{information}} of the customer who is the most unhappy with {{product}}", - "instantiation_dict": {"information": "email address", "product": "Circe fleece"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { "type": "array", "items": {"type": "string"} }, - "expected": { - "task_type": "retrieve", - "status": "SUCCESS", - "retrieved_data": ["hannah.lim@gmail.com"] - } - } - ], - "revision": 2 - }, - { - "sites": ["shopping_admin"], - "task_id": 244, - "intent_template_id": 244, - "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Get the email address of the customer who is the most unhappy with Olivia zip jacket", - "intent_template": "Get the {{information}} of the customer who is the most unhappy with {{product}}", - "instantiation_dict": {"information": "email address", "product": "Olivia zip jacket"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { "type": "array", "items": {"type": "string"} }, - "expected": { - "task_type": "retrieve", - "status": "SUCCESS", - "retrieved_data": ["emma.lopez@gmail.com"] - } - } - ], - "revision": 2 - }, - { - "sites": ["shopping_admin"], - "task_id": 245, - "intent_template_id": 244, - "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Get the name of the customer who is the most unhappy with Antonia racer tank", - "intent_template": "Get the {{information}} of the customer who is the most unhappy with {{product}}", - "instantiation_dict": {"information": "name", "product": "Antonia racer tank"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { "type": "array", "items": {"type": "string"} }, - "expected": { "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": ["Shaunte"] } - } - ], - "revision": 2 - }, - { - "sites": ["shopping_admin"], - "task_id": 246, - "intent_template_id": 244, - "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Get the name of the customer who is the most unhappy with Chloe tank", - "intent_template": "Get the {{information}} of the customer who is the most unhappy with {{product}}", - "instantiation_dict": {"information": "name", "product": "Chloe tank"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { "type": "array", "items": {"type": "string"} }, - "expected": { "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": ["Teofila"] } - } - ], - "revision": 2 - }, - { - "sites": ["shopping_admin"], - "task_id": 247, - "intent_template_id": 244, - "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Get the email address of the customer who is the most unhappy with the style of Zoe products", - "intent_template": "Get the {{information}} of the customer who is the most unhappy with {{product}}", - "instantiation_dict": {"information": "email address", "product": "the style of Zoe products"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "retrieve", "status": "NOT_FOUND_ERROR", "retrieved_data": null} - } - ], - "revision": 2 - }, - { - "sites": ["map"], - "task_id": 248, - "intent_template_id": 46, - "start_urls": ["__MAP__"], - "intent": "Get the coordinates of Carnegie Mellon Caf\u00e9. Return an object with keys \"latitude\" and \"longitude\" (as decimal degrees) only, without any additional details.", - "intent_template": "Get the coordinates of {{location}}. {{retrieved_data_format_spec}}.", - "instantiation_dict": { - "location": "Carnegie Mellon Caf\u00e9", - "retrieved_data_format_spec": "Return an object with keys \"latitude\" and \"longitude\" (as decimal degrees) only, without any additional details" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { "type": "array", "items": {"format": "coordinates"} }, - "expected": { - "task_type": "retrieve", - "status": "SUCCESS", - "retrieved_data": [ {"latitude": "40.4424191", "longitude": "-79.9397388"} ] - } - } - ], - "revision": 2 - }, - { - "sites": ["map"], - "task_id": 249, - "intent_template_id": 46, - "start_urls": ["__MAP__"], - "intent": "Get the coordinates of Western Pennsylvania Hospital Heliport. Return an object with keys \"latitude\" and \"longitude\" (as decimal degrees) only, without any additional details.", - "intent_template": "Get the coordinates of {{location}}. {{retrieved_data_format_spec}}.", - "instantiation_dict": { - "location": "Western Pennsylvania Hospital Heliport", - "retrieved_data_format_spec": "Return an object with keys \"latitude\" and \"longitude\" (as decimal degrees) only, without any additional details" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { "type": "array", "items": {"format": "coordinates"} }, - "expected": { - "task_type": "retrieve", - "status": "SUCCESS", - "retrieved_data": [ {"latitude": "40.46076", "longitude": "-79.94666"} ] - } - } - ], - "revision": 2 - }, - { - "sites": ["map"], - "task_id": 250, - "intent_template_id": 46, - "start_urls": ["__MAP__"], - "intent": "Get the coordinates of Apple Store near Pitt. Return an object with keys \"latitude\" and \"longitude\" (as decimal degrees) only, without any additional details.", - "intent_template": "Get the coordinates of {{location}}. {{retrieved_data_format_spec}}.", - "instantiation_dict": { - "location": "Apple Store near Pitt", - "retrieved_data_format_spec": "Return an object with keys \"latitude\" and \"longitude\" (as decimal degrees) only, without any additional details" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { "type": "array", "items": {"format": "coordinates"} }, - "expected": { - "task_type": "retrieve", - "status": "SUCCESS", - "retrieved_data": [ {"latitude": "40.4511693", "longitude": "-79.9334241"} ] - } - } - ], - "revision": 2 - }, - { - "sites": ["map"], - "task_id": 251, - "intent_template_id": 46, - "start_urls": ["__MAP__"], - "intent": "Get the coordinates of bus stop on the Carnegie art museum side of the street near CMU. Return an object with keys \"latitude\" and \"longitude\" (as decimal degrees) only, without any additional details.", - "intent_template": "Get the coordinates of {{location}}. {{retrieved_data_format_spec}}.", - "instantiation_dict": { - "location": "bus stop on the Carnegie art museum side of the street near CMU", - "retrieved_data_format_spec": "Return an object with keys \"latitude\" and \"longitude\" (as decimal degrees) only, without any additional details" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { "type": "array", "items": {"format": "coordinates"} }, - "expected": { - "task_type": "retrieve", - "status": "SUCCESS", - "retrieved_data": [ {"latitude": "40.4443", "longitude": "-79.94889"} ] - } - } - ], - "revision": 2 - }, - { - "sites": ["map"], - "task_id": 252, - "intent_template_id": 46, - "start_urls": ["__MAP__"], - "intent": "Get the coordinates of Tokyo Japanese Food Store in Pittsburgh. Return an object with keys \"latitude\" and \"longitude\" (as decimal degrees) only, without any additional details.", - "intent_template": "Get the coordinates of {{location}}. {{retrieved_data_format_spec}}.", - "instantiation_dict": { - "location": "Tokyo Japanese Food Store in Pittsburgh", - "retrieved_data_format_spec": "Return an object with keys \"latitude\" and \"longitude\" (as decimal degrees) only, without any additional details" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { "type": "array", "items": {"format": "coordinates"} }, - "expected": { - "task_type": "retrieve", - "status": "SUCCESS", - "retrieved_data": [ {"latitude": "40.45761", "longitude": "-79.92934"} ] - } - } - ], - "revision": 2 - }, - { - "sites": ["map"], - "task_id": 253, - "intent_template_id": 501, - "start_urls": ["__MAP__"], - "intent": "Get the phone number for Carnegie Mellon Caf\u00e9", - "intent_template": "Get the {{information}} for {{location}}{{retrieved_data_format_spec}}", - "instantiation_dict": { - "information": "phone number", - "location": "Carnegie Mellon Caf\u00e9", - "retrieved_data_format_spec": "" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "retrieve", "status": "NOT_FOUND_ERROR", "retrieved_data": null} - } - ], - "revision": 2 - }, - { - "sites": ["map"], - "task_id": 254, - "intent_template_id": 501, - "start_urls": ["__MAP__"], - "intent": "Get the phone number for Western Pennsylvania Hospital", - "intent_template": "Get the {{information}} for {{location}}{{retrieved_data_format_spec}}", - "instantiation_dict": { - "information": "phone number", - "location": "Western Pennsylvania Hospital", - "retrieved_data_format_spec": "" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { "type": "array", "items": {"type": "string"} }, - "expected": { - "task_type": "retrieve", - "status": "SUCCESS", - "retrieved_data": ["4125785000"] - } - } - ], - "revision": 2 - }, - { - "sites": ["map"], - "task_id": 255, - "intent_template_id": 501, - "start_urls": ["__MAP__"], - "intent": "Get the operator for PIT airport", - "intent_template": "Get the {{information}} for {{location}}{{retrieved_data_format_spec}}", - "instantiation_dict": { - "information": "operator", - "location": "PIT airport", - "retrieved_data_format_spec": "" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { "type": "array", "items": {"type": "string", "format": "location-name"} }, - "expected": { - "task_type": "retrieve", - "status": "SUCCESS", - "retrieved_data": ["Allegheny County Airport Authority"] - } - } - ], - "revision": 2 - }, - { - "sites": ["map"], - "task_id": 256, - "intent_template_id": 501, - "start_urls": ["__MAP__"], - "intent": "Get the website for Carnegie art museum in pittsburgh", - "intent_template": "Get the {{information}} for {{location}}{{retrieved_data_format_spec}}", - "instantiation_dict": { - "information": "website", - "location": "Carnegie art museum in pittsburgh", - "retrieved_data_format_spec": "" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { "type": "array", "items": {"type": "string"} }, - "expected": { - "task_type": "retrieve", - "status": "SUCCESS", - "retrieved_data": ["http://web.cmoa.org/"] - } - } - ], - "revision": 2 - }, - { - "sites": ["map"], - "task_id": 257, - "intent_template_id": 501, - "start_urls": ["__MAP__"], - "intent": "Get the hours of operation for Tokyo Japanese Food Store in Pittsburgh. Return a list of objects with keys \"day\", \"open_time\", and \"close_time\".", - "intent_template": "Get the {{information}} for {{location}}{{retrieved_data_format_spec}}", - "instantiation_dict": { - "information": "hours of operation", - "location": "Tokyo Japanese Food Store in Pittsburgh", - "retrieved_data_format_spec": ". Return a list of objects with keys \"day\", \"open_time\", and \"close_time\"." - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { - "type": "array", - "items": { - "type": "object", - "properties": { - "day" : { "type": "string" }, - "open_time" : { "type": "string" }, - "close_time": { "type": "string" } - } - } - }, - "expected": { - "task_type": "retrieve", - "status": "SUCCESS", - "retrieved_data": [ - { "day": "Wednesday", "open_time": "10:00", "close_time": "17:00" }, - { "day": "Thursday" , "open_time": "10:00", "close_time": "17:00" }, - { "day": "Friday" , "open_time": "10:00", "close_time": "17:00" }, - { "day": "Saturday" , "open_time": "10:00", "close_time": "17:00" }, - { "day": "Sunday" , "open_time": "10:00", "close_time": "17:00" } - ] - } - } - ], - "revision": 2 - }, - { - "sites": ["gitlab"], - "task_id": 258, - "intent_template_id": 325, - "start_urls": ["__GITLAB__"], - "intent": "Open the public projects listing", - "intent_template": "Open the public projects listing", - "instantiation_dict": {}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "navigate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { "url": "__GITLAB__/explore", "query_params": { "visibility_level": ["20"] } } - } - ], - "revision": 2 - }, - { - "sites": ["gitlab"], - "task_id": 259, - "intent_template_id": 312, - "start_urls": ["__GITLAB__"], - "intent": "Get me my RSS feed token", - "intent_template": "Get me my RSS feed token", - "instantiation_dict": {}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { "type": "array", "items": {"type": "string"} }, - "expected": { - "task_type": "retrieve", - "status": "SUCCESS", - "retrieved_data": ["TMN_bBn9Z48qVbUFZV45"] - } - } - ], - "revision": 2 - }, - { - "sites": ["shopping"], - "task_id": 260, - "intent_template_id": 211, - "start_urls": ["__SHOPPING__"], - "intent": "Open the Video Game category page to browse products", - "intent_template": "Open the {{category}} category page to browse products", - "instantiation_dict": {"category": "Video Game"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "navigate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": {"url": "__SHOPPING__/video-games.html"} - } - ], - "revision": 2 - }, - { - "sites": ["shopping"], - "task_id": 261, - "intent_template_id": 211, - "start_urls": ["__SHOPPING__"], - "intent": "Open the Headphones category page to browse products", - "intent_template": "Open the {{category}} category page to browse products", - "instantiation_dict": {"category": "Headphones"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "navigate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": {"url": "__SHOPPING__/electronics/headphones.html"} - } - ], - "revision": 2 - }, - { - "sites": ["shopping"], - "task_id": 262, - "intent_template_id": 211, - "start_urls": ["__SHOPPING__"], - "intent": "Open the Men shoes category page to browse products", - "intent_template": "Open the {{category}} category page to browse products", - "instantiation_dict": {"category": "Men shoes"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "navigate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": {"url": "__SHOPPING__/clothing-shoes-jewelry/men/shoes.html"} - } - ], - "revision": 2 - }, - { - "sites": ["shopping"], - "task_id": 263, - "intent_template_id": 211, - "start_urls": ["__SHOPPING__"], - "intent": "Open the Woman clothing category page to browse products", - "intent_template": "Open the {{category}} category page to browse products", - "instantiation_dict": {"category": "Woman clothing"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "navigate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": {"url": "__SHOPPING__/clothing-shoes-jewelry/women/clothing.html"} - } - ], - "revision": 2 - }, - { - "sites": ["shopping"], - "task_id": 264, - "intent_template_id": 211, - "start_urls": ["__SHOPPING__"], - "intent": "Open the Cabinets, Racks & Shelves category page to browse products", - "intent_template": "Open the {{category}} category page to browse products", - "instantiation_dict": {"category": "Cabinets, Racks & Shelves"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "navigate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__SHOPPING__/office-products/office-furniture-lighting/cabinets-racks-shelves.html" - } - } - ], - "revision": 2 - }, - { - "sites": ["wikipedia", "map"], - "task_id": 265, - "intent_template_id": 85, - "start_urls": ["__MAP__", "__WIKIPEDIA__"], - "intent": "Get the relation ID of the closest national park to Boston and the distance to drive there. Return a list of objects with keys \"relation_id\" (integer) and \"distance\" (numeric value with unit km or m, e.g., 2.4km, 500m) only, without any additional details. (Use the OSRM direction service and the provided wiki to look up any needed information and search both source and destination by coordinates from the place official page on the wiki.)", - "intent_template": "Get the relation ID of the closest national park to {{city}} and the {{metric_phrase}} to {{travel_mode}} there. {{retrieved_data_format_spec}}. (Use the OSRM direction service and the provided wiki to look up any needed information and search both source and destination by coordinates from the place official page on the wiki.)", - "instantiation_dict": { - "city": "Boston", - "travel_mode": "drive", - "metric_phrase": "distance", - "retrieved_data_format_spec": "Return a list of objects with keys \"relation_id\" (integer) and \"distance\" (numeric value with unit km or m, e.g., 2.4km, 500m) only, without any additional details" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { - "type": "array", - "items": { - "type": "object", - "properties": { - "relation_id": { "type": "integer" }, - "distance" : { "type": "string" , "format": "distance" } - } - } - }, - "expected": { - "task_type": "retrieve", - "status": "SUCCESS", - "retrieved_data": [ {"relation_id": 2176999, "distance": "459km"} ] - } - }, - { - "evaluator": "NetworkEventEvaluator", - "ignored_query_params_patterns": [".*"], - "expected": {"url": "^.*/route/v1/.*/-68.2177005,44.3494709;-71.0579762,42.3603713.*$"} - } - ], - "revision": 4 - }, - { - "sites": ["wikipedia", "map"], - "task_id": 266, - "intent_template_id": 85, - "start_urls": ["__MAP__", "__WIKIPEDIA__"], - "intent": "Get the relation ID of the closest national park to the largest city in Maine and the distance to drive there. Return a list of objects with keys \"relation_id\" (integer) and \"distance\" (numeric value with unit km or m, e.g., 2.4km, 500m) only, without any additional details. (Use the OSRM direction service and the provided wiki to look up any needed information and search both source and destination by coordinates from the place official page on the wiki.)", - "intent_template": "Get the relation ID of the closest national park to {{city}} and the {{metric_phrase}} to {{travel_mode}} there. {{retrieved_data_format_spec}}. (Use the OSRM direction service and the provided wiki to look up any needed information and search both source and destination by coordinates from the place official page on the wiki.)", - "instantiation_dict": { - "city": "the largest city in Maine", - "travel_mode": "drive", - "metric_phrase": "distance", - "retrieved_data_format_spec": "Return a list of objects with keys \"relation_id\" (integer) and \"distance\" (numeric value with unit km or m, e.g., 2.4km, 500m) only, without any additional details" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { - "type": "array", - "items": { - "type": "object", - "properties": { - "relation_id": { "type": "integer" }, - "distance" : { "type": "string" , "format": "distance" } - } - } - }, - "expected": { - "task_type": "retrieve", - "status": "SUCCESS", - "retrieved_data": [ {"relation_id": 2176999, "distance": "290km"} ] - } - }, - { - "evaluator": "NetworkEventEvaluator", - "ignored_query_params_patterns": [".*"], - "expected": {"url": "^.*/route/v1/.*/-68.2177005,44.3494709;-70.2545299,43.6599147.*$"} - } - ], - "revision": 4 - }, - { - "sites": ["wikipedia", "map"], - "task_id": 267, - "intent_template_id": 85, - "start_urls": ["__MAP__", "__WIKIPEDIA__"], - "intent": "Get the relation ID of the closest national park to the hometown of Stephen King and the time to drive there. Return a list of objects with keys \"relation_id\" (integer) and \"duration\" (in HH:MM:SS format) only, without any additional details. (Use the OSRM direction service and the provided wiki to look up any needed information and search both source and destination by coordinates from the place official page on the wiki.)", - "intent_template": "Get the relation ID of the closest national park to {{city}} and the {{metric_phrase}} to {{travel_mode}} there. {{retrieved_data_format_spec}}. (Use the OSRM direction service and the provided wiki to look up any needed information and search both source and destination by coordinates from the place official page on the wiki.)", - "instantiation_dict": { - "city": "the hometown of Stephen King", - "travel_mode": "drive", - "metric_phrase": "time", - "retrieved_data_format_spec": "Return a list of objects with keys \"relation_id\" (integer) and \"duration\" (in HH:MM:SS format) only, without any additional details" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { - "type": "array", - "items": { - "type": "object", - "properties": { - "relation_id": { "type": "integer" }, - "duration" : { "type": "string" , "format": "duration" } - } - } - }, - "expected": { - "task_type": "retrieve", - "status": "SUCCESS", - "retrieved_data": [ {"relation_id": 2176999, "duration": "01:33:00"} ] - } - }, - { - "evaluator": "NetworkEventEvaluator", - "ignored_query_params_patterns": [".*"], - "expected": { - "url": "^.*/route/.*/-68.2177005,44.3494709;-68.767507,44.8030715.*$", - "headers": {"Cookie": "^(?!.*_osm_directions_engine=fossgis_osrm_(?:bicycle|foot)).*$"} - } - } - ], - "revision": 4 - }, - { - "sites": ["wikipedia", "map"], - "task_id": 268, - "intent_template_id": 85, - "start_urls": ["__MAP__", "__WIKIPEDIA__"], - "intent": "Get the relation ID of the closest national park to Vinalhaven, ME and the time to bike there. Return a list of objects with keys \"relation_id\" (integer) and \"duration\" (in HH:MM:SS format) only, without any additional details. (Use the OSRM direction service and the provided wiki to look up any needed information and search both source and destination by coordinates from the place official page on the wiki.)", - "intent_template": "Get the relation ID of the closest national park to {{city}} and the {{metric_phrase}} to {{travel_mode}} there. {{retrieved_data_format_spec}}. (Use the OSRM direction service and the provided wiki to look up any needed information and search both source and destination by coordinates from the place official page on the wiki.)", - "instantiation_dict": { - "city": "Vinalhaven, ME", - "travel_mode": "bike", - "metric_phrase": "time", - "retrieved_data_format_spec": "Return a list of objects with keys \"relation_id\" (integer) and \"duration\" (in HH:MM:SS format) only, without any additional details" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { - "type": "array", - "items": { - "type": "object", - "properties": { - "relation_id": { "type": "integer" }, - "duration" : { "type": "string" , "format": "duration" } - } - } - }, - "expected": { - "task_type": "retrieve", - "status": "SUCCESS", - "retrieved_data": [ {"relation_id": 2176999, "duration": "10:58:00"} ] - } - }, - { - "evaluator": "NetworkEventEvaluator", - "ignored_query_params_patterns": [".*"], - "expected": { - "url": "^.*/route/v1/.*/-68.2177005,44.3494709;-68.8315387,44.0478975.*$", - "headers": {"Cookie": "^.*_osm_directions_engine=fossgis_osrm_bicycle.*$"} - } - } - ], - "revision": 4 - }, - { - "sites": ["shopping"], - "task_id": 269, - "intent_template_id": 139, - "start_urls": ["__SHOPPING__"], - "intent": "Open the \"women shoes\" category page filtered to under $25", - "intent_template": "Open the \"{{product_category}}\" category page filtered to {{price_range}}", - "instantiation_dict": {"price_range": "under $25", "product_category": "women shoes"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "navigate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "ignored_query_params_patterns": ["^(?!price$).+$"], - "expected": { - "url": "__SHOPPING__/clothing-shoes-jewelry/women/shoes.html", - "query_params": { "price": ["0-25"] } - } - } - ], - "revision": 2 - }, - { - "sites": ["shopping"], - "task_id": 270, - "intent_template_id": 139, - "start_urls": ["__SHOPPING__"], - "intent": "Open the \"men shoes\" category page filtered to under $30", - "intent_template": "Open the \"{{product_category}}\" category page filtered to {{price_range}}", - "instantiation_dict": {"price_range": "under $30", "product_category": "men shoes"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "navigate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__SHOPPING__/clothing-shoes-jewelry/men/shoes.html", - "query_params": { "price": ["0-30"] } - } - } - ], - "revision": 2 - }, - { - "sites": ["shopping"], - "task_id": 271, - "intent_template_id": 139, - "start_urls": ["__SHOPPING__"], - "intent": "Open the \"makeup remover\" category page filtered to under $46.99", - "intent_template": "Open the \"{{product_category}}\" category page filtered to {{price_range}}", - "instantiation_dict": {"price_range": "under $46.99", "product_category": "makeup remover"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "navigate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__SHOPPING__/beauty-personal-care/makeup/makeup-remover.html", - "query_params": { "price": ["0-46.99"] } - } - } - ], - "revision": 2 - }, - { - "sites": ["shopping"], - "task_id": 272, - "intent_template_id": 139, - "start_urls": ["__SHOPPING__"], - "intent": "Open the \"children dental care\" category page filtered to under $78", - "intent_template": "Open the \"{{product_category}}\" category page filtered to {{price_range}}", - "instantiation_dict": {"price_range": "under $78", "product_category": "children dental care"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "navigate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__SHOPPING__/beauty-personal-care/oral-care/children-s-dental-care.html", - "query_params": { "price": ["0-78"] } - } - } - ], - "revision": 2 - }, - { - "sites": ["shopping"], - "task_id": 273, - "intent_template_id": 139, - "start_urls": ["__SHOPPING__"], - "intent": "Open the \"furniture with accent\" category page filtered to under $199", - "intent_template": "Open the \"{{product_category}}\" category page filtered to {{price_range}}", - "instantiation_dict": {"price_range": "under $199", "product_category": "furniture with accent"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "navigate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__SHOPPING__/home-kitchen/furniture/accent-furniture.html", - "query_params": { "price": ["0-199"] } - } - } - ], - "revision": 2 - }, - { - "sites": ["shopping"], - "task_id": 274, - "intent_template_id": 212, - "start_urls": ["__SHOPPING__"], - "intent": "Open the search results for \"usb wifi\"", - "intent_template": "Open the search results for \"{{keyword}}\"", - "instantiation_dict": {"keyword": "usb wifi"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "navigate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__SHOPPING__/catalogsearch/result/", - "query_params": { "q": ["usb wifi"] } - } - } - ], - "revision": 2 - }, - { - "sites": ["shopping"], - "task_id": 275, - "intent_template_id": 212, - "start_urls": ["__SHOPPING__"], - "intent": "Open the search results for \"xbox\"", - "intent_template": "Open the search results for \"{{keyword}}\"", - "instantiation_dict": {"keyword": "xbox"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "navigate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__SHOPPING__/catalogsearch/result/", - "query_params": { "q": ["xbox"] } - } - } - ], - "revision": 2 - }, - { - "sites": ["shopping"], - "task_id": 276, - "intent_template_id": 212, - "start_urls": ["__SHOPPING__"], - "intent": "Open the search results for \"switch accessories\"", - "intent_template": "Open the search results for \"{{keyword}}\"", - "instantiation_dict": {"keyword": "switch accessories"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "navigate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__SHOPPING__/catalogsearch/result/", - "query_params": { "q": ["switch accessories"] } - } - } - ], - "revision": 2 - }, - { - "sites": ["shopping"], - "task_id": 277, - "intent_template_id": 212, - "start_urls": ["__SHOPPING__"], - "intent": "Open the search results for \"batteries for iphone 13\"", - "intent_template": "Open the search results for \"{{keyword}}\"", - "instantiation_dict": {"keyword": "batteries for iphone 13"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "navigate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__SHOPPING__/catalogsearch/result/", - "query_params": { "q": ["batteries for iphone 13"] } - } - } - ], - "revision": 2 - }, - { - "sites": ["shopping"], - "task_id": 278, - "intent_template_id": 212, - "start_urls": ["__SHOPPING__"], - "intent": "Open the search results for \"green tea bag for weight loss\"", - "intent_template": "Open the search results for \"{{keyword}}\"", - "instantiation_dict": {"keyword": "green tea bag for weight loss"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "navigate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__SHOPPING__/catalogsearch/result/", - "query_params": { "q": ["green tea bag for weight loss"] } - } - } - ], - "revision": 2 - }, - { - "sites": ["shopping"], - "task_id": 279, - "intent_template_id": 204, - "start_urls": ["__SHOPPING__"], - "intent": "Provide me with the full names of Bluetooth headphones from Sony, and also share the price range for the available models. Return an object with keys \"names\" (list of product names), \"min\" and \"max\" (as numbers, e.g., 10.99) only, without any additional details.", - "intent_template": "Provide me with the full names of {{product}}, and also share the price range for the available models. {{retrieved_data_format_spec}}.", - "instantiation_dict": { - "product": "Bluetooth headphones from Sony", - "retrieved_data_format_spec": "Return an object with keys \"names\" (list of product names), \"min\" and \"max\" (as numbers, e.g., 10.99) only, without any additional details" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { - "type": "array", - "items": { - "type": "object", - "properties": { - "names": { "type": "array", "items": {"type": "string"} }, - "min": {"type": "number", "format": "currency"}, - "max": {"type": "number", "format": "currency"} - } - } - }, - "expected": { - "task_type": "retrieve", - "status": "SUCCESS", - "retrieved_data": [ - { - "names": [ - "SONY WH1000XM3 Bluetooth Wireless Noise Canceling Headphones Silver WH-1000XM3/S (Renewed)", - "Sony WH-CH710N/H Wireless Bluetooth Noise Cancelling Headphones", - "Sony WH-1000XM3B Wireless Bluetooth Noise-Canceling Over-Ear Headphones (Black) Basic Headphone Bundle Kit with Stylus", - "Sony Wireless Headphones WH-CH510: Wireless Bluetooth On-Ear Headset with Mic for Phone-Call, Black", - "Sony WHCH710N Wireless Bluetooth Noise Canceling Over-The-Ear Headphones (Black) with Kratos 18W PD Two-Port Power Adapter and Kratos 6-Feet Nylon Braided USB-C Cable Bundle (3 Items)", - "Sony WI-SP500 Wireless in-Ear Sports Headphones, White (WISP500/W)", - "Sony WI-SP510 Extra BASS Wireless in-Ear Headset/Headphones with mic for Phone Call Sports IPX5 Bluetooth, Black (WISP510/B)", - "Sony MDRAS600BT Active Sports Bluetooth Headset (Black)", - "Sony WH-1000XM4 Wireless Noise Canceling Over-Ear Headphones (Black) with Sony WLA-NS7 Wireless TV Adapter Bundle (2 Items)", - "Sony WI-C300 Wireless In-Ear Headphones, Red (WIC300/R)", - "Sony XB950N1 Extra Bass Wireless Noise Canceling Headphones, Black", - "SONY - H900N Hi-Res Noise Cancelling Wireless Headphone Grayish Black Renewed" - ], - "min": 18.99, - "max": 406 - } - ] - } - } - ], - "revision": 2 - }, - { - "sites": ["shopping"], - "task_id": 280, - "intent_template_id": 204, - "start_urls": ["__SHOPPING__"], - "intent": "Provide me with the full names of chargers from Anker, and also share the price range for the available models. Return an object with keys \"names\" (list of product names), \"min\" and \"max\" (as numbers, e.g., 10.99) only, without any additional details.", - "intent_template": "Provide me with the full names of {{product}}, and also share the price range for the available models. {{retrieved_data_format_spec}}.", - "instantiation_dict": { - "product": "chargers from Anker", - "retrieved_data_format_spec": "Return an object with keys \"names\" (list of product names), \"min\" and \"max\" (as numbers, e.g., 10.99) only, without any additional details" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { - "type": "array", - "items": { - "type": "object", - "properties": { - "names": { "type": "array", "items": {"type": "string"} }, - "min": {"type": "number", "format": "currency"}, - "max": {"type": "number", "format": "currency"} - } - } - }, - "expected": { - "task_type": "retrieve", - "status": "SUCCESS", - "retrieved_data": [ - { - "names": [ - "Anker USB C Charger 30W, 711 Charger, Compact Fast Charger (Not Foldable) for MacBook Air/iPhone 13/13 Mini/13 Pro/13 Pro Max/12, Galaxy S21, Note 20, iPad Pro, Pixel, and More", - "Anker USB C Charger 40W, 521 Charger (Nano Pro), PIQ 3.0 Durable Compact Fast Charger (Not Foldable) for iPhone 13/13 Mini/13 Pro/13 Pro Max/12, Galaxy, Pixel 4/3, iPad/iPad Mini (Cable Not Included)", - "Anker PowerCore Speed 20000, 20000mAh Qualcomm Quick Charge 3.0 & PowerIQ Portable Charger, with Quick Charge Recharging, Power Bank for Samsung, iPhone, iPad and More, Black (A1278)", - "5Ft Micro-USB Charger Cord Cable Fit for Anker-PowerCore 5000 10000 20100 13000 26800 Mini 3350 Fusion II 15000 Redux 20000 Slim 10000 Astro E1 AC Replacement Power Adapter Supply", - "Anker 10W Max Wireless Charger, 313 Wireless Charger (Pad), Qi-Certified Wireless Charging 7.5W for iPhone 12/12 Pro/12 mini/12 Pro Max, 10W for Galaxy S10 S9 S8, S9 Plus, Note 9 (No AC Adapter)", - "Anker Wireless Charger, 313 Wireless Charger (Stand), Qi-Certified for iPhone 12, 12 Pro Max, SE, 11, 11 Pro, 11 Pro Max, XR, XS Max, 10W Fast-Charging Galaxy S20, S10 (No AC Adapter)", - "USB Charger, Anker Elite Dual Port 24W Wall Charger, PowerPort 2 with PowerIQ and Foldable Plug, for iPhone 11/Xs/XS Max/XR/X/8/7/6/Plus, iPad Pro/Air 2/Mini 3/Mini 4, Samsung S4/S5, and More", - "iPhone 12 Charger [GaN Tech], Anker 30W Compact USB-C Wall Charger with Power Delivery, PowerPort Atom for iPhone 12 / Mini/Pro/Pro Max / 11 / X/XS/XR, iPad Pro, MacBook 12'', Pixel, Galaxy", - "USB C Charger, Anker 30W 2 Port Fast Charger with 18W USB C Power Adapter, Foldable PowerPort PD 2 Charger for iPad Pro, iPhone 11/11 Pro / 11 Pro Max/XS/Max/XR/X, Pixel, Galaxy, and More", - "Anker 40W 5-Port USB Wall Charger, PowerPort 5 for iPhone XS / XS Max / XR / X / 8 / 7 / 6 / Plus, iPad Pro / Air 2 / mini, Galaxy S9 / S8 / Edge / Plus, Note 8 / 7, LG, Nexus, HTC and More, Black (AK-A2124111)", - "Anker Quick Charge 3.0 39W Dual USB Wall Charger, PowerPort Speed 2 for Galaxy S10/S9/S8/Edge/Plus, Note 8/7 and PowerIQ for iPhone Xs/XS Max/XR/X/8/Plus, iPad Pro/Air 2/Mini, LG, Nexus, HTC and More", - "USB C Charger, Anker 20W PIQ 3.0 Fast Charger with Foldable Plug, PowerPort III Charger for iPhone 13/13 Mini/13 Pro/13 Pro Max/12/11, iPad/iPad Mini, MagSafe, and More (Cable Not Included)" - ], - "min": 8.99, - "max": 59.99 - } - ] - } - } - ], - "revision": 2 - }, - { - "sites": ["shopping"], - "task_id": 281, - "intent_template_id": 204, - "start_urls": ["__SHOPPING__"], - "intent": "Provide me with the full names of Oral B brush heads designed for children, and also share the price range for the available models. Return an object with keys \"names\" (list of product names), \"min\" and \"max\" (as numbers, e.g., 10.99) only, without any additional details.", - "intent_template": "Provide me with the full names of {{product}}, and also share the price range for the available models. {{retrieved_data_format_spec}}.", - "instantiation_dict": { - "product": "Oral B brush heads designed for children", - "retrieved_data_format_spec": "Return an object with keys \"names\" (list of product names), \"min\" and \"max\" (as numbers, e.g., 10.99) only, without any additional details" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { - "type": "array", - "items": { - "type": "object", - "properties": { - "names": { "type": "array", "items": {"type": "string"} }, - "min": {"type": "number", "format": "currency"}, - "max": {"type": "number", "format": "currency"} - } - } - }, - "expected": { - "task_type": "retrieve", - "status": "SUCCESS", - "retrieved_data": [ - { - "names": [ - "Oral-B Kids Extra Soft Replacement Brush Heads featuring STAR WARS, 2 count", - "Kids By Oral-b Stages Power Star Wars Replacement Heads 4 Pack" - ], - "min": 12.99, - "max": 14.98 - } - ] - } - } - ], - "revision": 2 - }, - { - "sites": ["shopping"], - "task_id": 282, - "intent_template_id": 204, - "start_urls": ["__SHOPPING__"], - "intent": "Provide me with the full names of slide slippers from Nike, and also share the price range for the available models. Return an object with keys \"names\" (list of product names), \"min\" and \"max\" (as numbers, e.g., 10.99) only, without any additional details.", - "intent_template": "Provide me with the full names of {{product}}, and also share the price range for the available models. {{retrieved_data_format_spec}}.", - "instantiation_dict": { - "product": "slide slippers from Nike", - "retrieved_data_format_spec": "Return an object with keys \"names\" (list of product names), \"min\" and \"max\" (as numbers, e.g., 10.99) only, without any additional details" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { - "type": "array", - "items": { - "type": "object", - "properties": { - "names": { "type": "array", "items": {"type": "string"} }, - "min": {"type": "number", "format": "currency"}, - "max": {"type": "number", "format": "currency"} - } - } - }, - "expected": { - "task_type": "retrieve", - "status": "SUCCESS", - "retrieved_data": [ - { - "names": [ - "Nike Men's Air Max Camden Slide Sandal", - "Nike Men's Benassi JDI Fanny Pack Slides", - "Nike Victori One Mens Comfort Slide Cn9675-003 (Midnight Navy/Midnight Navy/White, Numeric_10)", - "Nike Offcourt Slide Mens Bq4639-002 Size 12", - "Nike Jordan Men's Break Slide Red AR6374-602", - "Nike Victori One Slide Mens Style : Dd9559-300", - "Nike Men's Benassi Solarsoft Slide Athletic Sandal (Black/White, numeric_14)", - "Nike Men's Benassi Solarsoft Slide Athletic Sandal (Midnight Navy/Blue, numeric_8)", - "Nike womens Benassi Just Do It" - ], - "min": 27.6, - "max": 90.65 - } - ] - } - } - ], - "revision": 2 - }, - { - "sites": ["shopping"], - "task_id": 283, - "intent_template_id": 210, - "start_urls": ["__SHOPPING__"], - "intent": "Open the page showing the most recent Xbox controller models released between 2020-2021", - "intent_template": "Open the page showing the most recent Xbox controller models released between 2020-2021", - "instantiation_dict": {}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "navigate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__SHOPPING__/microsoft-xbox-controller-carbon-black-for-series-x-series-s-xbox-one-windows-10-android-ios-bundled-with-dual-port-charging-dock-xbox-controller-skin-voucher-premgear-cloth.html" - } - } - ], - "revision": 2 - }, - { - "sites": ["shopping"], - "task_id": 284, - "intent_template_id": 207, - "start_urls": ["__SHOPPING__"], - "intent": "View the product page for the least expensive shoe storage with a minimum storage capacity of 12 pairs.", - "intent_template": "View the product page for the least expensive {{product}} with a minimum storage capacity of {{min_storage}}.", - "instantiation_dict": {"product": "shoe storage", "min_storage": "12 pairs"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "navigate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__SHOPPING__/onlyeasy-over-the-door-shoe-storage-organizer-hanging-shoe-rack-holder-with-24-large-fabric-pockets-22-1-x-61-4-herringbone-grey-mxrodsb1p.html" - } - } - ], - "revision": 2 - }, - { - "sites": ["shopping"], - "task_id": 285, - "intent_template_id": 207, - "start_urls": ["__SHOPPING__"], - "intent": "View the product page for the least expensive switch card holder with a minimum storage capacity of 15 cards.", - "intent_template": "View the product page for the least expensive {{product}} with a minimum storage capacity of {{min_storage}}.", - "instantiation_dict": {"product": "switch card holder", "min_storage": "15 cards"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "navigate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__SHOPPING__/game-card-holder-storage-case-for-nintendo-switch-games-or-ps-vita-game-case-or-sd-memory-cards-black.html" - } - } - ], - "revision": 2 - }, - { - "sites": ["shopping"], - "task_id": 286, - "intent_template_id": 207, - "start_urls": ["__SHOPPING__"], - "intent": "View the product page for the least expensive ssd hard drive with a minimum storage capacity of 1TB.", - "intent_template": "View the product page for the least expensive {{product}} with a minimum storage capacity of {{min_storage}}.", - "instantiation_dict": {"product": "ssd hard drive", "min_storage": "1TB"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "navigate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__SHOPPING__/portable-ssd-2tb-external-hard-drive-mobile-solid-state-drive-portable-hard-drive-for-pc-laptop-and-mac-data-storage-and-transfer-2tb-silver.html" - } - } - ], - "revision": 2 - }, - { - "sites": ["map"], - "task_id": 287, - "intent_template_id": 47, - "start_urls": ["__MAP__"], - "intent": "How much time does it take from Pittsburgh to Philadelphia by car? Return the value as a string in HH:MM:SS format only, without any additional details.", - "intent_template": "How much time does it take from Pittsburgh to Philadelphia by car? {{retrieved_data_format_spec}}.", - "instantiation_dict": { - "retrieved_data_format_spec": "Return the value as a string in HH:MM:SS format only, without any additional details" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { "type": "array", "items": {"type": "string", "format": "duration"} }, - "expected": { "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": ["5h 47min"] } - } - ], - "revision": 2 - }, - { - "sites": ["shopping_admin"], - "task_id": 288, - "intent_template_id": 234, - "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Get the email of the customer who has the most cancellations in the history", - "intent_template": "Get the {{attribute}} of the customer who has the most cancellations in the history{{retrieved_data_format_spec}}", - "instantiation_dict": {"attribute": "email", "retrieved_data_format_spec": ""}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { "type": "array", "items": {"type": "string"} }, - "expected": { - "task_type": "retrieve", - "status": "SUCCESS", - "retrieved_data": ["coolcat321@hotmail.com"] - } - } - ], - "revision": 2 - }, - { - "sites": ["shopping_admin"], - "task_id": 289, - "intent_template_id": 234, - "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Get the email address, name, phone number of the customer who has the most cancellations in the history. Return a list of objects with keys \"name\", \"email\", and \"phone_number\".", - "intent_template": "Get the {{attribute}} of the customer who has the most cancellations in the history{{retrieved_data_format_spec}}", - "instantiation_dict": { - "attribute": "email address, name, phone number", - "retrieved_data_format_spec": ". Return a list of objects with keys \"name\", \"email\", and \"phone_number\"." - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { - "type": "array", - "items": { - "type": "object", - "properties": { - "name" : { "type": "string" }, - "email" : { "type": "string" }, - "phone_number": { "type": "string" } - } - } - }, - "expected": { - "task_type": "retrieve", - "status": "SUCCESS", - "retrieved_data": [ - { - "name": "Samantha Jones", - "email": "coolcat321@hotmail.com", - "phone_number": "3055551212" - } - ] - } - } - ], - "revision": 2 - }, - { - "sites": ["shopping_admin"], - "task_id": 290, - "intent_template_id": 234, - "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Get the product SKUs in the most recent cancelled orders of the customer who has the most cancellations in the history", - "intent_template": "Get the {{attribute}} of the customer who has the most cancellations in the history{{retrieved_data_format_spec}}", - "instantiation_dict": { - "attribute": "product SKUs in the most recent cancelled orders", - "retrieved_data_format_spec": "" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { "type": "array", "items": {"type": "string"} }, - "expected": { - "task_type": "retrieve", - "status": "SUCCESS", - "retrieved_data": ["WSH09-29-White", "WSH09-28-Green", "MSH11-34-Blue", "WP09-29-Purple"] - } - } - ], - "revision": 2 - }, - { - "sites": ["shopping_admin"], - "task_id": 291, - "intent_template_id": 234, - "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Get the total spend on products in the most recent cancelled orders excluding shipping and handling of the customer who has the most cancellations in the historyReturn the value as a number (e.g., 10.99) only, without any additional details", - "intent_template": "Get the {{attribute}} of the customer who has the most cancellations in the history{{retrieved_data_format_spec}}", - "instantiation_dict": { - "attribute": "total spend on products in the most recent cancelled orders excluding shipping and handling", - "retrieved_data_format_spec": "Return the value as a number (e.g., 10.99) only, without any additional details" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { "type": "array", "items": {"type": "number", "format": "currency"} }, - "expected": { "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": [148.8] } - } - ], - "revision": 2 - }, - { - "sites": ["shopping_admin"], - "task_id": 292, - "intent_template_id": 234, - "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Get the total number of cancellations of the customer who has the most cancellations in the history", - "intent_template": "Get the {{attribute}} of the customer who has the most cancellations in the history{{retrieved_data_format_spec}}", - "instantiation_dict": {"attribute": "total number of cancellations", "retrieved_data_format_spec": ""}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { "type": "array", "items": {"type": "number"} }, - "expected": { "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": [9] } - } - ], - "revision": 2 - }, - { - "sites": ["gitlab"], - "task_id": 293, - "intent_template_id": 329, - "start_urls": ["__GITLAB__"], - "intent": "Get the URL to clone Super_Awesome_Robot with SSH. Return the URL only, without any additional details.", - "intent_template": "Get the URL to clone {{repo}} with SSH. {{retrieved_data_format_spec}}.", - "instantiation_dict": { - "repo": "Super_Awesome_Robot", - "retrieved_data_format_spec": "Return the URL only, without any additional details" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { "type": "array", "items": {"type": "string", "format": "url"} }, - "expected": { - "task_type": "retrieve", - "status": "SUCCESS", - "retrieved_data": ["git@__SSH_HOST__:convexegg/super_awesome_robot.git"] - } - } - ], - "revision": 2 - }, - { - "sites": ["gitlab"], - "task_id": 294, - "intent_template_id": 329, - "start_urls": ["__GITLAB__"], - "intent": "Get the URL to clone ChatGPT with SSH. Return the URL only, without any additional details.", - "intent_template": "Get the URL to clone {{repo}} with SSH. {{retrieved_data_format_spec}}.", - "instantiation_dict": { - "repo": "ChatGPT", - "retrieved_data_format_spec": "Return the URL only, without any additional details" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { "type": "array", "items": {"type": "string", "format": "url"} }, - "expected": { - "task_type": "retrieve", - "status": "SUCCESS", - "retrieved_data": ["git@__SSH_HOST__:convexegg/chatgpt.git"] - } - } - ], - "revision": 2 - }, - { - "sites": ["gitlab"], - "task_id": 295, - "intent_template_id": 329, - "start_urls": ["__GITLAB__"], - "intent": "Get the URL to clone metaseq with SSH. Return the URL only, without any additional details.", - "intent_template": "Get the URL to clone {{repo}} with SSH. {{retrieved_data_format_spec}}.", - "instantiation_dict": { - "repo": "metaseq", - "retrieved_data_format_spec": "Return the URL only, without any additional details" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { "type": "array", "items": {"type": "string", "format": "url"} }, - "expected": { - "task_type": "retrieve", - "status": "SUCCESS", - "retrieved_data": ["git@__SSH_HOST__:root/metaseq.git"] - } - } - ], - "revision": 2 - }, - { - "sites": ["gitlab"], - "task_id": 296, - "intent_template_id": 329, - "start_urls": ["__GITLAB__"], - "intent": "Get the URL to clone the best GAN python implementation with SSH. Return the URL only, without any additional details.", - "intent_template": "Get the URL to clone {{repo}} with SSH. {{retrieved_data_format_spec}}.", - "instantiation_dict": { - "repo": "the best GAN python implementation", - "retrieved_data_format_spec": "Return the URL only, without any additional details" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { "type": "array", "items": {"type": "string", "format": "url"} }, - "expected": { - "task_type": "retrieve", - "status": "SUCCESS", - "retrieved_data": ["git@__SSH_HOST__:eriklindernoren/PyTorch-GAN.git"] - } - } - ], - "revision": 2 - }, - { - "sites": ["gitlab"], - "task_id": 297, - "intent_template_id": 329, - "start_urls": ["__GITLAB__"], - "intent": "Get the URL to clone the most stared Covid related project with SSH. Return the URL only, without any additional details.", - "intent_template": "Get the URL to clone {{repo}} with SSH. {{retrieved_data_format_spec}}.", - "instantiation_dict": { - "repo": "the most stared Covid related project", - "retrieved_data_format_spec": "Return the URL only, without any additional details" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { "type": "array", "items": {"type": "string", "format": "url"} }, - "expected": { - "task_type": "retrieve", - "status": "SUCCESS", - "retrieved_data": ["git@__SSH_HOST__:covid19india/covid19india-react.git"] - } - } - ], - "revision": 2 - }, - { - "sites": ["shopping"], - "task_id": 298, - "intent_template_id": 180, - "start_urls": ["__SHOPPING__"], - "intent": "Open the order details page for the most recent completed order", - "intent_template": "Open the order details page for the most recent {{status}} order", - "instantiation_dict": {"status": "completed"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "navigate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": {"url": "__SHOPPING__/sales/order/view/order_id/180/"} - } - ], - "revision": 2 - }, - { - "sites": ["shopping"], - "task_id": 299, - "intent_template_id": 180, - "start_urls": ["__SHOPPING__"], - "intent": "Open the order details page for the most recent cancelled order", - "intent_template": "Open the order details page for the most recent {{status}} order", - "instantiation_dict": {"status": "cancelled"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "navigate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": {"url": "__SHOPPING__/sales/order/view/order_id/170/"} - } - ], - "revision": 2 - }, - { - "sites": ["shopping"], - "task_id": 300, - "intent_template_id": 180, - "start_urls": ["__SHOPPING__"], - "intent": "Open the order details page for the most recent pending order", - "intent_template": "Open the order details page for the most recent {{status}} order", - "instantiation_dict": {"status": "pending"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "navigate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": {"url": "__SHOPPING__/sales/order/view/order_id/189/"} - } - ], - "revision": 2 - }, - { - "sites": ["shopping"], - "task_id": 301, - "intent_template_id": 180, - "start_urls": ["__SHOPPING__"], - "intent": "Open the order details page for the most recent processing order", - "intent_template": "Open the order details page for the most recent {{status}} order", - "instantiation_dict": {"status": "processing"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "navigate", "status": "NOT_FOUND_ERROR", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": {"url": "__SHOPPING__/sales/order/history/"} - } - ], - "revision": 2 - }, - { - "sites": ["shopping"], - "task_id": 302, - "intent_template_id": 180, - "start_urls": ["__SHOPPING__"], - "intent": "Open the order details page for the most recent out of delivery order", - "intent_template": "Open the order details page for the most recent {{status}} order", - "instantiation_dict": {"status": "out of delivery"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "navigate", "status": "NOT_FOUND_ERROR", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": {"url": "__SHOPPING__/sales/order/history/"} - } - ], - "revision": 2 - }, - { - "sites": ["gitlab"], - "task_id": 303, - "intent_template_id": 321, - "start_urls": ["__GITLAB__/a11yproject/a11yproject.com"], - "intent": "How many commits did Kilian make during 2023 in the current repository?", - "intent_template": "How many commits did {{user}} make {{period}} in the current repository?", - "instantiation_dict": {"user": "Kilian", "period": "during 2023"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { "type": "array", "items": {"type": "number"} }, - "expected": { "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": [1] } - } - ], - "revision": 2 - }, - { - "sites": ["gitlab"], - "task_id": 304, - "intent_template_id": 321, - "start_urls": ["__GITLAB__/a11yproject/a11yproject.com"], - "intent": "How many commits did Eric Bailey make between start of Feb 2023 and end of May 2023 in the current repository?", - "intent_template": "How many commits did {{user}} make {{period}} in the current repository?", - "instantiation_dict": { - "user": "Eric Bailey", - "period": "between start of Feb 2023 and end of May 2023" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { "type": "array", "items": {"type": "number"} }, - "expected": { "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": [14] } - } - ], - "revision": 2 - }, - { - "sites": ["gitlab"], - "task_id": 305, - "intent_template_id": 321, - "start_urls": ["__GITLAB__/a11yproject/a11yproject.com"], - "intent": "How many commits did Philip make in Jan 2023 in the current repository?", - "intent_template": "How many commits did {{user}} make {{period}} in the current repository?", - "instantiation_dict": {"user": "Philip", "period": "in Jan 2023"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { "type": "array", "items": {"type": "number"} }, - "expected": { "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": [0] } - } - ], - "revision": 2 - }, - { - "sites": ["gitlab"], - "task_id": 306, - "intent_template_id": 321, - "start_urls": ["__GITLAB__/a11yproject/a11yproject.com"], - "intent": "How many commits did Anthony make between August 2022 through September 2022 in the current repository?", - "intent_template": "How many commits did {{user}} make {{period}} in the current repository?", - "instantiation_dict": {"user": "Anthony", "period": "between August 2022 through September 2022"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { "type": "array", "items": {"type": "number"} }, - "expected": { "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": [0] } - } - ], - "revision": 2 - }, - { - "sites": ["gitlab"], - "task_id": 307, - "intent_template_id": 321, - "start_urls": ["__GITLAB__/a11yproject/a11yproject.com"], - "intent": "How many commits did Nic Chan make on April 7th 2022 in the current repository?", - "intent_template": "How many commits did {{user}} make {{period}} in the current repository?", - "instantiation_dict": {"user": "Nic Chan", "period": "on April 7th 2022"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { "type": "array", "items": {"type": "number"} }, - "expected": { "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": [5] } - } - ], - "revision": 2 - }, - { - "sites": ["gitlab"], - "task_id": 308, - "intent_template_id": 323, - "start_urls": ["__GITLAB__"], - "intent": "Get the username(s) of the user(s) with the most commits to the primer/design project", - "intent_template": "Get the username(s) of the user(s) with the most commits to the {{repo}} project", - "instantiation_dict": {"repo": "primer/design"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { "type": "array", "items": {"type": "string"} }, - "expected": { - "task_type": "retrieve", - "status": "SUCCESS", - "retrieved_data": ["shawn.allen@github.com"] - } - } - ], - "revision": 2 - }, - { - "sites": ["gitlab"], - "task_id": 309, - "intent_template_id": 323, - "start_urls": ["__GITLAB__"], - "intent": "Get the username(s) of the user(s) with the most commits to the thoughtbot/administrate project", - "intent_template": "Get the username(s) of the user(s) with the most commits to the {{repo}} project", - "instantiation_dict": {"repo": "thoughtbot/administrate"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { "type": "array", "items": {"type": "string"} }, - "expected": { - "task_type": "retrieve", - "status": "SUCCESS", - "retrieved_data": ["wright.grayson@gmail.com"] - } - } - ], - "revision": 2 - }, - { - "sites": ["gitlab"], - "task_id": 310, - "intent_template_id": 323, - "start_urls": ["__GITLAB__"], - "intent": "Get the username(s) of the user(s) with the most commits to the AndroidSlidingUpPanel project", - "intent_template": "Get the username(s) of the user(s) with the most commits to the {{repo}} project", - "instantiation_dict": {"repo": "AndroidSlidingUpPanel"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { "type": "array", "items": {"type": "string"} }, - "expected": { - "task_type": "retrieve", - "status": "SUCCESS", - "retrieved_data": ["antonlopyrev@gmail.com"] - } - } - ], - "revision": 2 - }, - { - "sites": ["gitlab"], - "task_id": 311, - "intent_template_id": 323, - "start_urls": ["__GITLAB__"], - "intent": "Get the username(s) of the user(s) with the most commits to the Pytorch GAN project", - "intent_template": "Get the username(s) of the user(s) with the most commits to the {{repo}} project", - "instantiation_dict": {"repo": "Pytorch GAN"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { "type": "array", "items": {"type": "string"} }, - "expected": { - "task_type": "retrieve", - "status": "SUCCESS", - "retrieved_data": ["eriklindernoren@live.se"] - } - } - ], - "revision": 2 - }, - { - "sites": ["gitlab"], - "task_id": 312, - "intent_template_id": 323, - "start_urls": ["__GITLAB__"], - "intent": "Get the username(s) of the user(s) with the most commits to the csvkit project", - "intent_template": "Get the username(s) of the user(s) with the most commits to the {{repo}} project", - "instantiation_dict": {"repo": "csvkit"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { "type": "array", "items": {"type": "string"} }, - "expected": { - "task_type": "retrieve", - "status": "SUCCESS", - "retrieved_data": ["staringmonkey@gmail.com"] - } - } - ], - "revision": 2 - }, - { - "sites": ["shopping"], - "task_id": 313, - "intent_template_id": 134, - "start_urls": ["__SHOPPING__"], - "intent": "Get the customer service phone number", - "intent_template": "Get the customer service phone number", - "instantiation_dict": {}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "retrieve", "status": "NOT_FOUND_ERROR", "retrieved_data": null} - } - ], - "revision": 2 - }, - { - "sites": ["gitlab"], - "task_id": 314, - "intent_template_id": 324, - "start_urls": ["__GITLAB__"], - "intent": "Get the full names of the top 3 contributors (by commit count) to primer/design repo", - "intent_template": "Get the {{attribute}} of the top 3 contributors (by commit count) to {{repo}} repo{{retrieved_data_format_spec}}", - "instantiation_dict": { - "repo": "primer/design", - "attribute": "full names", - "retrieved_data_format_spec": "" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { "type": "array", "items": {"type": "string"} }, - "expected": { - "task_type": "retrieve", - "status": "SUCCESS", - "retrieved_data": ["Shawn Allen", "Inayaili Le\u00f3n", "Aurora Pleguezuelo"] - } - } - ], - "revision": 2 - }, - { - "sites": ["gitlab"], - "task_id": 315, - "intent_template_id": 324, - "start_urls": ["__GITLAB__"], - "intent": "Get the email addresses of the top 3 contributors (by commit count) to Pytorch GAN repo", - "intent_template": "Get the {{attribute}} of the top 3 contributors (by commit count) to {{repo}} repo{{retrieved_data_format_spec}}", - "instantiation_dict": { - "repo": "Pytorch GAN", - "attribute": "email addresses", - "retrieved_data_format_spec": "" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { "type": "array", "items": {"type": "string"} }, - "expected": { - "task_type": "retrieve", - "status": "SUCCESS", - "retrieved_data": ["eriklindernoren@live.se", "eriklindernoren@gmail.com", "pinnacle.chen@qq.com"] - } - } - ], - "revision": 2 - }, - { - "sites": ["gitlab"], - "task_id": 316, - "intent_template_id": 324, - "start_urls": ["__GITLAB__"], - "intent": "Get the email addresses of the top 3 contributors (by commit count) to facebook\"s guide on building react apps repo", - "intent_template": "Get the {{attribute}} of the top 3 contributors (by commit count) to {{repo}} repo{{retrieved_data_format_spec}}", - "instantiation_dict": { - "repo": "facebook\"s guide on building react apps", - "attribute": "email addresses", - "retrieved_data_format_spec": "" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { "type": "array", "items": {"type": "string"} }, - "expected": { - "task_type": "retrieve", - "status": "SUCCESS", - "retrieved_data": ["dan.abramov@gmail.com", "timer150@gmail.com", "ian@iansutherland.ca"] - } - } - ], - "revision": 2 - }, - { - "sites": ["gitlab"], - "task_id": 317, - "intent_template_id": 324, - "start_urls": ["__GITLAB__"], - "intent": "Get the names and number of commits of the top 3 contributors (by commit count) to metaseq repo. Return a list of objects with keys \"first_name\", \"last_name\", and \"number_of_commits\".", - "intent_template": "Get the {{attribute}} of the top 3 contributors (by commit count) to {{repo}} repo{{retrieved_data_format_spec}}", - "instantiation_dict": { - "repo": "metaseq", - "attribute": "names and number of commits", - "retrieved_data_format_spec": ". Return a list of objects with keys \"first_name\", \"last_name\", and \"number_of_commits\"." - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { - "type": "array", - "items": { - "type": "object", - "properties": { - "first_name" : { "type": "string" }, - "last_name" : { "type": "string" }, - "number_of_commits": { "type": "number" } - } - } - }, - "expected": { - "task_type": "retrieve", - "status": "SUCCESS", - "retrieved_data": [ - { "first_name": "Susan" , "last_name": "Zhang" , "number_of_commits": 70 }, - { "first_name": "Stephen", "last_name": "Roller", "number_of_commits": 51 }, - { "first_name": "Peter" , "last_name": "Albert", "number_of_commits": 12 } - ] - } - } - ], - "revision": 2 - }, - { - "sites": ["gitlab"], - "task_id": 318, - "intent_template_id": 324, - "start_urls": ["__GITLAB__"], - "intent": "Get the last names of the top 3 contributors (by commit count) to 2019-nCov repo", - "intent_template": "Get the {{attribute}} of the top 3 contributors (by commit count) to {{repo}} repo{{retrieved_data_format_spec}}", - "instantiation_dict": { - "repo": "2019-nCov", - "attribute": "last names", - "retrieved_data_format_spec": "" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { "type": "array", "items": {"type": "string"} }, - "expected": { - "task_type": "retrieve", - "status": "SUCCESS", - "retrieved_data": ["Lo", "Chen", "Chu"] - } - } - ], - "revision": 2 - }, - { - "sites": ["shopping"], - "task_id": 319, - "intent_template_id": 160, - "start_urls": ["__SHOPPING__"], - "intent": "How much refund should I expect from my orders canceled, if any, in April 2022, including shipping fee.. Return the value as a number (e.g., 10.99) only, without any additional details.", - "intent_template": "How much refund should I expect from my orders canceled, if any, in {{time}}{{conditions}}. {{retrieved_data_format_spec}}.", - "instantiation_dict": { - "time": "April 2022", - "conditions": ", including shipping fee.", - "retrieved_data_format_spec": "Return the value as a number (e.g., 10.99) only, without any additional details" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { "type": "array", "items": {"type": "number", "format": "currency"} }, - "expected": {"task_type": "retrieve", "status": "NOT_FOUND_ERROR"} - } - ], - "revision": 2 - }, - { - "sites": ["shopping"], - "task_id": 320, - "intent_template_id": 160, - "start_urls": ["__SHOPPING__"], - "intent": "How much refund should I expect from my orders canceled, if any, in Feb 2023, including shipping fee.. Return the value as a number (e.g., 10.99) only, without any additional details.", - "intent_template": "How much refund should I expect from my orders canceled, if any, in {{time}}{{conditions}}. {{retrieved_data_format_spec}}.", - "instantiation_dict": { - "time": "Feb 2023", - "conditions": ", including shipping fee.", - "retrieved_data_format_spec": "Return the value as a number (e.g., 10.99) only, without any additional details" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { "type": "array", "items": {"type": "number", "format": "currency"} }, - "expected": { "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": [406.53] } - } - ], - "revision": 2 - }, - { - "sites": ["shopping"], - "task_id": 321, - "intent_template_id": 160, - "start_urls": ["__SHOPPING__"], - "intent": "How much refund should I expect from my orders canceled, if any, in 2022, including shipping fee.. Return the value as a number (e.g., 10.99) only, without any additional details.", - "intent_template": "How much refund should I expect from my orders canceled, if any, in {{time}}{{conditions}}. {{retrieved_data_format_spec}}.", - "instantiation_dict": { - "time": "2022", - "conditions": ", including shipping fee.", - "retrieved_data_format_spec": "Return the value as a number (e.g., 10.99) only, without any additional details" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { "type": "array", "items": {"type": "number", "format": "currency"} }, - "expected": { "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": [3053.97] } - } - ], - "revision": 2 - }, - { - "sites": ["shopping"], - "task_id": 322, - "intent_template_id": 160, - "start_urls": ["__SHOPPING__"], - "intent": "How much refund should I expect from my orders canceled, if any, in May 2023 if I cannot get the shipping fee refunded?. Return the value as a number (e.g., 10.99) only, without any additional details.", - "intent_template": "How much refund should I expect from my orders canceled, if any, in {{time}}{{conditions}}. {{retrieved_data_format_spec}}.", - "instantiation_dict": { - "time": "May 2023", - "conditions": " if I cannot get the shipping fee refunded?", - "retrieved_data_format_spec": "Return the value as a number (e.g., 10.99) only, without any additional details" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { "type": "array", "items": {"type": "number", "format": "currency"} }, - "expected": { "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": [350.42] } - } - ], - "revision": 2 - }, - { - "sites": ["shopping"], - "task_id": 323, - "intent_template_id": 160, - "start_urls": ["__SHOPPING__"], - "intent": "How much refund should I expect from my orders canceled, if any, in March 2022? I only kept the AC-DC Adapter and the shop told me that I cannot get the shipping fee back.. Return the value as a number (e.g., 10.99) only, without any additional details.", - "intent_template": "How much refund should I expect from my orders canceled, if any, in {{time}}{{conditions}}. {{retrieved_data_format_spec}}.", - "instantiation_dict": { - "time": "March 2022", - "conditions": "? I only kept the AC-DC Adapter and the shop told me that I cannot get the shipping fee back.", - "retrieved_data_format_spec": "Return the value as a number (e.g., 10.99) only, without any additional details" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { "type": "array", "items": {"type": "number", "format": "currency"} }, - "expected": { "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": [264.49] } - } - ], - "revision": 2 - }, - { - "sites": ["shopping"], - "task_id": 324, - "intent_template_id": 208, - "start_urls": ["__SHOPPING__"], - "intent": "Pull up the page with all \"chairs\" listings sorted by ascending price.", - "intent_template": "Pull up the page with all \"{{product}}\" listings sorted by {{sorting_order}}.", - "instantiation_dict": {"product": "chairs", "sorting_order": "ascending price"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "navigate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__SHOPPING__/catalogsearch/result/index/", - "query_params": { - "product_list_order": [ "price" ], - "q" : [ "chairs" ], - "product_list_dir" : [ "asc" ] - } - } - } - ], - "revision": 2 - }, - { - "sites": ["shopping"], - "task_id": 325, - "intent_template_id": 208, - "start_urls": ["__SHOPPING__"], - "intent": "Pull up the page with all \"mouth night guard\" listings sorted by descending price.", - "intent_template": "Pull up the page with all \"{{product}}\" listings sorted by {{sorting_order}}.", - "instantiation_dict": {"product": "mouth night guard", "sorting_order": "descending price"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "navigate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "ignored_query_params_patterns": ["^(?!q$|product_list_dir$|product_list_order).+$"], - "expected": { - "url": "__SHOPPING__/catalogsearch/result/index/", - "query_params": { "q": ["mouth night guard "], "product_list_order": ["price"] } - } - } - ], - "revision": 2 - }, - { - "sites": ["shopping"], - "task_id": 326, - "intent_template_id": 208, - "start_urls": ["__SHOPPING__"], - "intent": "Pull up the page with all \"Canon photo printer\" listings sorted by search relevance, from most to least.", - "intent_template": "Pull up the page with all \"{{product}}\" listings sorted by {{sorting_order}}.", - "instantiation_dict": { - "product": "Canon photo printer", - "sorting_order": "search relevance, from most to least" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "navigate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__SHOPPING__/catalogsearch/result/", - "query_params": { "q": ["Canon photo printer"] } - } - } - ], - "revision": 2 - }, - { - "sites": ["shopping"], - "task_id": 327, - "intent_template_id": 208, - "start_urls": ["__SHOPPING__"], - "intent": "Pull up the page with all \"iphone 12 phone case\" listings sorted by name alphabetically.", - "intent_template": "Pull up the page with all \"{{product}}\" listings sorted by {{sorting_order}}.", - "instantiation_dict": {"product": "iphone 12 phone case", "sorting_order": "name alphabetically"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "navigate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "ignored_query_params_patterns": ["^(?!q$|product_list_dir$|product_list_order$).+$"], - "expected": { - "url": "__SHOPPING__/catalogsearch/result/index/", - "query_params": { - "q" : [ " iphone 12 phone case" ], - "product_list_order": [ "name" ], - "product_list_dir" : [ "asc" ] - } - } - } - ], - "revision": 2 - }, - { - "sites": ["shopping"], - "task_id": 328, - "intent_template_id": 208, - "start_urls": ["__SHOPPING__"], - "intent": "Pull up the page with all \"iphone 12 phone case\" listings sorted by price.", - "intent_template": "Pull up the page with all \"{{product}}\" listings sorted by {{sorting_order}}.", - "instantiation_dict": {"product": "iphone 12 phone case", "sorting_order": "price"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "navigate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "ignored_query_params_patterns": ["^(?!q$|product_list_order$).+$"], - "expected": { - "url": "__SHOPPING__/catalogsearch/result/index/", - "query_params": { "product_list_order": ["price"], "q": [" iphone 12 phone case"] } - } - } - ], - "revision": 2 - }, - { - "sites": ["shopping"], - "task_id": 329, - "intent_template_id": 147, - "start_urls": ["__SHOPPING__"], - "intent": "Return the total amount I spent on shopping at One Stop Market on April 19, 2023, excluding shipping. Return the value as a number (e.g., 10.99) only, without any additional details", - "intent_template": "Return the total amount I spent on shopping at One Stop Market {{time}}, excluding shipping. {{retrieved_data_format_spec}}", - "instantiation_dict": { - "time": "on April 19, 2023", - "retrieved_data_format_spec": "Return the value as a number (e.g., 10.99) only, without any additional details" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { "type": "array", "items": {"type": "number", "format": "currency"} }, - "expected": { "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": [0] } - } - ], - "revision": 2 - }, - { - "sites": ["shopping"], - "task_id": 330, - "intent_template_id": 147, - "start_urls": ["__SHOPPING__"], - "intent": "Return the total amount I spent on shopping at One Stop Market in March 2023, excluding shipping. Return the value as a number (e.g., 10.99) only, without any additional details", - "intent_template": "Return the total amount I spent on shopping at One Stop Market {{time}}, excluding shipping. {{retrieved_data_format_spec}}", - "instantiation_dict": { - "time": "in March 2023", - "retrieved_data_format_spec": "Return the value as a number (e.g., 10.99) only, without any additional details" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { "type": "array", "items": {"type": "number", "format": "currency"} }, - "expected": { "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": [53.31] } - } - ], - "revision": 2 - }, - { - "sites": ["shopping"], - "task_id": 331, - "intent_template_id": 147, - "start_urls": ["__SHOPPING__"], - "intent": "Return the total amount I spent on shopping at One Stop Market in July 2022, excluding shipping. Return the value as a number (e.g., 10.99) only, without any additional details", - "intent_template": "Return the total amount I spent on shopping at One Stop Market {{time}}, excluding shipping. {{retrieved_data_format_spec}}", - "instantiation_dict": { - "time": "in July 2022", - "retrieved_data_format_spec": "Return the value as a number (e.g., 10.99) only, without any additional details" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { "type": "array", "items": {"type": "number", "format": "currency"} }, - "expected": { "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": [25.16] } - } - ], - "revision": 2 - }, - { - "sites": ["shopping"], - "task_id": 332, - "intent_template_id": 147, - "start_urls": ["__SHOPPING__"], - "intent": "Return the total amount I spent on shopping at One Stop Market each month from Jan to the March 31, 2023, excluding shipping. Return a list of objects with keys \"month\" (month name) and \"total\" (as a number, e.g., 10.99) only, without any additional details", - "intent_template": "Return the total amount I spent on shopping at One Stop Market {{time}}, excluding shipping. {{retrieved_data_format_spec}}", - "instantiation_dict": { - "time": "each month from Jan to the March 31, 2023", - "retrieved_data_format_spec": "Return a list of objects with keys \"month\" (month name) and \"total\" (as a number, e.g., 10.99) only, without any additional details" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { - "type": "array", - "items": { - "type": "object", - "properties": { - "month": { "type": "string", "format": "month" }, - "total": { "type": "number", "format": "currency" } - } - } - }, - "expected": { - "task_type": "retrieve", - "status": "SUCCESS", - "retrieved_data": [ - { "month": "Jan", "total": 542.88 }, - { "month": "Feb", "total": 912.50 }, - { "month": "Mar", "total": 53.31 } - ] - } - } - ], - "revision": 2 - }, - { - "sites": ["shopping"], - "task_id": 333, - "intent_template_id": 147, - "start_urls": ["__SHOPPING__"], - "intent": "Return the total amount I spent on shopping at One Stop Market in November 2022, excluding shipping. Return the value as a number (e.g., 10.99) only, without any additional details", - "intent_template": "Return the total amount I spent on shopping at One Stop Market {{time}}, excluding shipping. {{retrieved_data_format_spec}}", - "instantiation_dict": { - "time": "in November 2022", - "retrieved_data_format_spec": "Return the value as a number (e.g., 10.99) only, without any additional details" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { "type": "array", "items": {"type": "number", "format": "currency"} }, - "expected": { "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": [358.18] } - } - ], - "revision": 2 - }, - { - "sites": ["shopping"], - "task_id": 334, - "intent_template_id": 169, - "start_urls": ["__SHOPPING__"], - "intent": "Return the date I last ordered my muffin cornbread mix. Return the date in YYYY-MM-DD format or null if not available, without any additional details.", - "intent_template": "Return the date I last ordered my {{description}}. {{retrieved_data_format_spec}}.", - "instantiation_dict": { - "description": "muffin cornbread mix", - "retrieved_data_format_spec": "Return the date in YYYY-MM-DD format or null if not available, without any additional details" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { "type": "array", "items": {"type": "string", "format": "date"} }, - "expected": { - "task_type": "retrieve", - "status": "SUCCESS", - "retrieved_data": ["03/11/2023"] - } - } - ], - "revision": 2 - }, - { - "sites": ["shopping"], - "task_id": 335, - "intent_template_id": 169, - "start_urls": ["__SHOPPING__"], - "intent": "Return the date I last ordered my body butter. Return the date in YYYY-MM-DD format or null if not available, without any additional details.", - "intent_template": "Return the date I last ordered my {{description}}. {{retrieved_data_format_spec}}.", - "instantiation_dict": { - "description": "body butter", - "retrieved_data_format_spec": "Return the date in YYYY-MM-DD format or null if not available, without any additional details" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { "type": "array", "items": {"type": "string", "format": "date"} }, - "expected": { - "task_type": "retrieve", - "status": "SUCCESS", - "retrieved_data": ["01/16/2023"] - } - } - ], - "revision": 2 - }, - { - "sites": ["shopping"], - "task_id": 336, - "intent_template_id": 169, - "start_urls": ["__SHOPPING__"], - "intent": "Return the date I last ordered my conditioner. Return the date in YYYY-MM-DD format or null if not available, without any additional details.", - "intent_template": "Return the date I last ordered my {{description}}. {{retrieved_data_format_spec}}.", - "instantiation_dict": { - "description": "conditioner", - "retrieved_data_format_spec": "Return the date in YYYY-MM-DD format or null if not available, without any additional details" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { "type": "array", "items": {"type": "string", "format": "date"} }, - "expected": { - "task_type": "retrieve", - "status": "SUCCESS", - "retrieved_data": ["01/16/2023"] - } - } - ], - "revision": 2 - }, - { - "sites": ["shopping"], - "task_id": 337, - "intent_template_id": 169, - "start_urls": ["__SHOPPING__"], - "intent": "Return the date I last ordered my olive bread. Return the date in YYYY-MM-DD format or null if not available, without any additional details.", - "intent_template": "Return the date I last ordered my {{description}}. {{retrieved_data_format_spec}}.", - "instantiation_dict": { - "description": "olive bread", - "retrieved_data_format_spec": "Return the date in YYYY-MM-DD format or null if not available, without any additional details" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { "type": "array", "items": {"type": "string", "format": "date"} }, - "expected": { - "task_type": "retrieve", - "status": "SUCCESS", - "retrieved_data": ["12/12/2022"] - } - } - ], - "revision": 2 - }, - { - "sites": ["shopping"], - "task_id": 338, - "intent_template_id": 169, - "start_urls": ["__SHOPPING__"], - "intent": "Return the date I last ordered my toothpaste. Return the date in YYYY-MM-DD format or null if not available, without any additional details.", - "intent_template": "Return the date I last ordered my {{description}}. {{retrieved_data_format_spec}}.", - "instantiation_dict": { - "description": "toothpaste", - "retrieved_data_format_spec": "Return the date in YYYY-MM-DD format or null if not available, without any additional details" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { "type": "array", "items": {"type": "string", "format": "date"} }, - "expected": { - "task_type": "retrieve", - "status": "SUCCESS", - "retrieved_data": ["12/04/2022"] - } - } - ], - "revision": 2 - }, - { - "sites": ["gitlab"], - "task_id": 339, - "intent_template_id": 299, - "start_urls": ["__GITLAB__/a11yproject/a11yproject.com"], - "intent": "Go to the list of all opened issues that report bugs for the current project", - "intent_template": "Go to the list of all opened issues {{description}} for the current project", - "instantiation_dict": {"description": "that report bugs"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "navigate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": {"url": "^__GITLAB__/a11yproject/a11yproject.com/-/issues/.*$"}, - "ignored_query_params_patterns": [".*"] - }, - { - "evaluator": "NetworkEventEvaluator", - "ignored_query_params_patterns": ["page", "sort", "scope"], - "expected": { - "url": "__GITLAB__/api/graphql", - "http_method": "POST", - "headers": { - "referer": "__GITLAB__/a11yproject/a11yproject.com/-/issues/?state=opened&label_name%5B%5D=bug" - } - } - } - ], - "revision": 2 - }, - { - "sites": ["gitlab"], - "task_id": 340, - "intent_template_id": 299, - "start_urls": ["__GITLAB__/primer/design"], - "intent": "Go to the list of all opened issues that report bugs for the current project", - "intent_template": "Go to the list of all opened issues {{description}} for the current project", - "instantiation_dict": {"description": "that report bugs"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "navigate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": {"url": "^__GITLAB__/primer/design/-/issues/.*$"}, - "ignored_query_params_patterns": [".*"] - }, - { - "evaluator": "NetworkEventEvaluator", - "ignored_query_params_patterns": ["page", "sort", "scope"], - "expected": { - "url": "__GITLAB__/api/graphql", - "http_method": "POST", - "headers": { - "referer": "__GITLAB__/primer/design/-/issues/?state=opened&label_name%5B%5D=type%3A%20bug%20%F0%9F%90%9E" - } - } - } - ], - "revision": 2 - }, - { - "sites": ["gitlab"], - "task_id": 341, - "intent_template_id": 299, - "start_urls": ["__GITLAB__/root/metaseq"], - "intent": "Go to the list of all opened issues requesting new features for the current project", - "intent_template": "Go to the list of all opened issues {{description}} for the current project", - "instantiation_dict": {"description": "requesting new features"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "navigate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": {"url": "^__GITLAB__/root/metaseq/-/issues/.*$"}, - "ignored_query_params_patterns": [".*"] - }, - { - "evaluator": "NetworkEventEvaluator", - "ignored_query_params_patterns": ["page", "sort", "scope"], - "expected": { - "url": "__GITLAB__/api/graphql", - "http_method": "POST", - "headers": { - "referer": "__GITLAB__/root/metaseq/-/issues/?state=opened&label_name%5B%5D=enhancement" - } - } - } - ], - "revision": 2 - }, - { - "sites": ["gitlab"], - "task_id": 342, - "intent_template_id": 299, - "start_urls": ["__GITLAB__/root/metaseq"], - "intent": "Go to the list of all opened issues that ask about OPT model related questions for the current project", - "intent_template": "Go to the list of all opened issues {{description}} for the current project", - "instantiation_dict": {"description": "that ask about OPT model related questions"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "navigate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "ignored_query_params_patterns": [".*"], - "expected": {"url": "^__GITLAB__/root/metaseq/-/issues.*$"} - }, - { - "evaluator": "NetworkEventEvaluator", - "ignored_query_params_patterns": ["page", "sort"], - "expected": { - "url": "__GITLAB__/api/graphql", - "http_method": "POST", - "headers": { - "referer": "__GITLAB__/root/metaseq/-/issues/?state=opened&label_name%5B%5D=question&search=OPT%20Model" - } - } - } - ], - "revision": 2 - }, - { - "sites": ["gitlab"], - "task_id": 343, - "intent_template_id": 299, - "start_urls": ["__GITLAB__/root/metaseq"], - "intent": "Go to the list of all opened issues that don't have any labels for the current project", - "intent_template": "Go to the list of all opened issues {{description}} for the current project", - "instantiation_dict": {"description": "that don't have any labels"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "navigate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "ignored_query_params_patterns": [".*"], - "expected": {"url": "^__GITLAB__/root/metaseq/-/issues.*$"} - }, - { - "evaluator": "NetworkEventEvaluator", - "ignored_query_params_patterns": ["page", "sort", "scope"], - "expected": { - "url": "__GITLAB__/api/graphql", - "http_method": "POST", - "headers": { - "referer": "__GITLAB__/root/metaseq/-/issues/?state=opened&label_name%5B%5D=None" - } - } - } - ], - "revision": 2 - }, - { - "sites": ["shopping_admin"], - "task_id": 344, - "intent_template_id": 248, - "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "How many reviews did our shop receive so far?", - "intent_template": "How many reviews did our shop receive {{time}}?", - "instantiation_dict": {"time": "so far"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { "type": "array", "items": {"type": "number"} }, - "expected": { "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": [351] } - } - ], - "revision": 2 - }, - { - "sites": ["shopping_admin"], - "task_id": 345, - "intent_template_id": 248, - "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "How many reviews did our shop receive in Apr 2023?", - "intent_template": "How many reviews did our shop receive {{time}}?", - "instantiation_dict": {"time": "in Apr 2023"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { "type": "array", "items": {"type": "number"} }, - "expected": { "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": [351] } - } - ], - "revision": 2 - }, - { - "sites": ["shopping_admin"], - "task_id": 346, - "intent_template_id": 248, - "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "How many reviews did our shop receive during 2022?", - "intent_template": "How many reviews did our shop receive {{time}}?", - "instantiation_dict": {"time": "during 2022"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { "type": "array", "items": {"type": "number"} }, - "expected": { "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": [0] } - } - ], - "revision": 2 - }, - { - "sites": ["shopping_admin"], - "task_id": 347, - "intent_template_id": 248, - "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "How many reviews did our shop receive from the beginning of the shop?", - "intent_template": "How many reviews did our shop receive {{time}}?", - "instantiation_dict": {"time": "from the beginning of the shop"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { "type": "array", "items": {"type": "number"} }, - "expected": { "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": [351] } - } - ], - "revision": 2 - }, - { - "sites": ["shopping_admin"], - "task_id": 348, - "intent_template_id": 248, - "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "How many reviews did our shop receive in May 2023?", - "intent_template": "How many reviews did our shop receive {{time}}?", - "instantiation_dict": {"time": "in May 2023"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { "type": "array", "items": {"type": "number"} }, - "expected": { "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": [0] } - } - ], - "revision": 2 - }, - { - "sites": ["gitlab"], - "task_id": 349, - "intent_template_id": 298, - "start_urls": ["__GITLAB__"], - "intent": "Get the usernames of other users who have access to my repo gimmiethat.space", - "intent_template": "Get the usernames of other users who have access to my repo {{repo}}", - "instantiation_dict": {"repo": "gimmiethat.space"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { "type": "array", "items": {"type": "string"} }, - "expected": { - "task_type": "retrieve", - "status": "SUCCESS", - "retrieved_data": ["^@?yjlou$"] - } - } - ], - "revision": 3 - }, - { - "sites": ["gitlab"], - "task_id": 350, - "intent_template_id": 298, - "start_urls": ["__GITLAB__"], - "intent": "Get the usernames of other users who have access to my repo prism-theme", - "intent_template": "Get the usernames of other users who have access to my repo {{repo}}", - "instantiation_dict": {"repo": "prism-theme"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { "type": "array", "items": {"type": "string"} }, - "expected": { - "task_type": "retrieve", - "status": "SUCCESS", - "retrieved_data": ["^@?abisubramanya27$"] - } - } - ], - "revision": 3 - }, - { - "sites": ["shopping"], - "task_id": 351, - "intent_template_id": 137, - "start_urls": ["__SHOPPING__"], - "intent": "Go to the page showing PS4 accessories products sorted by ascending price", - "intent_template": "Go to the page showing {{product_category}} products sorted by {{order}} price", - "instantiation_dict": {"product_category": "PS4 accessories", "order": "ascending"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "navigate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__SHOPPING__/video-games/playstation-4/accessories.html", - "query_params": { "product_list_order": ["price"] } - } - } - ], - "revision": 2 - }, - { - "sites": ["shopping"], - "task_id": 352, - "intent_template_id": 137, - "start_urls": ["__SHOPPING__"], - "intent": "Go to the page showing nutrition bars and drinks products sorted by ascending price", - "intent_template": "Go to the page showing {{product_category}} products sorted by {{order}} price", - "instantiation_dict": {"product_category": "nutrition bars and drinks", "order": "ascending"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "navigate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__SHOPPING__/health-household/diet-sports-nutrition/nutrition-bars-drinks.html", - "query_params": { "product_list_order": ["price"] } - } - } - ], - "revision": 2 - }, - { - "sites": ["shopping"], - "task_id": 353, - "intent_template_id": 137, - "start_urls": ["__SHOPPING__"], - "intent": "Go to the page showing competitive swimwear products sorted by ascending price", - "intent_template": "Go to the page showing {{product_category}} products sorted by {{order}} price", - "instantiation_dict": {"product_category": "competitive swimwear", "order": "ascending"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "navigate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__SHOPPING__/clothing-shoes-jewelry/sport-specific-clothing/competitive-swimwear.html", - "query_params": { "product_list_order": ["price"] } - } - } - ], - "revision": 2 - }, - { - "sites": ["shopping"], - "task_id": 354, - "intent_template_id": 137, - "start_urls": ["__SHOPPING__"], - "intent": "Go to the page showing living room furniture products sorted by descending price", - "intent_template": "Go to the page showing {{product_category}} products sorted by {{order}} price", - "instantiation_dict": {"product_category": "living room furniture", "order": "descending"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "navigate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__SHOPPING__/home-kitchen/furniture/living-room-furniture.html", - "query_params": { "product_list_order": ["price"], "product_list_dir": ["desc"] } - } - } - ], - "revision": 2 - }, - { - "sites": ["shopping"], - "task_id": 355, - "intent_template_id": 137, - "start_urls": ["__SHOPPING__"], - "intent": "Go to the page showing kids\" bedding products sorted by descending price", - "intent_template": "Go to the page showing {{product_category}} products sorted by {{order}} price", - "instantiation_dict": {"product_category": "kids\" bedding", "order": "descending"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "navigate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__SHOPPING__/home-kitchen/bedding/kids-bedding.html", - "query_params": { "product_list_dir": ["desc"] } - } - } - ], - "revision": 2 - }, - { - "sites": ["map"], - "task_id": 356, - "intent_template_id": 49, - "start_urls": ["__MAP__"], - "intent": "Show on the map the route from Gates and Hillman Centers at CMU in Pittsburgh to the location where the Declaration of Independence and Constitution were signed. (Use the OSRM direction service.)", - "intent_template": "Show on the map the route from Gates and Hillman Centers at CMU in Pittsburgh to the location where the Declaration of Independence and Constitution were signed. (Use the OSRM direction service.)", - "instantiation_dict": {}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "navigate", "status": "SUCCESS", "retrieved_data": null} - }, - { "evaluator": "NetworkEventEvaluator", "expected": {"url": "__MAP__"} } - ], - "revision": 2 - }, - { - "sites": ["gitlab"], - "task_id": 357, - "intent_template_id": 291, - "start_urls": ["__GITLAB__"], - "intent": "Go to the merge requests requiring my review", - "intent_template": "Go to the merge requests requiring my review", - "instantiation_dict": {}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "navigate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__GITLAB__/dashboard/merge_requests", - "query_params": { - "reviewer_username": [ "byteblaze" ], - "scope" : [ "^(all|)$" ], - "state" : [ "^(opened|)$" ] - } - } - } - ], - "revision": 2 - }, - { - "sites": ["shopping"], - "task_id": 358, - "intent_template_id": 206, - "start_urls": ["__SHOPPING__"], - "intent": "Get the shipping method for order number 187.", - "intent_template": "Get the {{info}} for order number {{order_number}}.{{retrieved_data_format_spec}}", - "instantiation_dict": { - "info": "shipping method", - "order_number": 187, - "retrieved_data_format_spec": "" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { "type": "array", "items": {"type": "string"} }, - "expected": { - "task_type": "retrieve", - "status": "SUCCESS", - "retrieved_data": ["^flat rate[^a-z0-9]*(?:fixed)$"] - } - } - ], - "revision": 2 - }, - { - "sites": ["shopping"], - "task_id": 359, - "intent_template_id": 206, - "start_urls": ["__SHOPPING__"], - "intent": "Get the order date for order number 148.Return the date in YYYY-MM-DD format or null if not available, without any additional details", - "intent_template": "Get the {{info}} for order number {{order_number}}.{{retrieved_data_format_spec}}", - "instantiation_dict": { - "info": "order date", - "order_number": "148", - "retrieved_data_format_spec": "Return the date in YYYY-MM-DD format or null if not available, without any additional details" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { "type": "array", "items": {"type": "string", "format": "date"} }, - "expected": { - "task_type": "retrieve", - "status": "SUCCESS", - "retrieved_data": ["January 29, 2023"] - } - } - ], - "revision": 2 - }, - { - "sites": ["shopping"], - "task_id": 360, - "intent_template_id": 206, - "start_urls": ["__SHOPPING__"], - "intent": "Get the product names for order number 148.", - "intent_template": "Get the {{info}} for order number {{order_number}}.{{retrieved_data_format_spec}}", - "instantiation_dict": { - "info": "product names", - "order_number": "148", - "retrieved_data_format_spec": "" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { "type": "array", "items": {"type": "string"} }, - "expected": { - "task_type": "retrieve", - "status": "SUCCESS", - "retrieved_data": [ - "Bornbridge Artificial Spiral Topiary Tree - Indoor / Outdoor Topiary Trees - Artificial Outdoor Plants (2 Pack, 4' Cypress)", - "Russound 5B45W 4\" Indoor Outdoor Speakers White" - ] - } - } - ], - "revision": 2 - }, - { - "sites": ["shopping"], - "task_id": 361, - "intent_template_id": 206, - "start_urls": ["__SHOPPING__"], - "intent": "Get the order statuses for order number 170 and 189. Return a list of objects with keys \"order_number\" and \"status\".", - "intent_template": "Get the {{info}} for order number {{order_number}}.{{retrieved_data_format_spec}}", - "instantiation_dict": { - "info": "order statuses", - "order_number": "170 and 189", - "retrieved_data_format_spec": " Return a list of objects with keys \"order_number\" and \"status\"." - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { - "type": "array", - "items": { - "type": "object", - "properties": { "order_number": {"type": "string"}, "status": {"type": "string"} } - } - }, - "expected": { - "task_type": "retrieve", - "status": "SUCCESS", - "retrieved_data": [ - { "order_number": "^#?\\s*0*170$", "status": "canceled" }, - { "order_number": "^#?\\s*0*189$", "status": "pending" } - ] - } - } - ], - "revision": 2 - }, - { - "sites": ["shopping"], - "task_id": 362, - "intent_template_id": 206, - "start_urls": ["__SHOPPING__"], - "intent": "Get the billing address for order number 00178. Return a list of objects with keys \"house_number\", \"street\", \"city\", \"state\", \"postcode\", and \"country\".", - "intent_template": "Get the {{info}} for order number {{order_number}}.{{retrieved_data_format_spec}}", - "instantiation_dict": { - "info": "billing address", - "order_number": "00178", - "retrieved_data_format_spec": " Return a list of objects with keys \"house_number\", \"street\", \"city\", \"state\", \"postcode\", and \"country\"." - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { - "type": "array", - "items": { - "type": "object", - "title": "full_address", - "properties": { - "house_number": { "type": "string" }, - "street" : { "type": "string" }, - "city" : { "type": "string" }, - "state" : { "type": "string" }, - "postcode" : { "type": "string" }, - "country" : { "type": "string" } - } - } - }, - "expected": { - "task_type": "retrieve", - "status": "SUCCESS", - "retrieved_data": [ - { - "house_number": "101", - "street": "S San Mateo Dr", - "city": "San Mateo", - "state": "California", - "postcode": "94010", - "country": "United States" - } - ] - } - } - ], - "revision": 2 - }, - { - "sites": ["map"], - "task_id": 363, - "intent_template_id": 58, - "start_urls": ["__MAP__"], - "intent": "Measure distance between Carnegie Mellon University and Carnegie Music Hall by walking. Return the value as a string (numeric value with unit km or m, e.g., 2.4km, 500m) only, without any additional details. (Use the OSRM direction service.)", - "intent_template": "Measure distance between {{location_address_1}} and {{location_address_2}} by walking. {{retrieved_data_format_spec}}. (Use the OSRM direction service.)", - "instantiation_dict": { - "location_address_1": "Carnegie Mellon University", - "location_address_2": "Carnegie Music Hall", - "retrieved_data_format_spec": "Return the value as a string (numeric value with unit km or m, e.g., 2.4km, 500m) only, without any additional details" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { "type": "array", "items": {"type": "string", "format": "distance"} }, - "expected": { "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": ["748m"] } - } - ], - "revision": 2 - }, - { - "sites": ["map"], - "task_id": 364, - "intent_template_id": 58, - "start_urls": ["__MAP__"], - "intent": "Measure distance between Carnegie Mellon University and UPMC Shadyside by walking. Return the value as a string (numeric value with unit km or m, e.g., 2.4km, 500m) only, without any additional details. (Use the OSRM direction service.)", - "intent_template": "Measure distance between {{location_address_1}} and {{location_address_2}} by walking. {{retrieved_data_format_spec}}. (Use the OSRM direction service.)", - "instantiation_dict": { - "location_address_1": "Carnegie Mellon University", - "location_address_2": "UPMC Shadyside", - "retrieved_data_format_spec": "Return the value as a string (numeric value with unit km or m, e.g., 2.4km, 500m) only, without any additional details" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { "type": "array", "items": {"type": "string", "format": "distance"} }, - "expected": { "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": ["1.7km"] } - } - ], - "revision": 2 - }, - { - "sites": ["map"], - "task_id": 365, - "intent_template_id": 58, - "start_urls": ["__MAP__"], - "intent": "Measure distance between Carnegie Music Hall and UPMC Shadyside by walking. Return the value as a string (numeric value with unit km or m, e.g., 2.4km, 500m) only, without any additional details. (Use the OSRM direction service.)", - "intent_template": "Measure distance between {{location_address_1}} and {{location_address_2}} by walking. {{retrieved_data_format_spec}}. (Use the OSRM direction service.)", - "instantiation_dict": { - "location_address_1": "Carnegie Music Hall", - "location_address_2": "UPMC Shadyside", - "retrieved_data_format_spec": "Return the value as a string (numeric value with unit km or m, e.g., 2.4km, 500m) only, without any additional details" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { "type": "array", "items": {"type": "string", "format": "distance"} }, - "expected": { "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": ["2.2km"] } - } - ], - "revision": 2 - }, - { - "sites": ["map"], - "task_id": 366, - "intent_template_id": 58, - "start_urls": ["__MAP__"], - "intent": "Measure distance between CVS (closest one) and UPMC Shadyside by walking. Return the value as a string (numeric value with unit km or m, e.g., 2.4km, 500m) only, without any additional details. (Use the OSRM direction service.)", - "intent_template": "Measure distance between {{location_address_1}} and {{location_address_2}} by walking. {{retrieved_data_format_spec}}. (Use the OSRM direction service.)", - "instantiation_dict": { - "location_address_1": "CVS (closest one)", - "location_address_2": "UPMC Shadyside", - "retrieved_data_format_spec": "Return the value as a string (numeric value with unit km or m, e.g., 2.4km, 500m) only, without any additional details" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { "type": "array", "items": {"type": "string", "format": "distance"} }, - "expected": { "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": ["1.2km"] } - } - ], - "revision": 2 - }, - { - "sites": ["map"], - "task_id": 367, - "intent_template_id": 58, - "start_urls": ["__MAP__"], - "intent": "Measure distance between Carnegie Mellon University and CVS (closest one) by walking. Return the value as a string (numeric value with unit km or m, e.g., 2.4km, 500m) only, without any additional details. (Use the OSRM direction service.)", - "intent_template": "Measure distance between {{location_address_1}} and {{location_address_2}} by walking. {{retrieved_data_format_spec}}. (Use the OSRM direction service.)", - "instantiation_dict": { - "location_address_1": "Carnegie Mellon University", - "location_address_2": "CVS (closest one)", - "retrieved_data_format_spec": "Return the value as a string (numeric value with unit km or m, e.g., 2.4km, 500m) only, without any additional details" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { "type": "array", "items": {"type": "string", "format": "distance"} }, - "expected": { "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": ["1.4km"] } - } - ], - "revision": 2 - }, - { - "sites": ["shopping"], - "task_id": 368, - "intent_template_id": 188, - "start_urls": ["__SHOPPING__"], - "intent": "Return the list of discounted (sale) items available on the site.", - "intent_template": "Return the list of discounted (sale) items available on the site.", - "instantiation_dict": {}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "retrieve", "status": "NOT_FOUND_ERROR", "retrieved_data": null} - } - ], - "revision": 2 - }, - { - "sites": ["map"], - "task_id": 369, - "intent_template_id": 52, - "start_urls": ["__MAP__"], - "intent": "Pull up the description page of Carnegie Music Hall on Map", - "intent_template": "Pull up the description page of {{location}} on Map", - "instantiation_dict": {"location": "Carnegie Music Hall"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "navigate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": {"url": "__MAP__/way/154257484/"} - } - ], - "revision": 2 - }, - { - "sites": ["map"], - "task_id": 370, - "intent_template_id": 52, - "start_urls": ["__MAP__"], - "intent": "Pull up the description page of Carnegie Mellon University on Map", - "intent_template": "Pull up the description page of {{location}} on Map", - "instantiation_dict": {"location": "Carnegie Mellon University"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "navigate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": {"url": "__MAP__/relation/2279034/"} - } - ], - "revision": 2 - }, - { - "sites": ["map"], - "task_id": 371, - "intent_template_id": 52, - "start_urls": ["__MAP__"], - "intent": "Pull up the description page of Piada restaurant near Pitt on Map", - "intent_template": "Pull up the description page of {{location}} on Map", - "instantiation_dict": {"location": "Piada restaurant near Pitt"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "navigate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": {"url": "__MAP__/node/2710170970"} - } - ], - "revision": 2 - }, - { - "sites": ["map"], - "task_id": 372, - "intent_template_id": 52, - "start_urls": ["__MAP__"], - "intent": "Pull up the description page of the Costco in Pittsburgh near a river on Map", - "intent_template": "Pull up the description page of {{location}} on Map", - "instantiation_dict": {"location": "the Costco in Pittsburgh near a river"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "navigate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": {"url": "__MAP__/way/168456128"} - } - ], - "revision": 2 - }, - { - "sites": ["map"], - "task_id": 373, - "intent_template_id": 52, - "start_urls": ["__MAP__"], - "intent": "Pull up the description page of Whole Foods near Carnegie Mellon on Map", - "intent_template": "Pull up the description page of {{location}} on Map", - "instantiation_dict": {"location": "Whole Foods near Carnegie Mellon"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "navigate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": {"url": "__MAP__/node/10114377662"} - } - ], - "revision": 2 - }, - { - "sites": ["shopping_admin"], - "task_id": 374, - "intent_template_id": 266, - "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Go to the Magento Blank theme settings page", - "intent_template": "Go to the {{name}} theme settings page", - "instantiation_dict": {"name": "Magento Blank"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "navigate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": {"url": "__SHOPPING_ADMIN__/admin/system_design_theme/edit/id/1"} - } - ], - "revision": 2 - }, - { - "sites": ["shopping_admin"], - "task_id": 375, - "intent_template_id": 266, - "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Go to the Magento Luma theme settings page", - "intent_template": "Go to the {{name}} theme settings page", - "instantiation_dict": {"name": "Magento Luma"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "navigate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": {"url": "__SHOPPING_ADMIN__/admin/system_design_theme/edit/id/3"} - } - ], - "revision": 2 - }, - { - "sites": ["shopping"], - "task_id": 376, - "intent_template_id": 182, - "start_urls": ["__SHOPPING__"], - "intent": "Summarize customer reviews for Amazon Echo Dot 3rd generation.", - "intent_template": "Summarize customer reviews for {{product}}.", - "instantiation_dict": {"product": "Amazon Echo Dot 3rd generation"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "retrieve", "status": "NOT_FOUND_ERROR", "retrieved_data": null} - } - ], - "revision": 2 - }, - { - "sites": ["map"], - "task_id": 377, - "intent_template_id": 59, - "start_urls": ["__MAP__"], - "intent": "Show on the map restaurants near CMU ArtPark Lab", - "intent_template": "Show on the map {{space}} near {{location}}", - "instantiation_dict": {"location": "CMU ArtPark Lab", "space": "restaurants"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "navigate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__MAP__/search", - "query_params": { "query": ["restaurants near CMU ArtPark Lab"] } - } - } - ], - "revision": 2 - }, - { - "sites": ["map"], - "task_id": 378, - "intent_template_id": 59, - "start_urls": ["__MAP__"], - "intent": "Show on the map parking near Carnegie Mellon University", - "intent_template": "Show on the map {{space}} near {{location}}", - "instantiation_dict": {"location": "Carnegie Mellon University", "space": "parking"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "navigate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__MAP__/search", - "query_params": { "query": ["parking near Carnegie Mellon University"] } - } - } - ], - "revision": 2 - }, - { - "sites": ["map"], - "task_id": 379, - "intent_template_id": 59, - "start_urls": ["__MAP__"], - "intent": "Show on the map hotels near Carnegie Mellon University", - "intent_template": "Show on the map {{space}} near {{location}}", - "instantiation_dict": {"location": "Carnegie Mellon University", "space": "hotels"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "navigate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__MAP__/search", - "query_params": { "query": ["hotels near Carnegie Mellon University"] } - } - } - ], - "revision": 2 - }, - { - "sites": ["map"], - "task_id": 380, - "intent_template_id": 59, - "start_urls": ["__MAP__"], - "intent": "Show on the map bars near Carnegie Music Hall", - "intent_template": "Show on the map {{space}} near {{location}}", - "instantiation_dict": {"location": "Carnegie Music Hall", "space": "bars"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "navigate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__MAP__/search", - "query_params": { "query": ["bars near Carnegie Music Hall"] } - } - } - ], - "revision": 2 - }, - { - "sites": ["map"], - "task_id": 381, - "intent_template_id": 59, - "start_urls": ["__MAP__"], - "intent": "Show on the map hotels near Carnegie Music Hall", - "intent_template": "Show on the map {{space}} near {{location}}", - "instantiation_dict": {"location": "Carnegie Music Hall", "space": "hotels"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "navigate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__MAP__/search", - "query_params": { "query": ["hotels near Carnegie Music Hall"] } - } - } - ], - "revision": 2 - }, - { - "sites": ["map"], - "task_id": 382, - "intent_template_id": 781, - "start_urls": ["__MAP__"], - "intent": "I am arriving at Carnegie Mellon University. Find the nearby US Citizenship and Immigration Services and the walking distance to the nearest Social Security Administration from US Citizenship and Immigration Services. (Use the OSRM direction service.)", - "intent_template": "I am arriving at Carnegie Mellon University. Find the nearby US Citizenship and Immigration Services and the walking distance to the nearest Social Security Administration from US Citizenship and Immigration Services. (Use the OSRM direction service.)", - "instantiation_dict": {}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "retrieve", "status": "NOT_FOUND_ERROR", "retrieved_data": null} - } - ], - "revision": 2 - }, - { - "sites": ["map"], - "task_id": 383, - "intent_template_id": 782, - "start_urls": ["__MAP__"], - "intent": "I am arriving at Pittsburgh Airport. Find if there is a Hyatt hotel nearby. If there is, get me its name and the names of any supermarkets that are within 15 minutes driving from the hotel. Return a list of objects with keys \"hotel\" and \"supermarkets\". (Use the OSRM direction service.)", - "intent_template": "I am arriving at Pittsburgh Airport. Find if there is a Hyatt hotel nearby. If there is, get me its name and the names of any supermarkets that are within 15 minutes driving from the hotel. {{retrieved_data_format_spec}}. (Use the OSRM direction service.)", - "instantiation_dict": { - "retrieved_data_format_spec": "Return a list of objects with keys \"hotel\" and \"supermarkets\"" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { - "type": "array", - "items": { - "type": "object", - "properties": { - "hotel": {"type": "string", "format": "location-name"}, - "supermarkets": { "type": "array", "items": {"type": "string", "format": "location-name"} } - } - } - }, - "expected": { - "task_type": "retrieve", - "status": "SUCCESS", - "retrieved_data": [ - { - "hotel": "Hyatt Regency Pittsburgh International Airport", - "supermarkets": ["Giant Eagle", "ALDI"] - } - ] - } - } - ], - "revision": 2 - }, - { - "sites": ["shopping"], - "task_id": 384, - "intent_template_id": 666, - "start_urls": ["__SHOPPING__"], - "intent": "List the customer names who complain about the quality of EYZUTAK phone cases", - "intent_template": "List the customer names who {{review_criteria}}", - "instantiation_dict": {"review_criteria": "complain about the quality of EYZUTAK phone cases"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { "type": "array", "items": {"type": "string"} }, - "expected": { - "task_type": "retrieve", - "status": "SUCCESS", - "retrieved_data": ["Lisa Lee", "Evelyn Kurver", "Amanda", "N Randall"] - } - } - ], - "revision": 2 - }, - { - "sites": ["shopping"], - "task_id": 385, - "intent_template_id": 666, - "start_urls": ["__SHOPPING__"], - "intent": "List the customer names who thinks EYZUTAK phone cases are of good looking", - "intent_template": "List the customer names who {{review_criteria}}", - "instantiation_dict": {"review_criteria": "thinks EYZUTAK phone cases are of good looking"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { "type": "array", "items": {"type": "string"} }, - "expected": { - "task_type": "retrieve", - "status": "SUCCESS", - "retrieved_data": ["Lisa Lee", "MH", "Misba009", "Amanda", "Cally", "Bethany Robertson"] - } - } - ], - "revision": 2 - }, - { - "sites": ["shopping"], - "task_id": 386, - "intent_template_id": 1355, - "start_urls": ["__SHOPPING__"], - "intent": "What is the rating of Ugreen lightning to 3.5mm cable?", - "intent_template": "What is the rating of {{product}}?", - "instantiation_dict": {"product": "Ugreen lightning to 3.5mm cable"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { "type": "array", "items": {"type": "number"} }, - "expected": { "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": [65] } - } - ], - "revision": 2 - }, - { - "sites": ["shopping"], - "task_id": 387, - "intent_template_id": 1356, - "start_urls": ["__SHOPPING__"], - "intent": "Who gave 4 or 5 stars for phone cases from EYZUTAK", - "intent_template": "Who gave {{stars}} for phone cases from EYZUTAK", - "instantiation_dict": {"stars": "4 or 5 stars"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { "type": "array", "items": {"type": "string"} }, - "expected": { - "task_type": "retrieve", - "status": "SUCCESS", - "retrieved_data": ["MH", "Misba009", "Amanda", "Amazon Customer", "Cally", "Bethany Robertson"] - } - } - ], - "revision": 2 - }, - { - "sites": ["shopping"], - "task_id": 388, - "intent_template_id": 1356, - "start_urls": ["__SHOPPING__"], - "intent": "Who gave 1 or 2 stars for phone cases from EYZUTAK", - "intent_template": "Who gave {{stars}} for phone cases from EYZUTAK", - "instantiation_dict": {"stars": "1 or 2 stars"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { "type": "array", "items": {"type": "string"} }, - "expected": { - "task_type": "retrieve", - "status": "SUCCESS", - "retrieved_data": ["Evelyn Kurver", "N Randall"] - } - } - ], - "revision": 2 - }, - { - "sites": ["gitlab"], - "task_id": 389, - "intent_template_id": 348, - "start_urls": ["__GITLAB__"], - "intent": "Post \"Thanks, working on reviews\" for the merge request related to octovisuals page in primer/design project", - "intent_template": "Post \"{{content}}\" for the merge request related to {{mr}} in {{repo}} project", - "instantiation_dict": { - "mr": "octovisuals page", - "repo": "primer/design", - "content": "Thanks, working on reviews" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__GITLAB__/primer/design/notes", - "http_method": "POST", - "post_data": { - "$.note.noteable_type": "MergeRequest", - "$.note.note": "Thanks, working on reviews" - }, - "query_params": { "target_id": ["139245"], "target_type": ["merge_request"] } - } - } - ], - "revision": 2 - }, - { - "sites": ["gitlab"], - "task_id": 390, - "intent_template_id": 348, - "start_urls": ["__GITLAB__"], - "intent": "Post \"lgtm\" for the merge request related to semantic HTML post in a11yproject/a11yproject.com project", - "intent_template": "Post \"{{content}}\" for the merge request related to {{mr}} in {{repo}} project", - "instantiation_dict": { - "mr": "semantic HTML post", - "repo": "a11yproject/a11yproject.com", - "content": "lgtm" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__GITLAB__/a11yproject/a11yproject.com/notes", - "http_method": "POST", - "post_data": {"$.note.noteable_type": "MergeRequest", "$.note.note": "lgtm"}, - "query_params": { "target_id": ["72594"], "target_type": ["merge_request"] } - } - } - ], - "revision": 2 - }, - { - "sites": ["gitlab"], - "task_id": 391, - "intent_template_id": 348, - "start_urls": ["__GITLAB__"], - "intent": "Post \"close because non reproducible\" for the merge request related to focus edge cases in a11yproject/a11yproject.com project", - "intent_template": "Post \"{{content}}\" for the merge request related to {{mr}} in {{repo}} project", - "instantiation_dict": { - "mr": "focus edge cases", - "repo": "a11yproject/a11yproject.com", - "content": "close because non reproducible" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__GITLAB__/a11yproject/a11yproject.com/notes", - "http_method": "POST", - "post_data": { - "$.note.noteable_type": "MergeRequest", - "$.note.note": "close because non reproducible" - }, - "query_params": { "target_id": ["72404"], "target_type": ["merge_request"] } - } - } - ], - "revision": 2 - }, - { - "sites": ["gitlab"], - "task_id": 392, - "intent_template_id": 348, - "start_urls": ["__GITLAB__"], - "intent": "Post \"Good idea\" for the merge request related to color ulitity in a11yproject.com project", - "intent_template": "Post \"{{content}}\" for the merge request related to {{mr}} in {{repo}} project", - "instantiation_dict": {"mr": "color ulitity", "repo": "a11yproject.com", "content": "Good idea"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__GITLAB__/a11yproject/a11yproject.com/notes", - "http_method": "POST", - "post_data": {"$.note.noteable_type": "MergeRequest", "$.note.note": "Good idea"}, - "query_params": { "target_id": ["72135"], "target_type": ["merge_request"] } - } - } - ], - "revision": 2 - }, - { - "sites": ["gitlab"], - "task_id": 393, - "intent_template_id": 348, - "start_urls": ["__GITLAB__"], - "intent": "Post \"lgtm\" for the merge request related to fixing the broken links in byteblaze/empathy-prompts project", - "intent_template": "Post \"{{content}}\" for the merge request related to {{mr}} in {{repo}} project", - "instantiation_dict": { - "mr": "fixing the broken links", - "repo": "byteblaze/empathy-prompts", - "content": "lgtm" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__GITLAB__/byteblaze/empathy-prompts/notes", - "http_method": "POST", - "post_data": {"$.note.noteable_type": "MergeRequest", "$.note.note": "lgtm"}, - "query_params": { "target_id": ["138843"], "target_type": ["merge_request"] } - } - } - ], - "revision": 2 - }, - { - "sites": ["gitlab"], - "task_id": 394, - "intent_template_id": 352, - "start_urls": ["__GITLAB__"], - "intent": "Fork 2019-nCov.", - "intent_template": "Fork {{repo}}.", - "instantiation_dict": {"repo": "2019-nCov"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__GITLAB__/api/v4/projects/79/fork", - "http_method": "POST", - "post_data": {"id": "79", "name": "2019-nCov", "namespace_id": 2505, "path": "2019-nCov"}, - "response_status": 201 - } - } - ], - "revision": 2 - }, - { - "sites": ["gitlab"], - "task_id": 395, - "intent_template_id": 352, - "start_urls": ["__GITLAB__"], - "intent": "Fork the Pytorch GAN repo with most stars.", - "intent_template": "Fork {{repo}}.", - "instantiation_dict": {"repo": "the Pytorch GAN repo with most stars"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__GITLAB__/api/v4/projects/86/fork", - "http_method": "POST", - "post_data": {"id": "86", "name": "PyTorch-GAN", "namespace_id": 2505, "path": "PyTorch-GAN"}, - "response_status": 201 - } - } - ], - "revision": 2 - }, - { - "sites": ["gitlab"], - "task_id": 396, - "intent_template_id": 352, - "start_urls": ["__GITLAB__"], - "intent": "Fork ChatGPT.", - "intent_template": "Fork {{repo}}.", - "instantiation_dict": {"repo": "ChatGPT"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__GITLAB__/api/v4/projects/175/fork", - "http_method": "POST", - "post_data": {"id": "175", "name": "Chatgpt", "namespace_id": 2505, "path": "chatgpt"}, - "response_status": 201 - } - } - ], - "revision": 2 - }, - { - "sites": ["gitlab"], - "task_id": 397, - "intent_template_id": 352, - "start_urls": ["__GITLAB__"], - "intent": "Fork MetaSeq.", - "intent_template": "Fork {{repo}}.", - "instantiation_dict": {"repo": "MetaSeq"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__GITLAB__/api/v4/projects/33/fork", - "http_method": "POST", - "post_data": {"id": "33", "name": "metaseq", "namespace_id": 2505, "path": "metaseq"}, - "response_status": 201 - } - } - ], - "revision": 2 - }, - { - "sites": ["gitlab"], - "task_id": 398, - "intent_template_id": 352, - "start_urls": ["__GITLAB__"], - "intent": "Fork all repos from Akilesh Kannan.", - "intent_template": "Fork {{repo}}.", - "instantiation_dict": {"repo": "all repos from Akilesh Kannan"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__GITLAB__/api/v4/projects/170/fork", - "http_method": "POST", - "post_data": {"id": "170", "name": "CacheEval", "namespace_id": 2505, "path": "CacheEval"}, - "response_status": 201 - } - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__GITLAB__/api/v4/projects/165/fork", - "http_method": "POST", - "post_data": { - "id": "165", - "name": "nvidia-patch", - "namespace_id": 2505, - "path": "nvidia-patch" - }, - "response_status": 201 - } - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__GITLAB__/api/v4/projects/163/fork", - "http_method": "POST", - "post_data": {"id": "163", "name": "SimCache", "namespace_id": 2505, "path": "SimCache"}, - "response_status": 201 - } - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__GITLAB__/api/v4/projects/164/fork", - "http_method": "POST", - "post_data": { - "id": "164", - "name": "viewgrades-scraper", - "namespace_id": 2505, - "path": "viewgrades-scraper" - }, - "response_status": 201 - } - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__GITLAB__/api/v4/projects/159/fork", - "http_method": "POST", - "post_data": {"id": "159", "name": "dots", "namespace_id": 2505, "path": "dots"}, - "response_status": 201 - } - } - ], - "revision": 2 - }, - { - "sites": ["reddit"], - "task_id": 399, - "intent_template_id": 6, - "start_urls": ["__REDDIT__"], - "intent": "Change my bio to \"I am a robot\" in the discussion forum", - "intent_template": "Change my bio to \"{{content}}\" in the discussion forum", - "instantiation_dict": {"content": "I am a robot"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "last_event_only": false, - "expected": { - "url": "__REDDIT__/user/MarvelsGrantMan136/edit_biography", - "http_method": "POST", - "post_data": {"user_biography[biography]": "I am a robot"}, - "response_status": 302 - } - } - ], - "revision": 2 - }, - { - "sites": ["reddit"], - "task_id": 400, - "intent_template_id": 6, - "start_urls": ["__REDDIT__"], - "intent": "Change my bio to \"Pro Python Developer with 20 years of Experience\" in the discussion forum", - "intent_template": "Change my bio to \"{{content}}\" in the discussion forum", - "instantiation_dict": {"content": "Pro Python Developer with 20 years of Experience"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "last_event_only": false, - "expected": { - "url": "__REDDIT__/user/MarvelsGrantMan136/edit_biography", - "http_method": "POST", - "post_data": { - "user_biography[biography]": "Pro Python Developer with 20 years of Experience" - }, - "response_status": 302 - } - } - ], - "revision": 2 - }, - { - "sites": ["reddit"], - "task_id": 401, - "intent_template_id": 6, - "start_urls": ["__REDDIT__"], - "intent": "Change my bio to \"Seeking SDE positions\" in the discussion forum", - "intent_template": "Change my bio to \"{{content}}\" in the discussion forum", - "instantiation_dict": {"content": "Seeking SDE positions"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "last_event_only": false, - "expected": { - "url": "__REDDIT__/user/MarvelsGrantMan136/edit_biography", - "http_method": "POST", - "post_data": {"user_biography[biography]": "Seeking SDE positions"}, - "response_status": 302 - } - } - ], - "revision": 2 - }, - { - "sites": ["reddit"], - "task_id": 402, - "intent_template_id": 6, - "start_urls": ["__REDDIT__"], - "intent": "Change my bio to \"Freelance Web Developer\" in the discussion forum", - "intent_template": "Change my bio to \"{{content}}\" in the discussion forum", - "instantiation_dict": {"content": "Freelance Web Developer"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "last_event_only": false, - "expected": { - "url": "__REDDIT__/user/MarvelsGrantMan136/edit_biography", - "http_method": "POST", - "post_data": {"user_biography[biography]": "Freelance Web Developer"}, - "response_status": 302 - } - } - ], - "revision": 2 - }, - { - "sites": ["reddit"], - "task_id": 403, - "intent_template_id": 6, - "start_urls": ["__REDDIT__"], - "intent": "Change my bio to \"Awesome Prompt Artist\" in the discussion forum", - "intent_template": "Change my bio to \"{{content}}\" in the discussion forum", - "instantiation_dict": {"content": "Awesome Prompt Artist"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "last_event_only": false, - "expected": { - "url": "__REDDIT__/user/MarvelsGrantMan136/edit_biography", - "http_method": "POST", - "post_data": {"user_biography[biography]": "Awesome Prompt Artist"}, - "response_status": 302 - } - } - ], - "revision": 2 - }, - { - "sites": ["reddit"], - "task_id": 404, - "intent_template_id": 22, - "start_urls": ["__REDDIT__"], - "intent": "Upvote the newest post in books forum", - "intent_template": "Upvote the newest post in {{forum}} forum", - "instantiation_dict": {"forum": "books"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__REDDIT__/sv/124260.json", - "http_method": "POST", - "post_data": {"choice": "1"} - } - } - ], - "revision": 2 - }, - { - "sites": ["reddit"], - "task_id": 405, - "intent_template_id": 22, - "start_urls": ["__REDDIT__"], - "intent": "Upvote the newest post in DIY forum", - "intent_template": "Upvote the newest post in {{forum}} forum", - "instantiation_dict": {"forum": "DIY"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__REDDIT__/sv/119019.json", - "http_method": "POST", - "post_data": {"choice": "1"} - } - } - ], - "revision": 2 - }, - { - "sites": ["reddit"], - "task_id": 406, - "intent_template_id": 22, - "start_urls": ["__REDDIT__"], - "intent": "Upvote the newest post in future technology forum", - "intent_template": "Upvote the newest post in {{forum}} forum", - "instantiation_dict": {"forum": "future technology"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__REDDIT__/sv/119517.json", - "http_method": "POST", - "post_data": {"choice": "1"} - } - } - ], - "revision": 2 - }, - { - "sites": ["reddit"], - "task_id": 407, - "intent_template_id": 22, - "start_urls": ["__REDDIT__"], - "intent": "Upvote the newest post in the deep learning forum", - "intent_template": "Upvote the newest post in {{forum}} forum", - "instantiation_dict": {"forum": "the deep learning"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__REDDIT__/sv/125036.json", - "http_method": "POST", - "post_data": {"choice": "1"} - } - } - ], - "revision": 2 - }, - { - "sites": ["reddit"], - "task_id": 408, - "intent_template_id": 22, - "start_urls": ["__REDDIT__"], - "intent": "Upvote the newest post in explain like im 5 forum", - "intent_template": "Upvote the newest post in {{forum}} forum", - "instantiation_dict": {"forum": "explain like im 5"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__REDDIT__/sv/125342.json", - "http_method": "POST", - "post_data": {"choice": "1"} - } - } - ], - "revision": 2 - }, - { - "sites": ["reddit"], - "task_id": 409, - "intent_template_id": 23, - "start_urls": ["__REDDIT__/f/books/59421/friendly-reminder-bookshop-org-exists"], - "intent": "Reply to the manager of the website in this post with \"thanks! I am a big fan of your website.\"", - "intent_template": "Reply to {{position_description}} {{location}} with {{comment_prefix}}\"{{content_description}}\"", - "instantiation_dict": { - "position_description": "the manager of the website", - "location": "in this post", - "comment_prefix": "", - "content_description": "thanks! I am a big fan of your website." - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__REDDIT__/f/books/59421/-/comment/1235250", - "http_method": "POST", - "post_data": {"reply_to_comment_1235250[comment]": "thanks! I am a big fan of your website."}, - "response_status": 302 - } - } - ], - "revision": 2 - }, - { - "sites": ["reddit"], - "task_id": 410, - "intent_template_id": 23, - "start_urls": ["__REDDIT__/f/singularity/69404/this-is-how-chatgpt-sees-itself"], - "intent": "Reply to the first reply in this post with \"don't panic\"", - "intent_template": "Reply to {{position_description}} {{location}} with {{comment_prefix}}\"{{content_description}}\"", - "instantiation_dict": { - "position_description": "the first reply", - "location": "in this post", - "comment_prefix": "", - "content_description": "don't panic" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__REDDIT__/f/singularity/69404/-/comment/1042264", - "http_method": "POST", - "post_data": {"reply_to_comment_1042264[comment]": "don't panic"}, - "response_status": 302 - } - } - ], - "revision": 2 - }, - { - "sites": ["gitlab"], - "task_id": 411, - "intent_template_id": 355, - "start_urls": ["__GITLAB__"], - "intent": "Change the LICENSE for repo byteblaze/cloud-to-butt to an MIT license", - "intent_template": "Change the LICENSE for repo {{repo}} to {{license}}", - "instantiation_dict": {"repo": "byteblaze/cloud-to-butt", "license": "an MIT license"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "post_data_schema": { - "type": "object", - "properties": { - "_method" : { "type": "string" }, - "file_path" : { "type": "string" }, - "branch_name": { "type": "string" }, - "content" : { "type": "string" } - } - }, - "expected": { - "url": [ - "__GITLAB__/byteblaze/cloud-to-butt/-/update/master/LICENSE.txt", - "__GITLAB__/byteblaze/cloud-to-butt/-/create/master", - "__GITLAB__/byteblaze/cloud-to-butt/-/blob/master/LICENSE" - ], - "http_method": "POST", - "post_data": { - "$.^file_path|file_name$": "LICENSE", - "branch_name": "master", - "content": "^.*MIT.*License.*$", - "_method": "^.*(?!delete$).*$" - }, - "response_status": 302 - } - } - ], - "revision": 2 - }, - { - "sites": ["gitlab"], - "task_id": 412, - "intent_template_id": 355, - "start_urls": ["__GITLAB__"], - "intent": "Change the LICENSE for repo byteblaze/accessible-html-content-patterns to an Apache License", - "intent_template": "Change the LICENSE for repo {{repo}} to {{license}}", - "instantiation_dict": { - "repo": "byteblaze/accessible-html-content-patterns", - "license": "an Apache License" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "post_data_schema": { - "type": "object", - "properties": { - "_method" : { "type": "string" }, - "file_path" : { "type": "string" }, - "branch_name": { "type": "string" }, - "content" : { "type": "string" } - } - }, - "expected": { - "url": [ - "__GITLAB__/byteblaze/accessible-html-content-patterns/-/update/main/LICENSE", - "__GITLAB__/byteblaze/accessible-html-content-patterns/-/create/main", - "__GITLAB__/byteblaze/accessible-html-content-patterns/-/blob/main/LICENSE" - ], - "http_method": "POST", - "post_data": { - "$.^file_path|file_name$": "LICENSE", - "branch_name": "main", - "content": "^.*Apache.*License.*$", - "_method": "^.*(?!delete$).*$" - }, - "response_status": 302 - } - } - ], - "revision": 2 - }, - { - "sites": ["gitlab"], - "task_id": 413, - "intent_template_id": 355, - "start_urls": ["__GITLAB__"], - "intent": "Change the LICENSE for repo byteblaze/a11y-syntax-highlighting to one that mandates all copies and derivative works to be under the same license", - "intent_template": "Change the LICENSE for repo {{repo}} to {{license}}", - "instantiation_dict": { - "repo": "byteblaze/a11y-syntax-highlighting", - "license": "one that mandates all copies and derivative works to be under the same license" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "post_data_schema": { - "type": "object", - "properties": { - "_method" : { "type": "string" }, - "file_path" : { "type": "string" }, - "branch_name": { "type": "string" }, - "content" : { "type": "string" } - } - }, - "expected": { - "url": [ - "__GITLAB__/byteblaze/a11y-syntax-highlighting/-/update/main/LICENSE", - "__GITLAB__/byteblaze/a11y-syntax-highlighting/-/create/main", - "__GITLAB__/byteblaze/a11y-syntax-highlighting/-/blob/main/LICENSE" - ], - "http_method": "POST", - "post_data": { - "$.^file_path|file_name$": "LICENSE", - "branch_name": "main", - "content": "^.*GNU GENERAL PUBLIC LICENSE.*$", - "_method": "^.*(?!delete$).*$" - }, - "response_status": 302 - } - } - ], - "revision": 2 - }, - { - "sites": ["gitlab"], - "task_id": 414, - "intent_template_id": 355, - "start_urls": ["__GITLAB__"], - "intent": "Change the LICENSE for repo byteblaze/dotfiles to an MIT license", - "intent_template": "Change the LICENSE for repo {{repo}} to {{license}}", - "instantiation_dict": {"repo": "byteblaze/dotfiles", "license": "an MIT license"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "post_data_schema": { - "type": "object", - "properties": { - "_method" : { "type": "string" }, - "file_path" : { "type": "string" }, - "branch_name": { "type": "string" }, - "content" : { "type": "string" } - } - }, - "expected": { - "url": [ - "__GITLAB__/byteblaze/dotfiles/-/update/main/LICENSE", - "__GITLAB__/byteblaze/dotfiles/-/create/main", - "__GITLAB__/byteblaze/dotfiles/-/blob/main/LICENSE" - ], - "http_method": "POST", - "post_data": { - "$.^file_path|file_name$": "LICENSE", - "branch_name": "main", - "content": "^.*MIT.*License.*$", - "_method": "^.*(?!delete$).*$" - }, - "response_status": 302 - } - } - ], - "revision": 2 - }, - { - "sites": ["gitlab"], - "task_id": 415, - "intent_template_id": 360, - "start_urls": ["__GITLAB__"], - "intent": "Reply on the merge request assigned to me for verification functions: if the last comment is from the author, reply \"Thank you\"; otherwise tag the author (e.g., @user) as a reminder.", - "intent_template": "Reply on the merge request assigned to me for {{topic}}: if the last comment is from the author, reply \"Thank you\"; otherwise tag the author (e.g., @user) as a reminder.", - "instantiation_dict": {"topic": "verification functions"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__GITLAB__/byteblaze/a11y-webring.club/notes?target_id=138783&target_type=merge_request", - "http_method": "POST", - "post_data": {"$.note.noteable_type": "MergeRequest", "$.note.note": "^.*@davepgreene.*$"} - } - } - ], - "revision": 2 - }, - { - "sites": ["gitlab"], - "task_id": 416, - "intent_template_id": 360, - "start_urls": ["__GITLAB__"], - "intent": "Reply on the merge request assigned to me for wcag: if the last comment is from the author, reply \"Thank you\"; otherwise tag the author (e.g., @user) as a reminder.", - "intent_template": "Reply on the merge request assigned to me for {{topic}}: if the last comment is from the author, reply \"Thank you\"; otherwise tag the author (e.g., @user) as a reminder.", - "instantiation_dict": {"topic": "wcag"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__GITLAB__/a11yproject/a11yproject.com/notes?target_id=72409&target_type=merge_request", - "http_method": "POST", - "post_data": {"$.note.noteable_type": "MergeRequest", "$.note.note": "Thank you"} - } - } - ], - "revision": 2 - }, - { - "sites": ["gitlab"], - "task_id": 417, - "intent_template_id": 360, - "start_urls": ["__GITLAB__"], - "intent": "Reply on the merge request assigned to me for 404 link: if the last comment is from the author, reply \"Thank you\"; otherwise tag the author (e.g., @user) as a reminder.", - "intent_template": "Reply on the merge request assigned to me for {{topic}}: if the last comment is from the author, reply \"Thank you\"; otherwise tag the author (e.g., @user) as a reminder.", - "instantiation_dict": {"topic": "404 link"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__GITLAB__/a11yproject/a11yproject.com/notes?target_id=72560&target_type=merge_request", - "http_method": "POST", - "post_data": {"$.note.noteable_type": "MergeRequest", "$.note.note": "^.*@Roshanjossey.*$"} - } - } - ], - "revision": 2 - }, - { - "sites": ["gitlab"], - "task_id": 418, - "intent_template_id": 361, - "start_urls": ["__GITLAB__"], - "intent": "Set my gitlab status as Busy.", - "intent_template": "Set my gitlab status as {{status}}.", - "instantiation_dict": {"status": "Busy"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__GITLAB__/api/v4/user/status", - "http_method": "PUT", - "post_data": {"availability": "busy", "clear_status_after": null} - } - } - ], - "revision": 2 - }, - { - "sites": ["gitlab"], - "task_id": 419, - "intent_template_id": 361, - "start_urls": ["__GITLAB__"], - "intent": "Set my gitlab status as Enjoying life.", - "intent_template": "Set my gitlab status as {{status}}.", - "instantiation_dict": {"status": "Enjoying life"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__GITLAB__/api/v4/user/status", - "http_method": "PUT", - "post_data": { - "message": "Enjoying life.", - "availability": "busy", - "clear_status_after": null - } - } - } - ], - "revision": 2 - }, - { - "sites": ["gitlab"], - "task_id": 420, - "intent_template_id": 361, - "start_urls": ["__GITLAB__"], - "intent": "Set my gitlab status as Playing Badminton.", - "intent_template": "Set my gitlab status as {{status}}.", - "instantiation_dict": {"status": "Playing Badminton"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__GITLAB__/api/v4/user/status", - "http_method": "PUT", - "post_data": { - "message": "Playing Badminton.", - "availability": "busy", - "clear_status_after": null - } - } - } - ], - "revision": 2 - }, - { - "sites": ["gitlab"], - "task_id": 421, - "intent_template_id": 361, - "start_urls": ["__GITLAB__"], - "intent": "Set my gitlab status as Resting due to leg injury.", - "intent_template": "Set my gitlab status as {{status}}.", - "instantiation_dict": {"status": "Resting due to leg injury"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__GITLAB__/api/v4/user/status", - "http_method": "PUT", - "post_data": { - "message": "Resting due to leg injury.", - "availability": "busy", - "clear_status_after": null - } - } - } - ], - "revision": 2 - }, - { - "sites": ["gitlab"], - "task_id": 422, - "intent_template_id": 361, - "start_urls": ["__GITLAB__"], - "intent": "Set my gitlab status as Out of Office.", - "intent_template": "Set my gitlab status as {{status}}.", - "instantiation_dict": {"status": "Out of Office"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__GITLAB__/api/v4/user/status", - "http_method": "PUT", - "post_data": { - "message": "Out of Office.", - "availability": "busy", - "clear_status_after": null - } - } - } - ], - "revision": 2 - }, - { - "sites": ["shopping_admin"], - "task_id": 423, - "intent_template_id": 237, - "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Set all Hollister shirts to on-sale status", - "intent_template": "Set all {{brand}} shirts to on-sale status", - "instantiation_dict": {"brand": "Hollister"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "last_event_only": false, - "expected": { - "url": "^__SHOPPING_ADMIN__/catalog/product/save/id/126/type/configurable/store/0/set/\\d+/back/edit$", - "post_data": {"report_type": "created_at_order", "from": "02/1/2023", "to": "02/28/2023"}, - "response_status": 302 - } - } - ], - "revision": 2 - }, - { - "sites": ["wikipedia", "map"], - "task_id": 424, - "intent_template_id": 371, - "start_urls": ["__MAP__", "__WIKIPEDIA__"], - "intent": "On the map site, view the info page for the city where A Beautiful Day in the Neighborhood was filmed (use the provided wiki site to look up any needed information).", - "intent_template": "On the map site, view the info page for {{description}} (use the provided wiki site to look up any needed information).", - "instantiation_dict": {"description": "the city where A Beautiful Day in the Neighborhood was filmed"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "navigate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "ignored_query_params_patterns": [".*"], - "expected": {"url": "__MAP__/relation/188553"} - } - ], - "revision": 2 - }, - { - "sites": ["wikipedia", "map"], - "task_id": 425, - "intent_template_id": 371, - "start_urls": ["__MAP__", "__WIKIPEDIA__"], - "intent": "On the map site, view the info page for the longest bridge in the Western hemisphere (use the provided wiki site to look up any needed information).", - "intent_template": "On the map site, view the info page for {{description}} (use the provided wiki site to look up any needed information).", - "instantiation_dict": {"description": "the longest bridge in the Western hemisphere"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "navigate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "ignored_query_params_patterns": [".*"], - "expected": {"url": "__MAP__/relation/5475586"} - } - ], - "revision": 2 - }, - { - "sites": ["wikipedia", "map"], - "task_id": 426, - "intent_template_id": 371, - "start_urls": ["__MAP__", "__WIKIPEDIA__"], - "intent": "On the map site, view the info page for the city in Pennsylvania where a plane crashed during the September 11th attacks (use the provided wiki site to look up any needed information).", - "intent_template": "On the map site, view the info page for {{description}} (use the provided wiki site to look up any needed information).", - "instantiation_dict": { - "description": "the city in Pennsylvania where a plane crashed during the September 11th attacks" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "navigate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "ignored_query_params_patterns": [".*"], - "expected": {"url": "__MAP__/relation/189076"} - } - ], - "revision": 2 - }, - { - "sites": ["wikipedia", "map"], - "task_id": 427, - "intent_template_id": 371, - "start_urls": ["__MAP__", "__WIKIPEDIA__"], - "intent": "On the map site, view the info page for the university with 26 Turing Award winners as of December 2021 (use the provided wiki site to look up any needed information).", - "intent_template": "On the map site, view the info page for {{description}} (use the provided wiki site to look up any needed information).", - "instantiation_dict": { - "description": "the university with 26 Turing Award winners as of December 2021" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "navigate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "ignored_query_params_patterns": [".*"], - "expected": {"url": "__MAP__/relation/65066"} - } - ], - "revision": 2 - }, - { - "sites": ["wikipedia", "map"], - "task_id": 428, - "intent_template_id": 371, - "start_urls": ["__MAP__", "__WIKIPEDIA__"], - "intent": "On the map site, view the info page for the undergrad college of the person who developed the Nash equilibrium (use the provided wiki site to look up any needed information).", - "intent_template": "On the map site, view the info page for {{description}} (use the provided wiki site to look up any needed information).", - "instantiation_dict": { - "description": "the undergrad college of the person who developed the Nash equilibrium" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "navigate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "ignored_query_params_patterns": [".*"], - "expected": {"url": "__MAP__/relation/2279034"} - } - ], - "revision": 2 - }, - { - "sites": ["wikipedia", "map"], - "task_id": 429, - "intent_template_id": 371, - "start_urls": ["__MAP__", "__WIKIPEDIA__"], - "intent": "On the map site, view the info page for the college or colleges where The Chair was filmed (open each in a separate tab) (use the provided wiki site to look up any needed information).", - "intent_template": "On the map site, view the info page for {{description}} (use the provided wiki site to look up any needed information).", - "instantiation_dict": { - "description": "the college or colleges where The Chair was filmed (open each in a separate tab)" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "navigate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "ignored_query_params_patterns": [".*"], - "expected": {"url": "__MAP__/relation/583390395"} - }, - { - "evaluator": "NetworkEventEvaluator", - "ignored_query_params_patterns": [".*"], - "expected": {"url": "__MAP__/relation/172206707"} - } - ], - "revision": 2 - }, - { - "sites": ["wikipedia", "map"], - "task_id": 430, - "intent_template_id": 371, - "start_urls": ["__MAP__", "__WIKIPEDIA__"], - "intent": "On the map site, view the info page for the college(s) where The Chair was filmed in Pennsylvania other than the ones in Pittsburgh (use the provided wiki site to look up any needed information).", - "intent_template": "On the map site, view the info page for {{description}} (use the provided wiki site to look up any needed information).", - "instantiation_dict": { - "description": "the college(s) where The Chair was filmed in Pennsylvania other than the ones in Pittsburgh" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "navigate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "ignored_query_params_patterns": [".*"], - "expected": {"url": "__MAP__/TASK_430_MAP_RELATION_ID"} - } - ], - "revision": 2 - }, - { - "sites": ["shopping"], - "task_id": 431, - "intent_template_id": 145, - "start_urls": [ - "__SHOPPING__/tall-pink-taper-candles-4-piece-orange-colored-tapered-candles-gradient-candles-10-6-inches-tall-tie-dye-candle-set-large-dripless-long-burning-candlesticks-two-color-taper-candles-candlesticks.html", - "__SHOPPING__/spaas-white-taper-candles-4-pack-10-inch-tall-candles-scent-free-premium-wax-candle-sticks-8-hour-long-burning-white-candlesticks-for-home-decoration-wedding-holiday-and-parties.html", - "__SHOPPING__/white-starfish-wall-candle-sconces-set-of-2-beach-decor-ocean-themed-wall-mount-candleholders-nautical-style-beach-bathroom-decor-coastal-farmhouse-seashell-candle-holders.html" - ], - "intent": "Add the product with the lowest per unit price from my open tabs to the shopping cart", - "intent_template": "Add the product with the lowest per unit price from my open tabs to the shopping cart", - "instantiation_dict": {}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "^__SHOPPING__/checkout/cart/add/uenc/.*/product/32202$", - "http_method": "POST", - "post_data": {"qty": "1", "item": "32202"} - } - } - ], - "revision": 2 - }, - { - "sites": ["shopping"], - "task_id": 432, - "intent_template_id": 145, - "start_urls": [ - "__SHOPPING__/ciclon-energy-drink-regular-24-cans-8-3oz.html", - "__SHOPPING__/v8-energy-healthy-energy-drink-steady-energy-from-black-and-green-tea-pomegranate-blueberry-8-ounce-can-pack-of-24.html" - ], - "intent": "Add the product with the lowest per unit price from my open tabs to the shopping cart", - "intent_template": "Add the product with the lowest per unit price from my open tabs to the shopping cart", - "instantiation_dict": {}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "^__SHOPPING__/checkout/cart/add/uenc/.*/product/104498$", - "http_method": "POST", - "post_data": {"qty": "1", "item": "104498"} - } - } - ], - "revision": 2 - }, - { - "sites": ["shopping"], - "task_id": 433, - "intent_template_id": 145, - "start_urls": [ - "__SHOPPING__/tazrigo-5pcs-white-dental-resin-brush-pens-dental-shaping-silicone-tooth-tool.html", - "__SHOPPING__/stylus-pens-for-touch-screens-2-pcs-universal-stylus-2-in-1-2022-updated-touch-screen-pens-for-all-touch-screens-cell-phones-tablets-laptops-with-6-replacement-tips-4-discstips-2-fiber-tips.html" - ], - "intent": "Add the product with the lowest per unit price from my open tabs to the shopping cart", - "intent_template": "Add the product with the lowest per unit price from my open tabs to the shopping cart", - "instantiation_dict": {}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "^__SHOPPING__/checkout/cart/add/uenc/.*/product/52674$", - "http_method": "POST", - "post_data": {"qty": "1", "item": "52674"} - } - } - ], - "revision": 2 - }, - { - "sites": ["shopping"], - "task_id": 434, - "intent_template_id": 145, - "start_urls": [ - "__SHOPPING__/3-pairs-ruffle-socks-lace-ankle-socks-for-girls-frilly-socks-women-decorative.html", - "__SHOPPING__/viviki-women-glitter-socks-ultrathin-transparent-tulle-lace-socks-no-show-ankle-crew-socks-3-pack.html" - ], - "intent": "Add the product with the lowest per unit price from my open tabs to the shopping cart", - "intent_template": "Add the product with the lowest per unit price from my open tabs to the shopping cart", - "instantiation_dict": {}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "^__SHOPPING__/checkout/cart/add/uenc/.*/product/11615$", - "http_method": "POST", - "post_data": {"qty": "1", "item": "11615"} - } - } - ], - "revision": 2 - }, - { - "sites": ["shopping"], - "task_id": 435, - "intent_template_id": 145, - "start_urls": [ - "__SHOPPING__/35-ft-hdmi-cable-gearit-pro-series-hdmi-cable-35-feet-high-speed-ethernet-4k-resolution-3d-video-and-arc-audio-return-channel-hdmi-cable-white.html", - "__SHOPPING__/dp-to-hdmi-cable-6ft-2-pack-fosmon-gold-plated-displayport-to-hdmi-cable-1080p-full-hd-for-pcs-to-hdtv-monitor-projector-with-hdmi-port.html" - ], - "intent": "Add the product with the lowest per unit price from my open tabs to the shopping cart", - "intent_template": "Add the product with the lowest per unit price from my open tabs to the shopping cart", - "instantiation_dict": {}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "^__SHOPPING__/checkout/cart/add/uenc/.*/product/77300$", - "http_method": "POST", - "post_data": {"qty": "1", "item": "77300"} - } - } - ], - "revision": 2 - }, - { - "sites": ["shopping"], - "task_id": 436, - "intent_template_id": 156, - "start_urls": ["__SHOPPING__"], - "intent": "I previously ordered a mattress foundation around Feb or March 2023 and later cancelled. Can you reorder it for me?", - "intent_template": "I previously ordered {{product}} {{time}} and later cancelled. Can you reorder it for me?", - "instantiation_dict": {"product": "a mattress foundation", "time": "around Feb or March 2023"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "^__SHOPPING__/checkout/cart/add/uenc/.*/product/14854$", - "http_method": "POST", - "post_data": {"qty": "1", "item": "14854"} - } - } - ], - "revision": 2 - }, - { - "sites": ["shopping"], - "task_id": 437, - "intent_template_id": 156, - "start_urls": ["__SHOPPING__"], - "intent": "I previously ordered a table lamp in May 2023 and later cancelled. Can you reorder it for me?", - "intent_template": "I previously ordered {{product}} {{time}} and later cancelled. Can you reorder it for me?", - "instantiation_dict": {"product": "a table lamp", "time": "in May 2023"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "^__SHOPPING__/checkout/cart/add/uenc/.*/product/15033$", - "http_method": "POST", - "post_data": {"qty": "1", "item": "15033"} - } - } - ], - "revision": 2 - }, - { - "sites": ["shopping"], - "task_id": 438, - "intent_template_id": 156, - "start_urls": ["__SHOPPING__"], - "intent": "I previously ordered a TV stand sometime around sep 2022 and later cancelled. Can you reorder it for me?", - "intent_template": "I previously ordered {{product}} {{time}} and later cancelled. Can you reorder it for me?", - "instantiation_dict": {"product": "a TV stand", "time": "sometime around sep 2022"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "^__SHOPPING__/checkout/cart/add/uenc/.*/product/35837$", - "http_method": "POST", - "post_data": {"qty": "1", "item": "35837"} - } - } - ], - "revision": 2 - }, - { - "sites": ["shopping"], - "task_id": 439, - "intent_template_id": 156, - "start_urls": ["__SHOPPING__"], - "intent": "I previously ordered a cat t-shirt during 2022 and later cancelled. Can you reorder it for me?", - "intent_template": "I previously ordered {{product}} {{time}} and later cancelled. Can you reorder it for me?", - "instantiation_dict": {"product": "a cat t-shirt", "time": "during 2022"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "^__SHOPPING__/checkout/cart/add/uenc/.*/product/24493$", - "http_method": "POST", - "post_data": {"qty": "1", "item": "24493"} - } - } - ], - "revision": 2 - }, - { - "sites": ["shopping"], - "task_id": 440, - "intent_template_id": 156, - "start_urls": ["__SHOPPING__"], - "intent": "I previously ordered a make up removal kit during summer 2022 and later cancelled. Can you reorder it for me?", - "intent_template": "I previously ordered {{product}} {{time}} and later cancelled. Can you reorder it for me?", - "instantiation_dict": {"product": "a make up removal kit", "time": "during summer 2022"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "^__SHOPPING__/checkout/cart/add/uenc/.*/product/51958$", - "http_method": "POST", - "post_data": {"qty": "1", "item": "51958"} - } - } - ], - "revision": 2 - }, - { - "sites": ["gitlab"], - "task_id": 441, - "intent_template_id": 308, - "start_urls": ["__GITLAB__/byteblaze/gimmiethat.space"], - "intent": "Update and commit (to the default branch) the website code for the current project using the simple online file editor to change the browser tab title to \"GIVE ME SPACE\"", - "intent_template": "Update and commit (to {{target_branch}}) the website code for the current project using the simple online file editor to change the browser tab title to \"{{title}}\"", - "instantiation_dict": {"title": "GIVE ME SPACE", "target_branch": "the default branch"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__GITLAB__/byteblaze/gimmiethat.space/-/update/main/index.html", - "http_method": "POST", - "post_data": { - "_method": "put", - "file_path": "index.html", - "branch_name": "main", - "original_branch": "main", - "content": "^<\\!doctype html>.*.*GIVE ME SPACE.*.*$" - }, - "response_status": 302 - } - } - ], - "revision": 2 - }, - { - "sites": ["gitlab"], - "task_id": 442, - "intent_template_id": 308, - "start_urls": ["__GITLAB__/byteblaze/gimmiethat.space"], - "intent": "Update and commit (to main) the website code for the current project using the simple online file editor to change the browser tab title to \"Welcome to my site\"", - "intent_template": "Update and commit (to {{target_branch}}) the website code for the current project using the simple online file editor to change the browser tab title to \"{{title}}\"", - "instantiation_dict": {"title": "Welcome to my site", "target_branch": "main"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__GITLAB__/byteblaze/gimmiethat.space/-/update/main/index.html", - "http_method": "POST", - "post_data": { - "_method": "put", - "file_path": "index.html", - "branch_name": "main", - "original_branch": "main", - "content": "^<\\!doctype html>.*.*Welcome to my site.*.*$" - }, - "response_status": 302 - } - } - ], - "revision": 2 - }, - { - "sites": ["gitlab"], - "task_id": 443, - "intent_template_id": 308, - "start_urls": ["__GITLAB__/byteblaze/gimmiethat.space"], - "intent": "Update and commit (to a new branch with no merged request) the website code for the current project using the simple online file editor to change the browser tab title to \"Not an interesting site\"", - "intent_template": "Update and commit (to {{target_branch}}) the website code for the current project using the simple online file editor to change the browser tab title to \"{{title}}\"", - "instantiation_dict": { - "title": "Not an interesting site", - "target_branch": "a new branch with no merged request" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__GITLAB__/byteblaze/gimmiethat.space/-/update/main/index.html", - "http_method": "POST", - "post_data": { - "_method": "put", - "file_path": "index.html", - "branch_name": "^(?!main$).+$", - "original_branch": "main", - "content": "^<\\!doctype html>.*.*Not an interesting site.*.*$" - }, - "response_status": 302 - } - } - ], - "revision": 2 - }, - { - "sites": ["gitlab"], - "task_id": 444, - "intent_template_id": 308, - "start_urls": ["__GITLAB__/byteblaze/gimmiethat.space"], - "intent": "Update and commit (to a new branch called title-update with no merged request) the website code for the current project using the simple online file editor to change the browser tab title to \"Title Wanted\"", - "intent_template": "Update and commit (to {{target_branch}}) the website code for the current project using the simple online file editor to change the browser tab title to \"{{title}}\"", - "instantiation_dict": { - "title": "Title Wanted", - "target_branch": "a new branch called title-update with no merged request" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__GITLAB__/byteblaze/gimmiethat.space/-/update/main/index.html", - "http_method": "POST", - "post_data": { - "_method": "put", - "file_path": "index.html", - "branch_name": "title-update", - "original_branch": "main", - "content": "^<\\!doctype html>.*.*Title Wanted.*.*$" - }, - "response_status": 302 - } - } - ], - "revision": 2 - }, - { - "sites": ["gitlab"], - "task_id": 445, - "intent_template_id": 308, - "start_urls": ["__GITLAB__/byteblaze/gimmiethat.space"], - "intent": "Update and commit (to main) the website code for the current project using the simple online file editor to change the browser tab title to \"Hello\"", - "intent_template": "Update and commit (to {{target_branch}}) the website code for the current project using the simple online file editor to change the browser tab title to \"{{title}}\"", - "instantiation_dict": {"title": "Hello", "target_branch": "main"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__GITLAB__/byteblaze/gimmiethat.space/-/update/main/index.html", - "http_method": "POST", - "post_data": { - "_method": "put", - "file_path": "index.html", - "branch_name": "main", - "original_branch": "main", - "content": "^<\\!doctype html>.*.*Hello.*.*$" - }, - "response_status": 302 - } - } - ], - "revision": 2 - }, - { - "sites": ["gitlab"], - "task_id": 446, - "intent_template_id": 999, - "start_urls": ["__GITLAB__"], - "intent": "Assign the issue regarding 404 errors in a11yproject to Roshanjossey.", - "intent_template": "Assign the issue regarding {{issue}} {{issue_location}}to {{account}}.", - "instantiation_dict": { - "issue": "404 errors", - "issue_location": "in a11yproject ", - "account": "Roshanjossey" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "ignored_query_params": ["serializer"], - "post_data_schema": { - "type": "object", - "properties": { "$.issue.assignee_ids": { "type": "array", "items": {"type": "number"} } } - }, - "expected": { - "url": "__GITLAB__/a11yproject/a11yproject.com/-/issues/1478.json", - "http_method": "PUT", - "post_data": { "$.issue.assignee_ids": [2264] } - } - } - ], - "revision": 2 - }, - { - "sites": ["gitlab"], - "task_id": 447, - "intent_template_id": 999, - "start_urls": ["__GITLAB__"], - "intent": "Assign the issue regarding linking to an accessibility statement in a11y-webring.club to Rohan.", - "intent_template": "Assign the issue regarding {{issue}} {{issue_location}}to {{account}}.", - "instantiation_dict": { - "issue": "linking to an accessibility statement", - "issue_location": "in a11y-webring.club ", - "account": "Rohan" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "ignored_query_params": ["serializer"], - "post_data_schema": { - "type": "object", - "properties": { "$.issue.assignee_ids": { "type": "array", "items": {"type": "number"} } } - }, - "expected": { - "url": "__GITLAB__/byteblaze/a11y-webring.club/-/issues/71.json", - "http_method": "PUT", - "post_data": { "$.issue.assignee_ids": [2366] } - } - } - ], - "revision": 2 - }, - { - "sites": ["gitlab"], - "task_id": 448, - "intent_template_id": 331, - "start_urls": ["__GITLAB__"], - "intent": "set the homepage URL on my GitLab profile to https://egg.tart.com", - "intent_template": "set the homepage URL on my GitLab profile to {{url}}", - "instantiation_dict": {"url": "https://egg.tart.com"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__GITLAB__/-/profile", - "http_method": "POST", - "post_data": {"user[website_url]": "https://egg.tart.com"}, - "response_status": 302 - } - } - ], - "revision": 2 - }, - { - "sites": ["gitlab"], - "task_id": 449, - "intent_template_id": 331, - "start_urls": ["__GITLAB__"], - "intent": "set the homepage URL on my GitLab profile to helloworld.xyz", - "intent_template": "set the homepage URL on my GitLab profile to {{url}}", - "instantiation_dict": {"url": "helloworld.xyz"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "post_data_schema": { "type": "object", "properties": { "user[website_url]": {"type": "string"} } }, - "expected": { - "url": "__GITLAB__/-/profile", - "http_method": "POST", - "post_data": { "user[website_url]": ["https://helloworld.xyz", "http://helloworld.xyz"] }, - "response_status": 302 - } - } - ], - "revision": 2 - }, - { - "sites": ["gitlab"], - "task_id": 450, - "intent_template_id": 331, - "start_urls": ["__GITLAB__"], - "intent": "set the homepage URL on my GitLab profile to https://a11yproject.contributor.me", - "intent_template": "set the homepage URL on my GitLab profile to {{url}}", - "instantiation_dict": {"url": "https://a11yproject.contributor.me"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__GITLAB__/-/profile", - "http_method": "POST", - "post_data": {"user[website_url]": "https://a11yproject.contributor.me"}, - "response_status": 302 - } - } - ], - "revision": 2 - }, - { - "sites": ["gitlab"], - "task_id": 451, - "intent_template_id": 331, - "start_urls": ["__GITLAB__"], - "intent": "set the homepage URL on my GitLab profile to www.byteblaze.com", - "intent_template": "set the homepage URL on my GitLab profile to {{url}}", - "instantiation_dict": {"url": "www.byteblaze.com"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "post_data_schema": { "type": "object", "properties": { "user[website_url]": {"type": "string"} } }, - "expected": { - "url": "__GITLAB__/-/profile", - "http_method": "POST", - "post_data": { - "user[website_url]": ["https://www.byteblaze.com", "http://www.byteblaze.com"] - }, - "response_status": 302 - } - } - ], - "revision": 2 - }, - { - "sites": ["gitlab"], - "task_id": 452, - "intent_template_id": 331, - "start_urls": ["__GITLAB__"], - "intent": "set the homepage URL on my GitLab profile to https://byteblaze.github.io", - "intent_template": "set the homepage URL on my GitLab profile to {{url}}", - "instantiation_dict": {"url": "https://byteblaze.github.io"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__GITLAB__/-/profile", - "http_method": "POST", - "post_data": {"user[website_url]": "https://byteblaze.github.io"}, - "response_status": 302 - } - } - ], - "revision": 2 - }, - { - "sites": ["shopping_admin"], - "task_id": 453, - "intent_template_id": 242, - "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Disable Teton pullover hoodie from the site, they are facing some quality issues.", - "intent_template": "Disable {{product}} from the site, they are facing some quality issues.", - "instantiation_dict": {"product": "Teton pullover hoodie"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "^__SHOPPING_ADMIN__/catalog/product/save/id/78/type/(simple|configurable)/store/0/set/\\d+/back/edit$", - "http_method": "POST", - "post_data": {"product[status]": "2"}, - "response_status": 302 - } - } - ], - "revision": 2 - }, - { - "sites": ["shopping_admin"], - "task_id": 454, - "intent_template_id": 242, - "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Disable Ryker Tee Crew Neck from the site, they are facing some quality issues.", - "intent_template": "Disable {{product}} from the site, they are facing some quality issues.", - "instantiation_dict": {"product": "Ryker Tee Crew Neck"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "^__SHOPPING_ADMIN__/catalog/product/save/id/478/type/(simple|configurable)/store/0/set/\\d+/back/edit$", - "http_method": "POST", - "post_data": {"product[status]": "2"}, - "response_status": 302 - } - } - ], - "revision": 2 - }, - { - "sites": ["shopping_admin"], - "task_id": 455, - "intent_template_id": 242, - "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Disable Helios Endurance Tank from the site, they are facing some quality issues.", - "intent_template": "Disable {{product}} from the site, they are facing some quality issues.", - "instantiation_dict": {"product": "Helios Endurance Tank"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "^__SHOPPING_ADMIN__/catalog/product/save/id/676/type/(simple|configurable)/store/0/set/\\d+/back/edit$", - "http_method": "POST", - "post_data": {"product[status]": "2"}, - "response_status": 302 - } - } - ], - "revision": 2 - }, - { - "sites": ["shopping_admin"], - "task_id": 456, - "intent_template_id": 242, - "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Disable Cora Pant from the site, they are facing some quality issues.", - "intent_template": "Disable {{product}} from the site, they are facing some quality issues.", - "instantiation_dict": {"product": "Cora Pant"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "^__SHOPPING_ADMIN__/catalog/product/save/id/1840/type/(simple|configurable)/store/0/set/\\d+/back/edit$", - "http_method": "POST", - "post_data": {"product[status]": "2"}, - "response_status": 302 - } - } - ], - "revision": 2 - }, - { - "sites": ["shopping_admin"], - "task_id": 457, - "intent_template_id": 242, - "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Disable Karmen yoga pants from the site, they are facing some quality issues.", - "intent_template": "Disable {{product}} from the site, they are facing some quality issues.", - "instantiation_dict": {"product": "Karmen yoga pants"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "^__SHOPPING_ADMIN__/catalog/product/save/id/1819/type/(simple|configurable)/store/0/set/\\d+/back/edit$", - "http_method": "POST", - "post_data": {"product[status]": "2"}, - "response_status": 302 - } - } - ], - "revision": 2 - }, - { - "sites": ["shopping_admin"], - "task_id": 458, - "intent_template_id": 247, - "start_urls": ["__SHOPPING_ADMIN__/catalog/product/edit/id/1481/"], - "intent": "Reduce the price of the product on the current page by $5", - "intent_template": "{{action}} the price of the product on the current page by {{amount}}", - "instantiation_dict": {"amount": "$5", "action": "Reduce"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "^__SHOPPING_ADMIN__/catalog/product/save/id/1481/type/(simple|configurable)/store/0/set/\\d+/back/edit$", - "http_method": "POST", - "post_data": {"product[price]": "27.00"}, - "response_status": 302 - } - } - ], - "revision": 2 - }, - { - "sites": ["shopping_admin"], - "task_id": 459, - "intent_template_id": 247, - "start_urls": ["__SHOPPING_ADMIN__/catalog/product/edit/id/237/"], - "intent": "Reduce the price of the product on the current page by 10%", - "intent_template": "{{action}} the price of the product on the current page by {{amount}}", - "instantiation_dict": {"amount": "10%", "action": "Reduce"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "^__SHOPPING_ADMIN__/catalog/product/save/id/237/type/(simple|configurable)/store/0/set/\\d+/back/edit$", - "http_method": "POST", - "post_data": {"product[price]": "62.10"}, - "response_status": 302 - } - } - ], - "revision": 2 - }, - { - "sites": ["shopping_admin"], - "task_id": 460, - "intent_template_id": 247, - "start_urls": ["__SHOPPING_ADMIN__/catalog/product/edit/id/418/"], - "intent": "Reduce the price of the product on the current page by 15%", - "intent_template": "{{action}} the price of the product on the current page by {{amount}}", - "instantiation_dict": {"amount": "15%", "action": "Reduce"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "^__SHOPPING_ADMIN__/catalog/product/save/id/418/type/(simple|configurable)/store/0/set/\\d+/back/edit$", - "http_method": "POST", - "post_data": {"product[price]": "38.25"}, - "response_status": 302 - } - } - ], - "revision": 2 - }, - { - "sites": ["shopping_admin"], - "task_id": 461, - "intent_template_id": 247, - "start_urls": ["__SHOPPING_ADMIN__/catalog/product/edit/id/721/"], - "intent": "Increase the price of the product on the current page by $11.5", - "intent_template": "{{action}} the price of the product on the current page by {{amount}}", - "instantiation_dict": {"amount": "$11.5", "action": "Increase"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "^__SHOPPING_ADMIN__/catalog/product/save/id/721/type/(simple|configurable)/store/0/set/\\d+/back/edit$", - "http_method": "POST", - "post_data": {"product[price]": "29.50"}, - "response_status": 302 - } - } - ], - "revision": 2 - }, - { - "sites": ["shopping_admin"], - "task_id": 462, - "intent_template_id": 247, - "start_urls": ["__SHOPPING_ADMIN__/catalog/product/edit/id/710/"], - "intent": "Increase the price of the product on the current page by 10%", - "intent_template": "{{action}} the price of the product on the current page by {{amount}}", - "instantiation_dict": {"amount": "10%", "action": "Increase"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "^__SHOPPING_ADMIN__/catalog/product/save/id/710/type/(simple|configurable)/store/0/set/\\d+/back/edit$", - "http_method": "POST", - "post_data": {"product[price]": "19.80"}, - "response_status": 302 - } - } - ], - "revision": 2 - }, - { - "sites": ["shopping_admin"], - "task_id": 463, - "intent_template_id": 247, - "start_urls": ["__SHOPPING_ADMIN__/catalog/product/edit/id/996/"], - "intent": "Increase the price of the product on the current page by 15%", - "intent_template": "{{action}} the price of the product on the current page by {{amount}}", - "instantiation_dict": {"amount": "15%", "action": "Increase"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "^__SHOPPING_ADMIN__/catalog/product/save/id/996/type/(simple|configurable)/store/0/set/\\d+/back/edit$", - "http_method": "POST", - "post_data": {"product[price]": "36.80"}, - "response_status": 302 - } - } - ], - "revision": 2 - }, - { - "sites": ["shopping_admin"], - "task_id": 464, - "intent_template_id": 251, - "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Update the product description of Antonia Racer Tank to \"{count} customer(s) love it!\" where count is the number of reviews with 4 stars or above, or \"don't miss out on this amazing product\" when there are no such reviews.", - "intent_template": "Update the product description of {{product}} to \"{count} customer(s) love it!\" where count is the number of reviews with 4 stars or above, or \"don't miss out on this amazing product\" when there are no such reviews.", - "instantiation_dict": {"product": "Antonia Racer Tank"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "^__SHOPPING_ADMIN__/catalog/product/save/id/1796/type/configurable/store/0/set/\\d+/back/edit$", - "http_method": "POST", - "response_status": 302, - "post_data": {"product[short_description]": "

1 customer(s) love it!

"} - } - } - ], - "revision": 2 - }, - { - "sites": ["shopping"], - "task_id": 465, - "intent_template_id": 186, - "start_urls": ["__SHOPPING__"], - "intent": "Add Tide PODS Spring Meadow Scent HE Turbo Laundry Detergent Pacs, 81 Count to my wish list", - "intent_template": "Add {{product}} to my wish list", - "instantiation_dict": { - "product": "Tide PODS Spring Meadow Scent HE Turbo Laundry Detergent Pacs, 81 Count" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__SHOPPING__/wishlist/index/add/", - "http_method": "POST", - "post_data": {"product": "71337", "qty": null}, - "response_status": 302 - } - } - ], - "revision": 2 - }, - { - "sites": ["shopping"], - "task_id": 466, - "intent_template_id": 186, - "start_urls": ["__SHOPPING__"], - "intent": "Add 2 Hawaiian Bamboo Orchid Roots #zc50 - by Discount Hawaiian Gifts to my wish list", - "intent_template": "Add {{product}} to my wish list", - "instantiation_dict": {"product": "2 Hawaiian Bamboo Orchid Roots #zc50 - by Discount Hawaiian Gifts"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__SHOPPING__/wishlist/index/add/", - "http_method": "POST", - "post_data": {"product": "22787", "qty": null}, - "response_status": 302 - } - } - ], - "revision": 2 - }, - { - "sites": ["shopping"], - "task_id": 467, - "intent_template_id": 186, - "start_urls": ["__SHOPPING__"], - "intent": "Add HONGJ Hawaiian Beach Outfits Set for Mens, Summer Tropical Tree Printed Relaxed-fit Hawaii Shirts Shorts 2 Piece Suits to my wish list", - "intent_template": "Add {{product}} to my wish list", - "instantiation_dict": { - "product": "HONGJ Hawaiian Beach Outfits Set for Mens, Summer Tropical Tree Printed Relaxed-fit Hawaii Shirts Shorts 2 Piece Suits" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__SHOPPING__/wishlist/index/add/", - "http_method": "POST", - "post_data": {"product": "85498", "qty": null}, - "response_status": 302 - } - } - ], - "revision": 2 - }, - { - "sites": ["shopping"], - "task_id": 468, - "intent_template_id": 186, - "start_urls": ["__SHOPPING__"], - "intent": "Add DkRgVNY Lace Spcling Lingerie Womens Sexy Hollow Out Underwear Bodysuit One Piece Snap Crotch Clubwear Teddy Bodysuit to my wish list", - "intent_template": "Add {{product}} to my wish list", - "instantiation_dict": { - "product": "DkRgVNY Lace Spcling Lingerie Womens Sexy Hollow Out Underwear Bodysuit One Piece Snap Crotch Clubwear Teddy Bodysuit" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__SHOPPING__/wishlist/index/add/", - "http_method": "POST", - "post_data": {"product": "30843", "qty": null}, - "response_status": 302 - } - } - ], - "revision": 2 - }, - { - "sites": ["shopping"], - "task_id": 469, - "intent_template_id": 186, - "start_urls": ["__SHOPPING__"], - "intent": "Add Light Blue Simple Summer New Low Heels Slippers for Women Fashion Chunky Heels Pointed Toe Wine Glasses Sandals Comfortable Walking Shoes Ladies All-Match Sexy Party Shoes to my wish list", - "intent_template": "Add {{product}} to my wish list", - "instantiation_dict": { - "product": "Light Blue Simple Summer New Low Heels Slippers for Women Fashion Chunky Heels Pointed Toe Wine Glasses Sandals Comfortable Walking Shoes Ladies All-Match Sexy Party Shoes" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__SHOPPING__/wishlist/index/add/", - "http_method": "POST", - "post_data": {"product": "60047", "qty": null}, - "response_status": 302 - } - } - ], - "revision": 2 - }, - { - "sites": ["shopping_admin"], - "task_id": 470, - "intent_template_id": 257, - "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Cancel order 302", - "intent_template": "Cancel order {{id}}", - "instantiation_dict": {"id": "302"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__SHOPPING_ADMIN__/sales/order/cancel/order_id/302/", - "http_method": "POST", - "response_status": 302 - } - } - ], - "revision": 2 - }, - { - "sites": ["shopping_admin"], - "task_id": 471, - "intent_template_id": 257, - "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Cancel order 307", - "intent_template": "Cancel order {{id}}", - "instantiation_dict": {"id": "307"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__SHOPPING_ADMIN__/sales/order/cancel/order_id/307/", - "http_method": "POST", - "response_status": 302 - } - } - ], - "revision": 2 - }, - { - "sites": ["shopping_admin"], - "task_id": 472, - "intent_template_id": 257, - "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Cancel order 299", - "intent_template": "Cancel order {{id}}", - "instantiation_dict": {"id": "299"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__SHOPPING_ADMIN__/sales/order/cancel/order_id/299/", - "http_method": "POST", - "response_status": 302 - } - } - ], - "revision": 2 - }, - { - "sites": ["shopping_admin"], - "task_id": 473, - "intent_template_id": 257, - "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Cancel order 301", - "intent_template": "Cancel order {{id}}", - "instantiation_dict": {"id": "301"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__SHOPPING_ADMIN__/sales/order/cancel/order_id/301/", - "http_method": "POST", - "response_status": 302 - } - } - ], - "revision": 2 - }, - { - "sites": ["shopping_admin"], - "task_id": 474, - "intent_template_id": 257, - "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Cancel order 305", - "intent_template": "Cancel order {{id}}", - "instantiation_dict": {"id": "305"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__SHOPPING_ADMIN__/sales/order/cancel/order_id/305/", - "http_method": "POST", - "response_status": 302 - } - } - ], - "revision": 2 - }, - { - "sites": ["gitlab"], - "task_id": 475, - "intent_template_id": 292, - "start_urls": ["__GITLAB__"], - "intent": "Set up a new, empty repository with the name chatgpt_plugin", - "intent_template": "Set up a new, empty repository with the name {{project_name}}", - "instantiation_dict": {"project_name": "chatgpt_plugin"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__GITLAB__/projects", - "http_method": "POST", - "post_data": { - "project[name]": "chatgpt_plugin", - "project[namespace_id]": "2505", - "project[path]": "chatgpt_plugin", - "project[initialize_with_readme]": "0" - }, - "response_status": 302 - } - } - ], - "revision": 2 - }, - { - "sites": ["gitlab"], - "task_id": 476, - "intent_template_id": 292, - "start_urls": ["__GITLAB__"], - "intent": "Set up a new, empty repository with the name awesome_llm_reading", - "intent_template": "Set up a new, empty repository with the name {{project_name}}", - "instantiation_dict": {"project_name": "awesome_llm_reading"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__GITLAB__/projects", - "http_method": "POST", - "post_data": { - "project[name]": "awesome_llm_reading", - "project[namespace_id]": "2505", - "project[path]": "awesome_llm_reading", - "project[initialize_with_readme]": "0" - }, - "response_status": 302 - } - } - ], - "revision": 2 - }, - { - "sites": ["gitlab"], - "task_id": 477, - "intent_template_id": 292, - "start_urls": ["__GITLAB__"], - "intent": "Set up a new, empty repository with the name awesome_program_aided_reasoning", - "intent_template": "Set up a new, empty repository with the name {{project_name}}", - "instantiation_dict": {"project_name": "awesome_program_aided_reasoning"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__GITLAB__/projects", - "http_method": "POST", - "post_data": { - "project[name]": "awesome_program_aided_reasoning", - "project[namespace_id]": "2505", - "project[path]": "awesome_program_aided_reasoning", - "project[initialize_with_readme]": "0" - }, - "response_status": 302 - } - } - ], - "revision": 2 - }, - { - "sites": ["gitlab"], - "task_id": 478, - "intent_template_id": 292, - "start_urls": ["__GITLAB__"], - "intent": "Set up a new, empty repository with the name webagent", - "intent_template": "Set up a new, empty repository with the name {{project_name}}", - "instantiation_dict": {"project_name": "webagent"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__GITLAB__/projects", - "http_method": "POST", - "post_data": { - "project[name]": "webagent", - "project[namespace_id]": "2505", - "project[path]": "webagent", - "project[initialize_with_readme]": "0" - }, - "response_status": 302 - } - } - ], - "revision": 2 - }, - { - "sites": ["gitlab"], - "task_id": 479, - "intent_template_id": 292, - "start_urls": ["__GITLAB__"], - "intent": "Set up a new, empty repository with the name awesome_webagent", - "intent_template": "Set up a new, empty repository with the name {{project_name}}", - "instantiation_dict": {"project_name": "awesome_webagent"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__GITLAB__/projects", - "http_method": "POST", - "post_data": { - "project[name]": "awesome_webagent", - "project[namespace_id]": "2505", - "project[path]": "awesome_webagent", - "project[initialize_with_readme]": "0" - }, - "response_status": 302 - } - } - ], - "revision": 2 - }, - { - "sites": ["gitlab"], - "task_id": 480, - "intent_template_id": 293, - "start_urls": ["__GITLAB__"], - "intent": "Invite yjlou as collaborator(s) to solarized-prism-theme repo", - "intent_template": "Invite {{collaborator_account_list}} as collaborator(s) to {{repo}} repo", - "instantiation_dict": {"collaborator_account_list": "yjlou", "repo": "solarized-prism-theme"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__GITLAB__/api/v4/projects/188/members", - "http_method": "POST", - "post_data": {"user_id": 168, "access_level": 30}, - "response_status": 201 - } - } - ], - "revision": 2 - }, - { - "sites": ["gitlab"], - "task_id": 481, - "intent_template_id": 294, - "start_urls": ["__GITLAB__"], - "intent": "Abishek wants to check my dotfile configurations. Please invite him to the repo as a guest.", - "intent_template": "{{name}} wants to check my dotfile configurations. Please invite him to the repo as a guest.", - "instantiation_dict": {"name": "Abishek"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__GITLAB__/api/v4/projects/193/members", - "http_method": "POST", - "post_data": {"user_id": 5, "access_level": 10}, - "response_status": 201 - } - } - ], - "revision": 2 - }, - { - "sites": ["gitlab"], - "task_id": 482, - "intent_template_id": 294, - "start_urls": ["__GITLAB__"], - "intent": "yjlou wants to check my dotfile configurations. Please invite him to the repo as a guest.", - "intent_template": "{{name}} wants to check my dotfile configurations. Please invite him to the repo as a guest.", - "instantiation_dict": {"name": "yjlou"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__GITLAB__/api/v4/projects/193/members", - "http_method": "POST", - "post_data": {"user_id": 168, "access_level": 10}, - "response_status": 201 - } - } - ], - "revision": 2 - }, - { - "sites": ["gitlab"], - "task_id": 483, - "intent_template_id": 294, - "start_urls": ["__GITLAB__"], - "intent": "Koushik wants to check my dotfile configurations. Please invite him to the repo as a guest.", - "intent_template": "{{name}} wants to check my dotfile configurations. Please invite him to the repo as a guest.", - "instantiation_dict": {"name": "Koushik"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__GITLAB__/api/v4/projects/193/members", - "http_method": "POST", - "post_data": {"user_id": 1912, "access_level": 10}, - "response_status": 201 - } - } - ], - "revision": 2 - }, - { - "sites": ["gitlab"], - "task_id": 484, - "intent_template_id": 294, - "start_urls": ["__GITLAB__"], - "intent": "Jakub Klinkovsk\u00fd wants to check my dotfile configurations. Please invite him to the repo as a guest.", - "intent_template": "{{name}} wants to check my dotfile configurations. Please invite him to the repo as a guest.", - "instantiation_dict": {"name": "Jakub Klinkovsk\u00fd"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__GITLAB__/api/v4/projects/193/members", - "http_method": "POST", - "post_data": {"user_id": 1842, "access_level": 10}, - "response_status": 201 - } - } - ], - "revision": 2 - }, - { - "sites": ["gitlab"], - "task_id": 485, - "intent_template_id": 294, - "start_urls": ["__GITLAB__"], - "intent": "Vinta wants to check my dotfile configurations. Please invite him to the repo as a guest.", - "intent_template": "{{name}} wants to check my dotfile configurations. Please invite him to the repo as a guest.", - "instantiation_dict": {"name": "Vinta"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__GITLAB__/api/v4/projects/193/members", - "http_method": "POST", - "post_data": {"user_id": 278, "access_level": 10}, - "response_status": 201 - } - } - ], - "revision": 2 - }, - { - "sites": ["shopping_admin"], - "task_id": 486, - "intent_template_id": 275, - "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Change the page title of \"404 Not Found\" page on my site to \"Bruh bro you clicked the wrong page\".", - "intent_template": "Change the page title of \"{{old_heading}}\" page on my site to \"{{heading}}\".", - "instantiation_dict": { - "heading": "Bruh bro you clicked the wrong page", - "old_heading": "404 Not Found" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__SHOPPING_ADMIN__/cms/page/save/back/edit", - "http_method": "POST", - "post_data": { - "title": "Bruh bro you clicked the wrong page", - "is_active": "1", - "store_id[0]": "0", - "page_id": "1" - }, - "response_status": 302 - } - } - ], - "revision": 2 - }, - { - "sites": ["shopping_admin"], - "task_id": 487, - "intent_template_id": 275, - "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Change the page title of \"Enable Cookies\" page on my site to \"Cookie monster coming to your place\".", - "intent_template": "Change the page title of \"{{old_heading}}\" page on my site to \"{{heading}}\".", - "instantiation_dict": { - "heading": "Cookie monster coming to your place", - "old_heading": "Enable Cookies" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__SHOPPING_ADMIN__/cms/page/save/back/edit", - "http_method": "POST", - "post_data": { - "title": "Cookie monster coming to your place", - "is_active": "1", - "store_id[0]": "0", - "page_id": "3" - }, - "response_status": 302 - } - } - ], - "revision": 2 - }, - { - "sites": ["shopping_admin"], - "task_id": 488, - "intent_template_id": 275, - "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Change the page title of \"Home Page\" page on my site to \"This is the home page!! Leave here!!\".", - "intent_template": "Change the page title of \"{{old_heading}}\" page on my site to \"{{heading}}\".", - "instantiation_dict": {"heading": "This is the home page!! Leave here!!", "old_heading": "Home Page"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__SHOPPING_ADMIN__/cms/page/save/back/edit", - "http_method": "POST", - "post_data": { - "title": "This is the home page!! Leave here!!", - "is_active": "1", - "store_id[0]": "0", - "page_id": "2" - }, - "response_status": 302 - } - } - ], - "revision": 2 - }, - { - "sites": ["shopping_admin"], - "task_id": 489, - "intent_template_id": 275, - "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Change the page title of \"Privacy Policy\" page on my site to \"No privacy policy is needed in this dystopian world\".", - "intent_template": "Change the page title of \"{{old_heading}}\" page on my site to \"{{heading}}\".", - "instantiation_dict": { - "heading": "No privacy policy is needed in this dystopian world", - "old_heading": "Privacy Policy" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__SHOPPING_ADMIN__/cms/page/save/back/edit", - "http_method": "POST", - "post_data": { - "title": "No privacy policy is needed in this dystopian world", - "is_active": "1", - "store_id[0]": "0", - "page_id": "4" - }, - "response_status": 302 - } - } - ], - "revision": 2 - }, - { - "sites": ["shopping_admin"], - "task_id": 490, - "intent_template_id": 275, - "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Change the page title of \"About us\" page on my site to \"Secret\".", - "intent_template": "Change the page title of \"{{old_heading}}\" page on my site to \"{{heading}}\".", - "instantiation_dict": {"heading": "Secret", "old_heading": "About us"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__SHOPPING_ADMIN__/cms/page/save/back/edit", - "http_method": "POST", - "post_data": {"title": "Secret", "is_active": "1", "store_id[0]": "0", "page_id": "5"}, - "response_status": 302 - } - } - ], - "revision": 2 - }, - { - "sites": ["shopping_admin"], - "task_id": 491, - "intent_template_id": 280, - "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Notify Sarah Miller in their most recent pending order with message \"the order is ready to be shipped soon!\"", - "intent_template": "Notify {{name}} in their most recent pending order with message \"{{message}}\"", - "instantiation_dict": {"name": "Sarah Miller", "message": "the order is ready to be shipped soon!"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": { - "task_type": "mutate", - "status": "ACTION_NOT_ALLOWED_ERROR", - "retrieved_data": null - } - } - ], - "revision": 2 - }, - { - "sites": ["shopping_admin"], - "task_id": 492, - "intent_template_id": 280, - "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Notify Jane Doe in their most recent pending order with message \"sorry we are out of stock, please reorder\"", - "intent_template": "Notify {{name}} in their most recent pending order with message \"{{message}}\"", - "instantiation_dict": {"name": "Jane Doe", "message": "sorry we are out of stock, please reorder"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "ignored_query_params_patterns": ["isAjax"], - "expected": { - "url": "__SHOPPING_ADMIN__/sales/order/addComment/order_id/302", - "http_method": "POST", - "post_data": { - "history[comment]": "sorry we are out of stock, please reorder", - "history[is_customer_notified]": "1" - } - } - } - ], - "revision": 2 - }, - { - "sites": ["shopping_admin"], - "task_id": 493, - "intent_template_id": 280, - "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Notify Grace Nguyen in their most recent pending order with message \"sorry we are bankrupt, please contact our customer service for refund\"", - "intent_template": "Notify {{name}} in their most recent pending order with message \"{{message}}\"", - "instantiation_dict": { - "name": "Grace Nguyen", - "message": "sorry we are bankrupt, please contact our customer service for refund" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "ignored_query_params_patterns": ["isAjax"], - "expected": { - "url": "__SHOPPING_ADMIN__/sales/order/addComment/order_id/65", - "http_method": "POST", - "post_data": { - "history[comment]": "sorry we are bankrupt, please contact our customer service for refund", - "history[is_customer_notified]": "1" - } - } - } - ], - "revision": 2 - }, - { - "sites": ["shopping_admin"], - "task_id": 494, - "intent_template_id": 280, - "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Notify Alex Thomas in their most recent pending order with message \"Yo, your order will be shipped soon!\"", - "intent_template": "Notify {{name}} in their most recent pending order with message \"{{message}}\"", - "instantiation_dict": {"name": "Alex Thomas", "message": "Yo, your order will be shipped soon!"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "ignored_query_params_patterns": ["isAjax"], - "expected": { - "url": "__SHOPPING_ADMIN__/sales/order/addComment/order_id/304", - "http_method": "POST", - "post_data": { - "history[comment]": "Yo, your order will be shipped soon!", - "history[is_customer_notified]": "1" - } - } - } - ], - "revision": 2 - }, - { - "sites": ["shopping_admin"], - "task_id": 495, - "intent_template_id": 280, - "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Notify Lily Potter in their most recent pending order with message \"Thanks, your order is ready to be shipped!\"", - "intent_template": "Notify {{name}} in their most recent pending order with message \"{{message}}\"", - "instantiation_dict": {"name": "Lily Potter", "message": "Thanks, your order is ready to be shipped!"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "ignored_query_params_patterns": ["isAjax"], - "expected": { - "url": "__SHOPPING_ADMIN__/sales/order/addComment/order_id/303", - "http_method": "POST", - "post_data": { - "history[comment]": "Thanks, your order is ready to be shipped!", - "history[is_customer_notified]": "1" - } - } - } - ], - "revision": 2 - }, - { - "sites": ["shopping_admin"], - "task_id": 496, - "intent_template_id": 284, - "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Update order #299 with the Federal Express tracking number 8974568499", - "intent_template": "Update order #{{order}} with the {{service}} tracking number {{tracking}}", - "instantiation_dict": {"tracking": "8974568499", "order": "299", "service": "Federal Express"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__SHOPPING_ADMIN__/admin/order_shipment/save/order_id/299/", - "http_method": "POST", - "post_data": {"tracking[1][carrier_code]": "fedex", "tracking[1][number]": "8974568499"}, - "response_status": 302 - } - } - ], - "revision": 2 - }, - { - "sites": ["shopping_admin"], - "task_id": 497, - "intent_template_id": 284, - "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Update order #307 with the DHL tracking number 24353446464", - "intent_template": "Update order #{{order}} with the {{service}} tracking number {{tracking}}", - "instantiation_dict": {"tracking": "24353446464", "order": "307", "service": "DHL"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__SHOPPING_ADMIN__/admin/order_shipment/save/order_id/307/", - "http_method": "POST", - "post_data": {"tracking[1][carrier_code]": "dhl", "tracking[1][number]": "24353446464"}, - "response_status": 302 - } - } - ], - "revision": 2 - }, - { - "sites": ["shopping_admin"], - "task_id": 498, - "intent_template_id": 284, - "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Update order #306 with the UPS tracking number 55591023930", - "intent_template": "Update order #{{order}} with the {{service}} tracking number {{tracking}}", - "instantiation_dict": {"tracking": "55591023930", "order": "306", "service": "UPS"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__SHOPPING_ADMIN__/admin/order_shipment/save/order_id/306/", - "http_method": "POST", - "post_data": {"tracking[1][carrier_code]": "ups", "tracking[1][number]": "55591023930"}, - "response_status": 302 - } - } - ], - "revision": 2 - }, - { - "sites": ["shopping_admin"], - "task_id": 499, - "intent_template_id": 284, - "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Update order #304 with the USPS tracking number 13849373987", - "intent_template": "Update order #{{order}} with the {{service}} tracking number {{tracking}}", - "instantiation_dict": {"tracking": "13849373987", "order": "304", "service": "USPS"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__SHOPPING_ADMIN__/admin/order_shipment/save/order_id/304/", - "http_method": "POST", - "post_data": {"tracking[1][carrier_code]": "usps", "tracking[1][number]": "13849373987"}, - "response_status": 302 - } - } - ], - "revision": 2 - }, - { - "sites": ["shopping_admin"], - "task_id": 500, - "intent_template_id": 284, - "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Update order #301 with the DHL tracking number 239028439840", - "intent_template": "Update order #{{order}} with the {{service}} tracking number {{tracking}}", - "instantiation_dict": {"tracking": "239028439840", "order": "301", "service": "DHL"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__SHOPPING_ADMIN__/admin/order_shipment/save/order_id/301/", - "http_method": "POST", - "post_data": {"tracking[1][carrier_code]": "dhl", "tracking[1][number]": "239028439840"}, - "response_status": 302 - } - } - ], - "revision": 2 - }, - { - "sites": ["shopping_admin"], - "task_id": 501, - "intent_template_id": 287, - "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Mark all Taurus Elements Shell as out of stock", - "intent_template": "Mark all {{product}} as out of stock", - "instantiation_dict": {"product": "Taurus Elements Shell"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "^__SHOPPING_ADMIN__/catalog/product/save/id/350/type/configurable/store/0/set/\\d+/back/edit$", - "http_method": "POST", - "post_data": {"product[quantity_and_stock_status][is_in_stock]": "0"}, - "response_status": 302 - } - } - ], - "revision": 2 - }, - { - "sites": ["shopping_admin"], - "task_id": 502, - "intent_template_id": 287, - "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Mark all Gobi HeatTec Tee as out of stock", - "intent_template": "Mark all {{product}} as out of stock", - "instantiation_dict": {"product": "Gobi HeatTec Tee"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "^__SHOPPING_ADMIN__/catalog/product/save/id/446/type/configurable/store/0/set/\\d+/back/edit$", - "http_method": "POST", - "post_data": {"product[quantity_and_stock_status][is_in_stock]": "0"}, - "response_status": 302 - } - } - ], - "revision": 2 - }, - { - "sites": ["shopping_admin"], - "task_id": 503, - "intent_template_id": 287, - "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Mark all rocco gym tank as out of stock", - "intent_template": "Mark all {{product}} as out of stock", - "instantiation_dict": {"product": "rocco gym tank"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "^__SHOPPING_ADMIN__/catalog/product/save/id/682/type/configurable/store/0/set/\\d+/back/edit$", - "http_method": "POST", - "post_data": {"product[quantity_and_stock_status][is_in_stock]": "0"}, - "response_status": 302 - } - } - ], - "revision": 2 - }, - { - "sites": ["shopping_admin"], - "task_id": 504, - "intent_template_id": 287, - "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Mark all Selene yoga hoodie as out of stock", - "intent_template": "Mark all {{product}} as out of stock", - "instantiation_dict": {"product": "Selene yoga hoodie"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "^__SHOPPING_ADMIN__/catalog/product/save/id/1108/type/configurable/store/0/set/\\d+/back/edit$", - "http_method": "POST", - "post_data": {"product[quantity_and_stock_status][is_in_stock]": "0"}, - "response_status": 302 - } - } - ], - "revision": 2 - }, - { - "sites": ["shopping_admin"], - "task_id": 505, - "intent_template_id": 287, - "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Mark all Aeon capri as out of stock", - "intent_template": "Mark all {{product}} as out of stock", - "instantiation_dict": {"product": "Aeon capri"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "^__SHOPPING_ADMIN__/catalog/product/save/id/1861/type/configurable/store/0/set/\\d+/back/edit$", - "http_method": "POST", - "post_data": {"product[quantity_and_stock_status][is_in_stock]": "0"}, - "response_status": 302 - } - } - ], - "revision": 2 - }, - { - "sites": ["shopping"], - "task_id": 506, - "intent_template_id": 172, - "start_urls": ["__SHOPPING__"], - "intent": "Buy the highest rated product from the meat substitute category within a budget between 100 and 200. Discard any items in your cart if it is not empty.", - "intent_template": "Buy the highest rated product from the {{product_category}} category within a budget {{dollar_value}}. Discard any items in your cart if it is not empty.", - "instantiation_dict": {"product_category": "meat substitute", "dollar_value": "between 100 and 200"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "decode_base64_query": true, - "expected": { - "url": "^__SHOPPING__/checkout/cart/add/uenc/.*/product/22490$", - "http_method": "POST", - "post_data": {"product": "22490"} - } - }, - { - "evaluator": "NetworkEventEvaluator", - "ignored_query_params_patterns": ["\\_"], - "expected": { - "url": "__SHOPPING__/rest/default/V1/carts/mine/totals", - "response_content": { - "items_qty": 1, - "$.items[0].name": "Beyond Meat Beef Beefy Crumble, 5 Pound -- 2 per case." - } - } - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__SHOPPING__/rest/default/V1/carts/mine/payment-information", - "http_method": "POST", - "post_data": {"$.billingAddress.customerId": "27"} - } - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": {"url": "__SHOPPING__/checkout/onepage/success"} - } - ], - "revision": 2 - }, - { - "sites": ["shopping"], - "task_id": 507, - "intent_template_id": 172, - "start_urls": ["__SHOPPING__"], - "intent": "Buy the highest rated product from the Ceiling light category within a budget above 1000. Discard any items in your cart if it is not empty.", - "intent_template": "Buy the highest rated product from the {{product_category}} category within a budget {{dollar_value}}. Discard any items in your cart if it is not empty.", - "instantiation_dict": {"product_category": "Ceiling light", "dollar_value": "above 1000"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "decode_base64_query": true, - "expected": { - "url": "^__SHOPPING__/checkout/cart/add/uenc/.*/product/71506$", - "http_method": "POST", - "post_data": {"product": "71506"} - } - }, - { - "evaluator": "NetworkEventEvaluator", - "ignored_query_params_patterns": ["\\_"], - "expected": { - "url": "__SHOPPING__/rest/default/V1/carts/mine/totals", - "response_content": { - "items_qty": 1, - "$.items[0].name": "40''X138\" Hight Ceiling Chandelier Crystal Raindrop Chandeliers Staircase Large Chandelier Villa Entrance Foyer Pendant Light Grand Light Foyer High Ceiling Fixture 15 GU10 Bulb Include Remote Dimming" - } - } - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__SHOPPING__/rest/default/V1/carts/mine/payment-information", - "http_method": "POST", - "post_data": {"$.billingAddress.customerId": "27"} - } - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": {"url": "__SHOPPING__/checkout/onepage/success"} - } - ], - "revision": 2 - }, - { - "sites": ["shopping"], - "task_id": 508, - "intent_template_id": 172, - "start_urls": ["__SHOPPING__"], - "intent": "Buy the highest rated product from the NS switch category within a budget under 60. Discard any items in your cart if it is not empty.", - "intent_template": "Buy the highest rated product from the {{product_category}} category within a budget {{dollar_value}}. Discard any items in your cart if it is not empty.", - "instantiation_dict": {"product_category": "NS switch", "dollar_value": "under 60"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "decode_base64_query": true, - "expected": { - "url": "^__SHOPPING__/checkout/cart/add/uenc/.*/product/40448$", - "http_method": "POST", - "post_data": {"product": "40448"} - } - }, - { - "evaluator": "NetworkEventEvaluator", - "ignored_query_params_patterns": ["\\_"], - "expected": { - "url": "__SHOPPING__/rest/default/V1/carts/mine/totals", - "response_content": { - "items_qty": 1, - "$.items[0].name": "Multi Pouch The Legend of Zelda Breath of the Wild for Nintendo Switch" - } - } - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__SHOPPING__/rest/default/V1/carts/mine/payment-information", - "http_method": "POST", - "post_data": {"$.billingAddress.customerId": "27"} - } - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": {"url": "__SHOPPING__/checkout/onepage/success"} - } - ], - "revision": 2 - }, - { - "sites": ["shopping"], - "task_id": 509, - "intent_template_id": 216, - "start_urls": ["__SHOPPING__"], - "intent": "Buy the best rating product from \"Men\"s shoe\" category with at least 5 reviews and the product is least expensive. Choose any available variant.", - "intent_template": "Buy the best rating product from \"{{category}}\" category with at least 5 reviews and the product is least expensive. Choose any available variant.", - "instantiation_dict": {"category": "Men\"s shoe"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "decode_base64_query": true, - "expected": { - "url": "^__SHOPPING__/checkout/cart/add/uenc/.*/product/65885$", - "http_method": "POST", - "post_data": {"product": "65885"} - } - }, - { - "evaluator": "NetworkEventEvaluator", - "ignored_query_params_patterns": ["\\_"], - "expected": { - "url": "__SHOPPING__/rest/default/V1/carts/mine/totals", - "response_content": {"items_qty": 1, "$.items[0].name": "Clarks Men's Tunsil Lane Oxford"} - } - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__SHOPPING__/rest/default/V1/carts/mine/payment-information", - "http_method": "POST", - "post_data": {"$.billingAddress.customerId": "27"} - } - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": {"url": "__SHOPPING__/checkout/onepage/success"} - } - ], - "revision": 2 - }, - { - "sites": ["shopping"], - "task_id": 510, - "intent_template_id": 216, - "start_urls": ["__SHOPPING__"], - "intent": "Buy the best rating product from \"Home Audio Speaker\" category with at least 5 reviews and the product is least expensive. Choose any available variant.", - "intent_template": "Buy the best rating product from \"{{category}}\" category with at least 5 reviews and the product is least expensive. Choose any available variant.", - "instantiation_dict": {"category": "Home Audio Speaker"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "decode_base64_query": true, - "expected": { - "url": "^__SHOPPING__/checkout/cart/add/uenc/.*/product/75640$", - "http_method": "POST", - "post_data": {"product": "75640"} - } - }, - { - "evaluator": "NetworkEventEvaluator", - "ignored_query_params_patterns": ["_"], - "expected": { - "url": "__SHOPPING__/rest/default/V1/carts/mine/totals", - "response_content": { - "items_qty": 1, - "$.items[0].name": "Atlantic Technology FS-7.0-GLB 7-channel Surround Bar (Gloss Black) (Discontinued by Manufacturer)" - } - } - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__SHOPPING__/rest/default/V1/carts/mine/payment-information", - "http_method": "POST", - "post_data": {"$.billingAddress.customerId": "27"} - } - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": {"url": "__SHOPPING__/checkout/onepage/success"} - } - ], - "revision": 2 - }, - { - "sites": ["shopping"], - "task_id": 511, - "intent_template_id": 189, - "start_urls": ["__SHOPPING__"], - "intent": "Add a laundry detergent to my wish list.", - "intent_template": "Add a {{product}} to my wish list.", - "instantiation_dict": {"product": "laundry detergent"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__SHOPPING__/wishlist/index/add/", - "http_method": "POST", - "response_status": 302, - "response_cookies": { - "mage-messages": "^.*(?=.*laundry)(?=.*detergent).* has been added to your wish list.*$" - }, - "post_data": {"qty": null} - } - } - ], - "revision": 2 - }, - { - "sites": ["shopping"], - "task_id": 512, - "intent_template_id": 189, - "start_urls": ["__SHOPPING__"], - "intent": "Add a toothpaste to my wish list.", - "intent_template": "Add a {{product}} to my wish list.", - "instantiation_dict": {"product": "toothpaste"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__SHOPPING__/wishlist/index/add/", - "http_method": "POST", - "response_status": 302, - "response_cookies": {"mage-messages": "^.*toothpaste.* has been added to your wish list.*$"}, - "post_data": {"qty": null} - } - } - ], - "revision": 2 - }, - { - "sites": ["shopping"], - "task_id": 513, - "intent_template_id": 189, - "start_urls": ["__SHOPPING__"], - "intent": "Add a chair to my wish list.", - "intent_template": "Add a {{product}} to my wish list.", - "instantiation_dict": {"product": "chair"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__SHOPPING__/wishlist/index/add/", - "http_method": "POST", - "response_status": 302, - "response_cookies": {"mage-messages": "^.*chair.* has been added to your wish list.*$"}, - "post_data": {"qty": null} - } - } - ], - "revision": 2 - }, - { - "sites": ["shopping"], - "task_id": 514, - "intent_template_id": 189, - "start_urls": ["__SHOPPING__"], - "intent": "Add a white desk to my wish list.", - "intent_template": "Add a {{product}} to my wish list.", - "instantiation_dict": {"product": "white desk"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__SHOPPING__/wishlist/index/add/", - "http_method": "POST", - "response_status": 302, - "response_cookies": { - "mage-messages": "^.*(?=.*white)(?=.*desk).* has been added to your wish list.*$" - }, - "post_data": {"qty": null} - } - } - ], - "revision": 2 - }, - { - "sites": ["shopping"], - "task_id": 515, - "intent_template_id": 189, - "start_urls": ["__SHOPPING__"], - "intent": "Add a white computer desk to my wish list.", - "intent_template": "Add a {{product}} to my wish list.", - "instantiation_dict": {"product": "white computer desk"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__SHOPPING__/wishlist/index/add/", - "http_method": "POST", - "response_status": 302, - "response_cookies": { - "mage-messages": "^.*(?=.*white)(?=.*computer)(?=.*desk).* has been added to your wish list.*$" - }, - "post_data": {"qty": null} - } - } - ], - "revision": 2 - }, - { - "sites": ["shopping"], - "task_id": 516, - "intent_template_id": 196, - "start_urls": [ - "__SHOPPING__/elmwood-inn-fine-teas-orange-vanilla-caffeine-free-fruit-infusion-16-ounce-pouch.html" - ], - "intent": "Add the product on the current page to my wishlist", - "intent_template": "Add the product on the current page to my wishlist", - "instantiation_dict": {}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__SHOPPING__/wishlist/index/add/", - "http_method": "POST", - "post_data": {"product": "104497", "qty": null}, - "response_status": 302 - } - } - ], - "revision": 2 - }, - { - "sites": ["shopping"], - "task_id": 517, - "intent_template_id": 196, - "start_urls": [ - "__SHOPPING__/skinit-decal-gaming-skin-compatible-with-xbox-one-s-console-and-controller-bundle-officially-licensed-nfl-baltimore-ravens-design.html" - ], - "intent": "Add the product on the current page to my wishlist", - "intent_template": "Add the product on the current page to my wishlist", - "instantiation_dict": {}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__SHOPPING__/wishlist/index/add/", - "http_method": "POST", - "post_data": {"product": "37811", "qty": null}, - "response_status": 302 - } - } - ], - "revision": 2 - }, - { - "sites": ["shopping"], - "task_id": 518, - "intent_template_id": 196, - "start_urls": ["__SHOPPING__/sceptre-e195bd-srr-19-inch-720p-led-tv-true-black-2017.html"], - "intent": "Add the product on the current page to my wishlist", - "intent_template": "Add the product on the current page to my wishlist", - "instantiation_dict": {}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__SHOPPING__/wishlist/index/add/", - "http_method": "POST", - "post_data": {"product": "37285", "qty": null}, - "response_status": 302 - } - } - ], - "revision": 2 - }, - { - "sites": ["shopping"], - "task_id": 519, - "intent_template_id": 196, - "start_urls": [ - "__SHOPPING__/iphone-13-pro-max-case-neon-turtle-iphone-13-pro-max-cases-tempered-glass-back-soft-silicone-tpu-shock-protective-case-for-apple-iphone-13-pro-max.html" - ], - "intent": "Add the product on the current page to my wishlist", - "intent_template": "Add the product on the current page to my wishlist", - "instantiation_dict": {}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__SHOPPING__/wishlist/index/add/", - "http_method": "POST", - "post_data": {"product": "37339", "qty": "1"}, - "response_status": 302 - } - } - ], - "revision": 2 - }, - { - "sites": ["shopping"], - "task_id": 520, - "intent_template_id": 196, - "start_urls": [ - "__SHOPPING__/magnetic-metal-stainless-steel-d-pads-kits-directional-pad-replacement-parts-for-xbox-one-elite-controller-elite-series-2-xbox-one-xbox-one-s-x-controller.html" - ], - "intent": "Add the product on the current page to my wishlist", - "intent_template": "Add the product on the current page to my wishlist", - "instantiation_dict": {}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__SHOPPING__/wishlist/index/add/", - "http_method": "POST", - "post_data": {"product": "37313", "qty": "1"}, - "response_status": 302 - } - } - ], - "revision": 2 - }, - { - "sites": ["shopping"], - "task_id": 521, - "intent_template_id": 199, - "start_urls": ["__SHOPPING__"], - "intent": "Subscribe to the newsletter of OneStopMarket", - "intent_template": "Subscribe to the newsletter of OneStopMarket", - "instantiation_dict": {}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__SHOPPING__/newsletter/subscriber/new/", - "http_method": "POST", - "post_data": {"email": "emma.lopez@gmail.com"}, - "response_status": 302 - } - } - ], - "revision": 2 - }, - { - "sites": ["gitlab"], - "task_id": 522, - "intent_template_id": 352, - "start_urls": ["__GITLAB__"], - "intent": "Fork all repos from facebook.", - "intent_template": "Fork {{repo}}.", - "instantiation_dict": {"repo": "all repos from facebook"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__GITLAB__/api/v4/projects/122/fork", - "http_method": "POST", - "post_data": { - "id": "122", - "name": "create-react-app", - "namespace_id": 2505, - "path": "create-react-app" - }, - "response_status": 201 - } - } - ], - "revision": 2 - }, - { - "sites": ["gitlab"], - "task_id": 523, - "intent_template_id": 354, - "start_urls": ["__GITLAB__"], - "intent": "Star the top five most stared repos in Gitlab", - "intent_template": "Star the top {{number}} most stared repos in Gitlab", - "instantiation_dict": {"number": "five"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__GITLAB__/umano/AndroidSlidingUpPanel/toggle_star.json", - "http_method": "POST", - "response_content": {"star_count": 56} - } - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__GITLAB__/facebook/create-react-app/toggle_star.json", - "http_method": "POST", - "response_content": {"star_count": 53} - } - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__GITLAB__/kkroening/ffmpeg-python/toggle_star.json", - "http_method": "POST", - "response_content": {"star_count": 52} - } - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__GITLAB__/mk-j/PHP_XLSXWriter/toggle_star.json", - "http_method": "POST", - "response_content": {"star_count": 48} - } - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__GITLAB__/koush/AndroidAsync/toggle_star.json", - "http_method": "POST", - "response_content": {"star_count": 47} - } - } - ], - "revision": 2 - }, - { - "sites": ["gitlab"], - "task_id": 524, - "intent_template_id": 354, - "start_urls": ["__GITLAB__"], - "intent": "Star the top eight most stared repos in Gitlab", - "intent_template": "Star the top {{number}} most stared repos in Gitlab", - "instantiation_dict": {"number": "eight"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__GITLAB__/umano/AndroidSlidingUpPanel/toggle_star.json", - "http_method": "POST", - "response_content": {"star_count": 56} - } - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__GITLAB__/facebook/create-react-app/toggle_star.json", - "http_method": "POST", - "response_content": {"star_count": 53} - } - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__GITLAB__/kkroening/ffmpeg-python/toggle_star.json", - "http_method": "POST", - "response_content": {"star_count": 52} - } - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__GITLAB__/mk-j/PHP_XLSXWriter/toggle_star.json", - "http_method": "POST", - "response_content": {"star_count": 48} - } - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__GITLAB__/koush/AndroidAsync/toggle_star.json", - "http_method": "POST", - "response_content": {"star_count": 47} - } - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__GITLAB__/eriklindernoren/PyTorch-GAN/toggle_star.json", - "http_method": "POST", - "response_content": {"star_count": 46} - } - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__GITLAB__/thoughtbot/administrate/toggle_star.json", - "http_method": "POST", - "response_content": {"star_count": 45} - } - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__GITLAB__/keycloak/keycloak/toggle_star.json", - "http_method": "POST", - "response_content": {"star_count": 44} - } - } - ], - "revision": 2 - }, - { - "sites": ["gitlab"], - "task_id": 525, - "intent_template_id": 354, - "start_urls": ["__GITLAB__"], - "intent": "Star the top four most stared repos in Gitlab", - "intent_template": "Star the top {{number}} most stared repos in Gitlab", - "instantiation_dict": {"number": "four"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__GITLAB__/umano/AndroidSlidingUpPanel/toggle_star.json", - "http_method": "POST", - "response_content": {"star_count": 56} - } - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__GITLAB__/facebook/create-react-app/toggle_star.json", - "http_method": "POST", - "response_content": {"star_count": 53} - } - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__GITLAB__/kkroening/ffmpeg-python/toggle_star.json", - "http_method": "POST", - "response_content": {"star_count": 52} - } - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__GITLAB__/mk-j/PHP_XLSXWriter/toggle_star.json", - "http_method": "POST", - "response_content": {"star_count": 48} - } - } - ], - "revision": 2 - }, - { - "sites": ["gitlab"], - "task_id": 526, - "intent_template_id": 354, - "start_urls": ["__GITLAB__"], - "intent": "Star the top three most stared repos in Gitlab", - "intent_template": "Star the top {{number}} most stared repos in Gitlab", - "instantiation_dict": {"number": "three"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__GITLAB__/umano/AndroidSlidingUpPanel/toggle_star.json", - "http_method": "POST", - "response_content": {"star_count": 56} - } - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__GITLAB__/facebook/create-react-app/toggle_star.json", - "http_method": "POST", - "response_content": {"star_count": 53} - } - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__GITLAB__/kkroening/ffmpeg-python/toggle_star.json", - "http_method": "POST", - "response_content": {"star_count": 52} - } - } - ], - "revision": 2 - }, - { - "sites": ["gitlab"], - "task_id": 527, - "intent_template_id": 354, - "start_urls": ["__GITLAB__"], - "intent": "Star the top one most stared repos in Gitlab", - "intent_template": "Star the top {{number}} most stared repos in Gitlab", - "instantiation_dict": {"number": "one"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__GITLAB__/umano/AndroidSlidingUpPanel/toggle_star.json", - "http_method": "POST", - "response_content": {"star_count": 56} - } - } - ], - "revision": 2 - }, - { - "sites": ["shopping"], - "task_id": 528, - "intent_template_id": 154, - "start_urls": ["__SHOPPING__"], - "intent": "Fill out the contact us form with this refund message for the phone screen protector bought March 2023: \"I need a refund. Order: . Amount: . Reason: It broke after three days of use.\" Leave the filled form ready for review and do not submit it.", - "intent_template": "Fill out the contact us form with this refund message for the {{product}} bought {{time}}: \"I need a refund. Order: . Amount: . Reason: It broke after three days of use.\" Leave the filled form ready for review and do not submit it.", - "instantiation_dict": {"product": "phone screen protector", "time": "March 2023"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "^http://.*/dummy_bin$", - "http_method": "POST", - "post_data": { - "form_id": "contact-form", - "name": "Emma Lopez", - "email": "emma.lopez@gmail.com", - "comment": "^I\\s+need\\s+a\\s+refund[.\\s]+Order\\s*:\\s*#?\\s*0*180[.\\s]+Amount\\s*:\\s*(\\$\\s*)?12\\.99(\\s*\\$|\\s+dollars)?[.\\s]+Reason\\s*:\\s*It\\s+broke\\s+after\\s+three\\s+days\\s+of\\s+use\\.?$" - }, - "response_status": -1 - } - }, - { - "evaluator": "NetworkEventEvaluator", - "should_not_exist": true, - "expected": {"url": "__SHOPPING__/contact/index/post", "http_method": "POST"} - } - ], - "revision": 2 - }, - { - "sites": ["shopping"], - "task_id": 529, - "intent_template_id": 154, - "start_urls": ["__SHOPPING__"], - "intent": "Fill out the contact us form with this refund message for the Stereo Sound System bought Feb 2023: \"I need a refund. Order: . Amount: . Reason: It broke after three days of use.\" Leave the filled form ready for review and do not submit it.", - "intent_template": "Fill out the contact us form with this refund message for the {{product}} bought {{time}}: \"I need a refund. Order: . Amount: . Reason: It broke after three days of use.\" Leave the filled form ready for review and do not submit it.", - "instantiation_dict": {"product": "Stereo Sound System", "time": "Feb 2023"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "^http://.*/dummy_bin$", - "http_method": "POST", - "post_data": { - "form_id": "contact-form", - "name": "Emma Lopez", - "email": "emma.lopez@gmail.com", - "comment": "^I\\s+need\\s+a\\s+refund[.\\s]+Order\\s*:\\s*#?\\s*0*157[.\\s]+Amount\\s*:\\s*(\\$\\s*)?122\\.05(\\s*\\$|\\s+dollars)?[.\\s]+Reason\\s*:\\s*It\\s+broke\\s+after\\s+three\\s+days\\s+of\\s+use\\.?$" - }, - "response_status": -1 - } - }, - { - "evaluator": "NetworkEventEvaluator", - "should_not_exist": true, - "expected": {"url": "__SHOPPING__/contact/index/post", "http_method": "POST"} - } - ], - "revision": 2 - }, - { - "sites": ["shopping"], - "task_id": 530, - "intent_template_id": 154, - "start_urls": ["__SHOPPING__"], - "intent": "Fill out the contact us form with this refund message for the kitchen organizer bought around Feb 2023: \"I need a refund. Order: . Amount: . Reason: It broke after three days of use.\" Leave the filled form ready for review and do not submit it.", - "intent_template": "Fill out the contact us form with this refund message for the {{product}} bought {{time}}: \"I need a refund. Order: . Amount: . Reason: It broke after three days of use.\" Leave the filled form ready for review and do not submit it.", - "instantiation_dict": {"product": "kitchen organizer", "time": "around Feb 2023"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "^http://.*/dummy_bin$", - "http_method": "POST", - "post_data": { - "form_id": "contact-form", - "name": "Emma Lopez", - "email": "emma.lopez@gmail.com", - "comment": "^I\\s+need\\s+a\\s+refund[.\\s]+Order\\s*:\\s*#?\\s*0*161[.\\s]+Amount\\s*:\\s*(\\$\\s*)?68\\.88(\\s*\\$|\\s+dollars)?[.\\s]+Reason\\s*:\\s*It\\s+broke\\s+after\\s+three\\s+days\\s+of\\s+use\\.?$" - }, - "response_status": -1 - } - }, - { - "evaluator": "NetworkEventEvaluator", - "should_not_exist": true, - "expected": {"url": "__SHOPPING__/contact/index/post", "http_method": "POST"} - } - ], - "revision": 2 - }, - { - "sites": ["shopping"], - "task_id": 531, - "intent_template_id": 154, - "start_urls": ["__SHOPPING__"], - "intent": "Fill out the contact us form with this refund message for the phone case bought March 2023: \"I need a refund. Order: . Amount: . Reason: It broke after three days of use.\" Leave the filled form ready for review and do not submit it.", - "intent_template": "Fill out the contact us form with this refund message for the {{product}} bought {{time}}: \"I need a refund. Order: . Amount: . Reason: It broke after three days of use.\" Leave the filled form ready for review and do not submit it.", - "instantiation_dict": {"product": "phone case", "time": "March 2023"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": {"url": "__SHOPPING__/contact"} - } - ], - "revision": 2 - }, - { - "sites": ["shopping"], - "task_id": 532, - "intent_template_id": 154, - "start_urls": ["__SHOPPING__"], - "intent": "Fill out the contact us form with this refund message for the PS3 remote controller bought early 2023: \"I need a refund. Order: . Amount: . Reason: It broke after three days of use.\" Leave the filled form ready for review and do not submit it.", - "intent_template": "Fill out the contact us form with this refund message for the {{product}} bought {{time}}: \"I need a refund. Order: . Amount: . Reason: It broke after three days of use.\" Leave the filled form ready for review and do not submit it.", - "instantiation_dict": {"product": "PS3 remote controller", "time": "early 2023"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": {"url": "__SHOPPING__/contact"} - } - ], - "revision": 2 - }, - { - "sites": ["gitlab"], - "task_id": 533, - "intent_template_id": 330, - "start_urls": ["__GITLAB__"], - "intent": "Follow [\"convexegg\", \"yjlou\"] on Gitlab", - "intent_template": "Follow {{account_list}} on Gitlab", - "instantiation_dict": {"account_list": "[\"convexegg\", \"yjlou\"]"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__GITLAB__/users/convexegg/follow.json", - "http_method": "POST", - "response_status": 302 - } - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__GITLAB__/users/yjlou/follow.json", - "http_method": "POST", - "response_status": 302 - } - } - ], - "revision": 2 - }, - { - "sites": ["gitlab"], - "task_id": 534, - "intent_template_id": 330, - "start_urls": ["__GITLAB__"], - "intent": "Follow [\"Jakub Klinkovsk\u00fd\", \"Koushik\", \"Vinta Chen\"] on Gitlab", - "intent_template": "Follow {{account_list}} on Gitlab", - "instantiation_dict": {"account_list": "[\"Jakub Klinkovsk\u00fd\", \"Koushik\", \"Vinta Chen\"]"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__GITLAB__/users/lahwaacz/follow.json", - "http_method": "POST", - "response_status": 302 - } - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__GITLAB__/users/koush/follow.json", - "http_method": "POST", - "response_status": 302 - } - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__GITLAB__/users/vinta/follow.json", - "http_method": "POST", - "response_status": 302 - } - } - ], - "revision": 2 - }, - { - "sites": ["gitlab"], - "task_id": 535, - "intent_template_id": 330, - "start_urls": ["__GITLAB__"], - "intent": "Follow [\"Jakub K\", \"ghost\", \"Beno\u00eet Blanchon\"] on Gitlab", - "intent_template": "Follow {{account_list}} on Gitlab", - "instantiation_dict": {"account_list": "[\"Jakub K\", \"ghost\", \"Beno\u00eet Blanchon\"]"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__GITLAB__/users/lahwaacz/follow.json", - "http_method": "POST", - "response_status": 302 - } - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__GITLAB__/users/ghost/follow.json", - "http_method": "POST", - "response_status": 302 - } - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__GITLAB__/users/bblanchon/follow.json", - "http_method": "POST", - "response_status": 302 - } - } - ], - "revision": 2 - }, - { - "sites": ["gitlab"], - "task_id": 536, - "intent_template_id": 330, - "start_urls": ["__GITLAB__"], - "intent": "Follow [\"ghost\", \"R1kk3r\", \"Abishek\"] on Gitlab", - "intent_template": "Follow {{account_list}} on Gitlab", - "instantiation_dict": {"account_list": "[\"ghost\", \"R1kk3r\", \"Abishek\"]"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__GITLAB__/users/ghost/follow.json", - "http_method": "POST", - "response_status": 302 - } - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__GITLAB__/users/R1kk3r/follow.json", - "http_method": "POST", - "response_status": 302 - } - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__GITLAB__/users/abisubramanya27/follow.json", - "http_method": "POST", - "response_status": 302 - } - } - ], - "revision": 2 - }, - { - "sites": ["gitlab"], - "task_id": 537, - "intent_template_id": 330, - "start_urls": ["__GITLAB__"], - "intent": "Follow [\"Jakub Klinkovsk\", \"convexegg\", \"Vinta Chen\", \"yjlou\", \"Abishek S\"] on Gitlab", - "intent_template": "Follow {{account_list}} on Gitlab", - "instantiation_dict": { - "account_list": "[\"Jakub Klinkovsk\", \"convexegg\", \"Vinta Chen\", \"yjlou\", \"Abishek S\"]" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__GITLAB__/users/lahwaacz/follow.json", - "http_method": "POST", - "response_status": 302 - } - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__GITLAB__/users/convexegg/follow.json", - "http_method": "POST", - "response_status": 302 - } - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__GITLAB__/users/vinta/follow.json", - "http_method": "POST", - "response_status": 302 - } - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__GITLAB__/users/yjlou/follow.json", - "http_method": "POST", - "response_status": 302 - } - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__GITLAB__/users/abisubramanya27/follow.json", - "http_method": "POST", - "response_status": 302 - } - } - ], - "revision": 2 - }, - { - "sites": ["shopping_admin"], - "task_id": 538, - "intent_template_id": 240, - "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Modify the address of order #299 to 456 Oak Avenue, Apartment 5B, New York, NY, 10001", - "intent_template": "Modify the address of order #{{order_id}} to {{address}}", - "instantiation_dict": { - "order_id": "299", - "address": "456 Oak Avenue, Apartment 5B, New York, NY, 10001" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "ignored_post_data_params_patterns": ["^form_key$"], - "expected": { - "url": "__SHOPPING_ADMIN__/sales/order/addressSave/address_id/598/", - "http_method": "POST", - "post_data": { - "street[0]": "456 Oak Avenue", - "street[1]": "Apartment 5B", - "country_id": "US", - "region": "New York", - "region_id": "43", - "city": "New York", - "postcode": "10001" - }, - "response_status": 302 - } - } - ], - "revision": 2 - }, - { - "sites": ["shopping_admin"], - "task_id": 539, - "intent_template_id": 240, - "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Modify the address of order #65 to 789 Pine Lane, San Francisco, CA, 94102", - "intent_template": "Modify the address of order #{{order_id}} to {{address}}", - "instantiation_dict": {"order_id": "65", "address": "789 Pine Lane, San Francisco, CA, 94102"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "ignored_post_data_params_patterns": ["^form_key$"], - "expected": { - "url": "__SHOPPING_ADMIN__/sales/order/addressSave/address_id/130/", - "http_method": "POST", - "post_data": { - "street[0]": "789 Pine Lane", - "country_id": "US", - "region": "California", - "region_id": "12", - "city": "San Francisco", - "postcode": "94102" - }, - "response_status": 302 - } - } - ], - "revision": 2 - }, - { - "sites": ["shopping_admin"], - "task_id": 540, - "intent_template_id": 240, - "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Modify the address of order #301 to 321 Birch Boulevard, Suite 200, Dallas, TX, 75201", - "intent_template": "Modify the address of order #{{order_id}} to {{address}}", - "instantiation_dict": { - "order_id": "301", - "address": "321 Birch Boulevard, Suite 200, Dallas, TX, 75201" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "ignored_post_data_params_patterns": ["^form_key$"], - "expected": { - "url": "__SHOPPING_ADMIN__/sales/order/addressSave/address_id/602/", - "http_method": "POST", - "post_data": { - "street[0]": "321 Birch Boulevard", - "street[1]": "Suite 200", - "country_id": "US", - "region": "Texas", - "region_id": "57", - "city": "Dallas", - "postcode": "75201" - }, - "response_status": 302 - } - } - ], - "revision": 2 - }, - { - "sites": ["shopping_admin"], - "task_id": 541, - "intent_template_id": 240, - "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Modify the address of order #125 to 654 Elm Drive, Apartment 12, Miami, FL, 33101", - "intent_template": "Modify the address of order #{{order_id}} to {{address}}", - "instantiation_dict": {"order_id": "125", "address": "654 Elm Drive, Apartment 12, Miami, FL, 33101"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "ignored_post_data_params_patterns": ["^form_key$"], - "expected": { - "url": "__SHOPPING_ADMIN__/sales/order/addressSave/address_id/249/", - "http_method": "POST", - "post_data": { - "street[0]": "654 Elm Drive", - "street[1]": "Apartment 12", - "country_id": "US", - "region": "Florida", - "region_id": "18", - "city": "Miami", - "postcode": "33101" - }, - "response_status": 302 - } - } - ], - "revision": 2 - }, - { - "sites": ["shopping_admin"], - "task_id": 542, - "intent_template_id": 240, - "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Modify the address of order #300 to 987 Cedar Court, Los Angeles, CA, 90012", - "intent_template": "Modify the address of order #{{order_id}} to {{address}}", - "instantiation_dict": {"order_id": "300", "address": "987 Cedar Court, Los Angeles, CA, 90012"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "ignored_post_data_params_patterns": ["^form_key$"], - "expected": { - "url": "__SHOPPING_ADMIN__/sales/order/addressSave/address_id/600/", - "http_method": "POST", - "post_data": { - "street[0]": "987 Cedar Court", - "country_id": "US", - "region": "California", - "region_id": "12", - "city": "Los Angeles", - "postcode": "90012" - }, - "response_status": 302 - } - } - ], - "revision": 2 - }, - { - "sites": ["shopping_admin"], - "task_id": 543, - "intent_template_id": 251, - "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Update the product description of Bella Tank to \"{count} customer(s) love it!\" where count is the number of reviews with 4 stars or above, or \"don't miss out on this amazing product\" when there are no such reviews.", - "intent_template": "Update the product description of {{product}} to \"{count} customer(s) love it!\" where count is the number of reviews with 4 stars or above, or \"don't miss out on this amazing product\" when there are no such reviews.", - "instantiation_dict": {"product": "Bella Tank"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "^__SHOPPING_ADMIN__/catalog/product/save/id/1684/type/configurable/store/0/set/\\d+/back/edit$", - "http_method": "POST", - "response_status": 302, - "post_data": {"product[short_description]": "

2 customer(s) love it!

"} - } - } - ], - "revision": 2 - }, - { - "sites": ["shopping_admin"], - "task_id": 544, - "intent_template_id": 251, - "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Update the product description of Selene Yoga Hoodie to \"{count} customer(s) love it!\" where count is the number of reviews with 4 stars or above, or \"don't miss out on this amazing product\" when there are no such reviews.", - "intent_template": "Update the product description of {{product}} to \"{count} customer(s) love it!\" where count is the number of reviews with 4 stars or above, or \"don't miss out on this amazing product\" when there are no such reviews.", - "instantiation_dict": {"product": "Selene Yoga Hoodie"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "^__SHOPPING_ADMIN__/catalog/product/save/id/1108/type/configurable/store/0/set/\\d+/back/edit$", - "http_method": "POST", - "response_status": 302, - "post_data": {"product[short_description]": "

3 customer(s) love it!

"} - } - } - ], - "revision": 2 - }, - { - "sites": ["shopping_admin"], - "task_id": 545, - "intent_template_id": 251, - "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Update the product description of Radiant Tee to \"{count} customer(s) love it!\" where count is the number of reviews with 4 stars or above, or \"don't miss out on this amazing product\" when there are no such reviews.", - "intent_template": "Update the product description of {{product}} to \"{count} customer(s) love it!\" where count is the number of reviews with 4 stars or above, or \"don't miss out on this amazing product\" when there are no such reviews.", - "instantiation_dict": {"product": "Radiant Tee"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "^__SHOPPING_ADMIN__/catalog/product/save/id/1556/type/configurable/store/0/set/\\d+/back/edit$", - "http_method": "POST", - "response_status": 302, - "post_data": {"product[short_description]": "

1 customer(s) love it!

"} - } - } - ], - "revision": 2 - }, - { - "sites": ["shopping_admin"], - "task_id": 546, - "intent_template_id": 251, - "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Update the product description of Lucia Cross-Fit Bra to \"{count} customer(s) love it!\" where count is the number of reviews with 4 stars or above, or \"don't miss out on this amazing product\" when there are no such reviews.", - "intent_template": "Update the product description of {{product}} to \"{count} customer(s) love it!\" where count is the number of reviews with 4 stars or above, or \"don't miss out on this amazing product\" when there are no such reviews.", - "instantiation_dict": {"product": "Lucia Cross-Fit Bra"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "^__SHOPPING_ADMIN__/catalog/product/save/id/1668/type/configurable/store/0/set/\\d+/back/edit$", - "http_method": "POST", - "response_status": 302, - "post_data": {"product[short_description]": "

don't miss out on this amazing product

"} - } - } - ], - "revision": 2 - }, - { - "sites": ["shopping_admin"], - "task_id": 547, - "intent_template_id": 252, - "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Add a new color option brown to the size S of Phoebe Zipper Sweatshirt", - "intent_template": "Add {{option_spec}} to {{base_setting}} {{product}}", - "instantiation_dict": { - "option_spec": "a new color option brown", - "base_setting": "the size S of", - "product": "Phoebe Zipper Sweatshirt" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__SHOPPING_ADMIN__/catalog/product_attribute/save/attribute_id/93", - "http_method": "POST", - "response_status": 302, - "post_data": { - "serialized_options": "^.*swatchtext%5Bvalue%5D%5Boption_\\d+%5D%5B0%5D=Brown.*$" - } - } - }, - { - "evaluator": "NetworkEventEvaluator", - "post_data_schema": { - "type": "object", - "properties": { - "$['configurable-matrix-serialized'][?(@.newProduct == 1)].attributes": { "type": "array", "items": {"type": "string"} } - } - }, - "expected": { - "url": "^__SHOPPING_ADMIN__/catalog/product/save/id/1130/type/configurable/store/0/set/\\d+/back/edit$", - "http_method": "POST", - "response_status": 302, - "post_data": { - "product[name]": "Phoebe Zipper Sweatshirt", - "$['configurable-matrix-serialized'][?(@.newProduct == 1)].attributes": ["size: s, color: brown"] - } - } - } - ], - "revision": 2 - }, - { - "sites": ["shopping_admin"], - "task_id": 548, - "intent_template_id": 252, - "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Add a new color blue to size S and M of Frankie Sweatshirt", - "intent_template": "Add {{option_spec}} to {{base_setting}} {{product}}", - "instantiation_dict": { - "option_spec": "a new color blue", - "base_setting": "size S and M of", - "product": "Frankie Sweatshirt" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__SHOPPING_ADMIN__/catalog/product_attribute/save/attribute_id/93", - "http_method": "POST", - "response_status": 302, - "post_data": { - "serialized_options": "^.*swatchtext%5Bvalue%5D%5Boption_\\d+%5D%5B0%5D=Blue.*$" - } - } - }, - { - "evaluator": "NetworkEventEvaluator", - "post_data_schema": { - "type": "object", - "properties": { - "$['configurable-matrix-serialized'][?(@.newProduct == 1)].attributes": { "type": "array", "items": {"type": "string"} } - } - }, - "expected": { - "url": "^__SHOPPING_ADMIN__/catalog/product/save/id/110/type/configurable/store/0/set/\\d+/back/edit$", - "http_method": "POST", - "response_status": 302, - "post_data": { - "product[name]": "Frankie Sweatshirt", - "$['configurable-matrix-serialized'][?(@.newProduct == 1)].attributes": ["size: s, color: blue", "size: m, color: blue"] - } - } - } - ], - "revision": 2 - }, - { - "sites": ["shopping_admin"], - "task_id": 549, - "intent_template_id": 252, - "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Add a new size XXXL to green Minerva LumaTech V-Tee", - "intent_template": "Add {{option_spec}} to {{base_setting}} {{product}}", - "instantiation_dict": { - "option_spec": "a new size XXXL", - "base_setting": "green", - "product": "Minerva LumaTech V-Tee" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__SHOPPING_ADMIN__/catalog/product_attribute/save/attribute_id/144", - "http_method": "POST", - "response_status": 302, - "post_data": { - "serialized_options": "^.*swatchtext%5Bvalue%5D%5Boption_\\d+%5D%5B0%5D=XXXL.*$" - } - } - }, - { - "evaluator": "NetworkEventEvaluator", - "post_data_schema": { - "type": "object", - "properties": { - "$['configurable-matrix-serialized'][?(@.newProduct == 1)].attributes": { "type": "array", "items": {"type": "string"} } - } - }, - "expected": { - "url": "^__SHOPPING_ADMIN__/catalog/product/save/id/1492/type/configurable/store/0/set/\\d+/back/edit$", - "http_method": "POST", - "response_status": 302, - "post_data": { - "product[name]": "Minerva LumaTech™ V-Tee", - "$['configurable-matrix-serialized'][?(@.newProduct == 1)].attributes": ["size: xxxl, color: green"] - } - } - } - ], - "revision": 2 - }, - { - "sites": ["shopping_admin"], - "task_id": 550, - "intent_template_id": 252, - "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Add a new size XXS to blue and purple Nona Fitness Tank", - "intent_template": "Add {{option_spec}} to {{base_setting}} {{product}}", - "instantiation_dict": { - "option_spec": "a new size XXS", - "base_setting": "blue and purple", - "product": "Nona Fitness Tank" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__SHOPPING_ADMIN__/catalog/product_attribute/save/attribute_id/144", - "http_method": "POST", - "response_status": 302, - "post_data": { - "serialized_options": "^.*&swatchtext%5Bvalue%5D%5Boption_\\d+%5D%5B0%5D=XXS.*$" - } - } - }, - { - "evaluator": "NetworkEventEvaluator", - "post_data_schema": { - "type": "object", - "properties": { - "$['configurable-matrix-serialized'][?(@.newProduct == 1)].attributes": { "type": "array", "items": {"type": "string"} } - } - }, - "expected": { - "url": "^__SHOPPING_ADMIN__/catalog/product/save/id/1732/type/configurable/store/0/set/\\d+/back/edit$", - "http_method": "POST", - "response_status": 302, - "post_data": { - "product[name]": "Nona Fitness Tank", - "$['configurable-matrix-serialized'][?(@.newProduct == 1)].attributes": ["size: xxs, color: blue", "size: xxs, color: purple"] - } - } - } - ], - "revision": 2 - }, - { - "sites": ["shopping_admin"], - "task_id": 551, - "intent_template_id": 252, - "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Add new size 30 and 31 to all color variants of Diana Tights", - "intent_template": "Add {{option_spec}} to {{base_setting}} {{product}}", - "instantiation_dict": { - "option_spec": "new size 30 and 31", - "base_setting": "all color variants of", - "product": "Diana Tights" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "^__SHOPPING_ADMIN__/catalog/product/save/id/1854/type/configurable/store/0/set/\\d+/back/edit$", - "http_method": "POST", - "response_status": 302, - "post_data": { - "product[configurable_attributes_data][93][code]": "color", - "product[configurable_attributes_data][93][position]": "1", - "product[configurable_attributes_data][93][values][49][include]": "1", - "product[configurable_attributes_data][93][values][49][value_index]": "49", - "product[configurable_attributes_data][93][values][50][include]": "1", - "product[configurable_attributes_data][93][values][50][value_index]": "50", - "product[configurable_attributes_data][93][values][56][include]": "1", - "product[configurable_attributes_data][93][values][56][value_index]": "56", - "product[configurable_attributes_data][144][attribute_id]": "144", - "product[configurable_attributes_data][144][code]": "size", - "product[configurable_attributes_data][144][position]": "0", - "product[configurable_attributes_data][144][values][171][include]": "1", - "product[configurable_attributes_data][144][values][171][value_index]": "171", - "product[configurable_attributes_data][144][values][172][include]": "1", - "product[configurable_attributes_data][144][values][172][value_index]": "172", - "product[configurable_attributes_data][144][values][173][include]": "1", - "product[configurable_attributes_data][144][values][173][value_index]": "173", - "product[configurable_attributes_data][144][values][174][include]": "1", - "product[configurable_attributes_data][144][values][174][value_index]": "174" - } - } - } - ], - "revision": 2 - }, - { - "sites": ["gitlab", "reddit"], - "task_id": 552, - "intent_template_id": 84, - "start_urls": ["__GITLAB__", "__REDDIT__"], - "intent": "Use the Web IDE to create a folder named real_space in gimmiethat.space repo. Within it, create a file named urls.json that contains the full URLs of the 5 most recent posts from the space forum on the discussion forum as a JSON object with a \"urls\" array. Write them in order from newest to oldest. Commit the changes to the default branch.", - "intent_template": "Use the Web IDE to create a folder named {{directory}} in {{gitlab_repo}} repo. Within it, create a file named urls.json that contains the full URLs of the 5 most recent posts from the {{forum}} forum on the discussion forum as a JSON object with a \"urls\" array. Write them in order from newest to oldest. Commit the changes to the default branch.", - "instantiation_dict": {"directory": "real_space", "gitlab_repo": "gimmiethat.space", "forum": "space"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "post_data_schema": { - "type": "object", - "properties": { "$.actions[0].content": {"type": "string", "format": "json"} } - }, - "expected": { - "url": "__GITLAB__/api/v4/projects/byteblaze%2Fgimmiethat.space/repository/commits", - "http_method": "POST", - "post_data": { - "branch": "main", - "$.actions[0].action": "create", - "$.actions[0].file_path": "real_space/urls.json", - "$.actions[0].content": "{\"urls\": [\"__REDDIT__/f/space/134164/scientists-erupt-at-nasa-gutting-funding-for-crucial-venus\",\"__REDDIT__/f/space/134163/virgin-orbit-fails-to-secure-funding-will-cease-operations\",\"__REDDIT__/f/space/134162/nasa-to-name-artemis-2-crew-next-week-the-first-moon\",\"__REDDIT__/f/space/134161/bent-light-in-deep-space-reveals-one-of-the-biggest-black\",\"__REDDIT__/f/space/134160/seti-s-new-machine-learning-algorithm-works-like-google-s\"]}", - "$.actions[1].action": null - }, - "response_status": 201 - } - } - ], - "revision": 2 - }, - { - "sites": ["gitlab", "reddit"], - "task_id": 553, - "intent_template_id": 84, - "start_urls": ["__GITLAB__", "__REDDIT__"], - "intent": "Use the Web IDE to create a folder named news in gimmiethat.space repo. Within it, create a file named urls.json that contains the full URLs of the 5 most recent posts from the news related forums forum on the discussion forum as a JSON object with a \"urls\" array. Write them in order from newest to oldest. Commit the changes to the default branch.", - "intent_template": "Use the Web IDE to create a folder named {{directory}} in {{gitlab_repo}} repo. Within it, create a file named urls.json that contains the full URLs of the 5 most recent posts from the {{forum}} forum on the discussion forum as a JSON object with a \"urls\" array. Write them in order from newest to oldest. Commit the changes to the default branch.", - "instantiation_dict": { - "directory": "news", - "gitlab_repo": "gimmiethat.space", - "forum": "news related forums" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "post_data_schema": { - "type": "object", - "properties": { "$.actions[0].content": {"type": "string", "format": "json"} } - }, - "expected": { - "url": "__GITLAB__/api/v4/projects/byteblaze%2Fgimmiethat.space/repository/commits", - "http_method": "POST", - "post_data": { - "branch": "main", - "$.actions[0].action": "create", - "$.actions[0].file_path": "news/urls.json", - "$.actions[0].content": "{\"urls\": [\"__REDDIT__/f/news/129905/ohio-man-charged-for-using-molotov-cocktails-to-attack\", \"__REDDIT__/f/news/129904/in-a-loss-for-fox-news-judge-allows-dominion-s-defamation\", \"__REDDIT__/f/news/129903/theater-group-sues-to-block-tennessee-s-new-anti-drag-law\", \"__REDDIT__/f/news/129902/andrew-tate-released-from-jail-in-romania-and-placed-under\", \"__REDDIT__/f/news/129901/rare-high-risk-storm-alert-issued-for-parts-of-midwest-and\"]}", - "$.actions[1].action": null - }, - "response_status": 201 - } - } - ], - "revision": 2 - }, - { - "sites": ["gitlab", "reddit"], - "task_id": 554, - "intent_template_id": 84, - "start_urls": ["__GITLAB__", "__REDDIT__"], - "intent": "Use the Web IDE to create a folder named movie_space in gimmiethat.space repo. Within it, create a file named urls.json that contains the full URLs of the 5 most recent posts from the movies forum on the discussion forum as a JSON object with a \"urls\" array. Write them in order from newest to oldest. Commit the changes to the default branch.", - "intent_template": "Use the Web IDE to create a folder named {{directory}} in {{gitlab_repo}} repo. Within it, create a file named urls.json that contains the full URLs of the 5 most recent posts from the {{forum}} forum on the discussion forum as a JSON object with a \"urls\" array. Write them in order from newest to oldest. Commit the changes to the default branch.", - "instantiation_dict": { - "directory": "movie_space", - "gitlab_repo": "gimmiethat.space", - "forum": "movies" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "post_data_schema": { - "type": "object", - "properties": { "$.actions[0].content": {"type": "string", "format": "json"} } - }, - "expected": { - "url": "__GITLAB__/api/v4/projects/byteblaze%2Fgimmiethat.space/repository/commits", - "http_method": "POST", - "post_data": { - "branch": "main", - "$.actions[0].action": "create", - "$.actions[0].file_path": "movie_space/urls.json", - "$.actions[0].content": "{\"urls\": [\"__REDDIT__/f/movies/128825/scenes-in-film-that-feel-off-or-wrong-in-some-way-and-make\",\"__REDDIT__/f/movies/128824/disney-s-live-action-lilo-amp-stitch-movie-finds-its-lilo-in\",\"__REDDIT__/f/movies/128823/fantastic-four-movie-gets-new-writer-with-avatar-the-way-of\",\"__REDDIT__/f/movies/128822/can-someone-explain-what-made-steven-seagal-so-appealing-for\",\"__REDDIT__/f/movies/128821/ban-on-fetish-sex-depictions-in-film-should-end-australia\"]}", - "$.actions[1].action": null - }, - "response_status": 201 - } - } - ], - "revision": 2 - }, - { - "sites": ["gitlab", "reddit"], - "task_id": 555, - "intent_template_id": 84, - "start_urls": ["__GITLAB__", "__REDDIT__"], - "intent": "Use the Web IDE to create a folder named funny_pic in gimmiethat.space repo. Within it, create a file named urls.json that contains the full URLs of the 5 most recent posts from the memes forum on the discussion forum as a JSON object with a \"urls\" array. Write them in order from newest to oldest. Commit the changes to the default branch.", - "intent_template": "Use the Web IDE to create a folder named {{directory}} in {{gitlab_repo}} repo. Within it, create a file named urls.json that contains the full URLs of the 5 most recent posts from the {{forum}} forum on the discussion forum as a JSON object with a \"urls\" array. Write them in order from newest to oldest. Commit the changes to the default branch.", - "instantiation_dict": {"directory": "funny_pic", "gitlab_repo": "gimmiethat.space", "forum": "memes"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "post_data_schema": { - "type": "object", - "properties": { "$.actions[0].content": {"type": "string", "format": "json"} } - }, - "expected": { - "url": "__GITLAB__/api/v4/projects/byteblaze%2Fgimmiethat.space/repository/commits", - "http_method": "POST", - "post_data": { - "branch": "main", - "$.actions[0].action": "create", - "$.actions[0].file_path": "funny_pic/urls.json", - "$.actions[0].content": "{\"urls\": [\"__REDDIT__/f/memes/127991/it-do-be-like-that-tho\",\"__REDDIT__/f/memes/127990/thank-you-memers-this-wouldn-t-be-possible-without-you\",\"__REDDIT__/f/memes/127989/if-you-have-no-other-choice\",\"__REDDIT__/f/memes/127988/yes-yes-yes\",\"__REDDIT__/f/memes/127987/shagadelic-baby\"]}", - "$.actions[1].action": null - }, - "response_status": 201 - } - } - ], - "revision": 2 - }, - { - "sites": ["gitlab", "wikipedia"], - "task_id": 556, - "intent_template_id": 87, - "start_urls": ["__GITLAB__", "__WIKIPEDIA__"], - "intent": "Create a repository named nolan_honest_fans with a README file containing only Christopher Nolan's theatrically released feature-length films (use the provided wiki site to look up any needed information). Use the Web IDE to create the README. Structure it with the repository name as the main heading and a bulleted list of the film titles. Commit to the default branch.", - "intent_template": "Create a repository named {{name}} with a README file containing {{topics}} (use the provided wiki site to look up any needed information). Use the Web IDE to create the README. Structure it with the repository name as the main heading and a bulleted list of the film titles. Commit to the default branch.", - "instantiation_dict": { - "name": "nolan_honest_fans", - "topics": "only Christopher Nolan's theatrically released feature-length films" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__GITLAB__/projects", - "http_method": "POST", - "post_data": { - "project[name]": "nolan_honest_fans", - "project[path]": "nolan_honest_fans", - "project[namespace_id]": "2505" - }, - "response_status": 302 - } - }, - { - "evaluator": "NetworkEventEvaluator", - "post_data_schema": { - "type": "object", - "properties": { "$.actions[0].content": {"type": "string", "format": "markdown"} } - }, - "expected": { - "url": "__GITLAB__/api/v4/projects/byteblaze%2Fnolan_honest_fans/repository/commits", - "http_method": "POST", - "post_data": { - "branch": "main", - "$.actions[0].action": "^(update|create)$", - "$.actions[0].file_path": "README.md", - "$.actions[0].content": "# nolan_honest_fans\n\n- Following\n- Memento\n- Insomnia\n- Batman Begins\n- The Prestige\n- The Dark Knight\n- Inception\n- The Dark Knight Rises\n- Interstellar\n- Dunkirk\n- Tenet\n- Oppenheimer", - "$.actions[1].action": null - }, - "response_status": 201 - } - } - ], - "revision": 3 - }, - { - "sites": ["gitlab", "wikipedia"], - "task_id": 557, - "intent_template_id": 87, - "start_urls": ["__GITLAB__", "__WIKIPEDIA__"], - "intent": "Create a repository named nolan_old_fans with a README file containing only Christopher Nolan's theatrically released feature-length films before 2010 (use the provided wiki site to look up any needed information). Use the Web IDE to create the README. Structure it with the repository name as the main heading and a bulleted list of the film titles. Commit to the default branch.", - "intent_template": "Create a repository named {{name}} with a README file containing {{topics}} (use the provided wiki site to look up any needed information). Use the Web IDE to create the README. Structure it with the repository name as the main heading and a bulleted list of the film titles. Commit to the default branch.", - "instantiation_dict": { - "name": "nolan_old_fans", - "topics": "only Christopher Nolan's theatrically released feature-length films before 2010" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__GITLAB__/projects", - "http_method": "POST", - "post_data": { - "project[name]": "nolan_old_fans", - "project[path]": "nolan_old_fans", - "project[namespace_id]": "2505" - }, - "response_status": 302 - } - }, - { - "evaluator": "NetworkEventEvaluator", - "post_data_schema": { - "type": "object", - "properties": { "$.actions[0].content": {"type": "string", "format": "markdown"} } - }, - "expected": { - "url": "__GITLAB__/api/v4/projects/byteblaze%2Fnolan_old_fans/repository/commits", - "http_method": "POST", - "post_data": { - "branch": "main", - "$.actions[0].action": "^(update|create)$", - "$.actions[0].file_path": "README.md", - "$.actions[0].content": "# nolan_old_fans\n\n- Following\n- Memento\n- Insomnia\n- Batman Begins\n- The Prestige\n- The Dark Knight", - "$.actions[1].action": null - }, - "response_status": 201 - } - } - ], - "revision": 3 - }, - { - "sites": ["gitlab", "wikipedia"], - "task_id": 558, - "intent_template_id": 87, - "start_urls": ["__GITLAB__", "__WIKIPEDIA__"], - "intent": "Create a repository named nolan_young_fans with a README file containing only Christopher Nolan's theatrically released feature-length films after 2010 (use the provided wiki site to look up any needed information). Use the Web IDE to create the README. Structure it with the repository name as the main heading and a bulleted list of the film titles. Commit to the default branch.", - "intent_template": "Create a repository named {{name}} with a README file containing {{topics}} (use the provided wiki site to look up any needed information). Use the Web IDE to create the README. Structure it with the repository name as the main heading and a bulleted list of the film titles. Commit to the default branch.", - "instantiation_dict": { - "name": "nolan_young_fans", - "topics": "only Christopher Nolan's theatrically released feature-length films after 2010" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__GITLAB__/projects", - "http_method": "POST", - "post_data": { - "project[name]": "nolan_young_fans", - "project[path]": "nolan_young_fans", - "project[namespace_id]": "2505" - }, - "response_status": 302 - } - }, - { - "evaluator": "NetworkEventEvaluator", - "post_data_schema": { - "type": "object", - "properties": { "$.actions[0].content": {"type": "string", "format": "markdown"} } - }, - "expected": { - "url": "__GITLAB__/api/v4/projects/byteblaze%2Fnolan_young_fans/repository/commits", - "http_method": "POST", - "post_data": { - "branch": "main", - "$.actions[0].action": "^(update|create)$", - "$.actions[0].file_path": "README.md", - "$.actions[0].content": "# nolan_young_fans\n\n- Inception\n- The Dark Knight Rises\n- Interstellar\n- Dunkirk\n- Tenet\n- Oppenheimer", - "$.actions[1].action": null - }, - "response_status": 201 - } - } - ], - "revision": 3 - }, - { - "sites": ["gitlab", "wikipedia"], - "task_id": 559, - "intent_template_id": 87, - "start_urls": ["__GITLAB__", "__WIKIPEDIA__"], - "intent": "Create a repository named nolan_followers with a README file containing career timeline headings of Christopher Nolan in order (use the provided wiki site to look up any needed information). Use the Web IDE to create the README. Structure it with the repository name as the main heading and a bulleted list of the film titles. Commit to the default branch.", - "intent_template": "Create a repository named {{name}} with a README file containing {{topics}} (use the provided wiki site to look up any needed information). Use the Web IDE to create the README. Structure it with the repository name as the main heading and a bulleted list of the film titles. Commit to the default branch.", - "instantiation_dict": { - "name": "nolan_followers", - "topics": "career timeline headings of Christopher Nolan in order" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__GITLAB__/projects", - "http_method": "POST", - "post_data": { - "project[name]": "nolan_followers", - "project[path]": "nolan_followers", - "project[namespace_id]": "2505" - }, - "response_status": 302 - } - }, - { - "evaluator": "NetworkEventEvaluator", - "post_data_schema": { - "type": "object", - "properties": { "$.actions[0].content": {"type": "string", "format": "markdown"} } - }, - "expected": { - "url": "__GITLAB__/api/v4/projects/byteblaze%2Fnolan_followers/repository/commits", - "http_method": "POST", - "post_data": { - "branch": "main", - "$.actions[0].action": "^(update|create)$", - "$.actions[0].file_path": "README.md", - "$.actions[0].content": "# nolan_followers\n\n- 1993\u20132003: Early career and breakthrough\n- 2003\u20132013: Widespread recognition\n- 2014\u20132019: Established Hollywood auteur\n- 2020\u2013present", - "$.actions[1].action": null - }, - "response_status": 201 - } - } - ], - "revision": 3 - }, - { - "sites": ["gitlab", "wikipedia"], - "task_id": 560, - "intent_template_id": 87, - "start_urls": ["__GITLAB__", "__WIKIPEDIA__"], - "intent": "Create a repository named nolan_academy_awards with a README file containing movies that won Academy Awards by Christopher Nolan (use the provided wiki site to look up any needed information). Use the Web IDE to create the README. Structure it with the repository name as the main heading and a bulleted list of the film titles. Commit to the default branch.", - "intent_template": "Create a repository named {{name}} with a README file containing {{topics}} (use the provided wiki site to look up any needed information). Use the Web IDE to create the README. Structure it with the repository name as the main heading and a bulleted list of the film titles. Commit to the default branch.", - "instantiation_dict": { - "name": "nolan_academy_awards", - "topics": "movies that won Academy Awards by Christopher Nolan" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__GITLAB__/projects", - "http_method": "POST", - "post_data": { - "project[name]": "nolan_academy_awards", - "project[path]": "nolan_academy_awards", - "project[namespace_id]": "2505" - }, - "response_status": 302 - } - }, - { - "evaluator": "NetworkEventEvaluator", - "post_data_schema": { - "type": "object", - "properties": { "$.actions[0].content": {"type": "string", "format": "markdown"} } - }, - "expected": { - "url": "__GITLAB__/api/v4/projects/byteblaze%2Fnolan_academy_awards/repository/commits", - "http_method": "POST", - "post_data": { - "branch": "main", - "$.actions[0].action": "^(update|create)$", - "$.actions[0].file_path": "README.md", - "$.actions[0].content": "# nolan_academy_awards\n\n- The Dark Knight\n- Inception\n- Interstellar\n- Dunkirk\n- Tenet", - "$.actions[1].action": null - }, - "response_status": 201 - } - } - ], - "revision": 3 - }, - { - "sites": ["gitlab", "wikipedia"], - "task_id": 561, - "intent_template_id": 87, - "start_urls": ["__GITLAB__", "__WIKIPEDIA__"], - "intent": "Create a repository named bafta_awards_nolan with a README file containing movies that are nominated BAFTA Awards by Christopher Nolan (use the provided wiki site to look up any needed information). Use the Web IDE to create the README. Structure it with the repository name as the main heading and a bulleted list of the film titles. Commit to the default branch.", - "intent_template": "Create a repository named {{name}} with a README file containing {{topics}} (use the provided wiki site to look up any needed information). Use the Web IDE to create the README. Structure it with the repository name as the main heading and a bulleted list of the film titles. Commit to the default branch.", - "instantiation_dict": { - "name": "bafta_awards_nolan", - "topics": "movies that are nominated BAFTA Awards by Christopher Nolan" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__GITLAB__/projects", - "http_method": "POST", - "post_data": { - "project[name]": "bafta_awards_nolan", - "project[path]": "bafta_awards_nolan", - "project[namespace_id]": "2505" - }, - "response_status": 302 - } - }, - { - "evaluator": "NetworkEventEvaluator", - "post_data_schema": { - "type": "object", - "properties": { "$.actions[0].content": {"type": "string", "format": "markdown"} } - }, - "expected": { - "url": "__GITLAB__/api/v4/projects/byteblaze%2Fbafta_awards_nolan/repository/commits", - "http_method": "POST", - "post_data": { - "branch": "main", - "$.actions[0].action": "^(update|create)$", - "$.actions[0].file_path": "README.md", - "$.actions[0].content": "# bafta_awards_nolan\n\n- Batman Begins\n- The Dark Knight\n- Inception\n- The Dark Knight Rises\n- Interstellar\n- Dunkirk\n- Tenet", - "$.actions[1].action": null - }, - "response_status": 201 - } - } - ], - "revision": 3 - }, - { - "sites": ["gitlab", "reddit"], - "task_id": 562, - "intent_template_id": 88, - "start_urls": ["__GITLAB__", "__REDDIT__"], - "intent": "Create a private repository named Awesome_DIY_ideas with a README file containing links to the 6 most active DIY posts from the DIY forum. Use the Web IDE to create the README. Structure it with the repository name as the main heading, \"Most Active DIY Threads\" as a subheading, and a bulleted list of the posts as markdown links (using their titles as link text) in the same order they appear on the forum. Commit to the default branch.", - "intent_template": "Create a private repository named {{name}} with a README file containing links to the {{num}} most active DIY posts from the DIY forum. Use the Web IDE to create the README. Structure it with the repository name as the main heading, \"Most Active DIY Threads\" as a subheading, and a bulleted list of the posts as markdown links (using their titles as link text) in the same order they appear on the forum. Commit to the default branch.", - "instantiation_dict": {"name": "Awesome_DIY_ideas", "num": 6}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__GITLAB__/projects", - "http_method": "POST", - "post_data": { - "project[template_name]": null, - "project[name]": "Awesome_DIY_ideas", - "project[path]": "awesome_diy_ideas", - "project[namespace_id]": "2505", - "project[visibility_level]": "0" - }, - "response_status": 302 - } - }, - { - "evaluator": "NetworkEventEvaluator", - "post_data_schema": { - "type": "object", - "properties": { "$.actions[0].content": {"type": "string", "format": "markdown"} } - }, - "expected": { - "url": "__GITLAB__/api/v4/projects/byteblaze%2FAwesome_DIY_ideas/repository/commits", - "http_method": "POST", - "post_data": { - "branch": "main", - "$.actions[0].action": "^(update|create)$", - "$.actions[0].file_path": "README.md", - "$.actions[0].content": "# Awesome_DIY_ideas\n\n## Most Active DIY Threads\n\n- [Separate glued plastic parts](__REDDIT__/f/DIY/118903/separate-glued-plastic-parts)\n- [How would you fix this dryer vent mess?](__REDDIT__/f/DIY/118923/how-would-you-fix-this-dryer-vent-mess)\n- [Basement Bulkhead/Soffit + Wall Framing](__REDDIT__/f/DIY/118935/basement-bulkhead-soffit-wall-framing)\n- [GE Water Heater Pilot Light Won't Stay Lit](__REDDIT__/f/DIY/118904/ge-water-heater-pilot-light-won-t-stay-lit)\n- [Attempting to move a wall outlet in my basement a few inches to the left and am totally stumped. Any help GREATLY appreciated!!](__REDDIT__/f/DIY/118960/attempting-to-move-a-wall-outlet-in-my-basement-a-few-inches)\n- [AFCI Outlet Question](__REDDIT__/f/DIY/118931/afci-outlet-question)", - "$.actions[1].action": null - }, - "response_status": 201 - } - } - ], - "revision": 2 - }, - { - "sites": ["gitlab", "reddit"], - "task_id": 563, - "intent_template_id": 88, - "start_urls": ["__GITLAB__", "__REDDIT__"], - "intent": "Create a private repository named fun_thing_to_do with a README file containing links to the 5 most active DIY posts from the DIY forum. Use the Web IDE to create the README. Structure it with the repository name as the main heading, \"Most Active DIY Threads\" as a subheading, and a bulleted list of the posts as markdown links (using their titles as link text) in the same order they appear on the forum. Commit to the default branch.", - "intent_template": "Create a private repository named {{name}} with a README file containing links to the {{num}} most active DIY posts from the DIY forum. Use the Web IDE to create the README. Structure it with the repository name as the main heading, \"Most Active DIY Threads\" as a subheading, and a bulleted list of the posts as markdown links (using their titles as link text) in the same order they appear on the forum. Commit to the default branch.", - "instantiation_dict": {"name": "fun_thing_to_do", "num": 5}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__GITLAB__/projects", - "http_method": "POST", - "post_data": { - "project[template_name]": null, - "project[name]": "fun_thing_to_do", - "project[path]": "fun_thing_to_do", - "project[namespace_id]": "2505", - "project[visibility_level]": "0" - }, - "response_status": 302 - } - }, - { - "evaluator": "NetworkEventEvaluator", - "post_data_schema": { - "type": "object", - "properties": { "$.actions[0].content": {"type": "string", "format": "markdown"} } - }, - "expected": { - "url": "__GITLAB__/api/v4/projects/byteblaze%2Ffun_thing_to_do/repository/commits", - "http_method": "POST", - "post_data": { - "branch": "main", - "$.actions[0].action": "^(update|create)$", - "$.actions[0].file_path": "README.md", - "$.actions[0].content": "# fun_thing_to_do\n\n## Most Active DIY Threads\n\n- [Separate glued plastic parts](__REDDIT__/f/DIY/118903/separate-glued-plastic-parts)\n- [How would you fix this dryer vent mess?](__REDDIT__/f/DIY/118923/how-would-you-fix-this-dryer-vent-mess)\n- [Basement Bulkhead/Soffit + Wall Framing](__REDDIT__/f/DIY/118935/basement-bulkhead-soffit-wall-framing)\n- [GE Water Heater Pilot Light Won't Stay Lit](__REDDIT__/f/DIY/118904/ge-water-heater-pilot-light-won-t-stay-lit)\n- [Attempting to move a wall outlet in my basement a few inches to the left and am totally stumped. Any help GREATLY appreciated!!](__REDDIT__/f/DIY/118960/attempting-to-move-a-wall-outlet-in-my-basement-a-few-inches)", - "$.actions[1].action": null - }, - "response_status": 201 - } - } - ], - "revision": 2 - }, - { - "sites": ["gitlab", "reddit"], - "task_id": 564, - "intent_template_id": 88, - "start_urls": ["__GITLAB__", "__REDDIT__"], - "intent": "Create a private repository named live_a_life with a README file containing links to the 3 most active DIY posts from the DIY forum. Use the Web IDE to create the README. Structure it with the repository name as the main heading, \"Most Active DIY Threads\" as a subheading, and a bulleted list of the posts as markdown links (using their titles as link text) in the same order they appear on the forum. Commit to the default branch.", - "intent_template": "Create a private repository named {{name}} with a README file containing links to the {{num}} most active DIY posts from the DIY forum. Use the Web IDE to create the README. Structure it with the repository name as the main heading, \"Most Active DIY Threads\" as a subheading, and a bulleted list of the posts as markdown links (using their titles as link text) in the same order they appear on the forum. Commit to the default branch.", - "instantiation_dict": {"name": "live_a_life", "num": 3}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__GITLAB__/projects", - "http_method": "POST", - "post_data": { - "project[template_name]": null, - "project[name]": "live_a_life", - "project[path]": "live_a_life", - "project[namespace_id]": "2505", - "project[visibility_level]": "0" - }, - "response_status": 302 - } - }, - { - "evaluator": "NetworkEventEvaluator", - "post_data_schema": { - "type": "object", - "properties": { "$.actions[0].content": {"type": "string", "format": "markdown"} } - }, - "expected": { - "url": "__GITLAB__/api/v4/projects/byteblaze%2Flive_a_life/repository/commits", - "http_method": "POST", - "post_data": { - "branch": "main", - "$.actions[0].action": "^(update|create)$", - "$.actions[0].file_path": "README.md", - "$.actions[0].content": "# live_a_life\n\n## Most Active DIY Threads\n\n- [Separate glued plastic parts](__REDDIT__/f/DIY/118903/separate-glued-plastic-parts)\n- [How would you fix this dryer vent mess?](__REDDIT__/f/DIY/118923/how-would-you-fix-this-dryer-vent-mess)\n- [Basement Bulkhead/Soffit + Wall Framing](__REDDIT__/f/DIY/118935/basement-bulkhead-soffit-wall-framing)", - "$.actions[1].action": null - }, - "response_status": 201 - } - } - ], - "revision": 2 - }, - { - "sites": ["gitlab", "reddit"], - "task_id": 565, - "intent_template_id": 88, - "start_urls": ["__GITLAB__", "__REDDIT__"], - "intent": "Create a private repository named TODO with a README file containing links to the 10 most active DIY posts from the DIY forum. Use the Web IDE to create the README. Structure it with the repository name as the main heading, \"Most Active DIY Threads\" as a subheading, and a bulleted list of the posts as markdown links (using their titles as link text) in the same order they appear on the forum. Commit to the default branch.", - "intent_template": "Create a private repository named {{name}} with a README file containing links to the {{num}} most active DIY posts from the DIY forum. Use the Web IDE to create the README. Structure it with the repository name as the main heading, \"Most Active DIY Threads\" as a subheading, and a bulleted list of the posts as markdown links (using their titles as link text) in the same order they appear on the forum. Commit to the default branch.", - "instantiation_dict": {"name": "TODO", "num": 10}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__GITLAB__/projects", - "http_method": "POST", - "post_data": { - "project[template_name]": null, - "project[name]": "TODO", - "project[path]": "todo", - "project[namespace_id]": "2505", - "project[visibility_level]": "0" - }, - "response_status": 302 - } - }, - { - "evaluator": "NetworkEventEvaluator", - "post_data_schema": { - "type": "object", - "properties": { "$.actions[0].content": {"type": "string", "format": "markdown"} } - }, - "expected": { - "url": "__GITLAB__/api/v4/projects/byteblaze%2FTODO/repository/commits", - "http_method": "POST", - "post_data": { - "branch": "main", - "$.actions[0].action": "^(update|create)$", - "$.actions[0].file_path": "README.md", - "$.actions[0].content": "# TODO\n\n## Most Active DIY Threads\n\n- [Separate glued plastic parts](__REDDIT__/f/DIY/118903/separate-glued-plastic-parts)\n- [How would you fix this dryer vent mess?](__REDDIT__/f/DIY/118923/how-would-you-fix-this-dryer-vent-mess)\n- [Basement Bulkhead/Soffit + Wall Framing](__REDDIT__/f/DIY/118935/basement-bulkhead-soffit-wall-framing)\n- [GE Water Heater Pilot Light Won't Stay Lit](__REDDIT__/f/DIY/118904/ge-water-heater-pilot-light-won-t-stay-lit)\n- [Attempting to move a wall outlet in my basement a few inches to the left and am totally stumped. Any help GREATLY appreciated!!](__REDDIT__/f/DIY/118960/attempting-to-move-a-wall-outlet-in-my-basement-a-few-inches)\n- [AFCI Outlet Question](__REDDIT__/f/DIY/118931/afci-outlet-question)\n- [Teflon tape to attach washing machine drain hose to pipe?](__REDDIT__/f/DIY/118824/teflon-tape-to-attach-washing-machine-drain-hose-to-pipe)\n- [Paver base for shed?](__REDDIT__/f/DIY/118866/paver-base-for-shed)\n- [Ways to locate our buried electrical service?](__REDDIT__/f/DIY/118820/ways-to-locate-our-buried-electrical-service)\n- [How to eliminate transitions for disability/mobility reasons](__REDDIT__/f/DIY/118836/how-to-eliminate-transitions-for-disability-mobility-reasons)", - "$.actions[1].action": null - }, - "response_status": 201 - } - } - ], - "revision": 2 - }, - { - "sites": ["gitlab", "reddit"], - "task_id": 566, - "intent_template_id": 88, - "start_urls": ["__GITLAB__", "__REDDIT__"], - "intent": "Create a private repository named Do it myself with a README file containing links to the 8 most active DIY posts from the DIY forum. Use the Web IDE to create the README. Structure it with the repository name as the main heading, \"Most Active DIY Threads\" as a subheading, and a bulleted list of the posts as markdown links (using their titles as link text) in the same order they appear on the forum. Commit to the default branch.", - "intent_template": "Create a private repository named {{name}} with a README file containing links to the {{num}} most active DIY posts from the DIY forum. Use the Web IDE to create the README. Structure it with the repository name as the main heading, \"Most Active DIY Threads\" as a subheading, and a bulleted list of the posts as markdown links (using their titles as link text) in the same order they appear on the forum. Commit to the default branch.", - "instantiation_dict": {"name": "Do it myself", "num": 8}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__GITLAB__/projects", - "http_method": "POST", - "post_data": { - "project[template_name]": null, - "project[name]": "Do it myself", - "project[path]": "do-it-myself", - "project[namespace_id]": "2505", - "project[visibility_level]": "0" - }, - "response_status": 302 - } - }, - { - "evaluator": "NetworkEventEvaluator", - "post_data_schema": { - "type": "object", - "properties": { "$.actions[0].content": {"type": "string", "format": "markdown"} } - }, - "expected": { - "url": "__GITLAB__//api/v4/projects/byteblaze%2Fdo-it-myself-with/repository/commits", - "http_method": "POST", - "post_data": { - "branch": "main", - "$.actions[0].action": "^(update|create)$", - "$.actions[0].file_path": "README.md", - "$.actions[0].content": "# Do it myself\n\n## Most Active DIY Threads\n\n- [Separate glued plastic parts](__REDDIT__/f/DIY/118903/separate-glued-plastic-parts)\n- [How would you fix this dryer vent mess?](__REDDIT__/f/DIY/118923/how-would-you-fix-this-dryer-vent-mess)\n- [Basement Bulkhead/Soffit + Wall Framing](__REDDIT__/f/DIY/118935/basement-bulkhead-soffit-wall-framing)\n- [GE Water Heater Pilot Light Won't Stay Lit](__REDDIT__/f/DIY/118904/ge-water-heater-pilot-light-won-t-stay-lit)\n- [Attempting to move a wall outlet in my basement a few inches to the left and am totally stumped. Any help GREATLY appreciated!!](__REDDIT__/f/DIY/118960/attempting-to-move-a-wall-outlet-in-my-basement-a-few-inches)\n- [AFCI Outlet Question](__REDDIT__/f/DIY/118931/afci-outlet-question)\n- [Teflon tape to attach washing machine drain hose to pipe?](__REDDIT__/f/DIY/118824/teflon-tape-to-attach-washing-machine-drain-hose-to-pipe)\n- [Paver base for shed?](__REDDIT__/f/DIY/118866/paver-base-for-shed)", - "$.actions[1].action": null - }, - "response_status": 201 - } - } - ], - "revision": 2 - }, - { - "sites": ["gitlab"], - "task_id": 567, - "intent_template_id": 293, - "start_urls": ["__GITLAB__"], - "intent": "Invite Jakub Klinkovsk\u00fd and Beno\u00eet Blanchon as collaborator(s) to gimmiethat.space repo", - "intent_template": "Invite {{collaborator_account_list}} as collaborator(s) to {{repo}} repo", - "instantiation_dict": { - "collaborator_account_list": "Jakub Klinkovsk\u00fd and Beno\u00eet Blanchon", - "repo": "gimmiethat.space" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__GITLAB__/api/v4/projects/184/members", - "http_method": "POST", - "post_data": {"user_id": 1842, "access_level": 30}, - "response_status": 201 - } - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__GITLAB__/api/v4/projects/184/members", - "http_method": "POST", - "post_data": {"user_id": 597, "access_level": 30}, - "response_status": 201 - } - } - ], - "revision": 2 - }, - { - "sites": ["gitlab"], - "task_id": 568, - "intent_template_id": 293, - "start_urls": ["__GITLAB__"], - "intent": "Invite Abishek and Vinta as collaborator(s) to a11yproject.com repo", - "intent_template": "Invite {{collaborator_account_list}} as collaborator(s) to {{repo}} repo", - "instantiation_dict": {"collaborator_account_list": "Abishek and Vinta", "repo": "a11yproject.com"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__GITLAB__/api/v4/projects/174/members", - "http_method": "POST", - "post_data": {"user_id": 5, "access_level": 30}, - "response_status": 201 - } - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__GITLAB__/api/v4/projects/174/members", - "http_method": "POST", - "post_data": {"user_id": 278, "access_level": 30}, - "response_status": 201 - } - } - ], - "revision": 2 - }, - { - "sites": ["gitlab"], - "task_id": 569, - "intent_template_id": 293, - "start_urls": ["__GITLAB__"], - "intent": "Invite Beno\u00eet and Abishek as collaborator(s) to my HTML5 markup extention repo", - "intent_template": "Invite {{collaborator_account_list}} as collaborator(s) to {{repo}} repo", - "instantiation_dict": { - "collaborator_account_list": "Beno\u00eet and Abishek", - "repo": "my HTML5 markup extention" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__GITLAB__/api/v4/projects/185/members", - "http_method": "POST", - "post_data": {"user_id": 597, "access_level": 30}, - "response_status": 201 - } - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__GITLAB__/api/v4/projects/185/members", - "http_method": "POST", - "post_data": {"user_id": 5, "access_level": 30}, - "response_status": 201 - } - } - ], - "revision": 2 - }, - { - "sites": ["gitlab"], - "task_id": 570, - "intent_template_id": 293, - "start_urls": ["__GITLAB__"], - "intent": "Invite Jakub K, Alex Dills, Alex Hutnik and Beno\u00eet Blanchon as collaborator(s) to my time tracking tool project repo", - "intent_template": "Invite {{collaborator_account_list}} as collaborator(s) to {{repo}} repo", - "instantiation_dict": { - "collaborator_account_list": "Jakub K, Alex Dills, Alex Hutnik and Beno\u00eet Blanchon", - "repo": "my time tracking tool project" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__GITLAB__/api/v4/projects/190/members", - "http_method": "POST", - "post_data": {"user_id": 1842, "access_level": 30}, - "response_status": 201 - } - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__GITLAB__/api/v4/projects/190/members", - "http_method": "POST", - "post_data": {"user_id": 2179, "access_level": 30}, - "response_status": 201 - } - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__GITLAB__/api/v4/projects/190/members", - "http_method": "POST", - "post_data": {"user_id": 1693, "access_level": 30}, - "response_status": 201 - } - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__GITLAB__/api/v4/projects/190/members", - "http_method": "POST", - "post_data": {"user_id": 597, "access_level": 30}, - "response_status": 201 - } - } - ], - "revision": 2 - }, - { - "sites": ["shopping"], - "task_id": 571, - "intent_template_id": 165, - "start_urls": ["__SHOPPING__"], - "intent": "I recently moved, my address is 231 Willow Way, Suite 100, Chicago, IL, 60601, update my information on OneStopShopping accordingly", - "intent_template": "I recently moved, my address is {{address}}, update my information on OneStopShopping accordingly", - "instantiation_dict": {"address": "231 Willow Way, Suite 100, Chicago, IL, 60601"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__SHOPPING__/customer/address/formPost/id/26/", - "http_method": "POST", - "post_data": { - "firstname": "Emma", - "lastname": "Lopez", - "street[0]": "231 Willow Way", - "street[1]": "Suite 100", - "country_id": "US", - "city": "Chicago", - "postcode": "60601" - }, - "response_status": 302 - } - } - ], - "revision": 2 - }, - { - "sites": ["shopping"], - "task_id": 572, - "intent_template_id": 165, - "start_urls": ["__SHOPPING__"], - "intent": "I recently moved, my address is 654 Aspen Road, House #3, Boston, MA, 02110, update my information on OneStopShopping accordingly", - "intent_template": "I recently moved, my address is {{address}}, update my information on OneStopShopping accordingly", - "instantiation_dict": {"address": "654 Aspen Road, House #3, Boston, MA, 02110"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__SHOPPING__/customer/address/formPost/id/26/", - "http_method": "POST", - "post_data": { - "firstname": "Emma", - "lastname": "Lopez", - "street[0]": "654 Aspen Road", - "street[1]": "House #3", - "country_id": "US", - "city": "Boston", - "postcode": "02110" - }, - "response_status": 302 - } - } - ], - "revision": 2 - }, - { - "sites": ["shopping"], - "task_id": 573, - "intent_template_id": 165, - "start_urls": ["__SHOPPING__"], - "intent": "I recently moved, my address is 987 Sycamore Circle, Philadelphia, PA, 19102, update my information on OneStopShopping accordingly", - "intent_template": "I recently moved, my address is {{address}}, update my information on OneStopShopping accordingly", - "instantiation_dict": {"address": "987 Sycamore Circle, Philadelphia, PA, 19102"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__SHOPPING__/customer/address/formPost/id/26/", - "http_method": "POST", - "post_data": { - "firstname": "Emma", - "lastname": "Lopez", - "street[0]": "987 Sycamore Circle", - "country_id": "US", - "city": "Philadelphia", - "postcode": "19102" - }, - "response_status": 302 - } - } - ], - "revision": 2 - }, - { - "sites": ["shopping"], - "task_id": 574, - "intent_template_id": 165, - "start_urls": ["__SHOPPING__"], - "intent": "I recently moved, my address is 111 Magnolia Path, Atlanta, GA, 30303, update my information on OneStopShopping accordingly", - "intent_template": "I recently moved, my address is {{address}}, update my information on OneStopShopping accordingly", - "instantiation_dict": {"address": "111 Magnolia Path, Atlanta, GA, 30303"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__SHOPPING__/customer/address/formPost/id/26/", - "http_method": "POST", - "post_data": { - "firstname": "Emma", - "lastname": "Lopez", - "street[0]": "111 Magnolia Path", - "country_id": "US", - "city": "Atlanta", - "postcode": "30303" - }, - "response_status": 302 - } - } - ], - "revision": 2 - }, - { - "sites": ["shopping"], - "task_id": 575, - "intent_template_id": 165, - "start_urls": ["__SHOPPING__"], - "intent": "I recently moved, my address is 222 Redwood Rise, Suite 300, Seattle, WA, 98101, update my information on OneStopShopping accordingly", - "intent_template": "I recently moved, my address is {{address}}, update my information on OneStopShopping accordingly", - "instantiation_dict": {"address": "222 Redwood Rise, Suite 300, Seattle, WA, 98101"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__SHOPPING__/customer/address/formPost/id/26/", - "http_method": "POST", - "post_data": { - "firstname": "Emma", - "lastname": "Lopez", - "street[0]": "222 Redwood Rise", - "street[1]": "Suite 300", - "country_id": "US", - "city": "Seattle", - "postcode": "98101" - }, - "response_status": 302 - } - } - ], - "revision": 2 - }, - { - "sites": ["gitlab"], - "task_id": 576, - "intent_template_id": 351, - "start_urls": ["__GITLAB__"], - "intent": "Add the following users to repo a11y-webring.club as developer: \"abisubramanya27\" and \"lahwaacz\"", - "intent_template": "Add the following users to {{repo_prefix}} {{repo}} as {{role}}: {{user_list}}", - "instantiation_dict": { - "repo": "a11y-webring.club", - "role": "developer", - "user_list": "\"abisubramanya27\" and \"lahwaacz\"", - "repo_prefix": "repo" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "post_data_schema": { - "type": "object", - "properties": { "user_id": {"type": "string", "format": "string_list"} } - }, - "expected": { - "url": "__GITLAB__/api/v4/projects/179/invitations", - "http_method": "POST", - "post_data": {"user_id": "5,1842", "access_level": 30}, - "response_status": 201 - } - } - ], - "revision": 2 - }, - { - "sites": ["gitlab"], - "task_id": 577, - "intent_template_id": 351, - "start_urls": ["__GITLAB__"], - "intent": "Add the following users to my GitHub timeline item management extension as maintainer: \"abisubramanya27\" and \"lahwaacz\"", - "intent_template": "Add the following users to {{repo_prefix}} {{repo}} as {{role}}: {{user_list}}", - "instantiation_dict": { - "repo": "GitHub timeline item management extension", - "role": "maintainer", - "user_list": "\"abisubramanya27\" and \"lahwaacz\"", - "repo_prefix": "my" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "post_data_schema": { - "type": "object", - "properties": { "user_id": {"type": "string", "format": "string_list"} } - }, - "expected": { - "url": "__GITLAB__/api/v4/projects/181/invitations", - "http_method": "POST", - "post_data": {"user_id": "5,1842", "access_level": 40}, - "response_status": 201 - } - } - ], - "revision": 2 - }, - { - "sites": ["gitlab"], - "task_id": 578, - "intent_template_id": 351, - "start_urls": ["__GITLAB__"], - "intent": "Add the following users to repo millennials-to-snake-people as reporter: \"yjlou\" and \"a11yproject\"", - "intent_template": "Add the following users to {{repo_prefix}} {{repo}} as {{role}}: {{user_list}}", - "instantiation_dict": { - "repo": "millennials-to-snake-people", - "role": "reporter", - "user_list": "\"yjlou\" and \"a11yproject\"", - "repo_prefix": "repo" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__GITLAB__/api/v4/projects/187/invitations", - "http_method": "POST", - "post_data": {"user_id": "168,2325", "access_level": 20}, - "response_status": 201 - } - } - ], - "revision": 2 - }, - { - "sites": ["gitlab"], - "task_id": 579, - "intent_template_id": 351, - "start_urls": ["__GITLAB__"], - "intent": "Add the following users to my time tracking tool as guest: \"yjlou\"", - "intent_template": "Add the following users to {{repo_prefix}} {{repo}} as {{role}}: {{user_list}}", - "instantiation_dict": { - "repo": "time tracking tool", - "role": "guest", - "user_list": "\"yjlou\"", - "repo_prefix": "my" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__GITLAB__/api/v4/projects/190/invitations", - "http_method": "POST", - "post_data": {"user_id": "168", "access_level": 10}, - "response_status": 201 - } - } - ], - "revision": 2 - }, - { - "sites": ["reddit"], - "task_id": 580, - "intent_template_id": 7, - "start_urls": ["__REDDIT__"], - "intent": "Create a new forum with name and title \"sci_fi\", description \"A wild place for sci-fi enthusiasts\", and sidebar \"**New**, Classic, *Movies*, Post my novel, Random\"", - "intent_template": "Create a new forum with name and title \"{{name}}\", description \"{{description}}\", and sidebar \"{{sidebar_list}}\"", - "instantiation_dict": { - "name": "sci_fi", - "description": "A wild place for sci-fi enthusiasts", - "sidebar_list": "**New**, Classic, *Movies*, Post my novel, Random" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__REDDIT__/create_forum", - "http_method": "POST", - "response_status": 302, - "post_data": { - "forum[name]": "sci_fi", - "forum[title]": "sci_fi", - "forum[description]": "A wild place for sci-fi enthusiasts", - "forum[sidebar]": "**New**, Classic, *Movies*, Post my novel, Random" - } - } - } - ], - "revision": 2 - }, - { - "sites": ["reddit"], - "task_id": 581, - "intent_template_id": 7, - "start_urls": ["__REDDIT__"], - "intent": "Create a new forum with name and title \"cmu_lti\", description \"Language Technologies Institute at Carnegie Mellon University\", and sidebar \"**announcement**, paper, **alumni**\"", - "intent_template": "Create a new forum with name and title \"{{name}}\", description \"{{description}}\", and sidebar \"{{sidebar_list}}\"", - "instantiation_dict": { - "name": "cmu_lti", - "description": "Language Technologies Institute at Carnegie Mellon University", - "sidebar_list": "**announcement**, paper, **alumni**" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__REDDIT__/create_forum", - "http_method": "POST", - "response_status": 302, - "post_data": { - "forum[name]": "cmu_lti", - "forum[title]": "cmu_lti", - "forum[description]": "Language Technologies Institute at Carnegie Mellon University", - "forum[sidebar]": "**announcement**, paper, **alumni**" - } - } - } - ], - "revision": 2 - }, - { - "sites": ["reddit"], - "task_id": 582, - "intent_template_id": 7, - "start_urls": ["__REDDIT__"], - "intent": "Create a new forum with name and title \"Cyberpunk\", description \"Welcome to the future\", and sidebar \"Games, *Books*, Movies, *Future*\"", - "intent_template": "Create a new forum with name and title \"{{name}}\", description \"{{description}}\", and sidebar \"{{sidebar_list}}\"", - "instantiation_dict": { - "name": "Cyberpunk", - "description": "Welcome to the future", - "sidebar_list": "Games, *Books*, Movies, *Future*" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__REDDIT__/create_forum", - "http_method": "POST", - "response_status": 302, - "post_data": { - "forum[name]": "Cyberpunk", - "forum[title]": "Cyberpunk", - "forum[description]": "Welcome to the future", - "forum[sidebar]": "Games, *Books*, Movies, *Future*" - } - } - } - ], - "revision": 2 - }, - { - "sites": ["reddit"], - "task_id": 583, - "intent_template_id": 7, - "start_urls": ["__REDDIT__"], - "intent": "Create a new forum with name and title \"PlantsForCatParents\", description \"Cat parents & plan lovers\", and sidebar \"**Cat friendly**, Local vendors, Promotion, *Toxic plants!*\"", - "intent_template": "Create a new forum with name and title \"{{name}}\", description \"{{description}}\", and sidebar \"{{sidebar_list}}\"", - "instantiation_dict": { - "name": "PlantsForCatParents", - "description": "Cat parents & plan lovers", - "sidebar_list": "**Cat friendly**, Local vendors, Promotion, *Toxic plants!*" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__REDDIT__/create_forum", - "http_method": "POST", - "response_status": 302, - "post_data": { - "forum[name]": "PlantsForCatParents", - "forum[title]": "PlantsForCatParents", - "forum[description]": "Cat parents & plan lovers", - "forum[sidebar]": "**Cat friendly**, Local vendors, Promotion, *Toxic plants!*" - } - } - } - ], - "revision": 2 - }, - { - "sites": ["reddit"], - "task_id": 584, - "intent_template_id": 7, - "start_urls": ["__REDDIT__"], - "intent": "Create a new forum with name and title \"Karaoke\", description \"Place for Karaoke lovers\", and sidebar \"*devices*, setup\"", - "intent_template": "Create a new forum with name and title \"{{name}}\", description \"{{description}}\", and sidebar \"{{sidebar_list}}\"", - "instantiation_dict": { - "name": "Karaoke", - "description": "Place for Karaoke lovers", - "sidebar_list": "*devices*, setup" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__REDDIT__/create_forum", - "http_method": "POST", - "response_status": 302, - "post_data": { - "forum[name]": "Karaoke", - "forum[title]": "Karaoke", - "forum[description]": "Place for Karaoke lovers", - "forum[sidebar]": "*devices*, setup" - } - } - } - ], - "revision": 2 - }, - { - "sites": ["shopping"], - "task_id": 585, - "intent_template_id": 194, - "start_urls": ["__SHOPPING__"], - "intent": "Rate my recently purchased floor lamp with 5 stars using my nickname Emma Lopez, with the summary \"Good purchase\" and review \"I like it\"", - "intent_template": "Rate my recently purchased {{product}} with {{num_star}} stars using my nickname {{nickname}}, with the summary \"{{summary}}\" and review \"{{review}}\"", - "instantiation_dict": { - "product": "floor lamp", - "num_star": 5, - "nickname": "Emma Lopez", - "summary": "Good purchase", - "review": "I like it" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__SHOPPING__/review/product/post/id/73063/", - "http_method": "POST", - "post_data": { - "ratings[4]": "20", - "nickname": "Emma Lopez", - "title": "Good purchase", - "detail": "I like it" - }, - "response_status": 302 - } - } - ], - "revision": 2 - }, - { - "sites": ["shopping"], - "task_id": 586, - "intent_template_id": 194, - "start_urls": ["__SHOPPING__"], - "intent": "Rate my recently purchased Jiffy Mix with 4 stars using my nickname ShoppingEmma, with the summary \"Good purchase\" and review \"I like it\"", - "intent_template": "Rate my recently purchased {{product}} with {{num_star}} stars using my nickname {{nickname}}, with the summary \"{{summary}}\" and review \"{{review}}\"", - "instantiation_dict": { - "product": "Jiffy Mix", - "num_star": 4, - "nickname": "ShoppingEmma", - "summary": "Good purchase", - "review": "I like it" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__SHOPPING__/review/product/post/id/102586/", - "http_method": "POST", - "post_data": { - "ratings[4]": "19", - "nickname": "ShoppingEmma", - "title": "Good purchase", - "detail": "I like it" - }, - "response_status": 302 - } - } - ], - "revision": 2 - }, - { - "sites": ["shopping"], - "task_id": 587, - "intent_template_id": 194, - "start_urls": ["__SHOPPING__"], - "intent": "Rate my recently purchased PS3 accessory with 3 stars using my nickname GamingEmma, with the summary \"Ok I guess\" and review \"Does the job\"", - "intent_template": "Rate my recently purchased {{product}} with {{num_star}} stars using my nickname {{nickname}}, with the summary \"{{summary}}\" and review \"{{review}}\"", - "instantiation_dict": { - "product": "PS3 accessory", - "num_star": 3, - "nickname": "GamingEmma", - "summary": "Ok I guess", - "review": "Does the job" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__SHOPPING__/review/product/post/id/101441/", - "http_method": "POST", - "post_data": { - "ratings[4]": "18", - "nickname": "GamingEmma", - "title": "Ok I guess", - "detail": "Does the job" - }, - "response_status": 302 - } - } - ], - "revision": 2 - }, - { - "sites": ["shopping"], - "task_id": 588, - "intent_template_id": 194, - "start_urls": ["__SHOPPING__"], - "intent": "Rate my recently purchased Foundation For Mattress With Frame Set with 1 stars using my nickname ShoppingEmma, with the summary \"Very bad\" and review \"I hated it\"", - "intent_template": "Rate my recently purchased {{product}} with {{num_star}} stars using my nickname {{nickname}}, with the summary \"{{summary}}\" and review \"{{review}}\"", - "instantiation_dict": { - "product": "Foundation For Mattress With Frame Set", - "num_star": 1, - "nickname": "ShoppingEmma", - "summary": "Very bad", - "review": "I hated it" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__SHOPPING__/review/product/post/id/14854/", - "http_method": "POST", - "post_data": { - "ratings[4]": "16", - "nickname": "ShoppingEmma", - "title": "Very bad", - "detail": "I hated it" - }, - "response_status": 302 - } - } - ], - "revision": 2 - }, - { - "sites": ["shopping"], - "task_id": 589, - "intent_template_id": 194, - "start_urls": ["__SHOPPING__"], - "intent": "Rate my recently purchased Mini Wireless Bluetooth Speaker with 2 stars using my nickname SimpleEmma, with the summary \"Very bad\" and review \"I hated it\"", - "intent_template": "Rate my recently purchased {{product}} with {{num_star}} stars using my nickname {{nickname}}, with the summary \"{{summary}}\" and review \"{{review}}\"", - "instantiation_dict": { - "product": "Mini Wireless Bluetooth Speaker", - "num_star": 2, - "nickname": "SimpleEmma", - "summary": "Very bad", - "review": "I hated it" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__SHOPPING__/review/product/post/id/76228/", - "http_method": "POST", - "post_data": { - "ratings[4]": "17", - "nickname": "SimpleEmma", - "title": "Very bad", - "detail": "I hated it" - }, - "response_status": 302 - } - } - ], - "revision": 2 - }, - { - "sites": ["gitlab"], - "task_id": 590, - "intent_template_id": 339, - "start_urls": ["__GITLAB__/primer/design"], - "intent": "Create a milestone in the current repo with title \"product launch\" for the upcoming event of product launch starting on January 16, 2023 and ending on January 30, 2023", - "intent_template": "Create a milestone in the current repo with title \"{{title}}\" for the upcoming {{event}} starting on {{start_date}} and ending {{end_date}}", - "instantiation_dict": { - "title": "product launch", - "event": "event of product launch", - "start_date": "January 16, 2023", - "end_date": "on January 30, 2023" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__GITLAB__/primer/design/-/milestones", - "http_method": "POST", - "post_data": { - "milestone[title]": "product launch", - "milestone[start_date]": "2023-01-16", - "milestone[due_date]": "2023-01-30" - }, - "response_status": 302 - } - } - ], - "revision": 2 - }, - { - "sites": ["gitlab"], - "task_id": 591, - "intent_template_id": 339, - "start_urls": ["__GITLAB__/primer/design"], - "intent": "Create a milestone in the current repo with title \"code review\" for the upcoming practice of collective code review starting on January 16, 2023 and ending in 20 days (inclusive)", - "intent_template": "Create a milestone in the current repo with title \"{{title}}\" for the upcoming {{event}} starting on {{start_date}} and ending {{end_date}}", - "instantiation_dict": { - "title": "code review", - "event": "practice of collective code review", - "start_date": "January 16, 2023", - "end_date": "in 20 days (inclusive)" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__GITLAB__/primer/design/-/milestones", - "http_method": "POST", - "post_data": { - "milestone[title]": "code review", - "milestone[start_date]": "2023-01-16", - "milestone[due_date]": "2023-02-04" - }, - "response_status": 302 - } - } - ], - "revision": 2 - }, - { - "sites": ["gitlab"], - "task_id": 592, - "intent_template_id": 339, - "start_urls": ["__GITLAB__/primer/design"], - "intent": "Create a milestone in the current repo with title \"sensitive information\" for the upcoming task of cleaning sensitive information starting on February 16, 2023 and ending in 20 days (inclusive)", - "intent_template": "Create a milestone in the current repo with title \"{{title}}\" for the upcoming {{event}} starting on {{start_date}} and ending {{end_date}}", - "instantiation_dict": { - "title": "sensitive information", - "event": "task of cleaning sensitive information", - "start_date": "February 16, 2023", - "end_date": "in 20 days (inclusive)" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__GITLAB__/primer/design/-/milestones", - "http_method": "POST", - "post_data": { - "milestone[title]": "sensitive information", - "milestone[start_date]": "2023-02-16", - "milestone[due_date]": "2023-03-07" - }, - "response_status": 302 - } - } - ], - "revision": 2 - }, - { - "sites": ["gitlab"], - "task_id": 593, - "intent_template_id": 339, - "start_urls": ["__GITLAB__/byteblaze/dotfiles"], - "intent": "Create a milestone in the current repo with title \"all branches to main\" for the upcoming task of merging all branches to main starting on March 15, 2044 and ending on March 30, 2044", - "intent_template": "Create a milestone in the current repo with title \"{{title}}\" for the upcoming {{event}} starting on {{start_date}} and ending {{end_date}}", - "instantiation_dict": { - "title": "all branches to main", - "event": "task of merging all branches to main", - "start_date": "March 15, 2044", - "end_date": "on March 30, 2044" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__GITLAB__/byteblaze/dotfiles/-/milestones", - "http_method": "POST", - "post_data": { - "milestone[title]": "all branches to main", - "milestone[start_date]": "2044-03-15", - "milestone[due_date]": "2044-03-30" - }, - "response_status": 302 - } - } - ], - "revision": 2 - }, - { - "sites": ["gitlab"], - "task_id": 594, - "intent_template_id": 339, - "start_urls": ["__GITLAB__/byteblaze/dotfiles"], - "intent": "Create a milestone in the current repo with title \"zsh comprehensive support\" for the upcoming task of adding a new branch for zsh comprehensive support starting on May 1, 2044 and ending in 20 days (inclusive)", - "intent_template": "Create a milestone in the current repo with title \"{{title}}\" for the upcoming {{event}} starting on {{start_date}} and ending {{end_date}}", - "instantiation_dict": { - "title": "zsh comprehensive support", - "event": "task of adding a new branch for zsh comprehensive support", - "start_date": "May 1, 2044", - "end_date": "in 20 days (inclusive)" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__GITLAB__/byteblaze/dotfiles/-/milestones", - "http_method": "POST", - "post_data": { - "milestone[title]": "zsh comprehensive support", - "milestone[start_date]": "2044-05-01", - "milestone[due_date]": "2044-05-20" - }, - "response_status": 302 - } - } - ], - "revision": 2 - }, - { - "sites": ["reddit"], - "task_id": 595, - "intent_template_id": 4, - "start_urls": ["__REDDIT__"], - "intent": "Subscribe to the \"space\" forum from the page of the hottest post in that forum.", - "intent_template": "Subscribe to the \"{{forum}}\" forum from the page of the {{post_selector}} post in that forum.", - "instantiation_dict": {"post_selector": "hottest", "forum": "space"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": ["__REDDIT__/f/space/subscribe.json", "__REDDIT__/f/space/unsubscribe.json"], - "http_method": "POST", - "headers": { - "referer": "__REDDIT__/f/space/69581/the-moon-saturn-and-jupiter-through-my-4-telescope-and" - }, - "response_content": {"subscribed": true} - } - } - ], - "revision": 2 - }, - { - "sites": ["reddit"], - "task_id": 596, - "intent_template_id": 4, - "start_urls": ["__REDDIT__"], - "intent": "Subscribe to the \"books\" forum from the page of the all time top post in that forum.", - "intent_template": "Subscribe to the \"{{forum}}\" forum from the page of the {{post_selector}} post in that forum.", - "instantiation_dict": {"post_selector": "all time top", "forum": "books"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": ["__REDDIT__/f/books/subscribe.json", "__REDDIT__/f/books/unsubscribe.json"], - "http_method": "POST", - "headers": { - "referer": "__REDDIT__/f/books/81371/the-letters-of-t-s-eliot-to-emily-hale-that-were-kept-sealed" - }, - "response_content": {"subscribed": true} - } - } - ], - "revision": 2 - }, - { - "sites": ["reddit"], - "task_id": 597, - "intent_template_id": 4, - "start_urls": ["__REDDIT__"], - "intent": "Subscribe to the \"consoles\" forum from the page of the most controversial post in that forum.", - "intent_template": "Subscribe to the \"{{forum}}\" forum from the page of the {{post_selector}} post in that forum.", - "instantiation_dict": {"post_selector": "most controversial", "forum": "consoles"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": [ - "__REDDIT__/f/consoles/subscribe.json", "__REDDIT__/f/consoles/unsubscribe.json" - ], - "http_method": "POST", - "headers": { - "referer": "__REDDIT__/f/consoles/17949/i-like-xbox-series-s-more-than-xbox-series-x" - }, - "response_content": {"subscribed": true} - } - } - ], - "revision": 2 - }, - { - "sites": ["reddit"], - "task_id": 598, - "intent_template_id": 4, - "start_urls": ["__REDDIT__"], - "intent": "Subscribe to the \"pittsburgh\" forum from the page of the all time most commented post in that forum.", - "intent_template": "Subscribe to the \"{{forum}}\" forum from the page of the {{post_selector}} post in that forum.", - "instantiation_dict": {"post_selector": "all time most commented", "forum": "pittsburgh"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": [ - "__REDDIT__/f/pittsburgh/subscribe.json", - "__REDDIT__/f/pittsburgh/unsubscribe.json" - ], - "http_method": "POST", - "headers": { - "referer": "__REDDIT__/f/pittsburgh/132053/is-bbq-chipped-chopped-ham-a-thing-my-grandma-says-it-is" - }, - "response_content": {"subscribed": true} - } - } - ], - "revision": 2 - }, - { - "sites": ["reddit"], - "task_id": 599, - "intent_template_id": 4, - "start_urls": ["__REDDIT__"], - "intent": "Subscribe to the \"machine learning\" forum from the page of the newest post in that forum.", - "intent_template": "Subscribe to the \"{{forum}}\" forum from the page of the {{post_selector}} post in that forum.", - "instantiation_dict": {"post_selector": "newest", "forum": "machine learning"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": [ - "__REDDIT__/f/machine learning/subscribe.json", - "__REDDIT__/f/machine learning/unsubscribe.json" - ], - "http_method": "POST", - "headers": { - "referer": "__REDDIT__/f/machine learning/120613/d-yan-lecun-s-recent-recommendations" - }, - "response_content": {"subscribed": true} - } - } - ], - "revision": 2 - }, - { - "sites": ["reddit"], - "task_id": 600, - "intent_template_id": 3765, - "start_urls": ["__REDDIT__"], - "intent": "Post my question with the title \"what is the recommended console to buy these days?\", in a forum where I'm likely to get an answer", - "intent_template": "Post my question with the title \"{{question}}\", in a forum where I'm likely to get an answer", - "instantiation_dict": {"question": "what is the recommended console to buy these days?"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": ["__REDDIT__/submit", "__REDDIT__/submit/consoles"], - "http_method": "POST", - "post_data": { - "submission[title]": "what is the recommended console to buy these days?", - "submission[forum]": "10039" - }, - "response_status": 302 - } - } - ], - "revision": 2 - }, - { - "sites": ["reddit"], - "task_id": 601, - "intent_template_id": 3765, - "start_urls": ["__REDDIT__"], - "intent": "Post my question with the title \"is car necessary in NYC\", in a forum where I'm likely to get an answer", - "intent_template": "Post my question with the title \"{{question}}\", in a forum where I'm likely to get an answer", - "instantiation_dict": {"question": "is car necessary in NYC"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": ["__REDDIT__/submit", "__REDDIT__/submit/nyc"], - "http_method": "POST", - "post_data": {"submission[title]": "is car necessary in NYC", "submission[forum]": "10067"}, - "response_status": 302 - } - } - ], - "revision": 2 - }, - { - "sites": ["reddit"], - "task_id": 602, - "intent_template_id": 3765, - "start_urls": ["__REDDIT__"], - "intent": "Post my question with the title \"places for new drivers to learn driving in pittsburgh\", in a forum where I'm likely to get an answer", - "intent_template": "Post my question with the title \"{{question}}\", in a forum where I'm likely to get an answer", - "instantiation_dict": {"question": "places for new drivers to learn driving in pittsburgh"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": ["__REDDIT__/submit", "__REDDIT__/submit/pittsburgh"], - "http_method": "POST", - "post_data": { - "submission[title]": "places for new drivers to learn driving in pittsburgh", - "submission[forum]": "10073" - }, - "response_status": 302 - } - } - ], - "revision": 2 - }, - { - "sites": ["reddit"], - "task_id": 603, - "intent_template_id": 3765, - "start_urls": ["__REDDIT__"], - "intent": "Post my question with the title \"safe and budget apartment to live in nyc\", in a forum where I'm likely to get an answer", - "intent_template": "Post my question with the title \"{{question}}\", in a forum where I'm likely to get an answer", - "instantiation_dict": {"question": "safe and budget apartment to live in nyc"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": ["__REDDIT__/submit", "__REDDIT__/submit/nyc"], - "http_method": "POST", - "post_data": { - "submission[title]": "safe and budget apartment to live in nyc", - "submission[forum]": "10067" - }, - "response_status": 302 - } - } - ], - "revision": 2 - }, - { - "sites": ["reddit"], - "task_id": 604, - "intent_template_id": 3765, - "start_urls": ["__REDDIT__"], - "intent": "Post my question with the title \"what is the SOTA web navigation agent repo\", in a forum where I'm likely to get an answer", - "intent_template": "Post my question with the title \"{{question}}\", in a forum where I'm likely to get an answer", - "instantiation_dict": {"question": "what is the SOTA web navigation agent repo"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": [ - "__REDDIT__/submit", "__REDDIT__/submit/deeplearning", - "__REDDIT__/submit/MachineLearning", "__REDDIT__/submit/singularity" - ], - "http_method": "POST", - "post_data": { - "submission[title]": "what is the SOTA web navigation agent repo", - "submission[forum]": "{{forum_id}}" - }, - "response_status": 302 - } - } - ], - "revision": 2 - }, - { - "sites": ["reddit"], - "task_id": 605, - "intent_template_id": 5, - "start_urls": ["__REDDIT__"], - "intent": "Find a forum focused on topics related to gaming consoles, and post my question, \"what is the recommended console to buy these days\" there", - "intent_template": "Find a forum focused on topics related to {{topic}}, and post my question, \"{{question}}\" there", - "instantiation_dict": { - "topic": "gaming consoles", - "question": "what is the recommended console to buy these days" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": ["__REDDIT__/submit", "__REDDIT__/submit/consoles"], - "http_method": "POST", - "post_data": { - "submission[title]": "what is the recommended console to buy these days", - "submission[forum]": "10039" - }, - "response_status": 302 - } - } - ], - "revision": 2 - }, - { - "sites": ["reddit"], - "task_id": 606, - "intent_template_id": 5, - "start_urls": ["__REDDIT__"], - "intent": "Find a forum focused on topics related to NYC, and post my question, \"is car necessary\" there", - "intent_template": "Find a forum focused on topics related to {{topic}}, and post my question, \"{{question}}\" there", - "instantiation_dict": {"topic": "NYC", "question": "is car necessary"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": ["__REDDIT__/submit", "__REDDIT__/submit/nyc"], - "http_method": "POST", - "post_data": {"submission[title]": "is car necessary", "submission[forum]": "10067"}, - "response_status": 302 - } - } - ], - "revision": 2 - }, - { - "sites": ["reddit"], - "task_id": 607, - "intent_template_id": 5, - "start_urls": ["__REDDIT__"], - "intent": "Find a forum focused on topics related to city Pittsburgh, and post my question, \"places for new drivers to learn driving\" there", - "intent_template": "Find a forum focused on topics related to {{topic}}, and post my question, \"{{question}}\" there", - "instantiation_dict": { - "topic": "city Pittsburgh", - "question": "places for new drivers to learn driving" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": ["__REDDIT__/submit", "__REDDIT__/submit/pittsburgh"], - "http_method": "POST", - "post_data": { - "submission[title]": "places for new drivers to learn driving", - "submission[forum]": "10073" - }, - "response_status": 302 - } - } - ], - "revision": 2 - }, - { - "sites": ["reddit"], - "task_id": 608, - "intent_template_id": 5, - "start_urls": ["__REDDIT__"], - "intent": "Find a forum focused on topics related to city lives in DMV area, and post my question, \"safe and budget apartment to live\" there", - "intent_template": "Find a forum focused on topics related to {{topic}}, and post my question, \"{{question}}\" there", - "instantiation_dict": { - "topic": "city lives in DMV area", - "question": "safe and budget apartment to live" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": ["__REDDIT__/submit", "__REDDIT__/submit/washington"], - "http_method": "POST", - "post_data": { - "submission[title]": "safe and budget apartment to live", - "submission[forum]": "10030" - }, - "response_status": 302 - } - } - ], - "revision": 2 - }, - { - "sites": ["reddit"], - "task_id": 609, - "intent_template_id": 5, - "start_urls": ["__REDDIT__"], - "intent": "Find a forum focused on topics related to ML, DL, NLP, and post my question, \"what is the SOTA web navigation agent repo\" there", - "intent_template": "Find a forum focused on topics related to {{topic}}, and post my question, \"{{question}}\" there", - "instantiation_dict": { - "topic": "ML, DL, NLP", - "question": "what is the SOTA web navigation agent repo" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": ["__REDDIT__/submit", "__REDDIT__/submit/deeplearning"], - "http_method": "POST", - "post_data": { - "submission[title]": "what is the SOTA web navigation agent repo", - "submission[forum]": "10043" - }, - "response_status": 302 - } - } - ], - "revision": 2 - }, - { - "sites": ["reddit"], - "task_id": 610, - "intent_template_id": 9, - "start_urls": ["__REDDIT__"], - "intent": "Post a review of my recent reading, titled \"To Kill a Mockingbird by Harper Lee\" in f/books and then comment \"good book!\" on the created post", - "intent_template": "Post a review of my recent reading, titled \"{{book}}\" in f/books and then comment \"{{content}}\" on the created post", - "instantiation_dict": {"book": "To Kill a Mockingbird by Harper Lee", "content": "good book!"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": ["__REDDIT__/submit", "__REDDIT__/submit/books"], - "http_method": "POST", - "post_data": { - "submission[title]": "To Kill a Mockingbird by Harper Lee", - "submission[forum]": "10037" - }, - "response_status": 302 - } - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "^__REDDIT__/f/books/\\d+/-/comment$", - "http_method": "POST", - "headers": {"referer": "^__REDDIT__/f/books/\\d+/to-kill-a-mockingbird-by-harper-lee$"}, - "post_data": {"$.^reply_to_submission_\\d+\\[comment\\]$": "good book!"}, - "response_status": 302 - } - } - ], - "revision": 2 - }, - { - "sites": ["reddit"], - "task_id": 611, - "intent_template_id": 9, - "start_urls": ["__REDDIT__"], - "intent": "Post a review of my recent reading, titled \"Harry Potter\" in f/books and then comment \"Wonderful journey\" on the created post", - "intent_template": "Post a review of my recent reading, titled \"{{book}}\" in f/books and then comment \"{{content}}\" on the created post", - "instantiation_dict": {"book": "Harry Potter", "content": "Wonderful journey"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": ["__REDDIT__/submit", "__REDDIT__/submit/books"], - "http_method": "POST", - "post_data": {"submission[title]": "Harry Potter", "submission[forum]": "10037"}, - "response_status": 302 - } - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__REDDIT__/f/books/{{post_id}}/-/comment", - "http_method": "POST", - "post_data": {"reply_to_submission_{{post_id}}[comment]": "Wonderful journey"}, - "response_status": 302 - } - } - ], - "revision": 2 - }, - { - "sites": ["reddit"], - "task_id": 612, - "intent_template_id": 9, - "start_urls": ["__REDDIT__"], - "intent": "Post a review of my recent reading, titled \"big little lies\" in f/books and then comment \"can't stop it\" on the created post", - "intent_template": "Post a review of my recent reading, titled \"{{book}}\" in f/books and then comment \"{{content}}\" on the created post", - "instantiation_dict": {"book": "big little lies", "content": "can't stop it"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": ["__REDDIT__/submit", "__REDDIT__/submit/books"], - "http_method": "POST", - "post_data": {"submission[title]": "big little lies", "submission[forum]": "10037"}, - "response_status": 302 - } - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__REDDIT__/f/books/{{post_id}}/-/comment", - "http_method": "POST", - "post_data": {"reply_to_submission_{{post_id}}[comment]": "can't stop it"}, - "response_status": 302 - } - } - ], - "revision": 2 - }, - { - "sites": ["reddit"], - "task_id": 613, - "intent_template_id": 9, - "start_urls": ["__REDDIT__"], - "intent": "Post a review of my recent reading, titled \"Love story\" in f/books and then comment \"I cried\" on the created post", - "intent_template": "Post a review of my recent reading, titled \"{{book}}\" in f/books and then comment \"{{content}}\" on the created post", - "instantiation_dict": {"book": "Love story", "content": "I cried"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": ["__REDDIT__/submit", "__REDDIT__/submit/books"], - "http_method": "POST", - "post_data": {"submission[title]": "Love story", "submission[forum]": "10037"}, - "response_status": 302 - } - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__REDDIT__/f/books/{{post_id}}/-/comment", - "http_method": "POST", - "post_data": {"reply_to_submission_{{post_id}}[comment]": "I cried"}, - "response_status": 302 - } - } - ], - "revision": 2 - }, - { - "sites": ["reddit"], - "task_id": 614, - "intent_template_id": 9, - "start_urls": ["__REDDIT__"], - "intent": "Post a review of my recent reading, titled \"Gone with the wind\" in f/books and then comment \"It's a book with history\" on the created post", - "intent_template": "Post a review of my recent reading, titled \"{{book}}\" in f/books and then comment \"{{content}}\" on the created post", - "instantiation_dict": {"book": "Gone with the wind", "content": "It's a book with history"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": ["__REDDIT__/submit", "__REDDIT__/submit/books"], - "http_method": "POST", - "post_data": {"submission[title]": "Gone with the wind", "submission[forum]": "10037"}, - "response_status": 302 - } - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__REDDIT__/f/books/{{post_id}}/-/comment", - "http_method": "POST", - "post_data": {"reply_to_submission_{{post_id}}[comment]": "It's a book with history"}, - "response_status": 302 - } - } - ], - "revision": 2 - }, - { - "sites": ["reddit"], - "task_id": 615, - "intent_template_id": 11, - "start_urls": ["__REDDIT__/f/pics"], - "intent": "Re-post the image of Bald Eagle from this forum to connecticut forum using the image URL and title \"from /f/pics\"", - "intent_template": "Re-post the image of {{content}} from this forum to {{forum}} forum using the image URL and title \"from /f/pics\"", - "instantiation_dict": {"content": "Bald Eagle", "forum": "connecticut"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "^__REDDIT__/submit.*$", - "http_method": "POST", - "post_data": { - "submission[url]": "__REDDIT__/submission_images/b02113033af32feae9ff147dbbe3764039368d67d193885bd04e65c2e6beea9c.jpg", - "submission[title]": "from /f/pics", - "submission[forum]": "10006" - }, - "response_status": 302 - } - } - ], - "revision": 2 - }, - { - "sites": ["reddit"], - "task_id": 616, - "intent_template_id": 11, - "start_urls": ["__REDDIT__/f/pics"], - "intent": "Re-post the image of Thanksgiving turkey from this forum to funny forum using the image URL and title \"from /f/pics\"", - "intent_template": "Re-post the image of {{content}} from this forum to {{forum}} forum using the image URL and title \"from /f/pics\"", - "instantiation_dict": {"content": "Thanksgiving turkey", "forum": "funny"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "^__REDDIT__/submit.*$", - "http_method": "POST", - "post_data": { - "submission[url]": "__REDDIT__/submission_images/2e4fa0a328e653a97a7d07046291c298ef5b4e0d0c73a287f317ca86a8e8685f.jpg", - "submission[title]": "from /f/pics", - "submission[forum]": "10046" - }, - "response_status": 302 - } - } - ], - "revision": 2 - }, - { - "sites": ["reddit"], - "task_id": 617, - "intent_template_id": 11, - "start_urls": ["__REDDIT__/f/pics"], - "intent": "Re-post the image of Firework from this forum to news forum using the image URL and title \"from /f/pics\"", - "intent_template": "Re-post the image of {{content}} from this forum to {{forum}} forum using the image URL and title \"from /f/pics\"", - "instantiation_dict": {"content": "Firework", "forum": "news"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "^__REDDIT__/submit.*$", - "http_method": "POST", - "post_data": { - "submission[url]": "__REDDIT__/submission_images/92411be6af4e9ad5ccd3ccbaa01c10457bb00e704e99c58dd430de1a958307fd.jpg", - "submission[title]": "from /f/pics", - "submission[forum]": "10064" - }, - "response_status": 302 - } - } - ], - "revision": 2 - }, - { - "sites": ["reddit"], - "task_id": 618, - "intent_template_id": 11, - "start_urls": ["__REDDIT__/f/pics"], - "intent": "Re-post the image of Wife's costume from this forum to funny forum using the image URL and title \"from /f/pics\"", - "intent_template": "Re-post the image of {{content}} from this forum to {{forum}} forum using the image URL and title \"from /f/pics\"", - "instantiation_dict": {"content": "Wife's costume", "forum": "funny"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "^__REDDIT__/submit.*$", - "http_method": "POST", - "post_data": { - "submission[url]": "__REDDIT__/submission_images/6bfbf1280d28d84a9261695f0cac5a90addaaff1174807a2b381fdc159f3ed00.jpg", - "submission[title]": "from /f/pics", - "submission[forum]": "10046" - }, - "response_status": 302 - } - } - ], - "revision": 2 - }, - { - "sites": ["reddit"], - "task_id": 619, - "intent_template_id": 11, - "start_urls": ["__REDDIT__/f/pics"], - "intent": "Re-post the image of costume contest from this forum to funny forum using the image URL and title \"from /f/pics\"", - "intent_template": "Re-post the image of {{content}} from this forum to {{forum}} forum using the image URL and title \"from /f/pics\"", - "instantiation_dict": {"content": "costume contest", "forum": "funny"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "^__REDDIT__/submit.*$", - "http_method": "POST", - "post_data": { - "submission[url]": "__REDDIT__/submission_images/bd8bc5f4c846aac4df08626faa3a34a7d47c8f3bdd92bf615a54afd939f063a7.jpg", - "submission[title]": "from /f/pics", - "submission[forum]": "10046" - }, - "response_status": 302 - } - } - ], - "revision": 2 - }, - { - "sites": ["reddit"], - "task_id": 620, - "intent_template_id": 12, - "start_urls": ["__REDDIT__"], - "intent": "Ask for advice in a forum for relations. Title it \"deal with long-distance relationships\" and in post details ask \"Please help\"", - "intent_template": "Ask for advice in a forum for relations. Title it \"{{issue}}\" and in post details ask \"Please help\"", - "instantiation_dict": {"issue": "deal with long-distance relationships"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": ["__REDDIT__/submit", "__REDDIT__/submit/relationship_advice"], - "http_method": "POST", - "post_data": { - "submission[title]": "deal with long-distance relationships", - "submission[body]": "Please help", - "submission[forum]": "10075" - }, - "response_status": 302 - } - } - ], - "revision": 2 - }, - { - "sites": ["reddit"], - "task_id": 621, - "intent_template_id": 12, - "start_urls": ["__REDDIT__"], - "intent": "Ask for advice in a forum for relations. Title it \"cheat\" and in post details ask \"Please help\"", - "intent_template": "Ask for advice in a forum for relations. Title it \"{{issue}}\" and in post details ask \"Please help\"", - "instantiation_dict": {"issue": "cheat"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": ["__REDDIT__/submit", "__REDDIT__/submit/relationship_advice"], - "http_method": "POST", - "post_data": { - "submission[title]": "cheat", - "submission[body]": "Please help", - "submission[forum]": "10075" - }, - "response_status": 302 - } - } - ], - "revision": 2 - }, - { - "sites": ["reddit"], - "task_id": 622, - "intent_template_id": 12, - "start_urls": ["__REDDIT__"], - "intent": "Ask for advice in a forum for relations. Title it \"sexual harassment\" and in post details ask \"Please help\"", - "intent_template": "Ask for advice in a forum for relations. Title it \"{{issue}}\" and in post details ask \"Please help\"", - "instantiation_dict": {"issue": "sexual harassment"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": ["__REDDIT__/submit", "__REDDIT__/submit/relationship_advice"], - "http_method": "POST", - "post_data": { - "submission[title]": "sexual harassment", - "submission[body]": "Please help", - "submission[forum]": "10075" - }, - "response_status": 302 - } - } - ], - "revision": 2 - }, - { - "sites": ["reddit"], - "task_id": 623, - "intent_template_id": 12, - "start_urls": ["__REDDIT__"], - "intent": "Ask for advice in a forum for relations. Title it \"gift for birthday\" and in post details ask \"Please help\"", - "intent_template": "Ask for advice in a forum for relations. Title it \"{{issue}}\" and in post details ask \"Please help\"", - "instantiation_dict": {"issue": "gift for birthday"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": ["__REDDIT__/submit", "__REDDIT__/submit/relationship_advice"], - "http_method": "POST", - "post_data": { - "submission[title]": "gift for birthday", - "submission[body]": "Please help", - "submission[forum]": "10075" - }, - "response_status": 302 - } - } - ], - "revision": 2 - }, - { - "sites": ["reddit"], - "task_id": 624, - "intent_template_id": 12, - "start_urls": ["__REDDIT__"], - "intent": "Ask for advice in a forum for relations. Title it \"break-up remedy\" and in post details ask \"Please help\"", - "intent_template": "Ask for advice in a forum for relations. Title it \"{{issue}}\" and in post details ask \"Please help\"", - "instantiation_dict": {"issue": "break-up remedy"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": ["__REDDIT__/submit", "__REDDIT__/submit/relationship_advice"], - "http_method": "POST", - "post_data": { - "submission[title]": "break-up remedy", - "submission[body]": "Please help", - "submission[forum]": "10075" - }, - "response_status": 302 - } - } - ], - "revision": 2 - }, - { - "sites": ["reddit"], - "task_id": 625, - "intent_template_id": 13, - "start_urls": ["__REDDIT__"], - "intent": "Create a discussion post titled \"the effectiveness of deep learning\" in a relevant forum and ask users for their opinions with the simple prompt, \"your opinion\"", - "intent_template": "Create a discussion post titled \"{{topic}}\" in a relevant forum and ask users for their opinions with the simple prompt, \"your opinion\"", - "instantiation_dict": {"topic": "the effectiveness of deep learning"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": ["__REDDIT__/submit", "__REDDIT__/submit/deeplearning"], - "http_method": "POST", - "post_data": { - "submission[title]": "the effectiveness of deep learning", - "submission[body]": "your opinion", - "submission[forum]": "10043" - }, - "response_status": 302 - } - } - ], - "revision": 2 - }, - { - "sites": ["reddit"], - "task_id": 626, - "intent_template_id": 13, - "start_urls": ["__REDDIT__"], - "intent": "Create a discussion post titled \"Iphone 14\" in a relevant forum and ask users for their opinions with the simple prompt, \"your opinion\"", - "intent_template": "Create a discussion post titled \"{{topic}}\" in a relevant forum and ask users for their opinions with the simple prompt, \"your opinion\"", - "instantiation_dict": {"topic": "Iphone 14"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": ["__REDDIT__/submit", "__REDDIT__/submit/iphone"], - "http_method": "POST", - "post_data": { - "submission[title]": "Iphone 14", - "submission[body]": "your opinion", - "submission[forum]": "10052" - }, - "response_status": 302 - } - } - ], - "revision": 2 - }, - { - "sites": ["reddit"], - "task_id": 627, - "intent_template_id": 13, - "start_urls": ["__REDDIT__"], - "intent": "Create a discussion post titled \"Harry Potter movie series\" in a relevant forum and ask users for their opinions with the simple prompt, \"your opinion\"", - "intent_template": "Create a discussion post titled \"{{topic}}\" in a relevant forum and ask users for their opinions with the simple prompt, \"your opinion\"", - "instantiation_dict": {"topic": "Harry Potter movie series"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": ["__REDDIT__/submit", "__REDDIT__/submit/movies"], - "http_method": "POST", - "post_data": { - "submission[title]": "Harry Potter movie series", - "submission[body]": "your opinion", - "submission[forum]": "10060" - }, - "response_status": 302 - } - } - ], - "revision": 2 - }, - { - "sites": ["reddit"], - "task_id": 628, - "intent_template_id": 13, - "start_urls": ["__REDDIT__"], - "intent": "Create a discussion post titled \"long distance relationship\" in a relevant forum and ask users for their opinions with the simple prompt, \"your opinion\"", - "intent_template": "Create a discussion post titled \"{{topic}}\" in a relevant forum and ask users for their opinions with the simple prompt, \"your opinion\"", - "instantiation_dict": {"topic": "long distance relationship"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": ["__REDDIT__/submit", "__REDDIT__/submit/relationship_advice"], - "http_method": "POST", - "post_data": { - "submission[title]": "long distance relationship", - "submission[body]": "your opinion", - "submission[forum]": "10075" - }, - "response_status": 302 - } - } - ], - "revision": 2 - }, - { - "sites": ["reddit"], - "task_id": 629, - "intent_template_id": 13, - "start_urls": ["__REDDIT__"], - "intent": "Create a discussion post titled \"Fun thing to do in Pittsburgh\" in a relevant forum and ask users for their opinions with the simple prompt, \"your opinion\"", - "intent_template": "Create a discussion post titled \"{{topic}}\" in a relevant forum and ask users for their opinions with the simple prompt, \"your opinion\"", - "instantiation_dict": {"topic": "Fun thing to do in Pittsburgh"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": ["__REDDIT__/submit", "__REDDIT__/submit/pittsburgh"], - "http_method": "POST", - "post_data": { - "submission[title]": "Fun thing to do in Pittsburgh", - "submission[body]": "your opinion", - "submission[forum]": "10073" - }, - "response_status": 302 - } - } - ], - "revision": 2 - }, - { - "sites": ["reddit"], - "task_id": 630, - "intent_template_id": 15, - "start_urls": ["__REDDIT__"], - "intent": "Create a post in f/headphones. Title it \"noise-cancelling headphones recommendations?\" and in post details ask \"Hey, looking for good noise-cancelling headphones under $200, any suggestions?\"", - "intent_template": "Create a post in {{forum}}. Title it \"{{category}} recommendations?\" and in post details ask \"Hey, looking for good {{category}} under {{price}}, any suggestions?\"", - "instantiation_dict": { - "category": "noise-cancelling headphones", - "price": "$200", - "forum": "f/headphones" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": ["__REDDIT__/submit", "__REDDIT__/submit/headphones"], - "http_method": "POST", - "post_data": { - "submission[title]": "noise-cancelling headphones recommendations?", - "submission[body]": "Hey, looking for good noise-cancelling headphones under $200, any suggestions?", - "submission[forum]": "10050" - }, - "response_status": 302 - } - } - ], - "revision": 2 - }, - { - "sites": ["reddit"], - "task_id": 631, - "intent_template_id": 15, - "start_urls": ["__REDDIT__"], - "intent": "Create a post in f/sports. Title it \"running shoes recommendations?\" and in post details ask \"Hey, looking for good running shoes under $100, any suggestions?\"", - "intent_template": "Create a post in {{forum}}. Title it \"{{category}} recommendations?\" and in post details ask \"Hey, looking for good {{category}} under {{price}}, any suggestions?\"", - "instantiation_dict": {"category": "running shoes", "price": "$100", "forum": "f/sports"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": ["__REDDIT__/submit", "__REDDIT__/submit/sports"], - "http_method": "POST", - "post_data": { - "submission[title]": "running shoes recommendations?", - "submission[body]": "Hey, looking for good running shoes under $100, any suggestions?", - "submission[forum]": "10080" - }, - "response_status": 302 - } - } - ], - "revision": 2 - }, - { - "sites": ["reddit"], - "task_id": 632, - "intent_template_id": 15, - "start_urls": ["__REDDIT__"], - "intent": "Create a post in f/sports. Title it \"running shoes recommendations?\" and in post details ask \"Hey, looking for good running shoes under $500, any suggestions?\"", - "intent_template": "Create a post in {{forum}}. Title it \"{{category}} recommendations?\" and in post details ask \"Hey, looking for good {{category}} under {{price}}, any suggestions?\"", - "instantiation_dict": {"category": "running shoes", "price": "$500", "forum": "f/sports"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": ["__REDDIT__/submit", "__REDDIT__/submit/sports"], - "http_method": "POST", - "post_data": { - "submission[title]": "running shoes recommendations?", - "submission[body]": "Hey, looking for good running shoes under $500, any suggestions?", - "submission[forum]": "10080" - }, - "response_status": 302 - } - } - ], - "revision": 2 - }, - { - "sites": ["reddit"], - "task_id": 633, - "intent_template_id": 15, - "start_urls": ["__REDDIT__"], - "intent": "Create a post in f/sports. Title it \"running pants recommendations?\" and in post details ask \"Hey, looking for good running pants under $500, any suggestions?\"", - "intent_template": "Create a post in {{forum}}. Title it \"{{category}} recommendations?\" and in post details ask \"Hey, looking for good {{category}} under {{price}}, any suggestions?\"", - "instantiation_dict": {"category": "running pants", "price": "$500", "forum": "f/sports"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": ["__REDDIT__/submit", "__REDDIT__/submit/sports"], - "http_method": "POST", - "post_data": { - "submission[title]": "running pants recommendations?", - "submission[body]": "Hey, looking for good running pants under $500, any suggestions?", - "submission[forum]": "10080" - }, - "response_status": 302 - } - } - ], - "revision": 2 - }, - { - "sites": ["reddit"], - "task_id": 634, - "intent_template_id": 15, - "start_urls": ["__REDDIT__"], - "intent": "Create a post in f/iphone. Title it \"used iphone recommendations?\" and in post details ask \"Hey, looking for good used iphone under $1000, any suggestions?\"", - "intent_template": "Create a post in {{forum}}. Title it \"{{category}} recommendations?\" and in post details ask \"Hey, looking for good {{category}} under {{price}}, any suggestions?\"", - "instantiation_dict": {"category": "used iphone", "price": "$1000", "forum": "f/iphone"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": ["__REDDIT__/submit", "__REDDIT__/submit/iphone"], - "http_method": "POST", - "post_data": { - "submission[title]": "used iphone recommendations?", - "submission[body]": "Hey, looking for good used iphone under $1000, any suggestions?", - "submission[forum]": "10052" - }, - "response_status": 302 - } - } - ], - "revision": 2 - }, - { - "sites": ["reddit"], - "task_id": 635, - "intent_template_id": 6100, - "start_urls": ["__REDDIT__"], - "intent": "Create a post in the most appropriate forum. Title it \"noise-cancelling headphones recommendations\" and in post details ask \"I need recommendations for noise-cancelling headphones within a budget of $200 please\"", - "intent_template": "Create a post in the most appropriate forum. Title it \"{{category}} recommendations\" and in post details ask \"I need recommendations for {{category}} within a budget of {{price}} please\"", - "instantiation_dict": {"category": "noise-cancelling headphones", "price": "$200"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": ["__REDDIT__/submit", "__REDDIT__/submit/headphones"], - "http_method": "POST", - "post_data": { - "submission[title]": "noise-cancelling headphones recommendations", - "submission[body]": "I need recommendations for noise-cancelling headphones within a budget of $200 please", - "submission[forum]": "10050" - }, - "response_status": 302 - } - } - ], - "revision": 2 - }, - { - "sites": ["reddit"], - "task_id": 636, - "intent_template_id": 6100, - "start_urls": ["__REDDIT__"], - "intent": "Create a post in the most appropriate forum. Title it \"DIY toolkit recommendations\" and in post details ask \"I need recommendations for DIY toolkit within a budget of $100 please\"", - "intent_template": "Create a post in the most appropriate forum. Title it \"{{category}} recommendations\" and in post details ask \"I need recommendations for {{category}} within a budget of {{price}} please\"", - "instantiation_dict": {"category": "DIY toolkit", "price": "$100"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": ["__REDDIT__/submit", "__REDDIT__/submit/diy"], - "http_method": "POST", - "post_data": { - "submission[title]": "DIY toolkit recommendations", - "submission[body]": "I need recommendations for DIY toolkit within a budget of $100 please", - "submission[forum]": "10007" - }, - "response_status": 302 - } - } - ], - "revision": 2 - }, - { - "sites": ["reddit"], - "task_id": 637, - "intent_template_id": 6100, - "start_urls": ["__REDDIT__"], - "intent": "Create a post in the most appropriate forum. Title it \"sony headphones recommendations\" and in post details ask \"I need recommendations for sony headphones within a budget of $500 please\"", - "intent_template": "Create a post in the most appropriate forum. Title it \"{{category}} recommendations\" and in post details ask \"I need recommendations for {{category}} within a budget of {{price}} please\"", - "instantiation_dict": {"category": "sony headphones", "price": "$500"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": ["__REDDIT__/submit", "__REDDIT__/submit/headphones"], - "http_method": "POST", - "post_data": { - "submission[title]": "sony headphones recommendations", - "submission[body]": "I need recommendations for sony headphones within a budget of $500 please", - "submission[forum]": "10050" - }, - "response_status": 302 - } - } - ], - "revision": 2 - }, - { - "sites": ["reddit"], - "task_id": 638, - "intent_template_id": 6100, - "start_urls": ["__REDDIT__"], - "intent": "Create a post in the most appropriate forum. Title it \"Must have product at last for ever recommendations\" and in post details ask \"I need recommendations for Must have product at last for ever within a budget of $30 please\"", - "intent_template": "Create a post in the most appropriate forum. Title it \"{{category}} recommendations\" and in post details ask \"I need recommendations for {{category}} within a budget of {{price}} please\"", - "instantiation_dict": {"category": "Must have product at last for ever", "price": "$30"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": ["__REDDIT__/submit", "__REDDIT__/submit/BuyItForLife"], - "http_method": "POST", - "post_data": { - "submission[title]": "Must have product at last for ever recommendations", - "submission[body]": "I need recommendations for Must have product at last for ever within a budget of $30 please", - "submission[forum]": "10003" - }, - "response_status": 302 - } - } - ], - "revision": 2 - }, - { - "sites": ["reddit"], - "task_id": 639, - "intent_template_id": 6100, - "start_urls": ["__REDDIT__"], - "intent": "Create a post in the most appropriate forum. Title it \"used iphone recommendations\" and in post details ask \"I need recommendations for used iphone within a budget of $1000 please\"", - "intent_template": "Create a post in the most appropriate forum. Title it \"{{category}} recommendations\" and in post details ask \"I need recommendations for {{category}} within a budget of {{price}} please\"", - "instantiation_dict": {"category": "used iphone", "price": "$1000"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": ["__REDDIT__/submit", "__REDDIT__/submit/iphone"], - "http_method": "POST", - "post_data": { - "submission[title]": "used iphone recommendations", - "submission[body]": "I need recommendations for used iphone within a budget of $1000 please", - "submission[forum]": "10052" - }, - "response_status": 302 - } - } - ], - "revision": 2 - }, - { - "sites": ["reddit"], - "task_id": 640, - "intent_template_id": 16, - "start_urls": ["__REDDIT__"], - "intent": "Post a notice in f/books titled \"book reading Meet up!\". Set post details to \"virtual meetup for book reading on March 15th\"", - "intent_template": "Post a notice in {{forum}} titled \"{{interest}} Meet up!\". Set post details to \"virtual meetup for {{interest}} on {{date}}\"", - "instantiation_dict": {"interest": "book reading", "date": "March 15th", "forum": "f/books"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": ["__REDDIT__/submit", "__REDDIT__/submit/books"], - "http_method": "POST", - "post_data": { - "submission[title]": "book reading Meet up!", - "submission[body]": "virtual meetup for book reading on March 15th", - "submission[forum]": "10037" - }, - "response_status": 302 - } - } - ], - "revision": 2 - }, - { - "sites": ["reddit"], - "task_id": 641, - "intent_template_id": 16, - "start_urls": ["__REDDIT__"], - "intent": "Post a notice in books forum titled \"Harry Potter Meet up!\". Set post details to \"virtual meetup for Harry Potter on July 8th\"", - "intent_template": "Post a notice in {{forum}} titled \"{{interest}} Meet up!\". Set post details to \"virtual meetup for {{interest}} on {{date}}\"", - "instantiation_dict": {"interest": "Harry Potter", "date": "July 8th", "forum": "books forum"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": ["__REDDIT__/submit", "__REDDIT__/submit/books"], - "http_method": "POST", - "post_data": { - "submission[title]": "Harry Potter Meet up!", - "submission[body]": "virtual meetup for Harry Potter on July 8th", - "submission[forum]": "10037" - }, - "response_status": 302 - } - } - ], - "revision": 2 - }, - { - "sites": ["reddit"], - "task_id": 642, - "intent_template_id": 16, - "start_urls": ["__REDDIT__"], - "intent": "Post a notice in books forum titled \"Big little lies Meet up!\". Set post details to \"virtual meetup for Big little lies on Sep 10th\"", - "intent_template": "Post a notice in {{forum}} titled \"{{interest}} Meet up!\". Set post details to \"virtual meetup for {{interest}} on {{date}}\"", - "instantiation_dict": {"interest": "Big little lies", "date": "Sep 10th", "forum": "books forum"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": ["__REDDIT__/submit", "__REDDIT__/submit/books"], - "http_method": "POST", - "post_data": { - "submission[title]": "Big little lies Meet up!", - "submission[body]": "virtual meetup for Big little lies on Sep 10th", - "submission[forum]": "10037" - }, - "response_status": 302 - } - } - ], - "revision": 2 - }, - { - "sites": ["reddit"], - "task_id": 643, - "intent_template_id": 16, - "start_urls": ["__REDDIT__"], - "intent": "Post a notice in nyc forum titled \"racing cars Meet up!\". Set post details to \"virtual meetup for racing cars on Oct 21st\"", - "intent_template": "Post a notice in {{forum}} titled \"{{interest}} Meet up!\". Set post details to \"virtual meetup for {{interest}} on {{date}}\"", - "instantiation_dict": {"interest": "racing cars", "date": "Oct 21st", "forum": "nyc forum"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": ["__REDDIT__/submit", "__REDDIT__/submit/nyc"], - "http_method": "POST", - "post_data": { - "submission[title]": "racing cars Meet up!", - "submission[body]": "virtual meetup for racing cars on Oct 21st", - "submission[forum]": "10067" - }, - "response_status": 302 - } - } - ], - "revision": 2 - }, - { - "sites": ["reddit"], - "task_id": 644, - "intent_template_id": 16, - "start_urls": ["__REDDIT__"], - "intent": "Post a notice in games forum titled \"Tears of Kingdom Meet up!\". Set post details to \"virtual meetup for Tears of Kingdom on Dec 15th\"", - "intent_template": "Post a notice in {{forum}} titled \"{{interest}} Meet up!\". Set post details to \"virtual meetup for {{interest}} on {{date}}\"", - "instantiation_dict": {"interest": "Tears of Kingdom", "date": "Dec 15th", "forum": "games forum"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": ["__REDDIT__/submit", "__REDDIT__/submit/gaming"], - "http_method": "POST", - "post_data": { - "submission[title]": "Tears of Kingdom Meet up!", - "submission[body]": "virtual meetup for Tears of Kingdom on Dec 15th", - "submission[forum]": "10048" - }, - "response_status": 302 - } - } - ], - "revision": 2 - }, - { - "sites": ["reddit"], - "task_id": 645, - "intent_template_id": 19, - "start_urls": ["__REDDIT__"], - "intent": "Post in books forum with title \"What could machine learning help the books field?\"", - "intent_template": "Post in {{forum}} forum with title \"{{title}}\"", - "instantiation_dict": { - "model_type": "machine learning", - "title": "What could machine learning help the books field?", - "forum": "books" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": ["__REDDIT__/submit", "__REDDIT__/submit/books"], - "http_method": "POST", - "post_data": { - "submission[title]": "What could machine learning help the books field?", - "submission[forum]": "10037" - }, - "response_status": 302 - } - } - ], - "revision": 2 - }, - { - "sites": ["reddit"], - "task_id": 646, - "intent_template_id": 19, - "start_urls": ["__REDDIT__"], - "intent": "Post in DIY forum with title \"What could midjourney help the DIY field?\"", - "intent_template": "Post in {{forum}} forum with title \"{{title}}\"", - "instantiation_dict": { - "model_type": "midjourney", - "title": "What could midjourney help the DIY field?", - "forum": "DIY" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": ["__REDDIT__/submit", "__REDDIT__/submit/DIY"], - "http_method": "POST", - "post_data": { - "submission[title]": "What could midjourney help the DIY field?", - "submission[forum]": "10007" - }, - "response_status": 302 - } - } - ], - "revision": 2 - }, - { - "sites": ["reddit"], - "task_id": 647, - "intent_template_id": 19, - "start_urls": ["__REDDIT__"], - "intent": "Post in technology forum with title \"What could open-source LLMs help the technology field?\"", - "intent_template": "Post in {{forum}} forum with title \"{{title}}\"", - "instantiation_dict": { - "model_type": "open-source LLMs", - "title": "What could open-source LLMs help the technology field?", - "forum": "technology" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": ["__REDDIT__/submit", "__REDDIT__/submit/technology"], - "http_method": "POST", - "post_data": { - "submission[title]": "What could open-source LLMs help the technology field?", - "submission[forum]": "10082" - }, - "response_status": 302 - } - } - ], - "revision": 2 - }, - { - "sites": ["reddit"], - "task_id": 648, - "intent_template_id": 19, - "start_urls": ["__REDDIT__"], - "intent": "Post in dataisbeautiful forum with title \"What could large language models help the dataisbeautiful field?\"", - "intent_template": "Post in {{forum}} forum with title \"{{title}}\"", - "instantiation_dict": { - "model_type": "large language models", - "title": "What could large language models help the dataisbeautiful field?", - "forum": "dataisbeautiful" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": ["__REDDIT__/submit", "__REDDIT__/submit/dataisbeautiful"], - "http_method": "POST", - "post_data": { - "submission[title]": "What could large language models help the dataisbeautiful field?", - "submission[forum]": "10042" - }, - "response_status": 302 - } - } - ], - "revision": 2 - }, - { - "sites": ["reddit"], - "task_id": 649, - "intent_template_id": 19, - "start_urls": ["__REDDIT__"], - "intent": "Post in dataisbeautiful forum with title \"What could diffusion model help the dataisbeautiful field?\"", - "intent_template": "Post in {{forum}} forum with title \"{{title}}\"", - "instantiation_dict": { - "model_type": "diffusion model", - "title": "What could diffusion model help the dataisbeautiful field?", - "forum": "dataisbeautiful" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": ["__REDDIT__/submit", "__REDDIT__/submit/dataisbeautiful"], - "http_method": "POST", - "post_data": { - "submission[title]": "What could diffusion model help the dataisbeautiful field?", - "submission[forum]": "10042" - }, - "response_status": 302 - } - } - ], - "revision": 2 - }, - { - "sites": ["reddit"], - "task_id": 650, - "intent_template_id": 23, - "start_urls": ["__REDDIT__/f/books/59421/friendly-reminder-bookshop-org-exists"], - "intent": "Reply to the post on this page with my comment \"I am a big fan of the bookorg\"", - "intent_template": "Reply to {{position_description}} {{location}} with {{comment_prefix}}\"{{content_description}}\"", - "instantiation_dict": { - "position_description": "the post", - "location": "on this page", - "comment_prefix": "my comment ", - "content_description": "I am a big fan of the bookorg" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__REDDIT__/f/books/59421/-/comment", - "http_method": "POST", - "post_data": {"reply_to_submission_59421[comment]": "I am a big fan of the bookorg"}, - "response_status": 302 - } - } - ], - "revision": 2 - }, - { - "sites": ["reddit"], - "task_id": 651, - "intent_template_id": 23, - "start_urls": [ - "__REDDIT__/f/pittsburgh/45899/driving-in-pittsburgh-summed-up-by-one-traffic-sign" - ], - "intent": "Reply to the post on this page with my comment \"Yeah, pittsburgh traffic, you know...\"", - "intent_template": "Reply to {{position_description}} {{location}} with {{comment_prefix}}\"{{content_description}}\"", - "instantiation_dict": { - "position_description": "the post", - "location": "on this page", - "comment_prefix": "my comment ", - "content_description": "Yeah, pittsburgh traffic, you know..." - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__REDDIT__/f/pittsburgh/45899/-/comment", - "http_method": "POST", - "post_data": {"reply_to_submission_45899[comment]": "Yeah, pittsburgh traffic, you know..."}, - "response_status": 302 - } - } - ], - "revision": 2 - }, - { - "sites": ["reddit"], - "task_id": 652, - "intent_template_id": 23, - "start_urls": [ - "__REDDIT__/f/technology/134852/ai-experts-disown-musk-backed-campaign-citing-their-research" - ], - "intent": "Reply to the post on this page with my comment \"???\"", - "intent_template": "Reply to {{position_description}} {{location}} with {{comment_prefix}}\"{{content_description}}\"", - "instantiation_dict": { - "position_description": "the post", - "location": "on this page", - "comment_prefix": "my comment ", - "content_description": "???" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__REDDIT__/f/technology/134852/-/comment", - "http_method": "POST", - "post_data": {"reply_to_submission_134852[comment]": "???"}, - "response_status": 302 - } - } - ], - "revision": 2 - }, - { - "sites": ["shopping"], - "task_id": 653, - "intent_template_id": 153, - "start_urls": ["__SHOPPING__"], - "intent": "Fill the \"contact us\" form in the site for a refund on the phone screen protector I bought, stating that \"it broke after just three days of use\". Also, ensure to include the order number #000000180 and the product SKU. Don\"t submit yet, I will check.", - "intent_template": "Fill the \"contact us\" form in the site for a refund on the {{product}} I bought, stating that \"it broke after just three days of use\". Also, ensure to include the order number #{{order_id}} and the product SKU. Don\"t submit yet, I will check.", - "instantiation_dict": {"product": "phone screen protector", "order_id": "000000180"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": {"url": "__SHOPPING__/contact"} - } - ], - "revision": 2 - }, - { - "sites": ["shopping"], - "task_id": 654, - "intent_template_id": 153, - "start_urls": ["__SHOPPING__"], - "intent": "Fill the \"contact us\" form in the site for a refund on the bluetooth speaker I bought, stating that \"it broke after just three days of use\". Also, ensure to include the order number #161 and the product SKU. Don\"t submit yet, I will check.", - "intent_template": "Fill the \"contact us\" form in the site for a refund on the {{product}} I bought, stating that \"it broke after just three days of use\". Also, ensure to include the order number #{{order_id}} and the product SKU. Don\"t submit yet, I will check.", - "instantiation_dict": {"product": "bluetooth speaker", "order_id": "161"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": {"url": "__SHOPPING__/contact"} - } - ], - "revision": 2 - }, - { - "sites": ["shopping"], - "task_id": 655, - "intent_template_id": 153, - "start_urls": ["__SHOPPING__"], - "intent": "Fill the \"contact us\" form in the site for a refund on the iphone case I bought, stating that \"it broke after just three days of use\". Also, ensure to include the order number #180 and the product SKU. Don\"t submit yet, I will check.", - "intent_template": "Fill the \"contact us\" form in the site for a refund on the {{product}} I bought, stating that \"it broke after just three days of use\". Also, ensure to include the order number #{{order_id}} and the product SKU. Don\"t submit yet, I will check.", - "instantiation_dict": {"product": "iphone case", "order_id": "180"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": {"url": "__SHOPPING__/contact"} - } - ], - "revision": 2 - }, - { - "sites": ["shopping"], - "task_id": 656, - "intent_template_id": 153, - "start_urls": ["__SHOPPING__"], - "intent": "Fill the \"contact us\" form in the site for a refund on the console skin I bought, stating that \"it broke after just three days of use\". Also, ensure to include the order number #180 and the product SKU. Don\"t submit yet, I will check.", - "intent_template": "Fill the \"contact us\" form in the site for a refund on the {{product}} I bought, stating that \"it broke after just three days of use\". Also, ensure to include the order number #{{order_id}} and the product SKU. Don\"t submit yet, I will check.", - "instantiation_dict": {"product": "console skin", "order_id": "180"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": {"url": "__SHOPPING__/contact"} - } - ], - "revision": 3 - }, - { - "sites": ["shopping"], - "task_id": 657, - "intent_template_id": 153, - "start_urls": ["__SHOPPING__"], - "intent": "Fill the \"contact us\" form in the site for a refund on the speaker I bought, stating that \"it broke after just three days of use\". Also, ensure to include the order number #148 and the product SKU. Don\"t submit yet, I will check.", - "intent_template": "Fill the \"contact us\" form in the site for a refund on the {{product}} I bought, stating that \"it broke after just three days of use\". Also, ensure to include the order number #{{order_id}} and the product SKU. Don\"t submit yet, I will check.", - "instantiation_dict": {"product": "speaker", "order_id": "148"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": {"url": "__SHOPPING__/contact"} - } - ], - "revision": 2 - }, - { - "sites": ["gitlab"], - "task_id": 658, - "intent_template_id": 327, - "start_urls": ["__GITLAB__"], - "intent": "Create an issue in a11yproject repo with title \"401 bad gateway\". Assign the issue to Roshanjossey (you can invite users to the project if needed). Set due date to be the end of 2030", - "intent_template": "Create an issue in {{repo}} repo with title \"{{issue}}\". Assign the issue to {{account}} (you can invite users to the project if needed). Set due date to be {{due}}", - "instantiation_dict": { - "repo": "a11yproject", - "issue": "401 bad gateway", - "account": "Roshanjossey", - "due": "the end of 2030" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__GITLAB__/a11yproject/a11yproject.com/-/issues", - "http_method": "POST", - "post_data": { - "issue[title]": "401 bad gateway", - "issue[assignee_ids][]": "2264", - "issue[due_date]": "2030-12-31" - }, - "response_status": 302 - } - } - ], - "revision": 2 - }, - { - "sites": ["gitlab"], - "task_id": 659, - "intent_template_id": 327, - "start_urls": ["__GITLAB__"], - "intent": "Create an issue in my empathy-prompts repo with title \"Integrating LLMs for better prompts\". Assign the issue to Roshanjossey (you can invite users to the project if needed). Set due date to be the beginning of Q2 2033", - "intent_template": "Create an issue in {{repo}} repo with title \"{{issue}}\". Assign the issue to {{account}} (you can invite users to the project if needed). Set due date to be {{due}}", - "instantiation_dict": { - "repo": "my empathy-prompts", - "issue": "Integrating LLMs for better prompts", - "account": "Roshanjossey", - "due": "the beginning of Q2 2033" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__GITLAB__/api/v4/projects/183/invitations", - "http_method": "POST", - "post_data": {"user_id": "2264"}, - "response_status": 201 - } - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__GITLAB__/byteblaze/empathy-prompts/-/issues", - "http_method": "POST", - "post_data": { - "issue[title]": "Integrating LLMs for better prompts", - "issue[assignee_ids][]": "2264", - "issue[due_date]": "2033-04-01" - }, - "response_status": 302 - } - } - ], - "revision": 2 - }, - { - "sites": ["gitlab"], - "task_id": 660, - "intent_template_id": 327, - "start_urls": ["__GITLAB__"], - "intent": "Create an issue in my dotfiles repo with title \"add support for oh-my-zsh\". Assign the issue to Abishek (you can invite users to the project if needed). Set due date to be July 18 2033", - "intent_template": "Create an issue in {{repo}} repo with title \"{{issue}}\". Assign the issue to {{account}} (you can invite users to the project if needed). Set due date to be {{due}}", - "instantiation_dict": { - "repo": "my dotfiles", - "issue": "add support for oh-my-zsh", - "account": "Abishek", - "due": "July 18 2033" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__GITLAB__/api/v4/projects/193/invitations", - "http_method": "POST", - "post_data": {"user_id": "5"}, - "response_status": 201 - } - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__GITLAB__/byteblaze/dotfiles/-/issues", - "http_method": "POST", - "post_data": { - "issue[title]": "add support for oh-my-zsh", - "issue[assignee_ids][]": "5", - "issue[due_date]": "2033-07-18" - }, - "response_status": 302 - } - } - ], - "revision": 2 - }, - { - "sites": ["gitlab"], - "task_id": 661, - "intent_template_id": 328, - "start_urls": ["__GITLAB__"], - "intent": "Open an issue with title \"Connection refused\" to discuss the connection refused exception in the respository ChatGPT.", - "intent_template": "Open an issue with title \"{{title}}\" to {{issue}} in the respository {{repo}}.", - "instantiation_dict": { - "title": "Connection refused", - "issue": "discuss the connection refused exception", - "repo": "ChatGPT" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__GITLAB__/api/v4/projects/175/issues", - "http_method": "POST", - "post_data": {"title": "Connection refused"}, - "response_status": 201 - } - } - ], - "revision": 2 - }, - { - "sites": ["gitlab"], - "task_id": 662, - "intent_template_id": 328, - "start_urls": ["__GITLAB__"], - "intent": "Open an issue with title \"OSError: [Errno 98] Address already in use\" to discuss the OS error in the respository aem-hacker.", - "intent_template": "Open an issue with title \"{{title}}\" to {{issue}} in the respository {{repo}}.", - "instantiation_dict": { - "title": "OSError: [Errno 98] Address already in use", - "issue": "discuss the OS error", - "repo": "aem-hacker" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__GITLAB__/api/v4/projects/138/issues", - "http_method": "POST", - "post_data": {"title": "OSError: [Errno 98] Address already in use"}, - "response_status": 201 - } - } - ], - "revision": 2 - }, - { - "sites": ["gitlab"], - "task_id": 663, - "intent_template_id": 328, - "start_urls": ["__GITLAB__"], - "intent": "Open an issue with title \"Llama model support timeline\" to ask the team\"s plan on supporting Llama and other llama family models in the respository metaseq.", - "intent_template": "Open an issue with title \"{{title}}\" to {{issue}} in the respository {{repo}}.", - "instantiation_dict": { - "title": "Llama model support timeline", - "issue": "ask the team\"s plan on supporting Llama and other llama family models", - "repo": "metaseq" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__GITLAB__/api/v4/projects/33/issues", - "http_method": "POST", - "post_data": {"title": "Llama model support timeline"}, - "response_status": 201 - } - } - ], - "revision": 2 - }, - { - "sites": ["gitlab"], - "task_id": 664, - "intent_template_id": 328, - "start_urls": ["__GITLAB__"], - "intent": "Open an issue with title \"Question on future usage of Python 3.11\" to ask the team\"s plans on adding Python 3.11 related resources in the respository awesome-python.", - "intent_template": "Open an issue with title \"{{title}}\" to {{issue}} in the respository {{repo}}.", - "instantiation_dict": { - "title": "Question on future usage of Python 3.11", - "issue": "ask the team\"s plans on adding Python 3.11 related resources", - "repo": "awesome-python" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__GITLAB__/api/v4/projects/84/issues", - "http_method": "POST", - "post_data": {"title": "Question on future usage of Python 3.11"}, - "response_status": 201 - } - } - ], - "revision": 2 - }, - { - "sites": ["gitlab"], - "task_id": 665, - "intent_template_id": 328, - "start_urls": ["__GITLAB__"], - "intent": "Open an issue with title \"Feature Request: MT support\" to request adding support for MT theme editor in the respository a11y-syntax-highlighting.", - "intent_template": "Open an issue with title \"{{title}}\" to {{issue}} in the respository {{repo}}.", - "instantiation_dict": { - "title": "Feature Request: MT support", - "issue": "request adding support for MT theme editor", - "repo": "a11y-syntax-highlighting" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__GITLAB__/api/v4/projects/186/issues", - "http_method": "POST", - "post_data": {"title": "Feature Request: MT support"}, - "response_status": 201 - } - } - ], - "revision": 2 - }, - { - "sites": ["gitlab"], - "task_id": 666, - "intent_template_id": 335, - "start_urls": ["__GITLAB__/primer/design"], - "intent": "Submit a merge request for dialog-component in the current repository to be merged into dialog branch, assign Primer as the reviewer", - "intent_template": "Submit a merge request for {{source_branch}} in {{source_project}} to be merged into {{target_branch}} branch, assign {{reviewer}} as the reviewer", - "instantiation_dict": { - "source_branch": "dialog-component", - "target_branch": "dialog", - "reviewer": "Primer", - "source_project": "the current repository" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__GITLAB__/primer/design/-/merge_requests", - "http_method": "POST", - "post_data": { - "merge_request[target_branch]": "dialog", - "merge_request[source_branch]": "dialog-component", - "merge_request[target_project_id]": "180", - "merge_request[source_project_id]": "180", - "merge_request[assignee_ids][]": "0", - "merge_request[reviewer_ids][]": "2367" - }, - "response_status": 302 - } - } - ], - "revision": 2 - }, - { - "sites": ["gitlab"], - "task_id": 667, - "intent_template_id": 335, - "start_urls": ["__GITLAB__/primer/design"], - "intent": "Submit a merge request for dialog-component in the current repository to be merged into bump-doctocat branch, assign primer as the reviewer", - "intent_template": "Submit a merge request for {{source_branch}} in {{source_project}} to be merged into {{target_branch}} branch, assign {{reviewer}} as the reviewer", - "instantiation_dict": { - "source_branch": "dialog-component", - "target_branch": "bump-doctocat", - "reviewer": "primer", - "source_project": "the current repository" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__GITLAB__/primer/design/-/merge_requests", - "http_method": "POST", - "post_data": { - "merge_request[target_branch]": "bump-doctocat", - "merge_request[source_branch]": "dialog-component", - "merge_request[target_project_id]": "180", - "merge_request[source_project_id]": "180", - "merge_request[assignee_ids][]": "0", - "merge_request[reviewer_ids][]": "2367" - }, - "response_status": 302 - } - } - ], - "revision": 2 - }, - { - "sites": ["gitlab"], - "task_id": 668, - "intent_template_id": 335, - "start_urls": ["__GITLAB__"], - "intent": "Submit a merge request for redesign in a11yproject.com to be merged into main branch, assign Roshan Jossy as the reviewer", - "intent_template": "Submit a merge request for {{source_branch}} in {{source_project}} to be merged into {{target_branch}} branch, assign {{reviewer}} as the reviewer", - "instantiation_dict": { - "source_branch": "redesign", - "target_branch": "main", - "reviewer": "Roshan Jossy", - "source_project": "a11yproject.com" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__GITLAB__/a11yproject/a11yproject.com/-/merge_requests", - "http_method": "POST", - "post_data": { - "merge_request[target_branch]": "main", - "merge_request[source_branch]": "redesign", - "merge_request[target_project_id]": "174", - "merge_request[source_project_id]": "174", - "merge_request[assignee_ids][]": "0", - "merge_request[reviewer_ids][]": "2264" - }, - "response_status": 302 - } - } - ], - "revision": 2 - }, - { - "sites": ["gitlab"], - "task_id": 669, - "intent_template_id": 337, - "start_urls": ["__GITLAB__/byteblaze/solarized-prism-theme"], - "intent": "Open a new issue to discuss the implementation of dark mode for the current project. Use an appropriate issue title.", - "intent_template": "Open a new issue to discuss the implementation of {{feature}} for the current project. Use an appropriate issue title.", - "instantiation_dict": {"feature": "dark mode"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__GITLAB__/byteblaze/solarized-prism-theme/-/issues", - "http_method": "POST", - "post_data": {"issue[title]": "^.*dark.*mode.*$"}, - "response_status": 302 - } - } - ], - "revision": 2 - }, - { - "sites": ["gitlab"], - "task_id": 670, - "intent_template_id": 337, - "start_urls": ["__GITLAB__/byteblaze/dotfiles"], - "intent": "Open a new issue to discuss the implementation of default plugins for .zsh for the current project. Use an appropriate issue title.", - "intent_template": "Open a new issue to discuss the implementation of {{feature}} for the current project. Use an appropriate issue title.", - "instantiation_dict": {"feature": "default plugins for .zsh"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__GITLAB__/byteblaze/dotfiles/-/issues", - "http_method": "POST", - "post_data": {"issue[title]": "^.*default.*plugins.*zsh.*$"}, - "response_status": 302 - } - } - ], - "revision": 2 - }, - { - "sites": ["shopping", "reddit"], - "task_id": 671, - "intent_template_id": 101, - "start_urls": ["__SHOPPING__", "__REDDIT__"], - "intent": "Create a post in the game related discussion forum about Sony Computer Entertainment VR to report customer reviews with 2 stars and less rating from the OneStopShop with the post title \"real user feedback on Sony Computer Entertainment VR\". Format the post body as a bullet point list in the same order they appear (md format: - \"\").", - "intent_template": "Create a post in the game related discussion forum about {{product}} to report customer reviews with {{rating}} rating from the OneStopShop with the post title \"real user feedback on {{product}}\". Format the post body as a bullet point list in the same order they appear (md format: - \"\").", - "instantiation_dict": {"product": "Sony Computer Entertainment VR", "rating": "2 stars and less"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "post_data_schema": { - "type": "object", - "properties": { "$.actions[0].content": {"type": "string", "format": "markdown"} } - }, - "expected": { - "url": "^__REDDIT__/submit.*$", - "http_method": "POST", - "post_data": { - "submission[title]": "real user feedback on Sony Computer Entertainment VR", - "submission[body]": "- \"Disappointing. Didn't last long before it stopped powering on and needed to be sent in for repair.\"\r\n- \"didn't last a year without issues\"\r\n- \"Received used items!!\"\r\n- \"Received used items!!\"", - "submission[forum]": "10048" - }, - "response_status": 302 - } - } - ], - "revision": 2 - }, - { - "sites": ["shopping", "reddit"], - "task_id": 672, - "intent_template_id": 101, - "start_urls": ["__SHOPPING__", "__REDDIT__"], - "intent": "Create a post in the game related discussion forum about Nintendo Switch Fortnite Wildcat Console EU to report customer reviews with 3 stars and less rating from the OneStopShop with the post title \"real user feedback on Nintendo Switch Fortnite Wildcat Console EU\". Format the post body as a bullet point list in the same order they appear (md format: - \"\").", - "intent_template": "Create a post in the game related discussion forum about {{product}} to report customer reviews with {{rating}} rating from the OneStopShop with the post title \"real user feedback on {{product}}\". Format the post body as a bullet point list in the same order they appear (md format: - \"\").", - "instantiation_dict": { - "product": "Nintendo Switch Fortnite Wildcat Console EU", - "rating": "3 stars and less" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "post_data_schema": { - "type": "object", - "properties": { "$.actions[0].content": {"type": "string", "format": "markdown"} } - }, - "expected": { - "url": "^__REDDIT__/submit.*$", - "http_method": "POST", - "post_data": { - "submission[title]": "real user feedback on Nintendo Switch Fortnite Wildcat Console EU", - "submission[body]": "- \"EU charger and wild cat card doesn't even work!\"\r\n- \"REFUND REJECTED\"\r\n- \"Charging port not compatible\"\r\n- \"not compatible in the US\"\r\n- \"Wildcard Bonus Credits Not Redeemable!\"\r\n- \"Code not available!!\"\r\n- \"Code not available!!\"", - "submission[forum]": "10048" - }, - "response_status": 302 - } - } - ], - "revision": 2 - }, - { - "sites": ["shopping", "reddit"], - "task_id": 673, - "intent_template_id": 101, - "start_urls": ["__SHOPPING__", "__REDDIT__"], - "intent": "Create a post in the game related discussion forum about Racing Wheel Overdrive for Xbox X to report customer reviews with 1 star rating from the OneStopShop with the post title \"real user feedback on Racing Wheel Overdrive for Xbox X\". Format the post body as a bullet point list in the same order they appear (md format: - \"\").", - "intent_template": "Create a post in the game related discussion forum about {{product}} to report customer reviews with {{rating}} rating from the OneStopShop with the post title \"real user feedback on {{product}}\". Format the post body as a bullet point list in the same order they appear (md format: - \"\").", - "instantiation_dict": {"product": "Racing Wheel Overdrive for Xbox X", "rating": "1 star"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "post_data_schema": { - "type": "object", - "properties": { "$.actions[0].content": {"type": "string", "format": "markdown"} } - }, - "expected": { - "url": "^__REDDIT__/submit.*$", - "http_method": "POST", - "post_data": { - "submission[title]": "real user feedback on Racing Wheel Overdrive for Xbox X", - "submission[body]": "- \"Unable to set neutral steering\"\r\n- \"Doesn't work with PC.\"\r\n- \"Crazy problems in automatic mode; then pedals stopped working\"\r\n- \"Only works with certain games.\"\r\n- \"Only works with certain games.\"", - "submission[forum]": "10048" - }, - "response_status": 302 - } - } - ], - "revision": 2 - }, - { - "sites": ["shopping", "reddit"], - "task_id": 674, - "intent_template_id": 101, - "start_urls": ["__SHOPPING__", "__REDDIT__"], - "intent": "Create a post in the game related discussion forum about Doc and Pies Arcade Factory Cocktail Arcade Machine to report customer reviews with 3 stars and less rating from the OneStopShop with the post title \"real user feedback on Doc and Pies Arcade Factory Cocktail Arcade Machine\". Format the post body as a bullet point list in the same order they appear (md format: - \"\").", - "intent_template": "Create a post in the game related discussion forum about {{product}} to report customer reviews with {{rating}} rating from the OneStopShop with the post title \"real user feedback on {{product}}\". Format the post body as a bullet point list in the same order they appear (md format: - \"\").", - "instantiation_dict": { - "product": "Doc and Pies Arcade Factory Cocktail Arcade Machine", - "rating": "3 stars and less" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "post_data_schema": { - "type": "object", - "properties": { "$.actions[0].content": {"type": "string", "format": "markdown"} } - }, - "expected": { - "url": "^__REDDIT__/submit.*$", - "http_method": "POST", - "post_data": { - "submission[title]": "real user feedback on Doc and Pies Arcade Factory Cocktail Arcade Machine", - "submission[body]": "- \"Poorly Made Exterior. Consider a different Company.\"\r\n- \"piece of junk ,..can't believe I spent money on this !!!!\"\r\n- \"Based arrived broken but game itself works\"\r\n- \"Based arrived broken but game itself works\"", - "submission[forum]": "10048" - }, - "response_status": 302 - } - } - ], - "revision": 2 - }, - { - "sites": ["shopping", "reddit"], - "task_id": 675, - "intent_template_id": 101, - "start_urls": ["__SHOPPING__", "__REDDIT__"], - "intent": "Create a post in the game related discussion forum about HORI 3D Surround Gaming Neckset to report customer reviews with 2 stars and less rating from the OneStopShop with the post title \"real user feedback on HORI 3D Surround Gaming Neckset\". Format the post body as a bullet point list in the same order they appear (md format: - \"\").", - "intent_template": "Create a post in the game related discussion forum about {{product}} to report customer reviews with {{rating}} rating from the OneStopShop with the post title \"real user feedback on {{product}}\". Format the post body as a bullet point list in the same order they appear (md format: - \"\").", - "instantiation_dict": {"product": "HORI 3D Surround Gaming Neckset", "rating": "2 stars and less"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "post_data_schema": { - "type": "object", - "properties": { "$.actions[0].content": {"type": "string", "format": "markdown"} } - }, - "expected": { - "url": "^__REDDIT__/submit.*$", - "http_method": "POST", - "post_data": { - "submission[title]": "real user feedback on HORI 3D Surround Gaming Neckset", - "submission[body]": "- \"Not worth it for PC users\"\r\n- \"I really wanted to like this.\"\r\n- \"I wish this was better...\"\r\n- \"I wish this was better...\"", - "submission[forum]": "10048" - }, - "response_status": 302 - } - } - ], - "revision": 2 - }, - { - "sites": ["shopping_admin"], - "task_id": 676, - "intent_template_id": 253, - "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Go to the list of orders that are suspected of being fraudulent", - "intent_template": "Go to the list of orders that are {{status}}", - "instantiation_dict": {"status": "suspected of being fraudulent"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "navigate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "^__SHOPPING_ADMIN__/mui/index/render/.*$", - "headers": {"referer": "__SHOPPING_ADMIN__/sales/order/"}, - "query_params": { - "namespace" : [ "sales_order_grid" ], - "filters[placeholder]": [ "true" ], - "filters[status]" : [ "fraud" ], - "search" : [ "" ], - "keywordUpdated" : [ "false" ] - } - }, - "ignored_query_params_patterns": ["^paging", "^sorting", "isAjax"] - } - ], - "revision": 2 - }, - { - "sites": ["shopping_admin"], - "task_id": 677, - "intent_template_id": 253, - "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Go to the list of orders that are processing", - "intent_template": "Go to the list of orders that are {{status}}", - "instantiation_dict": {"status": "processing"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "navigate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__SHOPPING_ADMIN__/mui/index/render/", - "headers": {"referer": "__SHOPPING_ADMIN__/sales/order/"}, - "query_params": { - "namespace" : [ "sales_order_grid" ], - "filters[placeholder]": [ "true" ], - "filters[status]" : [ "processing" ], - "search" : [ "" ], - "keywordUpdated" : [ "false" ] - } - }, - "ignored_query_params_patterns": ["^paging", "^sorting", "isAjax"] - } - ], - "revision": 2 - }, - { - "sites": ["shopping_admin"], - "task_id": 678, - "intent_template_id": 253, - "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Go to the list of orders that are canceled", - "intent_template": "Go to the list of orders that are {{status}}", - "instantiation_dict": {"status": "canceled"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "navigate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__SHOPPING_ADMIN__/mui/index/render/", - "headers": {"referer": "__SHOPPING_ADMIN__/sales/order/"}, - "query_params": { - "namespace" : [ "sales_order_grid" ], - "filters[placeholder]": [ "true" ], - "filters[status]" : [ "canceled" ], - "search" : [ "" ], - "keywordUpdated" : [ "false" ] - } - }, - "ignored_query_params_patterns": ["^paging", "^sorting", "isAjax"] - } - ], - "revision": 2 - }, - { - "sites": ["shopping_admin"], - "task_id": 679, - "intent_template_id": 253, - "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Go to the list of orders that are completed", - "intent_template": "Go to the list of orders that are {{status}}", - "instantiation_dict": {"status": "completed"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "navigate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "^__SHOPPING_ADMIN__/mui/index/render/.*$", - "query_params": { - "namespace" : [ "sales_order_grid" ], - "filters[placeholder]": [ "true" ], - "filters[status]" : [ "complete" ], - "search" : [ "" ], - "keywordUpdated" : [ "false" ] - }, - "headers": {"referer": "__SHOPPING_ADMIN__/sales/order/"} - }, - "ignored_query_params_patterns": ["^paging", "^sorting", "isAjax"], - "decode_base64_query": true - } - ], - "revision": 2 - }, - { - "sites": ["shopping_admin"], - "task_id": 680, - "intent_template_id": 253, - "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Go to the list of orders that are on hold", - "intent_template": "Go to the list of orders that are {{status}}", - "instantiation_dict": {"status": "on hold"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "navigate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__SHOPPING_ADMIN__/mui/index/render/", - "headers": {"referer": "__SHOPPING_ADMIN__/sales/order/"}, - "query_params": { - "namespace" : [ "sales_order_grid" ], - "filters[placeholder]": [ "true" ], - "filters[status]" : [ "holded" ], - "search" : [ "" ], - "keywordUpdated" : [ "false" ] - } - }, - "ignored_query_params_patterns": ["^paging", "^sorting", "isAjax"] - } - ], - "revision": 2 - }, - { - "sites": ["reddit", "gitlab"], - "task_id": 681, - "intent_template_id": 116, - "start_urls": ["__REDDIT__", "__GITLAB__"], - "intent": "Find a GitLab repository related to gan implementation and create a new post in the discussion forum with the URL of the repository to the most relevant forum. Use the repository path as the post title and \" commit(s) already!\" as the post body.", - "intent_template": "Find a GitLab repository related to {{topic}} and create a new post in the discussion forum with the URL of the repository to the most relevant forum. Use the repository path as the post title and \" commit(s) already!\" as the post body.", - "instantiation_dict": {"topic": "gan implementation"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "^__REDDIT__/submit/?.*$", - "http_method": "POST", - "post_data": { - "submission[mediaType]": "url", - "submission[url]": "__GITLAB__/eriklindernoren/PyTorch-GAN", - "submission[title]": "eriklindernoren/PyTorch-GAN", - "submission[body]": "193 commits already!", - "submission[forum]": "^(10043|10018)$" - }, - "response_status": 302 - } - } - ], - "revision": 2 - }, - { - "sites": ["reddit", "gitlab"], - "task_id": 682, - "intent_template_id": 116, - "start_urls": ["__REDDIT__", "__GITLAB__"], - "intent": "Find a GitLab repository related to chatGPT and create a new post in the discussion forum with the URL of the repository to the most relevant forum. Use the repository path as the post title and \" commit(s) already!\" as the post body.", - "intent_template": "Find a GitLab repository related to {{topic}} and create a new post in the discussion forum with the URL of the repository to the most relevant forum. Use the repository path as the post title and \" commit(s) already!\" as the post body.", - "instantiation_dict": {"topic": "chatGPT"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "^__REDDIT__/submit/?.*$", - "http_method": "POST", - "post_data": { - "submission[mediaType]": "url", - "submission[url]": "__GITLAB__/convexegg/chatgpt", - "submission[title]": "convexegg/chatgpt", - "submission[body]": "193 commits already!", - "submission[forum]": "^(10043|10018|10082|10078)$" - }, - "response_status": 302 - } - } - ], - "revision": 2 - }, - { - "sites": ["reddit", "gitlab"], - "task_id": 683, - "intent_template_id": 116, - "start_urls": ["__REDDIT__", "__GITLAB__"], - "intent": "Find a GitLab repository related to metaseq and create a new post in the discussion forum with the URL of the repository to the most relevant forum. Use the repository path as the post title and \" commit(s) already!\" as the post body.", - "intent_template": "Find a GitLab repository related to {{topic}} and create a new post in the discussion forum with the URL of the repository to the most relevant forum. Use the repository path as the post title and \" commit(s) already!\" as the post body.", - "instantiation_dict": {"topic": "metaseq"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "^__REDDIT__/submit/?.*$", - "http_method": "POST", - "post_data": { - "submission[mediaType]": "url", - "submission[url]": "__GITLAB__/facebook/metaseq", - "submission[title]": "facebook/metaseq", - "submission[body]": "272 commits already!", - "submission[forum]": "^(10043|10018)$" - }, - "response_status": 302 - } - } - ], - "revision": 2 - }, - { - "sites": ["reddit", "gitlab"], - "task_id": 684, - "intent_template_id": 117, - "start_urls": ["__REDDIT__", "__GITLAB__"], - "intent": "Promote byteblaze/cloud-to-butt in the discussion forum LifeProTips by creating a post with the project description as the title.", - "intent_template": "Promote {{repo}} in the discussion forum {{forum}} by creating a post with the project description as the title.", - "instantiation_dict": {"repo": "byteblaze/cloud-to-butt", "forum": "LifeProTips"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "^__REDDIT__/submit/?.*$", - "http_method": "POST", - "post_data": { - "submission[mediaType]": "url", - "submission[url]": "__GITLAB__/byteblaze/cloud-to-butt", - "submission[title]": "Chrome extension that replaces occurrences of 'the cloud' with 'my butt'", - "submission[forum]": "10016" - }, - "response_status": 302 - } - } - ], - "revision": 2 - }, - { - "sites": ["reddit", "gitlab"], - "task_id": 685, - "intent_template_id": 117, - "start_urls": ["__REDDIT__", "__GITLAB__"], - "intent": "Promote byteblaze/dotfiles in the discussion forum aww by creating a post with the project description as the title.", - "intent_template": "Promote {{repo}} in the discussion forum {{forum}} by creating a post with the project description as the title.", - "instantiation_dict": {"repo": "byteblaze/dotfiles", "forum": "aww"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "^__REDDIT__/submit/?.*$", - "http_method": "POST", - "post_data": { - "submission[mediaType]": "url", - "submission[url]": "__GITLAB__/byteblaze/dotfiles", - "submission[title]": "\ud83e\udd16 Computer setup", - "submission[forum]": "10035" - }, - "response_status": 302 - } - } - ], - "revision": 2 - }, - { - "sites": ["reddit", "gitlab"], - "task_id": 686, - "intent_template_id": 117, - "start_urls": ["__REDDIT__", "__GITLAB__"], - "intent": "Promote auth0/angular-storage in the discussion forum technology by creating a post with the project description as the title.", - "intent_template": "Promote {{repo}} in the discussion forum {{forum}} by creating a post with the project description as the title.", - "instantiation_dict": {"repo": "auth0/angular-storage", "forum": "technology"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "^__REDDIT__/submit/?.*$", - "http_method": "POST", - "post_data": { - "submission[mediaType]": "url", - "submission[url]": "__GITLAB__/auth0/angular-storage", - "submission[title]": "A storage library for AngularJS done right", - "submission[forum]": "10082" - }, - "response_status": 302 - } - } - ], - "revision": 2 - }, - { - "sites": ["reddit", "gitlab"], - "task_id": 687, - "intent_template_id": 117, - "start_urls": ["__REDDIT__", "__GITLAB__"], - "intent": "Promote koush/AndroidAsync in the discussion forum funny by creating a post with the project description as the title.", - "intent_template": "Promote {{repo}} in the discussion forum {{forum}} by creating a post with the project description as the title.", - "instantiation_dict": {"repo": "koush/AndroidAsync", "forum": "funny"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "^__REDDIT__/submit/?.*$", - "http_method": "POST", - "post_data": { - "submission[mediaType]": "url", - "submission[url]": "__GITLAB__/koush/AndroidAsync", - "submission[title]": "Asynchronous socket, http(s) (client+server) and websocket library for android. Based on nio, not threads.", - "submission[forum]": "10046" - }, - "response_status": 302 - } - } - ], - "revision": 2 - }, - { - "sites": ["reddit", "gitlab"], - "task_id": 688, - "intent_template_id": 117, - "start_urls": ["__REDDIT__", "__GITLAB__"], - "intent": "Promote lahwaacz/arch-wiki-docs in the discussion forum science by creating a post with the project description as the title.", - "intent_template": "Promote {{repo}} in the discussion forum {{forum}} by creating a post with the project description as the title.", - "instantiation_dict": {"repo": "lahwaacz/arch-wiki-docs", "forum": "science"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "^__REDDIT__/submit/?.*$", - "http_method": "POST", - "post_data": { - "submission[mediaType]": "url", - "submission[url]": "__GITLAB__/lahwaacz/arch-wiki-docs", - "submission[title]": "A script to download pages from Arch Wiki for offline browsing", - "submission[forum]": "10077" - }, - "response_status": 302 - } - } - ], - "revision": 2 - }, - { - "sites": ["shopping"], - "task_id": 689, - "intent_template_id": 163, - "start_urls": ["__SHOPPING__"], - "intent": "Prepare a coupon request in the shop's contact form with the reason: \"I am a loyal customer, give me a coupon\". Keep it ready for review.", - "intent_template": "Prepare a coupon request in the shop's contact form with the reason: \"{{reason}}\". Keep it ready for review.", - "instantiation_dict": {"reason": "I am a loyal customer, give me a coupon"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "^http://.*/dummy_bin$", - "http_method": "POST", - "post_data": { - "form_id": "contact-form", - "name": "Emma Lopez", - "email": "emma.lopez@gmail.com", - "comment": "I am a loyal customer, give me a coupon" - }, - "response_status": -1 - } - }, - { - "evaluator": "NetworkEventEvaluator", - "should_not_exist": true, - "expected": {"url": "__SHOPPING__/contact/index/post", "http_method": "POST"} - } - ], - "revision": 2 - }, - { - "sites": ["shopping"], - "task_id": 690, - "intent_template_id": 163, - "start_urls": ["__SHOPPING__"], - "intent": "Prepare a coupon request in the shop's contact form with the reason: \"they promised me a coupon last time\". Keep it ready for review.", - "intent_template": "Prepare a coupon request in the shop's contact form with the reason: \"{{reason}}\". Keep it ready for review.", - "instantiation_dict": {"reason": "they promised me a coupon last time"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "^http://.*/dummy_bin$", - "http_method": "POST", - "post_data": { - "form_id": "contact-form", - "name": "Emma Lopez", - "email": "emma.lopez@gmail.com", - "comment": "they promised me a coupon last time" - }, - "response_status": -1 - } - }, - { - "evaluator": "NetworkEventEvaluator", - "should_not_exist": true, - "expected": {"url": "__SHOPPING__/contact/index/post", "http_method": "POST"} - } - ], - "revision": 2 - }, - { - "sites": ["shopping"], - "task_id": 691, - "intent_template_id": 163, - "start_urls": ["__SHOPPING__"], - "intent": "Prepare a coupon request in the shop's contact form with the reason: \"I plan to make a bulk purchase\". Keep it ready for review.", - "intent_template": "Prepare a coupon request in the shop's contact form with the reason: \"{{reason}}\". Keep it ready for review.", - "instantiation_dict": {"reason": "I plan to make a bulk purchase"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "^http://.*/dummy_bin$", - "http_method": "POST", - "post_data": { - "form_id": "contact-form", - "name": "Emma Lopez", - "email": "emma.lopez@gmail.com", - "comment": "I plan to make a bulk purchase" - }, - "response_status": -1 - } - }, - { - "evaluator": "NetworkEventEvaluator", - "should_not_exist": true, - "expected": {"url": "__SHOPPING__/contact/index/post", "http_method": "POST"} - } - ], - "revision": 2 - }, - { - "sites": ["shopping"], - "task_id": 692, - "intent_template_id": 163, - "start_urls": ["__SHOPPING__"], - "intent": "Prepare a coupon request in the shop's contact form with the reason: \"I am a student\". Keep it ready for review.", - "intent_template": "Prepare a coupon request in the shop's contact form with the reason: \"{{reason}}\". Keep it ready for review.", - "instantiation_dict": {"reason": "I am a student"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "^http://.*/dummy_bin$", - "http_method": "POST", - "post_data": { - "form_id": "contact-form", - "name": "Emma Lopez", - "email": "emma.lopez@gmail.com", - "comment": "I am a student" - }, - "response_status": -1 - } - }, - { - "evaluator": "NetworkEventEvaluator", - "should_not_exist": true, - "expected": {"url": "__SHOPPING__/contact/index/post", "http_method": "POST"} - } - ], - "revision": 2 - }, - { - "sites": ["shopping"], - "task_id": 693, - "intent_template_id": 163, - "start_urls": ["__SHOPPING__"], - "intent": "Prepare a coupon request in the shop's contact form with the reason: \"my refund is supposed to be replaced by a coupon\". Keep it ready for review.", - "intent_template": "Prepare a coupon request in the shop's contact form with the reason: \"{{reason}}\". Keep it ready for review.", - "instantiation_dict": {"reason": "my refund is supposed to be replaced by a coupon"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "^http://.*/dummy_bin$", - "http_method": "POST", - "post_data": { - "form_id": "contact-form", - "name": "Emma Lopez", - "email": "emma.lopez@gmail.com", - "comment": "my refund is supposed to be replaced by a coupon" - }, - "response_status": -1 - } - }, - { - "evaluator": "NetworkEventEvaluator", - "should_not_exist": true, - "expected": {"url": "__SHOPPING__/contact/index/post", "http_method": "POST"} - } - ], - "revision": 2 - }, - { - "sites": ["shopping_admin"], - "task_id": 694, - "intent_template_id": 256, - "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Add a simple product named \"Energy-Bulk Women Shirt\" with 50 in stock, available in size S and color blue, priced at $60 using the appropriate attribute set.", - "intent_template": "Add a simple product named \"{{product}}\" with {{stock}} in stock, available in {{size}} and color {{color}}, priced at ${{price}} using the appropriate attribute set.", - "instantiation_dict": { - "product": "Energy-Bulk Women Shirt", - "stock": "50", - "size": "size S", - "color": "blue", - "price": "60" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "ignored_query_params_patterns": ["isAjax"], - "expected": { - "url": "^__SHOPPING_ADMIN__/catalog/product/save/type/simple/store/0/set/\\d+/back/edit$", - "http_method": "POST", - "response_status": 302, - "post_data": { - "product[name]": "Energy-Bulk Women Shirt", - "product[price]": "60", - "product[status]": "1", - "product[quantity_and_stock_status][qty]": "50", - "product[quantity_and_stock_status][is_in_stock]": "1", - "product[size]": "167", - "product[color]": "50" - } - } - } - ], - "revision": 2 - }, - { - "sites": ["shopping_admin"], - "task_id": 695, - "intent_template_id": 256, - "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Add a simple product named \"Energy-Bulk Man Yoga Pant\" with 50 in stock, available in size 38 and color yellow, priced at $69.99 using the appropriate attribute set.", - "intent_template": "Add a simple product named \"{{product}}\" with {{stock}} in stock, available in {{size}} and color {{color}}, priced at ${{price}} using the appropriate attribute set.", - "instantiation_dict": { - "product": "Energy-Bulk Man Yoga Pant", - "stock": "50", - "size": "size 38", - "color": "yellow", - "price": "69.99" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "ignored_query_params_patterns": ["isAjax"], - "expected": { - "url": "^__SHOPPING_ADMIN__/catalog/product/save/type/simple/store/0/set/\\d+/back/edit$", - "http_method": "POST", - "response_status": 302, - "post_data": { - "product[name]": "Energy-Bulk Man Yoga Pant", - "product[price]": "69.99", - "product[status]": "1", - "product[quantity_and_stock_status][qty]": "50", - "product[quantity_and_stock_status][is_in_stock]": "1", - "product[size]": "179", - "product[color]": "60" - } - } - } - ], - "revision": 2 - }, - { - "sites": ["shopping_admin"], - "task_id": 696, - "intent_template_id": 256, - "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Add a simple product named \"FancyBoy Man Causal Jeans\" with 42 in stock, available in size 34 and color Blue, priced at $169.99 using the appropriate attribute set.", - "intent_template": "Add a simple product named \"{{product}}\" with {{stock}} in stock, available in {{size}} and color {{color}}, priced at ${{price}} using the appropriate attribute set.", - "instantiation_dict": { - "product": "FancyBoy Man Causal Jeans", - "stock": "42", - "size": "size 34", - "color": "Blue", - "price": "169.99" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "ignored_query_params_patterns": ["isAjax"], - "expected": { - "url": "^__SHOPPING_ADMIN__/catalog/product/save/type/simple/store/0/set/\\d+/back/edit$", - "http_method": "POST", - "response_status": 302, - "post_data": { - "product[name]": "FancyBoy Man Causal Jeans", - "product[price]": "169.99", - "product[status]": "1", - "product[quantity_and_stock_status][qty]": "42", - "product[quantity_and_stock_status][is_in_stock]": "1", - "product[size]": "177", - "product[color]": "50" - } - } - } - ], - "revision": 2 - }, - { - "sites": ["shopping_admin"], - "task_id": 697, - "intent_template_id": 256, - "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Add a simple product named \"Swatch Smart Watch\" with 42 in stock, available in a single size and color Blue, priced at $769.99 using the appropriate attribute set.", - "intent_template": "Add a simple product named \"{{product}}\" with {{stock}} in stock, available in {{size}} and color {{color}}, priced at ${{price}} using the appropriate attribute set.", - "instantiation_dict": { - "product": "Swatch Smart Watch", - "stock": "42", - "size": "a single size", - "color": "Blue", - "price": "769.99" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "ignored_query_params_patterns": ["isAjax"], - "expected": { - "url": "^__SHOPPING_ADMIN__/catalog/product/save/type/simple/store/0/set/\\d+/back/edit$", - "http_method": "POST", - "response_status": 302, - "post_data": { - "product[name]": "Swatch Smart Watch", - "product[price]": "769.99", - "product[status]": "1", - "product[quantity_and_stock_status][qty]": "42", - "product[quantity_and_stock_status][is_in_stock]": "1", - "product[color]": "50" - } - } - } - ], - "revision": 2 - }, - { - "sites": ["shopping_admin"], - "task_id": 698, - "intent_template_id": 256, - "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Add a simple product named \"Lelelumon Yoga Mat\" with 42 in stock, available in size uni-size and color black, priced at $769.99 using the appropriate attribute set.", - "intent_template": "Add a simple product named \"{{product}}\" with {{stock}} in stock, available in {{size}} and color {{color}}, priced at ${{price}} using the appropriate attribute set.", - "instantiation_dict": { - "product": "Lelelumon Yoga Mat", - "stock": "42", - "size": "size uni-size", - "color": "black", - "price": "769.99" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "ignored_query_params_patterns": ["isAjax"], - "expected": { - "url": "^__SHOPPING_ADMIN__/catalog/product/save/type/simple/store/0/set/\\d+/back/edit$", - "http_method": "POST", - "response_status": 302, - "post_data": { - "product[name]": "Lelelumon Yoga Mat", - "product[price]": "769.99", - "product[status]": "1", - "product[quantity_and_stock_status][qty]": "42", - "product[quantity_and_stock_status][is_in_stock]": "1", - "product[color]": "49" - } - } - } - ], - "revision": 2 - }, - { - "sites": ["shopping_admin"], - "task_id": 699, - "intent_template_id": 258, - "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Create a new marketing price rule called \"spring sale\" for all registered customers that offers a 20 percent discount site-wide", - "intent_template": "Create a new marketing price rule called \"{{topic}}\" for all registered customers that offers {{rule}}", - "instantiation_dict": {"topic": "spring sale", "rule": "a 20 percent discount site-wide"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__SHOPPING_ADMIN__/sales_rule/promo_quote/save", - "http_method": "POST", - "response_status": 302, - "post_data": { - "name": "spring sale", - "website_ids": [1], - "customer_group_ids": [1], - "simple_action": "by_percent", - "discount_amount": 20 - } - } - } - ], - "revision": 2 - }, - { - "sites": ["shopping_admin"], - "task_id": 700, - "intent_template_id": 258, - "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Create a new marketing price rule called \"fall discount\" for all registered customers that offers $10 discount on checkout", - "intent_template": "Create a new marketing price rule called \"{{topic}}\" for all registered customers that offers {{rule}}", - "instantiation_dict": {"topic": "fall discount", "rule": "$10 discount on checkout"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__SHOPPING_ADMIN__/sales_rule/promo_quote/save", - "http_method": "POST", - "response_status": 302, - "post_data": { - "name": "fall discount", - "website_ids": [1], - "customer_group_ids": [1], - "simple_action": "cart_fixed", - "discount_amount": 10 - } - } - } - ], - "revision": 2 - }, - { - "sites": ["shopping_admin"], - "task_id": 701, - "intent_template_id": 258, - "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Create a new marketing price rule called \"Mother's day sale\" for all registered customers that offers 15% discount on checkout on all their cart", - "intent_template": "Create a new marketing price rule called \"{{topic}}\" for all registered customers that offers {{rule}}", - "instantiation_dict": { - "topic": "Mother's day sale", - "rule": "15% discount on checkout on all their cart" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__SHOPPING_ADMIN__/sales_rule/promo_quote/save", - "http_method": "POST", - "response_status": 302, - "post_data": { - "name": "Mother's day sale", - "is_active": "1", - "customer_group_ids[0]": "1", - "website_ids[0]": "1", - "coupon_type": "1", - "simple_action": "by_percent", - "discount_amount": 15 - } - } - } - ], - "revision": 2 - }, - { - "sites": ["shopping_admin"], - "task_id": 702, - "intent_template_id": 258, - "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Create a new marketing price rule called \"Pride Month\" for all registered customers that offers 45% off on all products", - "intent_template": "Create a new marketing price rule called \"{{topic}}\" for all registered customers that offers {{rule}}", - "instantiation_dict": {"topic": "Pride Month", "rule": "45% off on all products"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__SHOPPING_ADMIN__/catalog_rule/promo_catalog/save/", - "http_method": "POST", - "response_status": 302, - "post_data": { - "name": "Pride Month", - "is_active": "1", - "customer_group_ids[0]": "1", - "website_ids[0]": "1", - "simple_action": "by_percent", - "discount_amount": "45" - } - } - } - ], - "revision": 2 - }, - { - "sites": ["shopping_admin"], - "task_id": 703, - "intent_template_id": 258, - "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Create a new marketing price rule called \"Thanks giving sale\" for all registered customers that offers $40 discount on all their purchase", - "intent_template": "Create a new marketing price rule called \"{{topic}}\" for all registered customers that offers {{rule}}", - "instantiation_dict": {"topic": "Thanks giving sale", "rule": "$40 discount on all their purchase"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__SHOPPING_ADMIN__/sales_rule/promo_quote/save", - "http_method": "POST", - "response_status": 302, - "post_data": { - "name": "Thanks giving sale", - "discount_amount": "40", - "is_active": "1", - "customer_group_ids[0]": "1", - "website_ids[0]": "1", - "coupon_type": "1", - "rule[actions][1][type]": "Magento\\SalesRule\\Model\\Rule\\Condition\\Product\\Combine", - "rule[actions][1][aggregator]": "all", - "rule[actions][1][value]": "1" - } - } - } - ], - "revision": 2 - }, - { - "sites": ["shopping_admin"], - "task_id": 704, - "intent_template_id": 268, - "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Show the sales order report for for last months (today is March 15, 2023).", - "intent_template": "Show the {{report}} for {{time_span}} (today is March 15, 2023).", - "instantiation_dict": {"report": "sales order report", "time_span": "for last months"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "navigate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "ignored_query_params_patterns": ["^(?!report_type$|from$|to$).*$"], - "decode_base64_query": true, - "expected": { - "url": "__SHOPPING_ADMIN__/reports/report_sales/sales/filter", - "query_params": { - "report_type": [ "created_at_order" ], - "from" : [ "02/1/2023" ], - "to" : [ "02/28/2023" ] - } - }, - "query_params_schema": { - "type": "object", - "properties": { - "report_type": { "type": "array", "items": {"type": "string"} }, - "from" : { "type": "array", "items": {"type": "string", "format": "date"} }, - "to" : { "type": "array", "items": {"type": "string", "format": "date"} } - } - } - } - ], - "revision": 2 - }, - { - "sites": ["shopping_admin"], - "task_id": 705, - "intent_template_id": 268, - "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Show the sales order report for over the last 45 days (today is March 15, 2023).", - "intent_template": "Show the {{report}} for {{time_span}} (today is March 15, 2023).", - "instantiation_dict": {"report": "sales order report", "time_span": "over the last 45 days"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "navigate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "ignored_query_params_patterns": ["^(?!report_type$|from$|to$).*$"], - "decode_base64_query": true, - "expected": { - "url": "__SHOPPING_ADMIN__/reports/report_sales/sales/filter", - "query_params": { - "report_type": [ "created_at_order" ], - "from" : [ "01/29/2023" ], - "to" : [ "03/15/2023" ] - } - }, - "query_params_schema": { - "type": "object", - "properties": { - "report_type": { "type": "array", "items": {"type": "string"} }, - "from" : { "type": "array", "items": {"type": "string", "format": "date"} }, - "to" : { "type": "array", "items": {"type": "string", "format": "date"} } - } - } - } - ], - "revision": 2 - }, - { - "sites": ["shopping_admin"], - "task_id": 706, - "intent_template_id": 268, - "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Show the refund report for for Q1 (today is March 15, 2023).", - "intent_template": "Show the {{report}} for {{time_span}} (today is March 15, 2023).", - "instantiation_dict": {"report": "refund report", "time_span": "for Q1"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "navigate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "ignored_query_params_patterns": ["^(?!report_type$|from$|to$).*$"], - "decode_base64_query": true, - "expected": { - "url": "__SHOPPING_ADMIN__/reports/report_sales/refunded/filter", - "query_params": { - "report_type": [ "created_at_order" ], - "from" : [ "01/1/2023" ], - "to" : [ "03/31/2023" ] - } - }, - "query_params_schema": { - "type": "object", - "properties": { - "report_type": { "type": "array", "items": {"type": "string"} }, - "from" : { "type": "array", "items": {"type": "string", "format": "date"} }, - "to" : { "type": "array", "items": {"type": "string", "format": "date"} } - } - } - } - ], - "revision": 2 - }, - { - "sites": ["shopping_admin"], - "task_id": 707, - "intent_template_id": 268, - "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Show the sales order report for for last year (today is March 15, 2023).", - "intent_template": "Show the {{report}} for {{time_span}} (today is March 15, 2023).", - "instantiation_dict": {"report": "sales order report", "time_span": "for last year"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "navigate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "ignored_query_params_patterns": ["^(?!report_type$|from$|to$).*$"], - "decode_base64_query": true, - "expected": { - "url": "__SHOPPING_ADMIN__/reports/report_sales/sales/filter", - "query_params": { - "report_type": [ "created_at_order" ], - "from" : [ "1/1/2022" ], - "to" : [ "12/31/2022" ] - } - }, - "query_params_schema": { - "type": "object", - "properties": { - "report_type": { "type": "array", "items": {"type": "string"} }, - "from" : { "type": "array", "items": {"type": "string", "format": "date"} }, - "to" : { "type": "array", "items": {"type": "string", "format": "date"} } - } - } - } - ], - "revision": 2 - }, - { - "sites": ["shopping_admin"], - "task_id": 708, - "intent_template_id": 268, - "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Show the tax report for for this year (today is March 15, 2023).", - "intent_template": "Show the {{report}} for {{time_span}} (today is March 15, 2023).", - "instantiation_dict": {"report": "tax report", "time_span": "for this year"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "navigate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "ignored_query_params_patterns": ["^(?!report_type$|from$|to$).*$"], - "decode_base64_query": true, - "expected": { - "url": "__SHOPPING_ADMIN__/reports/report_sales/tax/filter", - "query_params": { - "report_type": [ "created_at_order" ], - "from" : [ "01/1/2023" ], - "to" : [ "03/15/2023" ] - } - }, - "query_params_schema": { - "type": "object", - "properties": { - "report_type": { "type": "array", "items": {"type": "string"} }, - "from" : { "type": "array", "items": {"type": "string", "format": "date"} }, - "to" : { "type": "array", "items": {"type": "string", "format": "date"} } - } - } - } - ], - "revision": 2 - }, - { - "sites": ["shopping_admin"], - "task_id": 709, - "intent_template_id": 271, - "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Show the orders report from May 1, 2021 to March 31, 2022.", - "intent_template": "Show the {{type}} report from {{start_date}} to {{end_date}}.", - "instantiation_dict": { - "type": "orders", - "start_date": "May 1, 2021", - "end_date": "March 31, 2022", - "article": "an" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "navigate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "ignored_query_params_patterns": ["^(?!report_type$|from$|to$).*$"], - "decode_base64_query": true, - "expected": { - "url": "__SHOPPING_ADMIN__/reports/report_sales/sales/filter", - "query_params": { - "report_type": [ "created_at_order" ], - "from" : [ "05/1/2021" ], - "to" : [ "03/31/2022" ] - } - }, - "query_params_schema": { - "type": "object", - "properties": { - "report_type": { "type": "array", "items": {"type": "string"} }, - "from" : { "type": "array", "items": {"type": "string", "format": "date"} }, - "to" : { "type": "array", "items": {"type": "string", "format": "date"} } - } - } - } - ], - "revision": 2 - }, - { - "sites": ["shopping_admin"], - "task_id": 710, - "intent_template_id": 271, - "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Show the shipping report from August 5, 2022 to March 1, 2023.", - "intent_template": "Show the {{type}} report from {{start_date}} to {{end_date}}.", - "instantiation_dict": { - "type": "shipping", - "start_date": "August 5, 2022", - "end_date": "March 1, 2023", - "article": "a" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "navigate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "ignored_query_params_patterns": ["^(?!report_type$|from$|to$).*$"], - "decode_base64_query": true, - "expected": { - "url": "__SHOPPING_ADMIN__/reports/report_sales/shipping/filter", - "query_params": { - "report_type": [ "created_at_order" ], - "from" : [ "08/5/2022" ], - "to" : [ "03/1/2023" ] - } - }, - "query_params_schema": { - "type": "object", - "properties": { - "report_type": { "type": "array", "items": {"type": "string"} }, - "from" : { "type": "array", "items": {"type": "string", "format": "date"} }, - "to" : { "type": "array", "items": {"type": "string", "format": "date"} } - } - } - } - ], - "revision": 2 - }, - { - "sites": ["shopping_admin"], - "task_id": 711, - "intent_template_id": 271, - "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Show the product view report from July 5, 2021 to May 31, 2023.", - "intent_template": "Show the {{type}} report from {{start_date}} to {{end_date}}.", - "instantiation_dict": { - "type": "product view", - "start_date": "July 5, 2021", - "end_date": "May 31, 2023", - "article": "a" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "navigate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "ignored_query_params_patterns": ["^(?!report_type$|from$|to$).*$"], - "decode_base64_query": true, - "expected": { - "url": "__SHOPPING_ADMIN__/reports/report_product/viewed/filter", - "query_params": { - "report_type": [ "created_at_order" ], - "from" : [ "07/5/2021" ], - "to" : [ "05/31/2023" ] - } - }, - "query_params_schema": { - "type": "object", - "properties": { - "report_type": { "type": "array", "items": {"type": "string"} }, - "from" : { "type": "array", "items": {"type": "string", "format": "date"} }, - "to" : { "type": "array", "items": {"type": "string", "format": "date"} } - } - } - } - ], - "revision": 2 - }, - { - "sites": ["shopping_admin"], - "task_id": 712, - "intent_template_id": 271, - "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Show the coupons report from May 1, 2021 to May 15, 2023.", - "intent_template": "Show the {{type}} report from {{start_date}} to {{end_date}}.", - "instantiation_dict": { - "type": "coupons", - "start_date": "May 1, 2021", - "end_date": "May 15, 2023", - "article": "a" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "navigate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "ignored_query_params_patterns": ["^(?!report_type$|from$|to$).*$"], - "decode_base64_query": true, - "expected": { - "url": "__SHOPPING_ADMIN__/reports/report_sales/coupons/filter", - "query_params": { - "report_type": [ "created_at_order" ], - "from" : [ "05/1/2021" ], - "to" : [ "05/15/2023" ] - } - }, - "query_params_schema": { - "type": "object", - "properties": { - "report_type": { "type": "array", "items": {"type": "string"} }, - "from" : { "type": "array", "items": {"type": "string", "format": "date"} }, - "to" : { "type": "array", "items": {"type": "string", "format": "date"} } - } - } - } - ], - "revision": 2 - }, - { - "sites": ["shopping_admin"], - "task_id": 713, - "intent_template_id": 271, - "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Show the best sellers report from May 1, 2022 to May 31, 2023.", - "intent_template": "Show the {{type}} report from {{start_date}} to {{end_date}}.", - "instantiation_dict": { - "type": "best sellers", - "start_date": "May 1, 2022", - "end_date": "May 31, 2023", - "article": "a" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "navigate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "ignored_query_params_patterns": ["^(?!report_type$|from$|to$).*$"], - "decode_base64_query": true, - "expected": { - "url": "__SHOPPING_ADMIN__/reports/report_sales/bestsellers/filter", - "query_params": { - "report_type": [ "created_at_order" ], - "from" : [ "05/1/2022" ], - "to" : [ "05/31/2023" ] - } - }, - "query_params_schema": { - "type": "object", - "properties": { - "report_type": { "type": "array", "items": {"type": "string"} }, - "from" : { "type": "array", "items": {"type": "string", "format": "date"} }, - "to" : { "type": "array", "items": {"type": "string", "format": "date"} } - } - } - } - ], - "revision": 2 - }, - { - "sites": ["reddit"], - "task_id": 714, - "intent_template_id": 24, - "start_urls": ["__REDDIT__"], - "intent": "Thumbs down the top 1 post ever in gadgets.", - "intent_template": "Thumbs down the top {{k}} post ever in {{forum}}.", - "instantiation_dict": {"k": "1", "forum": "gadgets"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__REDDIT__/sv/19459.json", - "http_method": "POST", - "post_data": {"choice": "-1"} - } - } - ], - "revision": 2 - }, - { - "sites": ["reddit"], - "task_id": 715, - "intent_template_id": 24, - "start_urls": ["__REDDIT__"], - "intent": "Thumbs down the top 2 post ever in history.", - "intent_template": "Thumbs down the top {{k}} post ever in {{forum}}.", - "instantiation_dict": {"k": "2", "forum": "history"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__REDDIT__/sv/58888.json", - "http_method": "POST", - "post_data": {"choice": "-1"} - } - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__REDDIT__/sv/41616.json", - "http_method": "POST", - "post_data": {"choice": "-1"} - } - } - ], - "revision": 2 - }, - { - "sites": ["reddit"], - "task_id": 716, - "intent_template_id": 24, - "start_urls": ["__REDDIT__"], - "intent": "Thumbs down the top 3 post ever in books.", - "intent_template": "Thumbs down the top {{k}} post ever in {{forum}}.", - "instantiation_dict": {"k": "3", "forum": "books"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__REDDIT__/sv/81371.json", - "http_method": "POST", - "post_data": {"choice": "-1"} - } - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__REDDIT__/sv/59421.json", - "http_method": "POST", - "post_data": {"choice": "-1"} - } - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__REDDIT__/sv/59447.json", - "http_method": "POST", - "post_data": {"choice": "-1"} - } - } - ], - "revision": 2 - }, - { - "sites": ["reddit"], - "task_id": 717, - "intent_template_id": 24, - "start_urls": ["__REDDIT__"], - "intent": "Thumbs down the top 4 post ever in movies.", - "intent_template": "Thumbs down the top {{k}} post ever in {{forum}}.", - "instantiation_dict": {"k": "4", "forum": "movies"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__REDDIT__/sv/86174.json", - "http_method": "POST", - "post_data": {"choice": "-1"} - } - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__REDDIT__/sv/86029.json", - "http_method": "POST", - "post_data": {"choice": "-1"} - } - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__REDDIT__/sv/86055.json", - "http_method": "POST", - "post_data": {"choice": "-1"} - } - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__REDDIT__/sv/42682.json", - "http_method": "POST", - "post_data": {"choice": "-1"} - } - } - ], - "revision": 2 - }, - { - "sites": ["reddit"], - "task_id": 718, - "intent_template_id": 24, - "start_urls": ["__REDDIT__"], - "intent": "Thumbs down the top 5 post ever in technology.", - "intent_template": "Thumbs down the top {{k}} post ever in {{forum}}.", - "instantiation_dict": {"k": "5", "forum": "technology"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__REDDIT__/sv/48670.json", - "http_method": "POST", - "post_data": {"choice": "-1"} - } - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__REDDIT__/sv/134696.json", - "http_method": "POST", - "post_data": {"choice": "-1"} - } - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__REDDIT__/sv/48785.json", - "http_method": "POST", - "post_data": {"choice": "-1"} - } - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__REDDIT__/sv/70354.json", - "http_method": "POST", - "post_data": {"choice": "-1"} - } - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__REDDIT__/sv/70233.json", - "http_method": "POST", - "post_data": {"choice": "-1"} - } - } - ], - "revision": 2 - }, - { - "sites": ["reddit"], - "task_id": 719, - "intent_template_id": 25, - "start_urls": ["__REDDIT__"], - "intent": "Like all submissions created by ThetaGang_wsb in forum wallstreetbets", - "intent_template": "Like all submissions created by {{user}} in forum {{forum}}", - "instantiation_dict": {"user": "ThetaGang_wsb", "forum": "wallstreetbets"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__REDDIT__/sv/29478.json", - "http_method": "POST", - "post_data": {"choice": "1"} - } - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__REDDIT__/sv/29458.json", - "http_method": "POST", - "post_data": {"choice": "1"} - } - } - ], - "revision": 2 - }, - { - "sites": ["reddit"], - "task_id": 720, - "intent_template_id": 25, - "start_urls": ["__REDDIT__"], - "intent": "Like all submissions created by CameronKelsey in forum earthporn", - "intent_template": "Like all submissions created by {{user}} in forum {{forum}}", - "instantiation_dict": {"user": "CameronKelsey", "forum": "earthporn"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__REDDIT__/sv/98332.json", - "http_method": "POST", - "post_data": {"choice": "1"} - } - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__REDDIT__/sv/98297.json", - "http_method": "POST", - "post_data": {"choice": "1"} - } - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__REDDIT__/sv/98256.json", - "http_method": "POST", - "post_data": {"choice": "1"} - } - } - ], - "revision": 2 - }, - { - "sites": ["reddit"], - "task_id": 721, - "intent_template_id": 25, - "start_urls": ["__REDDIT__"], - "intent": "Like all submissions created by UniversityofBath in forum IAmA", - "intent_template": "Like all submissions created by {{user}} in forum {{forum}}", - "instantiation_dict": {"user": "UniversityofBath", "forum": "IAmA"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__REDDIT__/sv/119742.json", - "http_method": "POST", - "post_data": {"choice": "1"} - } - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__REDDIT__/sv/119719.json", - "http_method": "POST", - "post_data": {"choice": "1"} - } - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__REDDIT__/sv/119714.json", - "http_method": "POST", - "post_data": {"choice": "1"} - } - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__REDDIT__/sv/55155.json", - "http_method": "POST", - "post_data": {"choice": "1"} - } - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__REDDIT__/sv/55142.json", - "http_method": "POST", - "post_data": {"choice": "1"} - } - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__REDDIT__/sv/34032.json", - "http_method": "POST", - "post_data": {"choice": "1"} - } - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__REDDIT__/sv/13175.json", - "http_method": "POST", - "post_data": {"choice": "1"} - } - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__REDDIT__/sv/13170.json", - "http_method": "POST", - "post_data": {"choice": "1"} - } - } - ], - "revision": 2 - }, - { - "sites": ["reddit"], - "task_id": 722, - "intent_template_id": 25, - "start_urls": ["__REDDIT__"], - "intent": "Like all submissions created by Don_Gato1 in forum nyc", - "intent_template": "Like all submissions created by {{user}} in forum {{forum}}", - "instantiation_dict": {"user": "Don_Gato1", "forum": "nyc"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__REDDIT__/sv/44650.json", - "http_method": "POST", - "post_data": {"choice": "1"} - } - } - ], - "revision": 2 - }, - { - "sites": ["reddit"], - "task_id": 723, - "intent_template_id": 25, - "start_urls": ["__REDDIT__"], - "intent": "Like all submissions created by FTorrez81 in forum iphone13", - "intent_template": "Like all submissions created by {{user}} in forum {{forum}}", - "instantiation_dict": {"user": "FTorrez81", "forum": "iphone13"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "retrieve", "status": "NOT_FOUND_ERROR", "retrieved_data": null} - } - ], - "revision": 2 - }, - { - "sites": ["reddit"], - "task_id": 724, - "intent_template_id": 25, - "start_urls": ["__REDDIT__"], - "intent": "Like all submissions created by Hrekires in forum news", - "intent_template": "Like all submissions created by {{user}} in forum {{forum}}", - "instantiation_dict": {"user": "Hrekires", "forum": "news"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__REDDIT__/sv/129816.json", - "http_method": "POST", - "post_data": {"choice": "1"} - } - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__REDDIT__/sv/129808.json", - "http_method": "POST", - "post_data": {"choice": "1"} - } - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__REDDIT__/sv/129794.json", - "http_method": "POST", - "post_data": {"choice": "1"} - } - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__REDDIT__/sv/129783.json", - "http_method": "POST", - "post_data": {"choice": "1"} - } - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__REDDIT__/sv/129594.json", - "http_method": "POST", - "post_data": {"choice": "1"} - } - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__REDDIT__/sv/129508.json", - "http_method": "POST", - "post_data": {"choice": "1"} - } - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__REDDIT__/sv/43839.json", - "http_method": "POST", - "post_data": {"choice": "1"} - } - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__REDDIT__/sv/43781.json", - "http_method": "POST", - "post_data": {"choice": "1"} - } - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__REDDIT__/sv/43572.json", - "http_method": "POST", - "post_data": {"choice": "1"} - } - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__REDDIT__/sv/43558.json", - "http_method": "POST", - "post_data": {"choice": "1"} - } - } - ], - "revision": 2 - }, - { - "sites": ["reddit"], - "task_id": 725, - "intent_template_id": 1510, - "start_urls": ["__REDDIT__"], - "intent": "DisLike all submissions created by RickyDontLoseThat in forum massachusetts", - "intent_template": "DisLike all submissions created by {{user}} in forum {{forum}}", - "instantiation_dict": {"user": "RickyDontLoseThat", "forum": "massachusetts"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__REDDIT__/sv/84954.json", - "http_method": "POST", - "post_data": {"choice": "-1"} - } - } - ], - "revision": 2 - }, - { - "sites": ["reddit"], - "task_id": 726, - "intent_template_id": 1510, - "start_urls": ["__REDDIT__"], - "intent": "DisLike all submissions created by jacyanthis in forum earthporn", - "intent_template": "DisLike all submissions created by {{user}} in forum {{forum}}", - "instantiation_dict": {"user": "jacyanthis", "forum": "earthporn"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "retrieve", "status": "NOT_FOUND_ERROR", "retrieved_data": null} - } - ], - "revision": 2 - }, - { - "sites": ["reddit"], - "task_id": 727, - "intent_template_id": 1510, - "start_urls": ["__REDDIT__"], - "intent": "DisLike all submissions created by PatientBuilder499 in forum videos", - "intent_template": "DisLike all submissions created by {{user}} in forum {{forum}}", - "instantiation_dict": {"user": "PatientBuilder499", "forum": "videos"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__REDDIT__/sv/115139.json", - "http_method": "POST", - "post_data": {"choice": "-1"} - } - } - ], - "revision": 2 - }, - { - "sites": ["reddit"], - "task_id": 728, - "intent_template_id": 1510, - "start_urls": ["__REDDIT__"], - "intent": "DisLike all submissions created by sirbarani in forum sports", - "intent_template": "DisLike all submissions created by {{user}} in forum {{forum}}", - "instantiation_dict": {"user": "sirbarani", "forum": "sports"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__REDDIT__/sv/48303.json", - "http_method": "POST", - "post_data": {"choice": "-1"} - } - } - ], - "revision": 2 - }, - { - "sites": ["reddit"], - "task_id": 729, - "intent_template_id": 1510, - "start_urls": ["__REDDIT__"], - "intent": "DisLike all submissions created by AdamCannon in forum UpliftingNews", - "intent_template": "DisLike all submissions created by {{user}} in forum {{forum}}", - "instantiation_dict": {"user": "AdamCannon", "forum": "UpliftingNews"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__REDDIT__/sv/16087.json", - "http_method": "POST", - "post_data": {"choice": "-1"} - } - } - ], - "revision": 2 - }, - { - "sites": ["reddit"], - "task_id": 730, - "intent_template_id": 1510, - "start_urls": ["__REDDIT__"], - "intent": "DisLike all submissions created by Hrekires in forum news", - "intent_template": "DisLike all submissions created by {{user}} in forum {{forum}}", - "instantiation_dict": {"user": "Hrekires", "forum": "news"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__REDDIT__/sv/129816.json", - "http_method": "POST", - "post_data": {"choice": "-1"} - } - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__REDDIT__/sv/129808.json", - "http_method": "POST", - "post_data": {"choice": "-1"} - } - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__REDDIT__/sv/129794.json", - "http_method": "POST", - "post_data": {"choice": "-1"} - } - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__REDDIT__/sv/129783.json", - "http_method": "POST", - "post_data": {"choice": "-1"} - } - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__REDDIT__/sv/129594.json", - "http_method": "POST", - "post_data": {"choice": "-1"} - } - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__REDDIT__/sv/129508.json", - "http_method": "POST", - "post_data": {"choice": "-1"} - } - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__REDDIT__/sv/43839.json", - "http_method": "POST", - "post_data": {"choice": "-1"} - } - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__REDDIT__/sv/43781.json", - "http_method": "POST", - "post_data": {"choice": "-1"} - } - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__REDDIT__/sv/43572.json", - "http_method": "POST", - "post_data": {"choice": "-1"} - } - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__REDDIT__/sv/43558.json", - "http_method": "POST", - "post_data": {"choice": "-1"} - } - } - ], - "revision": 2 - }, - { - "sites": ["reddit"], - "task_id": 731, - "intent_template_id": 27, - "start_urls": ["__REDDIT__"], - "intent": "Edit my post on Nvidia RTX 4090 by adding a line to the body that says \"EDIT: This news aged well\"", - "intent_template": "Edit my post on {{post}} by adding a line to the body that says \"{{content}}\"", - "instantiation_dict": {"post": "Nvidia RTX 4090", "content": "EDIT: This news aged well"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__REDDIT__/f/MachineLearning/1/-/edit", - "http_method": "POST", - "post_data": { - "submission[title]": "Nvidia RTX 4090", - "submission[body]": "^\\s*Crazy device for ML!\\s*EDIT: This news aged well\\s*$" - }, - "response_status": 302 - } - } - ], - "revision": 2 - }, - { - "sites": ["reddit"], - "task_id": 732, - "intent_template_id": 27, - "start_urls": ["__REDDIT__"], - "intent": "Edit my post on The Night Agent by adding a line to the body that says \"Done watching, pretty cool!\"", - "intent_template": "Edit my post on {{post}} by adding a line to the body that says \"{{content}}\"", - "instantiation_dict": {"post": "The Night Agent", "content": "Done watching, pretty cool!"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__REDDIT__/f/television/134868/-/edit", - "http_method": "POST", - "post_data": { - "submission[title]": "'The Night Agent' Renewed for Season 2 at Netflix", - "submission[body]": "Done watching, pretty cool!" - }, - "response_status": 302 - } - } - ], - "revision": 2 - }, - { - "sites": ["reddit"], - "task_id": 733, - "intent_template_id": 27, - "start_urls": ["__REDDIT__"], - "intent": "Edit my post on Star Trek Starfleet Academy series by adding a line to the body that says \"Every watch makes me feel like a kid again\"", - "intent_template": "Edit my post on {{post}} by adding a line to the body that says \"{{content}}\"", - "instantiation_dict": { - "post": "Star Trek Starfleet Academy series", - "content": "Every watch makes me feel like a kid again" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__REDDIT__/f/television/135201/-/edit", - "http_method": "POST", - "post_data": { - "submission[title]": "Star Trek: Starfleet Academy Series, From Alex Kurtzman and Nancy Drew Creator, Ordered at Paramount+", - "submission[body]": "Every watch makes me feel like a kid again" - }, - "response_status": 302 - } - } - ], - "revision": 2 - }, - { - "sites": ["reddit"], - "task_id": 734, - "intent_template_id": 27, - "start_urls": ["__REDDIT__"], - "intent": "Edit my post on Ted Lasso season 3 premiere by adding a line to the body that says \"Done watching. I love the renew!\"", - "intent_template": "Edit my post on {{post}} by adding a line to the body that says \"{{content}}\"", - "instantiation_dict": { - "post": "Ted Lasso season 3 premiere", - "content": "Done watching. I love the renew!" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__REDDIT__/f/television/135156/-/edit", - "http_method": "POST", - "post_data": { - "submission[title]": "\u2018Ted Lasso\u2019 Season 3 Premiere Scores 870K U.S. Households, Up 59% From Season 2 Premiere", - "submission[body]": "Done watching. I love the renew!" - }, - "response_status": 302 - } - } - ], - "revision": 2 - }, - { - "sites": ["reddit"], - "task_id": 735, - "intent_template_id": 27, - "start_urls": ["__REDDIT__"], - "intent": "Edit my post on Lord of the Rings season 2 cast announcement with the lowest vote count by adding a line to the body that says \"The cast is amazing!\"", - "intent_template": "Edit my post on {{post}} by adding a line to the body that says \"{{content}}\"", - "instantiation_dict": { - "post": "Lord of the Rings season 2 cast announcement with the lowest vote count", - "content": "The cast is amazing!" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__REDDIT__/f/television/70886/-/edit", - "http_method": "POST", - "post_data": { - "submission[title]": "\u2018The Lord Of The Rings: The Rings Of Power\u2019 Adds 8 To Season 2 Cast", - "submission[body]": "The cast is amazing!" - }, - "response_status": 302 - } - } - ], - "revision": 2 - }, - { - "sites": ["gitlab"], - "task_id": 736, - "intent_template_id": 355, - "start_urls": ["__GITLAB__"], - "intent": "Change the LICENSE for repo byteblaze/gimmiethat.space and byteblaze/dotfiles to MIT licenses", - "intent_template": "Change the LICENSE for repo {{repo}} to {{license}}", - "instantiation_dict": { - "repo": "byteblaze/gimmiethat.space and byteblaze/dotfiles", - "license": "MIT licenses" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "post_data_schema": { - "type": "object", - "properties": { - "_method" : { "type": "string" }, - "file_path" : { "type": "string" }, - "branch_name": { "type": "string" }, - "content" : { "type": "string" } - } - }, - "expected": { - "url": [ - "__GITLAB__/byteblaze/gimmiethat.space/-/update/main/LICENSE", - "__GITLAB__/byteblaze/gimmiethat.space/-/create/main", - "__GITLAB__/byteblaze/gimmiethat.space/-/blob/main/LICENSE" - ], - "http_method": "POST", - "post_data": { - "$.^file_path|file_name$": "LICENSE", - "branch_name": "main", - "content": "^.*MIT.*License.*$", - "_method": "^.*(?!delete$).*$" - }, - "response_status": 302 - } - }, - { - "evaluator": "NetworkEventEvaluator", - "post_data_schema": { - "type": "object", - "properties": { - "_method" : { "type": "string" }, - "file_path" : { "type": "string" }, - "branch_name": { "type": "string" }, - "content" : { "type": "string" } - } - }, - "expected": { - "url": [ - "__GITLAB__/byteblaze/dotfiles/-/update/main/LICENSE", - "__GITLAB__/byteblaze/dotfiles/-/create/main", - "__GITLAB__/byteblaze/dotfiles/-/blob/main/LICENSE" - ], - "http_method": "POST", - "post_data": { - "$.^file_path|file_name$": "LICENSE", - "branch_name": "main", - "content": "^.*MIT.*License.*$", - "_method": "^.*(?!delete$).*$" - }, - "response_status": 302 - } - } - ], - "revision": 2 - }, - { - "sites": ["wikipedia", "map"], - "task_id": 737, - "intent_template_id": 94, - "start_urls": ["__MAP__", "__WIKIPEDIA__"], - "intent": "View the route on the map from Carnegie Mellon University to the home stadium of Philadelphia 76ers. (Use the OSRM direction service and the provided wiki to look up any needed information and search both source and destination by coordinates from the place official page on the wiki.)", - "intent_template": "View the route on the map from {{location}} to the home stadium of {{sport_team}}{{time_phrase}}. (Use the OSRM direction service and the provided wiki to look up any needed information and search both source and destination by coordinates from the place official page on the wiki.)", - "instantiation_dict": { - "location": "Carnegie Mellon University", - "sport_team": "Philadelphia 76ers", - "time_phrase": "" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "navigate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "ignored_query_params_patterns": [".*"], - "expected": {"url": "^.*/route/v1/.*/-75.1718916,39.9011873;-79.9427192,40.4441897.*$"} - } - ], - "revision": 2 - }, - { - "sites": ["wikipedia", "map"], - "task_id": 738, - "intent_template_id": 94, - "start_urls": ["__MAP__", "__WIKIPEDIA__"], - "intent": "View the route on the map from Carnegie Mellon University to the home stadium of Philadelphia 76ers in the 70s. (Use the OSRM direction service and the provided wiki to look up any needed information and search both source and destination by coordinates from the place official page on the wiki.)", - "intent_template": "View the route on the map from {{location}} to the home stadium of {{sport_team}}{{time_phrase}}. (Use the OSRM direction service and the provided wiki to look up any needed information and search both source and destination by coordinates from the place official page on the wiki.)", - "instantiation_dict": { - "location": "Carnegie Mellon University", - "sport_team": "Philadelphia 76ers", - "time_phrase": " in the 70s" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "navigate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "ignored_query_params_patterns": [".*"], - "expected": {"url": "^.*/route/v1/.*/-75.1712951,39.9042046;-79.9427192,40.4441897.*$"} - } - ], - "revision": 2 - }, - { - "sites": ["wikipedia", "map"], - "task_id": 739, - "intent_template_id": 94, - "start_urls": ["__MAP__", "__WIKIPEDIA__"], - "intent": "View the route on the map from Carnegie Mellon University to the home stadium of Yankees in the 80s. (Use the OSRM direction service and the provided wiki to look up any needed information and search both source and destination by coordinates from the place official page on the wiki.)", - "intent_template": "View the route on the map from {{location}} to the home stadium of {{sport_team}}{{time_phrase}}. (Use the OSRM direction service and the provided wiki to look up any needed information and search both source and destination by coordinates from the place official page on the wiki.)", - "instantiation_dict": { - "location": "Carnegie Mellon University", - "sport_team": "Yankees", - "time_phrase": " in the 80s" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "navigate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "ignored_query_params_patterns": [".*"], - "expected": {"url": "^.*/route/v1/.*/-73.9265212,40.8295828;-79.9427192,40.4441897.*$"} - } - ], - "revision": 2 - }, - { - "sites": ["wikipedia", "map"], - "task_id": 740, - "intent_template_id": 94, - "start_urls": ["__MAP__", "__WIKIPEDIA__"], - "intent": "View the route on the map from Carnegie Mellon University to the home stadium of NYC NBA team. (Use the OSRM direction service and the provided wiki to look up any needed information and search both source and destination by coordinates from the place official page on the wiki.)", - "intent_template": "View the route on the map from {{location}} to the home stadium of {{sport_team}}{{time_phrase}}. (Use the OSRM direction service and the provided wiki to look up any needed information and search both source and destination by coordinates from the place official page on the wiki.)", - "instantiation_dict": { - "location": "Carnegie Mellon University", - "sport_team": "NYC NBA team", - "time_phrase": "" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "navigate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "ignored_query_params_patterns": [".*"], - "expected": {"url": "^.*/route/v1/.*/-73.9935443,40.7505085;-79.9427192,40.4441897.*$"} - } - ], - "revision": 2 - }, - { - "sites": ["wikipedia", "map"], - "task_id": 741, - "intent_template_id": 94, - "start_urls": ["__MAP__", "__WIKIPEDIA__"], - "intent": "View the route on the map from Carnegie Mellon University to the home stadium of Boston home NBA team. (Use the OSRM direction service and the provided wiki to look up any needed information and search both source and destination by coordinates from the place official page on the wiki.)", - "intent_template": "View the route on the map from {{location}} to the home stadium of {{sport_team}}{{time_phrase}}. (Use the OSRM direction service and the provided wiki to look up any needed information and search both source and destination by coordinates from the place official page on the wiki.)", - "instantiation_dict": { - "location": "Carnegie Mellon University", - "sport_team": "Boston home NBA team", - "time_phrase": "" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "navigate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "ignored_query_params_patterns": [".*"], - "expected": {"url": "^.*/route/v1/.*/-71.0621475,42.3662922;-79.9427192,40.4441897.*$"} - } - ], - "revision": 2 - }, - { - "sites": ["gitlab"], - "task_id": 742, - "intent_template_id": 332, - "start_urls": ["__GITLAB__"], - "intent": "Create a new private project \"planner\" and add Abishek, Vinta as members", - "intent_template": "Create a {{create_spec}} \"{{project_name}}\"{{suffix}}", - "instantiation_dict": { - "create_spec": "new private project", - "project_name": "planner", - "suffix": " and add Abishek, Vinta as members" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__GITLAB__/api/v4/projects", - "http_method": "POST", - "post_data": {"name": "planner", "visibility": "private"}, - "response_status": 201 - } - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "^__GITLAB__/api/v4/projects/\\d+/members$", - "http_method": "POST", - "post_data": {"user_id": 5, "access_level": 30}, - "response_status": 201 - } - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "^__GITLAB__/api/v4/projects/\\d+/members$", - "http_method": "POST", - "post_data": {"user_id": 278, "access_level": 30}, - "response_status": 201 - } - } - ], - "revision": 2 - }, - { - "sites": ["gitlab"], - "task_id": 743, - "intent_template_id": 332, - "start_urls": ["__GITLAB__"], - "intent": "Create a new public project \"web_arena\" and add Abishek, Vinta as members", - "intent_template": "Create a {{create_spec}} \"{{project_name}}\"{{suffix}}", - "instantiation_dict": { - "create_spec": "new public project", - "project_name": "web_arena", - "suffix": " and add Abishek, Vinta as members" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__GITLAB__/api/v4/projects", - "http_method": "POST", - "post_data": {"name": "web_arena", "visibility": "public"}, - "response_status": 201 - } - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "^__GITLAB__/api/v4/projects/\\d+/members$", - "http_method": "POST", - "post_data": {"user_id": 5, "access_level": 30}, - "response_status": 201 - } - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "^__GITLAB__/api/v4/projects/\\d+/members$", - "http_method": "POST", - "post_data": {"user_id": 278, "access_level": 30}, - "response_status": 201 - } - } - ], - "revision": 2 - }, - { - "sites": ["gitlab"], - "task_id": 744, - "intent_template_id": 332, - "start_urls": ["__GITLAB__"], - "intent": "Create a new public project \"AutoAGI\" and add primer as members", - "intent_template": "Create a {{create_spec}} \"{{project_name}}\"{{suffix}}", - "instantiation_dict": { - "create_spec": "new public project", - "project_name": "AutoAGI", - "suffix": " and add primer as members" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__GITLAB__/api/v4/projects", - "http_method": "POST", - "post_data": {"name": "AutoAGI", "visibility": "public"}, - "response_status": 201 - } - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "^__GITLAB__/api/v4/projects/\\d+/members$", - "http_method": "POST", - "post_data": {"user_id": 2367, "access_level": 30}, - "response_status": 201 - } - } - ], - "revision": 2 - }, - { - "sites": ["gitlab"], - "task_id": 745, - "intent_template_id": 332, - "start_urls": ["__GITLAB__"], - "intent": "Create a new public project \"awesome-llms\" and add primer, convexegg, abishek as members", - "intent_template": "Create a {{create_spec}} \"{{project_name}}\"{{suffix}}", - "instantiation_dict": { - "create_spec": "new public project", - "project_name": "awesome-llms", - "suffix": " and add primer, convexegg, abishek as members" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__GITLAB__/api/v4/projects", - "http_method": "POST", - "post_data": {"name": "awesome-llms", "visibility": "public"}, - "response_status": 201 - } - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "^__GITLAB__/api/v4/projects/\\d+/members$", - "http_method": "POST", - "post_data": {"user_id": 2367, "access_level": 30}, - "response_status": 201 - } - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "^__GITLAB__/api/v4/projects/\\d+/members$", - "http_method": "POST", - "post_data": {"user_id": 43, "access_level": 30}, - "response_status": 201 - } - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "^__GITLAB__/api/v4/projects/\\d+/members$", - "http_method": "POST", - "post_data": {"user_id": 5, "access_level": 30}, - "response_status": 201 - } - } - ], - "revision": 2 - }, - { - "sites": ["gitlab"], - "task_id": 746, - "intent_template_id": 332, - "start_urls": ["__GITLAB__"], - "intent": "Create a new private project \"llm_bulk_inference\" and add primer, convexegg, abishek as members", - "intent_template": "Create a {{create_spec}} \"{{project_name}}\"{{suffix}}", - "instantiation_dict": { - "create_spec": "new private project", - "project_name": "llm_bulk_inference", - "suffix": " and add primer, convexegg, abishek as members" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__GITLAB__/api/v4/projects", - "http_method": "POST", - "post_data": {"name": "llm_bulk_inference", "visibility": "private"}, - "response_status": 201 - } - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "^__GITLAB__/api/v4/projects/\\d+/members$", - "http_method": "POST", - "post_data": {"user_id": 2367, "access_level": 30}, - "response_status": 201 - } - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "^__GITLAB__/api/v4/projects/\\d+/members$", - "http_method": "POST", - "post_data": {"user_id": 43, "access_level": 30}, - "response_status": 201 - } - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "^__GITLAB__/api/v4/projects/\\d+/members$", - "http_method": "POST", - "post_data": {"user_id": 5, "access_level": 30}, - "response_status": 201 - } - } - ], - "revision": 2 - }, - { - "sites": ["gitlab"], - "task_id": 747, - "intent_template_id": 2100, - "start_urls": ["__GITLAB__"], - "intent": "Start a private project called \"awesome_web_agents\" with blank template and add Abishek, Vinta as members", - "intent_template": "Start a private project called \"{{project_name}}\" with {{template}} template and add {{account_list}} as members", - "instantiation_dict": { - "project_name": "awesome_web_agents", - "template": "blank", - "account_list": "Abishek, Vinta" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__GITLAB__/projects", - "http_method": "POST", - "post_data": { - "project[template_name]": null, - "project[name]": "awesome_web_agents", - "project[path]": "awesome_web_agents", - "project[namespace_id]": "2505", - "project[visibility_level]": "0" - }, - "response_status": 302 - } - }, - { - "evaluator": "NetworkEventEvaluator", - "ignored_query_params": ["serializer"], - "post_data_schema": { - "type": "object", - "properties": { "user_id": {"type": "string", "format": "string_list"} } - }, - "expected": { - "url": "^__GITLAB__/api/v4/projects/\\d+/invitations$", - "http_method": "POST", - "post_data": {"user_id": "5,278"}, - "response_status": 201 - } - } - ], - "revision": 2 - }, - { - "sites": ["gitlab"], - "task_id": 748, - "intent_template_id": 2100, - "start_urls": ["__GITLAB__"], - "intent": "Start a private project called \"web_agent_android_xl\" with Android template and add primer, convexegg, abishek as members", - "intent_template": "Start a private project called \"{{project_name}}\" with {{template}} template and add {{account_list}} as members", - "instantiation_dict": { - "project_name": "web_agent_android_xl", - "template": "Android", - "account_list": "primer, convexegg, abishek" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__GITLAB__/api/v4/projects", - "http_method": "POST", - "post_data": { - "name": "web_agent_android_xl", - "visibility": "private", - "namespace_id": 2505, - "template_name": "android" - }, - "response_status": 201 - } - }, - { - "evaluator": "NetworkEventEvaluator", - "post_data_schema": { - "type": "object", - "properties": { "user_id": {"type": "string", "format": "string_list"} } - }, - "expected": { - "url": "^__GITLAB__/api/v4/projects/\\d+/invitations$", - "http_method": "POST", - "post_data": {"user_id": "2367,43,5", "access_level": 30}, - "response_status": 201 - } - } - ], - "revision": 2 - }, - { - "sites": ["gitlab"], - "task_id": 749, - "intent_template_id": 2100, - "start_urls": ["__GITLAB__"], - "intent": "Start a private project called \"project_site\" with NodeJS template and add primer, convexegg, vinta as members", - "intent_template": "Start a private project called \"{{project_name}}\" with {{template}} template and add {{account_list}} as members", - "instantiation_dict": { - "project_name": "project_site", - "template": "NodeJS", - "account_list": "primer, convexegg, vinta" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__GITLAB__/api/v4/projects", - "http_method": "POST", - "post_data": { - "name": "project_site", - "visibility": "private", - "namespace_id": 2505, - "template_name": "express" - }, - "response_status": 201 - } - }, - { - "evaluator": "NetworkEventEvaluator", - "post_data_schema": { - "type": "object", - "properties": { "user_id": {"type": "string", "format": "string_list"} } - }, - "expected": { - "url": "^__GITLAB__/api/v4/projects/\\d+/invitations$", - "http_method": "POST", - "post_data": {"user_id": "2367,43,278", "access_level": 30}, - "response_status": 201 - } - } - ], - "revision": 2 - }, - { - "sites": ["gitlab"], - "task_id": 750, - "intent_template_id": 2100, - "start_urls": ["__GITLAB__"], - "intent": "Start a private project called \"agi_index\" with HTML Gitlab pages template and add Vinta Chen as members", - "intent_template": "Start a private project called \"{{project_name}}\" with {{template}} template and add {{account_list}} as members", - "instantiation_dict": { - "project_name": "agi_index", - "template": "HTML Gitlab pages", - "account_list": "Vinta Chen" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__GITLAB__/projects", - "http_method": "POST", - "post_data": { - "project[template_name]": "plainhtml", - "project[name]": "agi_index", - "project[path]": "agi_index", - "project[namespace_id]": "2505", - "project[visibility_level]": "0" - }, - "response_status": 302 - } - }, - { - "evaluator": "NetworkEventEvaluator", - "ignored_query_params": ["serializer"], - "expected": { - "url": "^__GITLAB__/api/v4/projects/\\d+/invitations$", - "http_method": "POST", - "post_data": {"user_id": "278"}, - "response_status": 201 - } - } - ], - "revision": 2 - }, - { - "sites": ["gitlab"], - "task_id": 751, - "intent_template_id": 2100, - "start_urls": ["__GITLAB__"], - "intent": "Start a private project called \"AGISite\" with JEKYLL template and add Rohan and Vinta as members", - "intent_template": "Start a private project called \"{{project_name}}\" with {{template}} template and add {{account_list}} as members", - "instantiation_dict": { - "project_name": "AGISite", - "template": "JEKYLL", - "account_list": "Rohan and Vinta" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__GITLAB__/projects", - "http_method": "POST", - "post_data": { - "project[template_name]": "jekyll", - "project[name]": "AGISite", - "project[path]": "AGISite", - "project[namespace_id]": "2505", - "project[visibility_level]": "0" - }, - "response_status": 302 - } - }, - { - "evaluator": "NetworkEventEvaluator", - "ignored_query_params": ["serializer"], - "post_data_schema": { - "type": "object", - "properties": { "user_id": {"type": "string", "format": "string_list"} } - }, - "expected": { - "url": "^__GITLAB__/api/v4/projects/\\d+/invitations$", - "http_method": "POST", - "post_data": {"user_id": "2366,278"}, - "response_status": 201 - } - } - ], - "revision": 2 - }, - { - "sites": ["gitlab"], - "task_id": 752, - "intent_template_id": 332, - "start_urls": ["__GITLAB__"], - "intent": "Create a private blank repository called \"web_agent\"", - "intent_template": "Create a {{create_spec}} \"{{project_name}}\"{{suffix}}", - "instantiation_dict": { - "create_spec": "private blank repository called", - "project_name": "web_agent", - "suffix": "" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__GITLAB__/projects", - "http_method": "POST", - "post_data": { - "project[name]": "web_agent", - "project[path]": "web_agent", - "project[namespace_id]": "2505", - "project[visibility_level]": "0" - }, - "response_status": 302 - } - } - ], - "revision": 2 - }, - { - "sites": ["gitlab"], - "task_id": 753, - "intent_template_id": 332, - "start_urls": ["__GITLAB__"], - "intent": "Create a private Android repository called \"web_agent_android_xs\" using the right template to speed up development.", - "intent_template": "Create a {{create_spec}} \"{{project_name}}\"{{suffix}}", - "instantiation_dict": { - "create_spec": "private Android repository called", - "project_name": "web_agent_android_xs", - "suffix": " using the right template to speed up development." - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__GITLAB__/projects", - "http_method": "POST", - "post_data": { - "project[template_name]": "android", - "project[name]": "web_agent_android_xs", - "project[path]": "web_agent_android_xs", - "project[namespace_id]": "2505", - "project[visibility_level]": "0" - }, - "response_status": 302 - } - } - ], - "revision": 2 - }, - { - "sites": ["gitlab"], - "task_id": 754, - "intent_template_id": 332, - "start_urls": ["__GITLAB__"], - "intent": "Create a private NodeJS repository called \"web_agent_nodejs\" using the right template to speed up development.", - "intent_template": "Create a {{create_spec}} \"{{project_name}}\"{{suffix}}", - "instantiation_dict": { - "create_spec": "private NodeJS repository called", - "project_name": "web_agent_nodejs", - "suffix": " using the right template to speed up development." - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__GITLAB__/projects", - "http_method": "POST", - "post_data": { - "project[template_name]": "express", - "project[name]": "web_agent_nodejs", - "project[namespace_id]": "2505", - "project[path]": "web_agent_nodejs", - "project[visibility_level]": "0" - }, - "response_status": 302 - } - } - ], - "revision": 2 - }, - { - "sites": ["gitlab"], - "task_id": 755, - "intent_template_id": 332, - "start_urls": ["__GITLAB__"], - "intent": "Create a private HTML repository called \"web_agent_index\" using the right template to speed up development.", - "intent_template": "Create a {{create_spec}} \"{{project_name}}\"{{suffix}}", - "instantiation_dict": { - "create_spec": "private HTML repository called", - "project_name": "web_agent_index", - "suffix": " using the right template to speed up development." - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__GITLAB__/projects", - "http_method": "POST", - "post_data": { - "project[template_name]": "plainhtml", - "project[name]": "web_agent_index", - "project[namespace_id]": "2505", - "project[path]": "web_agent_index", - "project[visibility_level]": "0" - }, - "response_status": 302 - } - } - ], - "revision": 2 - }, - { - "sites": ["gitlab"], - "task_id": 756, - "intent_template_id": 332, - "start_urls": ["__GITLAB__"], - "intent": "Create a private JEKYLL repository called \"11711_gitlab\" using the right template to speed up development.", - "intent_template": "Create a {{create_spec}} \"{{project_name}}\"{{suffix}}", - "instantiation_dict": { - "create_spec": "private JEKYLL repository called", - "project_name": "11711_gitlab", - "suffix": " using the right template to speed up development." - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__GITLAB__/projects", - "http_method": "POST", - "post_data": { - "project[template_name]": "nfjekyll", - "project[name]": "11711_gitlab", - "project[namespace_id]": "2505", - "project[path]": "11711_gitlab", - "project[visibility_level]": "0" - }, - "response_status": 302 - } - } - ], - "revision": 2 - }, - { - "sites": ["map"], - "task_id": 757, - "intent_template_id": 42, - "start_urls": ["__MAP__"], - "intent": "Show on the map the path and travel time from home of the 1980 Super Bowl champions to home of the 1991 Super Bowl champions. (Use the OSRM direction service.)", - "intent_template": "Show on the map the {{view_type}} from {{city1}} to {{city2}}. (Use the OSRM direction service.)", - "instantiation_dict": { - "city1": "home of the 1980 Super Bowl champions", - "city2": "home of the 1991 Super Bowl champions", - "view_type": "path and travel time" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "navigate", "status": "SUCCESS", "retrieved_data": null} - }, - { "evaluator": "NetworkEventEvaluator", "expected": {"url": "__MAP__"} } - ], - "revision": 2 - }, - { - "sites": ["map"], - "task_id": 758, - "intent_template_id": 42, - "start_urls": ["__MAP__"], - "intent": "Show on the map the path and travel time from the big apple to biggest city in Maine. (Use the OSRM direction service.)", - "intent_template": "Show on the map the {{view_type}} from {{city1}} to {{city2}}. (Use the OSRM direction service.)", - "instantiation_dict": { - "city1": "the big apple", - "city2": "biggest city in Maine", - "view_type": "path and travel time" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "navigate", "status": "SUCCESS", "retrieved_data": null} - }, - { "evaluator": "NetworkEventEvaluator", "expected": {"url": "__MAP__"} } - ], - "revision": 2 - }, - { - "sites": ["map", "shopping_admin"], - "task_id": 759, - "intent_template_id": 42, - "start_urls": ["__MAP__", "__SHOPPING_ADMIN__"], - "intent": "Show on the map the route and driving time from the city where my E-commerce customer Sophia Young lives to New York City. (Use the OSRM direction service.)", - "intent_template": "Show on the map the {{view_type}} from {{city1}} to {{city2}}. (Use the OSRM direction service.)", - "instantiation_dict": { - "city1": "the city where my E-commerce customer Sophia Young lives", - "city2": "New York City", - "view_type": "route and driving time" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "navigate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "ignored_query_params_patterns": [".*"], - "expected": { - "url": "^.*/route/v1/.*/-71.060511,42.3554334;-1.4869496,55.0252998.*$", - "headers": {"Cookie": "^(?!.*_osm_directions_engine=fossgis_osrm_(?:bicycle|foot)).*$"} - } - } - ], - "revision": 2 - }, - { - "sites": ["map", "shopping_admin"], - "task_id": 760, - "intent_template_id": 42, - "start_urls": ["__MAP__", "__SHOPPING_ADMIN__"], - "intent": "Show on the map the route and driving time from Allentown, PA to the city where my E-commerce customer Amanda Kim lives. (Use the OSRM direction service.)", - "intent_template": "Show on the map the {{view_type}} from {{city1}} to {{city2}}. (Use the OSRM direction service.)", - "instantiation_dict": { - "city1": "Allentown, PA", - "city2": "the city where my E-commerce customer Amanda Kim lives", - "view_type": "route and driving time" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "navigate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "ignored_query_params_patterns": [".*"], - "expected": { - "url": "^.*/route/v1/.*/-75.4716115,40.6022552;-74.4041622,40.0757384.*$", - "headers": {"Cookie": "^(?!.*_osm_directions_engine=fossgis_osrm_(?:bicycle|foot)).*$"} - } - } - ], - "revision": 2 - }, - { - "sites": ["map"], - "task_id": 761, - "intent_template_id": 54, - "start_urls": ["__MAP__"], - "intent": "Show on the map directions from Carnegie Science Museum to Hunt library CMU using walk options. (Use the OSRM direction service.)", - "intent_template": "Show on the map directions from {{location_address_1}} to {{location_address_2}} using {{transportation}} options. (Use the OSRM direction service.)", - "instantiation_dict": { - "location_address_1": "Carnegie Science Museum", - "location_address_2": "Hunt library CMU", - "transportation": "walk" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "navigate", "status": "SUCCESS", "retrieved_data": null} - }, - { "evaluator": "NetworkEventEvaluator", "expected": {"url": "__MAP__"} } - ], - "revision": 2 - }, - { - "sites": ["map"], - "task_id": 762, - "intent_template_id": 54, - "start_urls": ["__MAP__"], - "intent": "Show on the map directions from Carnegie Music Hall in NYC to Carnegie Mellon University using driving options. (Use the OSRM direction service.)", - "intent_template": "Show on the map directions from {{location_address_1}} to {{location_address_2}} using {{transportation}} options. (Use the OSRM direction service.)", - "instantiation_dict": { - "location_address_1": "Carnegie Music Hall in NYC", - "location_address_2": "Carnegie Mellon University", - "transportation": "driving" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "navigate", "status": "SUCCESS", "retrieved_data": null} - }, - { "evaluator": "NetworkEventEvaluator", "expected": {"url": "__MAP__"} } - ], - "revision": 2 - }, - { - "sites": ["map"], - "task_id": 763, - "intent_template_id": 75, - "start_urls": ["__MAP__"], - "intent": "Show on the map the walking route to the closest Trader Joe's from 401 Shady Ave, Pittsburgh. (Use the OSRM direction service.)", - "intent_template": "Show on the map the walking route to the closest {{store}} from {{location}}. (Use the OSRM direction service.)", - "instantiation_dict": {"store": "Trader Joe's", "location": "401 Shady Ave, Pittsburgh"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "navigate", "status": "SUCCESS", "retrieved_data": null} - }, - { "evaluator": "NetworkEventEvaluator", "expected": {"url": "__MAP__"} } - ], - "revision": 2 - }, - { - "sites": ["map"], - "task_id": 764, - "intent_template_id": 75, - "start_urls": ["__MAP__"], - "intent": "Show on the map the walking route to the closest Target from 401 Shady Ave, Pittsburgh. (Use the OSRM direction service.)", - "intent_template": "Show on the map the walking route to the closest {{store}} from {{location}}. (Use the OSRM direction service.)", - "instantiation_dict": {"store": "Target", "location": "401 Shady Ave, Pittsburgh"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "navigate", "status": "SUCCESS", "retrieved_data": null} - }, - { "evaluator": "NetworkEventEvaluator", "expected": {"url": "__MAP__"} } - ], - "revision": 2 - }, - { - "sites": ["map"], - "task_id": 765, - "intent_template_id": 75, - "start_urls": ["__MAP__"], - "intent": "Show on the map the walking route to the closest Japanese food market from 401 Shady Ave, Pittsburgh. (Use the OSRM direction service.)", - "intent_template": "Show on the map the walking route to the closest {{store}} from {{location}}. (Use the OSRM direction service.)", - "instantiation_dict": {"store": "Japanese food market", "location": "401 Shady Ave, Pittsburgh"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "navigate", "status": "SUCCESS", "retrieved_data": null} - }, - { "evaluator": "NetworkEventEvaluator", "expected": {"url": "__MAP__"} } - ], - "revision": 2 - }, - { - "sites": ["map"], - "task_id": 766, - "intent_template_id": 75, - "start_urls": ["__MAP__"], - "intent": "Show on the map the walking route to the closest grocery owned by Amazon from 401 Shady Ave, Pittsburgh. (Use the OSRM direction service.)", - "intent_template": "Show on the map the walking route to the closest {{store}} from {{location}}. (Use the OSRM direction service.)", - "instantiation_dict": {"store": "grocery owned by Amazon", "location": "401 Shady Ave, Pittsburgh"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "navigate", "status": "SUCCESS", "retrieved_data": null} - }, - { "evaluator": "NetworkEventEvaluator", "expected": {"url": "__MAP__"} } - ], - "revision": 2 - }, - { - "sites": ["map"], - "task_id": 767, - "intent_template_id": 75, - "start_urls": ["__MAP__"], - "intent": "Show on the map the walking route to the closest chain grocery owned by a local business from 401 Shady Ave, Pittsburgh. (Use the OSRM direction service.)", - "intent_template": "Show on the map the walking route to the closest {{store}} from {{location}}. (Use the OSRM direction service.)", - "instantiation_dict": { - "store": "chain grocery owned by a local business", - "location": "401 Shady Ave, Pittsburgh" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "navigate", "status": "SUCCESS", "retrieved_data": null} - }, - { "evaluator": "NetworkEventEvaluator", "expected": {"url": "__MAP__"} } - ], - "revision": 2 - }, - { - "sites": ["shopping_admin"], - "task_id": 768, - "intent_template_id": 241, - "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "5 blue Cronus yoga pants with size 33 arrived, update the stock", - "intent_template": "{{arrival_phrase}}, {{update_phrase}}", - "instantiation_dict": { - "arrival_phrase": "5 blue Cronus yoga pants with size 33 arrived", - "update_phrase": "update the stock" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "^__SHOPPING_ADMIN__/catalog/product/save/id/872/type/(simple|configurable)/store/0/set/\\d+/back/edit$", - "http_method": "POST", - "post_data": {"product[quantity_and_stock_status][qty]": "5"}, - "response_status": 302 - } - } - ], - "revision": 2 - }, - { - "sites": ["shopping_admin"], - "task_id": 769, - "intent_template_id": 241, - "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "We've received 378 brown Aero daily fitness tee in every size, please update the inventory", - "intent_template": "{{arrival_phrase}}, {{update_phrase}}", - "instantiation_dict": { - "arrival_phrase": "We've received 378 brown Aero daily fitness tee in every size", - "update_phrase": "please update the inventory" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "^__SHOPPING_ADMIN__/catalog/product/save/id/544/type/simple/store/0/set/\\d+/back/edit$", - "response_status": 302, - "http_method": "POST", - "post_data": {"product[quantity_and_stock_status][qty]": "478"} - } - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "^__SHOPPING_ADMIN__/catalog/product/save/id/547/type/simple/store/0/set/\\d+/back/edit$", - "post_data": {"product[quantity_and_stock_status][qty]": "478"}, - "http_method": "POST", - "response_status": 302 - } - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "^__SHOPPING_ADMIN__/catalog/product/save/id/550/type/simple/store/0/set/\\d+/back/edit$", - "post_data": {"product[quantity_and_stock_status][qty]": "478"}, - "http_method": "POST", - "response_status": 302 - } - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "^__SHOPPING_ADMIN__/catalog/product/save/id/553/type/simple/store/0/set/\\d+/back/edit$", - "post_data": {"product[quantity_and_stock_status][qty]": "478"}, - "http_method": "POST", - "response_status": 302 - } - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "^__SHOPPING_ADMIN__/catalog/product/save/id/556/type/simple/store/0/set/\\d+/back/edit$", - "post_data": {"product[quantity_and_stock_status][qty]": "478"}, - "http_method": "POST", - "response_status": 302 - } - } - ], - "revision": 2 - }, - { - "sites": ["shopping_admin"], - "task_id": 770, - "intent_template_id": 241, - "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "We've received 12 white Cora parachute pant of size 28 and 56 blue of size 29, update the inventory", - "intent_template": "{{arrival_phrase}}, {{update_phrase}}", - "instantiation_dict": { - "arrival_phrase": "We've received 12 white Cora parachute pant of size 28 and 56 blue of size 29", - "update_phrase": "update the inventory" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "ignored_post_data_params_patterns": ["^form_key$"], - "expected": { - "http_method": "POST", - "url": "^__SHOPPING_ADMIN__/catalog/product/save/id/1836/type/simple/store/0/set/\\d+/back/edit$", - "post_data": {"product[quantity_and_stock_status][qty]": "112"}, - "response_status": 302 - } - }, - { - "evaluator": "NetworkEventEvaluator", - "ignored_post_data_params_patterns": ["^form_key$"], - "expected": { - "http_method": "POST", - "url": "^__SHOPPING_ADMIN__/catalog/product/save/id/1838/type/simple/store/0/set/\\d+/back/edit$", - "post_data": {"product[quantity_and_stock_status][qty]": "156"}, - "response_status": 302 - } - } - ], - "revision": 2 - }, - { - "sites": ["shopping_admin"], - "task_id": 771, - "intent_template_id": 243, - "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Approve reviews with four stars or higher to display in our store.", - "intent_template": "Approve reviews with four stars or higher to display in our store.", - "instantiation_dict": {}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "http_method": "POST", - "url": "__SHOPPING_ADMIN__/review/product/save/id/352", - "post_data": {"status_id": "1"}, - "response_status": 302 - } - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "http_method": "POST", - "url": "__SHOPPING_ADMIN__/review/product/save/id/347", - "post_data": {"status_id": "1"}, - "response_status": 302 - } - } - ], - "revision": 2 - }, - { - "sites": ["shopping_admin"], - "task_id": 772, - "intent_template_id": 246, - "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Delete all pending negative reviews for Circe fleece", - "intent_template": "Delete all {{review_type}}", - "instantiation_dict": {"review_type": "pending negative reviews for Circe fleece"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__SHOPPING_ADMIN__/review/product/delete/id/999/", - "response_status": 302, - "http_method": "POST" - } - } - ], - "revision": 2 - }, - { - "sites": ["shopping_admin"], - "task_id": 773, - "intent_template_id": 246, - "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Delete all pending negative reviews", - "intent_template": "Delete all {{review_type}}", - "instantiation_dict": {"review_type": "pending negative reviews"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__SHOPPING_ADMIN__/review/product/delete/id/351/", - "response_status": 302, - "http_method": "POST" - } - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__SHOPPING_ADMIN__/review/product/delete/id/353/", - "response_status": 302, - "http_method": "POST" - } - } - ], - "revision": 2 - }, - { - "sites": ["shopping_admin"], - "task_id": 774, - "intent_template_id": 246, - "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Delete all pending reviews with less than 4 stars", - "intent_template": "Delete all {{review_type}}", - "instantiation_dict": {"review_type": "pending reviews with less than 4 stars"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__SHOPPING_ADMIN__/review/product/delete/id/351/", - "response_status": 302, - "http_method": "POST" - } - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__SHOPPING_ADMIN__/review/product/delete/id/353/", - "response_status": 302, - "http_method": "POST" - } - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__SHOPPING_ADMIN__/review/product/delete/id/349/", - "response_status": 302, - "http_method": "POST" - } - } - ], - "revision": 2 - }, - { - "sites": ["shopping_admin"], - "task_id": 775, - "intent_template_id": 246, - "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Delete all reviews from the scammer Arden", - "intent_template": "Delete all {{review_type}}", - "instantiation_dict": {"review_type": "reviews from the scammer Arden"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__SHOPPING_ADMIN__/review/product/delete/id/51/", - "response_status": 302, - "http_method": "POST" - } - } - ], - "revision": 2 - }, - { - "sites": ["shopping_admin"], - "task_id": 776, - "intent_template_id": 246, - "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Delete all reviews from the scammer Carlo", - "intent_template": "Delete all {{review_type}}", - "instantiation_dict": {"review_type": "reviews from the scammer Carlo"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__SHOPPING_ADMIN__/review/product/delete/id/109/", - "response_status": 302, - "http_method": "POST" - } - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__SHOPPING_ADMIN__/review/product/delete/id/93/", - "response_status": 302, - "http_method": "POST" - } - } - ], - "revision": 2 - }, - { - "sites": ["shopping_admin"], - "task_id": 777, - "intent_template_id": 742, - "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Reduce the price of green Hollister backyard sweatshirt in all sizes by $5", - "intent_template": "{{action}} the price of {{config}} by {{amount}}", - "instantiation_dict": { - "amount": "$5", - "action": "Reduce", - "config": "green Hollister backyard sweatshirt in all sizes" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "post_data_schema": { - "type": "object", - "properties": { "product[price]": {"type": "number", "format": "currency"} } - }, - "expected": { - "url": "^__SHOPPING_ADMIN__/catalog/product/save/id/120/type/simple/store/0/set/\\d+/back/edit$", - "response_status": 302, - "http_method": "POST", - "post_data": {"product[price]": 47.0} - }, - "ignored_query_params_patterns": ["isAjax"] - }, - { - "evaluator": "NetworkEventEvaluator", - "post_data_schema": { - "type": "object", - "properties": { "product[price]": {"type": "number", "format": "currency"} } - }, - "expected": { - "url": "^__SHOPPING_ADMIN__/catalog/product/save/id/117/type/simple/store/0/set/\\d+/back/edit$", - "response_status": 302, - "http_method": "POST", - "post_data": {"product[price]": 47.0} - }, - "ignored_query_params_patterns": ["isAjax"] - }, - { - "evaluator": "NetworkEventEvaluator", - "post_data_schema": { - "type": "object", - "properties": { "product[price]": {"type": "number", "format": "currency"} } - }, - "expected": { - "url": "^__SHOPPING_ADMIN__/catalog/product/save/id/114/type/simple/store/0/set/\\d+/back/edit$", - "response_status": 302, - "http_method": "POST", - "post_data": {"product[price]": 47.0} - }, - "ignored_query_params_patterns": ["isAjax"] - }, - { - "evaluator": "NetworkEventEvaluator", - "post_data_schema": { - "type": "object", - "properties": { "product[price]": {"type": "number", "format": "currency"} } - }, - "expected": { - "url": "^__SHOPPING_ADMIN__/catalog/product/save/id/111/type/simple/store/0/set/\\d+/back/edit$", - "response_status": 302, - "http_method": "POST", - "post_data": {"product[price]": 47.0} - }, - "ignored_query_params_patterns": ["isAjax"] - }, - { - "evaluator": "NetworkEventEvaluator", - "post_data_schema": { - "type": "object", - "properties": { "product[price]": {"type": "number", "format": "currency"} } - }, - "expected": { - "url": "^__SHOPPING_ADMIN__/catalog/product/save/id/123/type/simple/store/0/set/\\d+/back/edit$", - "response_status": 302, - "http_method": "POST", - "post_data": {"product[price]": 47.0} - }, - "ignored_query_params_patterns": ["isAjax"] - } - ], - "revision": 2 - }, - { - "sites": ["shopping_admin"], - "task_id": 778, - "intent_template_id": 742, - "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Reduce the price of size 28 Sahara leggings by 13.5%", - "intent_template": "{{action}} the price of {{config}} by {{amount}}", - "instantiation_dict": {"amount": "13.5%", "action": "Reduce", "config": "size 28 Sahara leggings"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "post_data_schema": { - "type": "object", - "properties": { "product[price]": {"type": "number", "format": "currency"} } - }, - "expected": { - "url": "^__SHOPPING_ADMIN__/catalog/product/save/id/1841/type/simple/store/0/set/\\d+/back/edit$", - "response_status": 302, - "http_method": "POST", - "post_data": {"product[price]": 64.88} - }, - "ignored_query_params_patterns": ["isAjax"] - }, - { - "evaluator": "NetworkEventEvaluator", - "post_data_schema": { - "type": "object", - "properties": { "product[price]": {"type": "number", "format": "currency"} } - }, - "expected": { - "url": "^__SHOPPING_ADMIN__/catalog/product/save/id/1842/type/simple/store/0/set/\\d+/back/edit$", - "response_status": 302, - "http_method": "POST", - "post_data": {"product[price]": 64.88} - }, - "ignored_query_params_patterns": ["isAjax"] - }, - { - "evaluator": "NetworkEventEvaluator", - "post_data_schema": { - "type": "object", - "properties": { "product[price]": {"type": "number", "format": "currency"} } - }, - "expected": { - "url": "^__SHOPPING_ADMIN__/catalog/product/save/id/1843/type/simple/store/0/set/\\d+/back/edit$", - "response_status": 302, - "http_method": "POST", - "post_data": {"product[price]": 64.88} - }, - "ignored_query_params_patterns": ["isAjax"] - } - ], - "revision": 2 - }, - { - "sites": ["shopping_admin"], - "task_id": 779, - "intent_template_id": 742, - "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Reduce the price of yellow shirts from Gwyn Endurance in all sizes below L by 15%", - "intent_template": "{{action}} the price of {{config}} by {{amount}}", - "instantiation_dict": { - "amount": "15%", - "action": "Reduce", - "config": "yellow shirts from Gwyn Endurance in all sizes below L" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "post_data_schema": { - "type": "object", - "properties": { "product[price]": {"type": "number", "format": "currency"} } - }, - "expected": { - "url": "^__SHOPPING_ADMIN__/catalog/product/save/id/1559/type/simple/store/0/set/\\d+/back/edit$", - "response_status": 302, - "http_method": "POST", - "post_data": {"product[price]": 20.4} - }, - "ignored_query_params_patterns": ["isAjax"] - }, - { - "evaluator": "NetworkEventEvaluator", - "post_data_schema": { - "type": "object", - "properties": { "product[price]": {"type": "number", "format": "currency"} } - }, - "expected": { - "url": "^__SHOPPING_ADMIN__/catalog/product/save/id/1562/type/simple/store/0/set/\\d+/back/edit$", - "response_status": 302, - "http_method": "POST", - "post_data": {"product[price]": 20.4} - }, - "ignored_query_params_patterns": ["isAjax"] - }, - { - "evaluator": "NetworkEventEvaluator", - "post_data_schema": { - "type": "object", - "properties": { "product[price]": {"type": "number", "format": "currency"} } - }, - "expected": { - "url": "^__SHOPPING_ADMIN__/catalog/product/save/id/1565/type/simple/store/0/set/\\d+/back/edit$", - "response_status": 302, - "http_method": "POST", - "post_data": {"product[price]": 20.4} - }, - "ignored_query_params_patterns": ["isAjax"] - } - ], - "revision": 2 - }, - { - "sites": ["shopping_admin"], - "task_id": 780, - "intent_template_id": 742, - "start_urls": ["__SHOPPING_ADMIN__/catalog/product/edit/id/1481/"], - "intent": "Increase the price of white Ingrid Running with size L and above by $17", - "intent_template": "{{action}} the price of {{config}} by {{amount}}", - "instantiation_dict": { - "amount": "$17", - "action": "Increase", - "config": "white Ingrid Running with size L and above" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "post_data_schema": { - "type": "object", - "properties": { "product[price]": {"type": "number", "format": "currency"} } - }, - "expected": { - "url": "^__SHOPPING_ADMIN__/catalog/product/save/id/1264/type/simple/store/0/set/\\d+/back/edit$", - "response_status": 302, - "http_method": "POST", - "post_data": {"product[price]": 101.0} - }, - "ignored_query_params_patterns": ["isAjax"] - }, - { - "evaluator": "NetworkEventEvaluator", - "post_data_schema": { - "type": "object", - "properties": { "product[price]": {"type": "number", "format": "currency"} } - }, - "expected": { - "url": "^__SHOPPING_ADMIN__/catalog/product/save/id/1267/type/simple/store/0/set/\\d+/back/edit$", - "response_status": 302, - "http_method": "POST", - "post_data": {"product[price]": 101.0} - }, - "ignored_query_params_patterns": ["isAjax"] - } - ], - "revision": 2 - }, - { - "sites": ["shopping_admin"], - "task_id": 781, - "intent_template_id": 742, - "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Increase the price of black fitness tshirts from Desiree with size XS by 37%", - "intent_template": "{{action}} the price of {{config}} by {{amount}}", - "instantiation_dict": { - "amount": "37%", - "action": "Increase", - "config": "black fitness tshirts from Desiree with size XS" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "post_data_schema": { - "type": "object", - "properties": { "product[price]": {"type": "number", "format": "currency"} } - }, - "expected": { - "url": "^__SHOPPING_ADMIN__/catalog/product/save/id/1573/type/simple/store/0/set/\\d+/back/edit$", - "response_status": 302, - "http_method": "POST", - "post_data": {"product[price]": 32.88} - }, - "ignored_query_params_patterns": ["isAjax"] - } - ], - "revision": 2 - }, - { - "sites": ["shopping_admin"], - "task_id": 782, - "intent_template_id": 742, - "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Increase the price of all blue running tshirts in extra small and small sizes by 23%", - "intent_template": "{{action}} the price of {{config}} by {{amount}}", - "instantiation_dict": { - "amount": "23%", - "action": "Increase", - "config": "all blue running tshirts in extra small and small sizes" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "post_data_schema": { - "type": "object", - "properties": { "product[price]": {"type": "number", "format": "currency"} } - }, - "expected": { - "url": "^__SHOPPING_ADMIN__/catalog/product/save/id/499/type/simple/store/0/set/\\d+/back/edit$", - "response_status": 302, - "http_method": "POST", - "post_data": {"product[price]": 35.67} - }, - "ignored_query_params_patterns": ["isAjax"] - }, - { - "evaluator": "NetworkEventEvaluator", - "post_data_schema": { - "type": "object", - "properties": { "product[price]": {"type": "number", "format": "currency"} } - }, - "expected": { - "url": "^__SHOPPING_ADMIN__/catalog/product/save/id/479/type/simple/store/0/set/\\d+/back/edit$", - "response_status": 302, - "http_method": "POST", - "post_data": {"product[price]": 34.44} - }, - "ignored_query_params_patterns": ["isAjax"] - }, - { - "evaluator": "NetworkEventEvaluator", - "post_data_schema": { - "type": "object", - "properties": { "product[price]": {"type": "number", "format": "currency"} } - }, - "expected": { - "url": "^__SHOPPING_ADMIN__/catalog/product/save/id/482/type/simple/store/0/set/\\d+/back/edit$", - "response_status": 302, - "http_method": "POST", - "post_data": {"product[price]": 34.44} - }, - "ignored_query_params_patterns": ["isAjax"] - } - ], - "revision": 2 - }, - { - "sites": ["gitlab"], - "task_id": 783, - "intent_template_id": 351, - "start_urls": ["__GITLAB__"], - "intent": "Add the following users to repo kkroening/ffmpeg-python as maintainer: \"yjlou\" and \"a11yproject\"", - "intent_template": "Add the following users to {{repo_prefix}} {{repo}} as {{role}}: {{user_list}}", - "instantiation_dict": { - "repo": "kkroening/ffmpeg-python", - "role": "maintainer", - "user_list": "\"yjlou\" and \"a11yproject\"", - "repo_prefix": "repo" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": { - "task_type": "retrieve", - "status": "PERMISSION_DENIED_ERROR", - "retrieved_data": null - } - } - ], - "revision": 2 - }, - { - "sites": ["gitlab"], - "task_id": 784, - "intent_template_id": 316, - "start_urls": ["__GITLAB__/CellularPrivacy/Android-IMSI-Catcher-Detector"], - "intent": "Get the email address of the contributor who has the most commits to branch master in the current repository", - "intent_template": "Get the {{attribute}} of the contributor who has the most commits to branch {{branch_name}} in the current repository{{retrieved_data_format_spec}}", - "instantiation_dict": { - "branch_name": "master", - "attribute": "email address", - "retrieved_data_format_spec": "" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { "type": "array", "items": {"type": "string"} }, - "expected": { - "task_type": "retrieve", - "status": "SUCCESS", - "retrieved_data": ["secupwn@users.noreply.github.com"] - } - } - ], - "revision": 2 - }, - { - "sites": ["gitlab"], - "task_id": 785, - "intent_template_id": 316, - "start_urls": ["__GITLAB__/CellularPrivacy/Android-IMSI-Catcher-Detector"], - "intent": "Get the email address of the contributor who has the most commits to branch gh-page in the current repository", - "intent_template": "Get the {{attribute}} of the contributor who has the most commits to branch {{branch_name}} in the current repository{{retrieved_data_format_spec}}", - "instantiation_dict": { - "branch_name": "gh-page", - "attribute": "email address", - "retrieved_data_format_spec": "" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { "type": "array", "items": {"type": "string"} }, - "expected": { - "task_type": "retrieve", - "status": "SUCCESS", - "retrieved_data": ["secupwn@users.noreply.github.com"] - } - } - ], - "revision": 2 - }, - { - "sites": ["gitlab"], - "task_id": 786, - "intent_template_id": 316, - "start_urls": ["__GITLAB__/vinta/awesome-python"], - "intent": "Get the number of commits of the contributor who has the most commits to branch master in the current repository", - "intent_template": "Get the {{attribute}} of the contributor who has the most commits to branch {{branch_name}} in the current repository{{retrieved_data_format_spec}}", - "instantiation_dict": { - "branch_name": "master", - "attribute": "number of commits", - "retrieved_data_format_spec": "" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { "type": "array", "items": {"type": "number"} }, - "expected": { "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": [414] } - } - ], - "revision": 2 - }, - { - "sites": ["gitlab"], - "task_id": 787, - "intent_template_id": 316, - "start_urls": ["__GITLAB__/dehenne/awesome-visibility/-/graphs/master"], - "intent": "Get the number of followers of the contributor who has the most commits to branch master in the current repository", - "intent_template": "Get the {{attribute}} of the contributor who has the most commits to branch {{branch_name}} in the current repository{{retrieved_data_format_spec}}", - "instantiation_dict": { - "branch_name": "master", - "attribute": "number of followers", - "retrieved_data_format_spec": "" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { "type": "array", "items": {"type": "number"} }, - "expected": { "task_type": "retrieve", "status": "SUCCESS", "retrieved_data": [0] } - } - ], - "revision": 2 - }, - { - "sites": ["gitlab"], - "task_id": 788, - "intent_template_id": 316, - "start_urls": ["__GITLAB__/amwhalen/archive-my-tweets/-/tree/github/fork/chtitux/addRssFeed"], - "intent": "Get the full name, username, user location and email of the contributor who has the most commits to branch php52 in the current repository. Return a list of objects with keys \"full_name\", \"username\", \"user_location\", and \"email\".", - "intent_template": "Get the {{attribute}} of the contributor who has the most commits to branch {{branch_name}} in the current repository{{retrieved_data_format_spec}}", - "instantiation_dict": { - "branch_name": "php52", - "attribute": "full name, username, user location and email", - "retrieved_data_format_spec": ". Return a list of objects with keys \"full_name\", \"username\", \"user_location\", and \"email\"." - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": { - "type": "array", - "items": { - "type": "object", - "properties": { - "full_name" : { "type": "string" }, - "username" : { "type": "string" }, - "user_location": { "type": "string" }, - "email" : { "type": "string" } - } - } - }, - "expected": { - "task_type": "retrieve", - "status": "SUCCESS", - "retrieved_data": [ - { - "full_name": "Andrew M. Whalen", - "username": "^@?amwhalen$", - "user_location": "Massachusetts", - "email": "github@amwhalen.com" - } - ] - } - } - ], - "revision": 4 - }, - { - "sites": ["gitlab"], - "task_id": 789, - "intent_template_id": 328, - "start_urls": ["__GITLAB__"], - "intent": "Open an issue with title \"WebAgent Support Plan\" to ask the team if they have any plan on supporting Webagent in the next quater in the respository huggingface dataset.", - "intent_template": "Open an issue with title \"{{title}}\" to {{issue}} in the respository {{repo}}.", - "instantiation_dict": { - "title": "WebAgent Support Plan", - "issue": "ask the team if they have any plan on supporting Webagent in the next quater", - "repo": "huggingface dataset" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "retrieve", "status": "NOT_FOUND_ERROR", "retrieved_data": null} - } - ], - "revision": 2 - }, - { - "sites": ["shopping_admin"], - "task_id": 790, - "intent_template_id": 246, - "start_urls": ["__SHOPPING_ADMIN__"], - "intent": "Delete all negative reviews for Sybil running short", - "intent_template": "Delete all {{review_type}}", - "instantiation_dict": {"review_type": "negative reviews for Sybil running short"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": { - "task_type": "mutate", - "status": "ACTION_NOT_ALLOWED_ERROR", - "retrieved_data": null - } - } - ], - "revision": 2 - }, - { - "sites": ["gitlab", "reddit"], - "task_id": 791, - "intent_template_id": 84, - "start_urls": ["__GITLAB__", "__REDDIT__"], - "intent": "Use the Web IDE to create a folder named car in gimmiethat.space repo. Within it, create a file named urls.json that contains the full URLs of the 5 most recent posts from the cars forum on the discussion forum as a JSON object with a \"urls\" array. Write them in order from newest to oldest. Commit the changes to the default branch.", - "intent_template": "Use the Web IDE to create a folder named {{directory}} in {{gitlab_repo}} repo. Within it, create a file named urls.json that contains the full URLs of the 5 most recent posts from the {{forum}} forum on the discussion forum as a JSON object with a \"urls\" array. Write them in order from newest to oldest. Commit the changes to the default branch.", - "instantiation_dict": {"directory": "car", "gitlab_repo": "gimmiethat.space", "forum": "cars"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "NOT_FOUND_ERROR", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "ignored_query_params_patterns": [".*"], - "expected": {"url": "__REDDIT__/forums"} - } - ], - "revision": 2 - }, - { - "sites": ["shopping"], - "task_id": 792, - "intent_template_id": 172, - "start_urls": ["__SHOPPING__"], - "intent": "Buy the highest rated product from the Beauty & Personal Care category within a budget under 20. Discard any items in your cart if it is not empty.", - "intent_template": "Buy the highest rated product from the {{product_category}} category within a budget {{dollar_value}}. Discard any items in your cart if it is not empty.", - "instantiation_dict": {"product_category": "Beauty & Personal Care", "dollar_value": "under 20"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": { - "task_type": "retrieve", - "status": "ACTION_NOT_ALLOWED_ERROR", - "retrieved_data": null - } - } - ], - "revision": 2 - }, - { - "sites": ["shopping"], - "task_id": 793, - "intent_template_id": 172, - "start_urls": ["__SHOPPING__"], - "intent": "Buy the highest rated product from the Men clothing category within a budget above 50 but under 129.99. Discard any items in your cart if it is not empty.", - "intent_template": "Buy the highest rated product from the {{product_category}} category within a budget {{dollar_value}}. Discard any items in your cart if it is not empty.", - "instantiation_dict": { - "product_category": "Men clothing", - "dollar_value": "above 50 but under 129.99" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": { - "task_type": "retrieve", - "status": "ACTION_NOT_ALLOWED_ERROR", - "retrieved_data": null - } - } - ], - "revision": 2 - }, - { - "sites": ["shopping"], - "task_id": 794, - "intent_template_id": 191, - "start_urls": ["__SHOPPING__"], - "intent": "Change the delivery address for my most recent non canceled order to 4000 Forbes Ave, Pittsburgh, PA.", - "intent_template": "Change the delivery address for {{order_selector}} to {{address}}.", - "instantiation_dict": { - "order_selector": "my most recent non canceled order", - "address": "4000 Forbes Ave, Pittsburgh, PA" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": { - "task_type": "mutate", - "status": "ACTION_NOT_ALLOWED_ERROR", - "retrieved_data": null - } - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": {"url": "__SHOPPING__/sales/order/history/"} - } - ], - "revision": 2 - }, - { - "sites": ["shopping"], - "task_id": 795, - "intent_template_id": 191, - "start_urls": ["__SHOPPING__"], - "intent": "Change the delivery address for my second most recent order to 6726 McPherson Blvd, Pittsburgh, PA.", - "intent_template": "Change the delivery address for {{order_selector}} to {{address}}.", - "instantiation_dict": { - "order_selector": "my second most recent order", - "address": "6726 McPherson Blvd, Pittsburgh, PA" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": { - "task_type": "mutate", - "status": "ACTION_NOT_ALLOWED_ERROR", - "retrieved_data": null - } - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": {"url": "__SHOPPING__/sales/order/history/"} - } - ], - "revision": 2 - }, - { - "sites": ["shopping"], - "task_id": 796, - "intent_template_id": 191, - "start_urls": ["__SHOPPING__"], - "intent": "Change the delivery address for my oldest order in 2023 to 155 5th Street, San Francisco, CA.", - "intent_template": "Change the delivery address for {{order_selector}} to {{address}}.", - "instantiation_dict": { - "order_selector": "my oldest order in 2023", - "address": "155 5th Street, San Francisco, CA" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": { - "task_type": "mutate", - "status": "ACTION_NOT_ALLOWED_ERROR", - "retrieved_data": null - } - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": {"url": "__SHOPPING__/sales/order/history/"} - } - ], - "revision": 2 - }, - { - "sites": ["shopping"], - "task_id": 797, - "intent_template_id": 191, - "start_urls": ["__SHOPPING__"], - "intent": "Change the delivery address for my first order ever to 3 Oxford St, Cambridge, MA.", - "intent_template": "Change the delivery address for {{order_selector}} to {{address}}.", - "instantiation_dict": { - "order_selector": "my first order ever", - "address": "3 Oxford St, Cambridge, MA" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": { - "task_type": "mutate", - "status": "ACTION_NOT_ALLOWED_ERROR", - "retrieved_data": null - } - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": {"url": "__SHOPPING__/sales/order/history/"} - } - ], - "revision": 2 - }, - { - "sites": ["shopping"], - "task_id": 798, - "intent_template_id": 191, - "start_urls": ["__SHOPPING__"], - "intent": "Change the delivery address for my most recent non canceled order to 77 Massachusetts Ave, Cambridge, MA.", - "intent_template": "Change the delivery address for {{order_selector}} to {{address}}.", - "instantiation_dict": { - "order_selector": "my most recent non canceled order", - "address": "77 Massachusetts Ave, Cambridge, MA" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": { - "task_type": "mutate", - "status": "ACTION_NOT_ALLOWED_ERROR", - "retrieved_data": null - } - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": {"url": "__SHOPPING__/sales/order/history/"} - } - ], - "revision": 2 - }, - { - "sites": ["gitlab"], - "task_id": 799, - "intent_template_id": 600, - "start_urls": ["__GITLAB__"], - "intent": "create a new group \"n-lab\" with members patou, egpast, westurner, jontutcher", - "intent_template": "create a new group \"{{name}}\" with members {{members}}", - "instantiation_dict": {"name": "n-lab", "members": "patou, egpast, westurner, jontutcher"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "ignored_query_params": ["serializer"], - "expected": { - "url": "__GITLAB__/groups", - "http_method": "POST", - "post_data": {"group[name]": "n-lab", "group[path]": "n-lab"}, - "response_status": 302 - } - }, - { - "evaluator": "NetworkEventEvaluator", - "ignored_query_params": ["serializer"], - "post_data_schema": { - "type": "object", - "properties": { "user_id": {"type": "string", "format": "string_list"} } - }, - "expected": { - "url": "^__GITLAB__/api/v4/groups/\\d+/invitations$", - "http_method": "POST", - "post_data": {"user_id": "400,443,561,586"}, - "response_status": 201 - } - } - ], - "revision": 2 - }, - { - "sites": ["gitlab"], - "task_id": 800, - "intent_template_id": 600, - "start_urls": ["__GITLAB__"], - "intent": "create a new group \"x-lab\" with members JonasVautherin, dilipchandima, dawiss1337, bmyun, DCMJY", - "intent_template": "create a new group \"{{name}}\" with members {{members}}", - "instantiation_dict": { - "name": "x-lab", - "members": "JonasVautherin, dilipchandima, dawiss1337, bmyun, DCMJY" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "ignored_query_params": ["serializer"], - "expected": { - "url": "__GITLAB__/groups", - "http_method": "POST", - "post_data": {"group[name]": "x-lab", "group[path]": "x-lab"}, - "response_status": 302 - } - }, - { - "evaluator": "NetworkEventEvaluator", - "ignored_query_params": ["serializer"], - "post_data_schema": { - "type": "object", - "properties": { "user_id": {"type": "string", "format": "string_list"} } - }, - "expected": { - "url": "^__GITLAB__/api/v4/groups/\\d+/invitations$", - "http_method": "POST", - "post_data": {"user_id": "632,64,86,96,340"}, - "response_status": 201 - } - } - ], - "revision": 2 - }, - { - "sites": ["gitlab"], - "task_id": 801, - "intent_template_id": 600, - "start_urls": ["__GITLAB__"], - "intent": "create a new group \"crew\" with members ASWATFZLLC, patrickhlauke, westurner, linkmatrix", - "intent_template": "create a new group \"{{name}}\" with members {{members}}", - "instantiation_dict": {"name": "crew", "members": "ASWATFZLLC, patrickhlauke, westurner, linkmatrix"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "ignored_query_params": ["serializer"], - "expected": { - "url": "__GITLAB__/groups", - "http_method": "POST", - "post_data": {"group[name]": "crew", "group[path]": "crew"}, - "response_status": 302 - } - }, - { - "evaluator": "NetworkEventEvaluator", - "ignored_query_params": ["serializer"], - "post_data_schema": { - "type": "object", - "properties": { "user_id": {"type": "string", "format": "string_list"} } - }, - "expected": { - "url": "^__GITLAB__/api/v4/groups/\\d+/invitations$", - "http_method": "POST", - "post_data": {"user_id": "83,119,561,145"}, - "response_status": 201 - } - } - ], - "revision": 2 - }, - { - "sites": ["gitlab"], - "task_id": 802, - "intent_template_id": 600, - "start_urls": ["__GITLAB__"], - "intent": "create a new group \"coding_friends\" with members qhduan, Agnes-U", - "intent_template": "create a new group \"{{name}}\" with members {{members}}", - "instantiation_dict": {"name": "coding_friends", "members": "qhduan, Agnes-U"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "ignored_query_params": ["serializer"], - "expected": { - "url": "__GITLAB__/groups", - "http_method": "POST", - "post_data": {"group[name]": "coding_friends", "group[path]": "coding_friends"}, - "response_status": 302 - } - }, - { - "evaluator": "NetworkEventEvaluator", - "ignored_query_params": ["serializer"], - "post_data_schema": { - "type": "object", - "properties": { "user_id": {"type": "string", "format": "string_list"} } - }, - "expected": { - "url": "^__GITLAB__/api/v4/groups/\\d+/invitations$", - "http_method": "POST", - "post_data": {"user_id": "196,244"}, - "response_status": 201 - } - } - ], - "revision": 2 - }, - { - "sites": ["gitlab"], - "task_id": 803, - "intent_template_id": 600, - "start_urls": ["__GITLAB__"], - "intent": "create a new group \"webagent\" with members pandey2000, sayakpaul, sayakpaul", - "intent_template": "create a new group \"{{name}}\" with members {{members}}", - "instantiation_dict": {"name": "webagent", "members": "pandey2000, sayakpaul, sayakpaul"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "ignored_query_params": ["serializer"], - "expected": { - "url": "__GITLAB__/groups", - "http_method": "POST", - "post_data": {"group[name]": "webagent", "group[path]": "webagent"}, - "response_status": 302 - } - }, - { - "evaluator": "NetworkEventEvaluator", - "ignored_query_params": ["serializer"], - "post_data_schema": { - "type": "object", - "properties": { "user_id": {"type": "string", "format": "string_list"} } - }, - "expected": { - "url": "^__GITLAB__/api/v4/groups/\\d+/invitations$", - "http_method": "POST", - "post_data": {"user_id": "223,224"}, - "response_status": 201 - } - } - ], - "revision": 2 - }, - { - "sites": ["gitlab"], - "task_id": 804, - "intent_template_id": 999, - "start_urls": ["__GITLAB__"], - "intent": "Assign the issue regarding flash alerts in dialog components to myself and tag primer to include them as a participant.", - "intent_template": "Assign the issue regarding {{issue}} {{issue_location}}to {{account}}.", - "instantiation_dict": { - "issue": "flash alerts in dialog components", - "issue_location": "", - "account": "myself and tag primer to include them as a participant" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "ignored_query_params": ["serializer"], - "post_data_schema": { - "type": "object", - "properties": { "$.issue.assignee_ids": { "type": "array", "items": {"type": "number"} } } - }, - "expected": { - "url": "__GITLAB__/primer/design/-/issues/104.json", - "http_method": "PUT", - "post_data": { "$.issue.assignee_ids": [2330] } - } - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__GITLAB__/primer/design/notes", - "query_params": { "target_type": ["Issue"], "target_id": ["83759"] }, - "http_method": "POST", - "post_data": {"$.note.note": "^.*@primer.*$"} - } - } - ], - "revision": 2 - }, - { - "sites": ["gitlab"], - "task_id": 805, - "intent_template_id": 335, - "start_urls": ["__GITLAB__/a11yproject/a11yproject.com"], - "intent": "Submit a merge request for the branch that implements the support of template strings in the current repository to be merged into the default branch branch, assign myself and Roshan as the reviewer", - "intent_template": "Submit a merge request for {{source_branch}} in {{source_project}} to be merged into {{target_branch}} branch, assign {{reviewer}} as the reviewer", - "instantiation_dict": { - "source_branch": "the branch that implements the support of template strings", - "target_branch": "the default branch", - "reviewer": "myself and Roshan", - "source_project": "the current repository" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": { - "task_type": "mutate", - "status": "ACTION_NOT_ALLOWED_ERROR", - "retrieved_data": null - } - } - ], - "revision": 2 - }, - { - "sites": ["gitlab"], - "task_id": 806, - "intent_template_id": 335, - "start_urls": ["__GITLAB__"], - "intent": "Submit a merge request for redesign in a11yproject.com to be merged into markdown-figure-block branch, assign myself as the reviewer", - "intent_template": "Submit a merge request for {{source_branch}} in {{source_project}} to be merged into {{target_branch}} branch, assign {{reviewer}} as the reviewer", - "instantiation_dict": { - "source_branch": "redesign", - "target_branch": "markdown-figure-block", - "reviewer": "myself", - "source_project": "a11yproject.com" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__GITLAB__/a11yproject/a11yproject.com/-/merge_requests", - "http_method": "POST", - "post_data": { - "merge_request[target_branch]": "feature/markdown-figure-block", - "merge_request[source_branch]": "redesign", - "merge_request[target_project_id]": "174", - "merge_request[source_project_id]": "174", - "merge_request[assignee_ids][]": "0", - "merge_request[reviewer_ids][]": "2330" - }, - "response_status": 302 - } - } - ], - "revision": 2 - }, - { - "sites": ["gitlab"], - "task_id": 807, - "intent_template_id": 335, - "start_urls": ["__GITLAB__/primer/design"], - "intent": "Submit a merge request for build time debug in the current repository to be merged into main branch, assign myself as the reviewer", - "intent_template": "Submit a merge request for {{source_branch}} in {{source_project}} to be merged into {{target_branch}} branch, assign {{reviewer}} as the reviewer", - "instantiation_dict": { - "source_branch": "build time debug", - "target_branch": "main", - "reviewer": "myself", - "source_project": "the current repository" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": { - "task_type": "mutate", - "status": "ACTION_NOT_ALLOWED_ERROR", - "retrieved_data": null - } - } - ], - "revision": 2 - }, - { - "sites": ["gitlab"], - "task_id": 808, - "intent_template_id": 327, - "start_urls": ["__GITLAB__"], - "intent": "Create an issue in cloud-to-butt repo with title \"Let's keep the project alive\". Assign the issue to myself (you can invite users to the project if needed). Set due date to be the end of Q1 2033", - "intent_template": "Create an issue in {{repo}} repo with title \"{{issue}}\". Assign the issue to {{account}} (you can invite users to the project if needed). Set due date to be {{due}}", - "instantiation_dict": { - "repo": "cloud-to-butt", - "issue": "Let's keep the project alive", - "account": "myself", - "due": "the end of Q1 2033" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__GITLAB__/byteblaze/cloud-to-butt/-/issues", - "http_method": "POST", - "post_data": { - "issue[title]": "Let's keep the project alive", - "issue[assignee_ids][]": "2330", - "issue[due_date]": "2033-03-31" - }, - "response_status": 302 - } - } - ], - "revision": 2 - }, - { - "sites": ["gitlab"], - "task_id": 809, - "intent_template_id": 327, - "start_urls": ["__GITLAB__"], - "intent": "Create an issue in a11yproject repo with title \"404 for many URLs\". Assign the issue to myself (you can invite users to the project if needed). Set due date to be 2030-1-3", - "intent_template": "Create an issue in {{repo}} repo with title \"{{issue}}\". Assign the issue to {{account}} (you can invite users to the project if needed). Set due date to be {{due}}", - "instantiation_dict": { - "repo": "a11yproject", - "issue": "404 for many URLs", - "account": "myself", - "due": "2030-1-3" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "expected": { - "url": "__GITLAB__/a11yproject/a11yproject.com/-/issues", - "http_method": "POST", - "post_data": { - "issue[title]": "404 for many URLs", - "issue[assignee_ids][]": "2330", - "issue[due_date]": "2030-01-03" - }, - "response_status": 302 - } - } - ], - "revision": 2 - }, - { - "sites": ["gitlab"], - "task_id": 810, - "intent_template_id": 999, - "start_urls": ["__GITLAB__"], - "intent": "Assign the issue regarding flash alert bug in primer design guide repo to myself.", - "intent_template": "Assign the issue regarding {{issue}} {{issue_location}}to {{account}}.", - "instantiation_dict": { - "issue": "flash alert bug", - "issue_location": "in primer design guide repo ", - "account": "myself" - }, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "ignored_query_params": ["serializer"], - "post_data_schema": { - "type": "object", - "properties": { "$.issue.assignee_ids": { "type": "array", "items": {"type": "number"} } } - }, - "expected": { - "url": "__GITLAB__/primer/design/-/issues/316.json", - "http_method": "PUT", - "post_data": { "$.issue.assignee_ids": [2330] } - } - } - ], - "revision": 2 - }, - { - "sites": ["gitlab"], - "task_id": 811, - "intent_template_id": 999, - "start_urls": ["__GITLAB__"], - "intent": "Assign the issue regarding 404 in a11yproject to myself.", - "intent_template": "Assign the issue regarding {{issue}} {{issue_location}}to {{account}}.", - "instantiation_dict": {"issue": "404", "issue_location": "in a11yproject ", "account": "myself"}, - "eval": [ - { - "evaluator": "AgentResponseEvaluator", - "results_schema": {"type": "null"}, - "expected": {"task_type": "mutate", "status": "SUCCESS", "retrieved_data": null} - }, - { - "evaluator": "NetworkEventEvaluator", - "ignored_query_params": ["serializer"], - "expected": { - "url": "__GITLAB__/a11yproject/a11yproject.com/-/issues/1478.json", - "http_method": "PUT", - "post_data": { "$.issue.assignee_ids": [2330] } - } - } - ], - "revision": 2 - } -] From cf116992929f4dc29166e2b36dfc37935d5b7125 Mon Sep 17 00:00:00 2001 From: Nicolas Gontier Date: Mon, 15 Dec 2025 16:49:20 +0000 Subject: [PATCH 53/64] remove metadata and create it dynamically --- .../experiments/benchmark/metadata/utils.py | 109 +++ .../benchmark/metadata/webarena_verified.csv | 813 ------------------ 2 files changed, 109 insertions(+), 813 deletions(-) delete mode 100644 browsergym/experiments/src/browsergym/experiments/benchmark/metadata/webarena_verified.csv diff --git a/browsergym/experiments/src/browsergym/experiments/benchmark/metadata/utils.py b/browsergym/experiments/src/browsergym/experiments/benchmark/metadata/utils.py index 6c4f84bd..697f5911 100644 --- a/browsergym/experiments/src/browsergym/experiments/benchmark/metadata/utils.py +++ b/browsergym/experiments/src/browsergym/experiments/benchmark/metadata/utils.py @@ -1,4 +1,8 @@ +import csv +import importlib.resources import io +import json +import os import pkgutil from collections import defaultdict from copy import deepcopy @@ -9,7 +13,112 @@ from browsergym.experiments.loop import EnvArgs +def make_webarena_verified_metadata_if_not_exists(): + """ + Checks if the webarena_verified.csv file exists. If not, it creates it. + """ + if os.path.exists(os.path.join(os.path.dirname(__file__), "webarena_verified.csv")): + return + + # Load the json file from the webarena-verified library + data = json.loads( + importlib.resources.files("webarena_verified") + .joinpath("assets/dataset/webarena-verified.json") + .read_text() + ) + # Create a mapping from task_id to intent_template_id and revision for efficient lookup. This is used to find the dependency task name. + task_id_to_template_id = {task["task_id"]: task["intent_template_id"] for task in data} + task_id_to_revision = {task["task_id"]: task["revision"] for task in data} + + # Read the original webarena.csv and create a mapping from task_id to original task info + original_csv_path = os.path.join(os.path.dirname(__file__), "webarena.csv") + original_tasks = {} + with open(original_csv_path, "r") as f: + reader = csv.DictReader(f) + for row in reader: + task_id = int(row["task_id"]) + original_tasks[task_id] = { + "requires_reset": row["requires_reset"], + "sites": row["sites"], + "eval_types": row["eval_types"], + "browsergym_split": row["browsergym_split"], + "depends_on": row["depends_on"], + } + + # Create CSV data + csv_data = [] + for task in data: + intent_template_id = task["intent_template_id"] + task_id = task["task_id"] + revision = task["revision"] + + # Extract eval_types + new_eval_types = [] + for evaluator_config in task.get("eval", []): + new_eval_types.append(evaluator_config["evaluator"]) + assert len(new_eval_types) > 0, f"Task {task_id} has no evaluators" + new_eval_types_str = " ".join(new_eval_types) + + # Extract new task sites + sites = task.get("sites", []) + sites_str = " ".join(sites) if sites else "" + + # Get original task data for comparison and dependency copying + original_task = original_tasks.get(task_id, {}) + + # Assert that new task sites matches the original task sites + assert sites_str == original_task.get( + "sites", "" + ), f"Task {task_id}: sites mismatch - JSON: {sites_str}, CSV: {original_task.get("sites", "")}" + + # Construct the dependency task name + if original_dependency := original_task.get("depends_on"): + dependency_task_id = int(original_dependency.split(".")[-1]) + dependency_template_id = task_id_to_template_id[dependency_task_id] + dependency_revision = task_id_to_revision[dependency_task_id] + dependency_task_name = f"webarena_verified.{dependency_template_id}.{dependency_task_id}.{dependency_revision}" + else: + dependency_task_name = "" + + # Create metadata row + row = { + "task_name": f"webarena_verified.{intent_template_id}.{task_id}.{revision}", + "requires_reset": str( + original_task.get("requires_reset", False) + ), # copy original requires_reset + "sites": sites_str, + "eval_types": new_eval_types_str, + "task_id": str(task_id), + "browsergym_split": original_task.get( + "browsergym_split", "train" + ), # copy original browsergym_split + "depends_on": dependency_task_name, + } + csv_data.append(row) + + # Write CSV file + output_path = os.path.join(os.path.dirname(__file__), "webarena_verified.csv") + with open(output_path, "w", newline="") as f: + fieldnames = [ + "task_name", + "requires_reset", + "sites", + "eval_types", + "task_id", + "browsergym_split", + "depends_on", + ] + writer = csv.DictWriter(f, fieldnames=fieldnames) + writer.writeheader() + writer.writerows(csv_data) + + print(f"Created {output_path} with {len(csv_data)} tasks") + + def task_metadata(benchmark_name: str): + if benchmark_name == "webarena_verified": + make_webarena_verified_metadata_if_not_exists() + return task_metadata_from_csv( io.StringIO(pkgutil.get_data(__name__, f"{benchmark_name}.csv").decode("utf-8")) ) diff --git a/browsergym/experiments/src/browsergym/experiments/benchmark/metadata/webarena_verified.csv b/browsergym/experiments/src/browsergym/experiments/benchmark/metadata/webarena_verified.csv deleted file mode 100644 index 068301a0..00000000 --- a/browsergym/experiments/src/browsergym/experiments/benchmark/metadata/webarena_verified.csv +++ /dev/null @@ -1,813 +0,0 @@ -task_name,requires_reset,sites,eval_types,task_id,browsergym_split,depends_on -webarena_verified.279.0.2,False,shopping_admin,AgentResponseEvaluator,0,train, -webarena_verified.279.1.2,False,shopping_admin,AgentResponseEvaluator,1,test,webarena_verified.279.0.2 -webarena_verified.279.2.2,False,shopping_admin,AgentResponseEvaluator,2,train,webarena_verified.279.1.2 -webarena_verified.279.3.2,False,shopping_admin,AgentResponseEvaluator,3,test,webarena_verified.279.2.2 -webarena_verified.279.4.2,False,shopping_admin,AgentResponseEvaluator,4,train,webarena_verified.279.3.2 -webarena_verified.279.5.2,False,shopping_admin,AgentResponseEvaluator,5,train,webarena_verified.279.4.2 -webarena_verified.279.6.2,False,shopping_admin,AgentResponseEvaluator,6,test,webarena_verified.279.5.2 -webarena_verified.79.7.2,False,map,AgentResponseEvaluator,7,train, -webarena_verified.79.8.2,False,map,AgentResponseEvaluator,8,test,webarena_verified.79.7.2 -webarena_verified.79.9.2,False,map,AgentResponseEvaluator,9,test,webarena_verified.79.8.2 -webarena_verified.79.10.2,False,map,AgentResponseEvaluator,10,test,webarena_verified.79.9.2 -webarena_verified.288.11.2,False,shopping_admin,AgentResponseEvaluator,11,test,webarena_verified.279.6.2 -webarena_verified.288.12.2,False,shopping_admin,AgentResponseEvaluator,12,train,webarena_verified.288.11.2 -webarena_verified.288.13.2,False,shopping_admin,AgentResponseEvaluator,13,train,webarena_verified.288.12.2 -webarena_verified.288.14.2,False,shopping_admin,AgentResponseEvaluator,14,train,webarena_verified.288.13.2 -webarena_verified.288.15.2,False,shopping_admin,AgentResponseEvaluator,15,test,webarena_verified.288.14.2 -webarena_verified.73.16.2,False,map,AgentResponseEvaluator,16,test,webarena_verified.79.10.2 -webarena_verified.73.17.2,False,map,AgentResponseEvaluator,17,train,webarena_verified.73.16.2 -webarena_verified.73.18.2,False,map,AgentResponseEvaluator,18,test,webarena_verified.73.17.2 -webarena_verified.73.19.2,False,map,AgentResponseEvaluator,19,train,webarena_verified.73.18.2 -webarena_verified.73.20.2,False,map,AgentResponseEvaluator,20,test,webarena_verified.73.19.2 -webarena_verified.222.21.2,False,shopping,AgentResponseEvaluator,21,test, -webarena_verified.222.22.2,False,shopping,AgentResponseEvaluator,22,test,webarena_verified.222.21.2 -webarena_verified.222.23.2,False,shopping,AgentResponseEvaluator,23,test,webarena_verified.222.22.2 -webarena_verified.222.24.2,False,shopping,AgentResponseEvaluator,24,test,webarena_verified.222.23.2 -webarena_verified.222.25.2,False,shopping,AgentResponseEvaluator,25,test,webarena_verified.222.24.2 -webarena_verified.222.26.2,False,shopping,AgentResponseEvaluator,26,test,webarena_verified.222.25.2 -webarena_verified.33.27.2,False,reddit,AgentResponseEvaluator,27,test, -webarena_verified.33.28.2,False,reddit,AgentResponseEvaluator,28,train,webarena_verified.33.27.2 -webarena_verified.33.29.2,False,reddit,AgentResponseEvaluator,29,train,webarena_verified.33.28.2 -webarena_verified.33.30.2,False,reddit,AgentResponseEvaluator,30,test,webarena_verified.33.29.2 -webarena_verified.33.31.2,False,reddit,AgentResponseEvaluator,31,train,webarena_verified.33.30.2 -webarena_verified.78.32.2,False,map,AgentResponseEvaluator,32,test,webarena_verified.73.20.2 -webarena_verified.78.33.2,False,map,AgentResponseEvaluator,33,test,webarena_verified.78.32.2 -webarena_verified.78.34.2,False,map,AgentResponseEvaluator,34,train,webarena_verified.78.33.2 -webarena_verified.78.35.2,False,map,AgentResponseEvaluator,35,test,webarena_verified.78.34.2 -webarena_verified.77.36.2,False,map,AgentResponseEvaluator,36,test,webarena_verified.78.35.2 -webarena_verified.77.37.2,False,map,AgentResponseEvaluator,37,train,webarena_verified.77.36.2 -webarena_verified.77.38.2,False,map,AgentResponseEvaluator,38,train,webarena_verified.77.37.2 -webarena_verified.77.39.2,False,map,AgentResponseEvaluator,39,train,webarena_verified.77.38.2 -webarena_verified.77.40.2,False,map,AgentResponseEvaluator,40,test,webarena_verified.77.39.2 -webarena_verified.285.41.2,False,shopping_admin,AgentResponseEvaluator,41,train,webarena_verified.288.15.2 -webarena_verified.285.42.2,False,shopping_admin,AgentResponseEvaluator,42,train,webarena_verified.285.41.2 -webarena_verified.285.43.2,False,shopping_admin,AgentResponseEvaluator,43,test,webarena_verified.285.42.2 -webarena_verified.303.44.2,False,gitlab,AgentResponseEvaluator NetworkEventEvaluator,44,train, -webarena_verified.300.45.2,False,gitlab,AgentResponseEvaluator NetworkEventEvaluator,45,test,webarena_verified.303.44.2 -webarena_verified.300.46.2,False,gitlab,AgentResponseEvaluator NetworkEventEvaluator,46,test,webarena_verified.300.45.2 -webarena_verified.197.47.2,False,shopping,AgentResponseEvaluator,47,train,webarena_verified.222.26.2 -webarena_verified.197.48.2,False,shopping,AgentResponseEvaluator,48,test,webarena_verified.197.47.2 -webarena_verified.197.49.2,False,shopping,AgentResponseEvaluator,49,train,webarena_verified.197.48.2 -webarena_verified.197.50.2,False,shopping,AgentResponseEvaluator,50,train,webarena_verified.197.49.2 -webarena_verified.197.51.2,False,shopping,AgentResponseEvaluator,51,test,webarena_verified.197.50.2 -webarena_verified.68.52.2,False,map,AgentResponseEvaluator,52,test,webarena_verified.77.40.2 -webarena_verified.68.53.2,False,map,AgentResponseEvaluator,53,train,webarena_verified.68.52.2 -webarena_verified.68.54.2,False,map,AgentResponseEvaluator,54,test,webarena_verified.68.53.2 -webarena_verified.68.55.2,False,map,AgentResponseEvaluator,55,train,webarena_verified.68.54.2 -webarena_verified.68.56.2,False,map,AgentResponseEvaluator,56,train,webarena_verified.68.55.2 -webarena_verified.69.57.2,False,map,AgentResponseEvaluator,57,train,webarena_verified.68.56.2 -webarena_verified.69.58.2,False,map,AgentResponseEvaluator,58,train,webarena_verified.69.57.2 -webarena_verified.69.59.2,False,map,AgentResponseEvaluator,59,test,webarena_verified.69.58.2 -webarena_verified.69.60.2,False,map,AgentResponseEvaluator,60,test,webarena_verified.69.59.2 -webarena_verified.69.61.2,False,map,AgentResponseEvaluator,61,train,webarena_verified.69.60.2 -webarena_verified.276.62.2,False,shopping_admin,AgentResponseEvaluator,62,train,webarena_verified.285.43.2 -webarena_verified.276.63.2,False,shopping_admin,AgentResponseEvaluator,63,test,webarena_verified.276.62.2 -webarena_verified.276.64.2,False,shopping_admin,AgentResponseEvaluator,64,test,webarena_verified.276.63.2 -webarena_verified.276.65.2,False,shopping_admin,AgentResponseEvaluator,65,train,webarena_verified.276.64.2 -webarena_verified.17.66.2,False,reddit,AgentResponseEvaluator,66,test,webarena_verified.33.31.2 -webarena_verified.17.67.2,False,reddit,AgentResponseEvaluator,67,test,webarena_verified.17.66.2 -webarena_verified.17.68.2,False,reddit,AgentResponseEvaluator,68,train,webarena_verified.17.67.2 -webarena_verified.17.69.2,False,reddit,AgentResponseEvaluator,69,test,webarena_verified.17.68.2 -webarena_verified.70.70.2,False,map,AgentResponseEvaluator,70,train,webarena_verified.69.61.2 -webarena_verified.70.71.2,False,map,AgentResponseEvaluator,71,test,webarena_verified.70.70.2 -webarena_verified.70.72.2,False,map,AgentResponseEvaluator,72,train,webarena_verified.70.71.2 -webarena_verified.70.73.2,False,map,AgentResponseEvaluator,73,test,webarena_verified.70.72.2 -webarena_verified.65.74.2,False,map,AgentResponseEvaluator,74,train,webarena_verified.70.73.2 -webarena_verified.65.75.2,False,map,AgentResponseEvaluator,75,train,webarena_verified.65.74.2 -webarena_verified.65.76.2,False,map,AgentResponseEvaluator,76,train,webarena_verified.65.75.2 -webarena_verified.277.77.2,False,shopping_admin,AgentResponseEvaluator,77,test,webarena_verified.276.65.2 -webarena_verified.277.78.2,False,shopping_admin,AgentResponseEvaluator,78,train,webarena_verified.277.77.2 -webarena_verified.277.79.2,False,shopping_admin,AgentResponseEvaluator,79,test,webarena_verified.277.78.2 -webarena_verified.72.80.2,False,map,AgentResponseEvaluator,80,test,webarena_verified.65.76.2 -webarena_verified.72.81.2,False,map,AgentResponseEvaluator,81,test,webarena_verified.72.80.2 -webarena_verified.72.82.2,False,map,AgentResponseEvaluator,82,train,webarena_verified.72.81.2 -webarena_verified.72.83.2,False,map,AgentResponseEvaluator,83,train,webarena_verified.72.82.2 -webarena_verified.64.84.2,False,map,AgentResponseEvaluator,84,train,webarena_verified.72.83.2 -webarena_verified.64.85.2,False,map,AgentResponseEvaluator,85,test,webarena_verified.64.84.2 -webarena_verified.64.86.2,False,map,AgentResponseEvaluator,86,test,webarena_verified.64.85.2 -webarena_verified.64.87.2,False,map,AgentResponseEvaluator,87,train,webarena_verified.64.86.2 -webarena_verified.64.88.2,False,map,AgentResponseEvaluator,88,train,webarena_verified.64.87.2 -webarena_verified.67.89.3,False,map,AgentResponseEvaluator,89,test,webarena_verified.64.88.2 -webarena_verified.67.90.3,False,map,AgentResponseEvaluator,90,test,webarena_verified.67.89.3 -webarena_verified.67.91.3,False,map,AgentResponseEvaluator,91,train,webarena_verified.67.90.3 -webarena_verified.67.92.3,False,map,AgentResponseEvaluator,92,train,webarena_verified.67.91.3 -webarena_verified.67.93.3,False,map,AgentResponseEvaluator,93,train,webarena_verified.67.92.3 -webarena_verified.274.94.2,False,shopping_admin,AgentResponseEvaluator,94,test,webarena_verified.277.79.2 -webarena_verified.274.95.2,False,shopping_admin,AgentResponseEvaluator,95,train,webarena_verified.274.94.2 -webarena_verified.193.96.2,False,shopping,AgentResponseEvaluator,96,test,webarena_verified.197.51.2 -webarena_verified.120.97.2,False,map wikipedia,AgentResponseEvaluator NetworkEventEvaluator,97,test,webarena_verified.67.93.3 -webarena_verified.66.98.2,False,map,AgentResponseEvaluator,98,test,webarena_verified.120.97.2 -webarena_verified.66.99.2,False,map,AgentResponseEvaluator,99,train,webarena_verified.66.98.2 -webarena_verified.66.100.2,False,map,AgentResponseEvaluator,100,test,webarena_verified.66.99.2 -webarena_verified.66.101.2,False,map,AgentResponseEvaluator,101,train,webarena_verified.66.100.2 -webarena_verified.349.102.2,False,gitlab,AgentResponseEvaluator NetworkEventEvaluator NetworkEventEvaluator,102,train,webarena_verified.300.46.2 -webarena_verified.349.103.2,False,gitlab,AgentResponseEvaluator NetworkEventEvaluator NetworkEventEvaluator,103,train,webarena_verified.349.102.2 -webarena_verified.349.104.2,False,gitlab,AgentResponseEvaluator NetworkEventEvaluator NetworkEventEvaluator,104,test,webarena_verified.349.103.2 -webarena_verified.349.105.2,False,gitlab,AgentResponseEvaluator NetworkEventEvaluator NetworkEventEvaluator,105,train,webarena_verified.349.104.2 -webarena_verified.349.106.2,False,gitlab,AgentResponseEvaluator NetworkEventEvaluator NetworkEventEvaluator,106,test,webarena_verified.349.105.2 -webarena_verified.270.107.2,False,shopping_admin,AgentResponseEvaluator,107,test,webarena_verified.274.95.2 -webarena_verified.270.108.2,False,shopping_admin,AgentResponseEvaluator,108,train,webarena_verified.270.107.2 -webarena_verified.270.109.2,False,shopping_admin,AgentResponseEvaluator,109,test,webarena_verified.270.108.2 -webarena_verified.270.110.2,False,shopping_admin,AgentResponseEvaluator,110,train,webarena_verified.270.109.2 -webarena_verified.270.111.2,False,shopping_admin,AgentResponseEvaluator,111,train,webarena_verified.270.110.2 -webarena_verified.245.112.2,False,shopping_admin,AgentResponseEvaluator,112,test,webarena_verified.270.111.2 -webarena_verified.245.113.2,False,shopping_admin,AgentResponseEvaluator,113,test,webarena_verified.245.112.2 -webarena_verified.245.114.2,False,shopping_admin,AgentResponseEvaluator,114,train,webarena_verified.245.113.2 -webarena_verified.245.115.2,False,shopping_admin,AgentResponseEvaluator,115,test,webarena_verified.245.114.2 -webarena_verified.245.116.2,False,shopping_admin,AgentResponseEvaluator,116,test,webarena_verified.245.115.2 -webarena_verified.161.117.2,False,shopping,AgentResponseEvaluator,117,test,webarena_verified.193.96.2 -webarena_verified.151.118.2,False,shopping,AgentResponseEvaluator NetworkEventEvaluator,118,train,webarena_verified.161.117.2 -webarena_verified.250.119.2,False,shopping_admin,AgentResponseEvaluator,119,test,webarena_verified.245.116.2 -webarena_verified.250.120.2,False,shopping_admin,AgentResponseEvaluator,120,train,webarena_verified.250.119.2 -webarena_verified.250.121.2,False,shopping_admin,AgentResponseEvaluator,121,train,webarena_verified.250.120.2 -webarena_verified.250.122.2,False,shopping_admin,AgentResponseEvaluator,122,test,webarena_verified.250.121.2 -webarena_verified.250.123.2,False,shopping_admin,AgentResponseEvaluator,123,train,webarena_verified.250.122.2 -webarena_verified.159.124.2,False,shopping,AgentResponseEvaluator,124,train,webarena_verified.151.118.2 -webarena_verified.159.125.2,False,shopping,AgentResponseEvaluator,125,train,webarena_verified.159.124.2 -webarena_verified.159.126.2,False,shopping,AgentResponseEvaluator,126,test,webarena_verified.159.125.2 -webarena_verified.1001.127.2,False,shopping_admin,AgentResponseEvaluator,127,train,webarena_verified.250.123.2 -webarena_verified.1002.128.2,False,shopping_admin,AgentResponseEvaluator,128,train,webarena_verified.1001.127.2 -webarena_verified.1002.129.2,False,shopping_admin,AgentResponseEvaluator,129,train,webarena_verified.1002.128.2 -webarena_verified.1002.130.2,False,shopping_admin,AgentResponseEvaluator,130,train,webarena_verified.1002.129.2 -webarena_verified.1002.131.2,False,shopping_admin,AgentResponseEvaluator,131,test,webarena_verified.1002.130.2 -webarena_verified.322.132.2,False,gitlab,AgentResponseEvaluator,132,train,webarena_verified.349.106.2 -webarena_verified.322.133.2,False,gitlab,AgentResponseEvaluator,133,test,webarena_verified.322.132.2 -webarena_verified.322.134.2,False,gitlab,AgentResponseEvaluator,134,test,webarena_verified.322.133.2 -webarena_verified.322.135.2,False,gitlab,AgentResponseEvaluator,135,train,webarena_verified.322.134.2 -webarena_verified.322.136.2,False,gitlab,AgentResponseEvaluator,136,train,webarena_verified.322.135.2 -webarena_verified.51.137.2,False,map,AgentResponseEvaluator,137,test,webarena_verified.66.101.2 -webarena_verified.51.138.2,False,map,AgentResponseEvaluator,138,test,webarena_verified.51.137.2 -webarena_verified.51.139.2,False,map,AgentResponseEvaluator,139,test,webarena_verified.51.138.2 -webarena_verified.51.140.2,False,map,AgentResponseEvaluator,140,train,webarena_verified.51.139.2 -webarena_verified.162.141.2,False,shopping,AgentResponseEvaluator,141,train,webarena_verified.159.126.2 -webarena_verified.162.142.2,False,shopping,AgentResponseEvaluator,142,train,webarena_verified.162.141.2 -webarena_verified.162.143.2,False,shopping,AgentResponseEvaluator,143,test,webarena_verified.162.142.2 -webarena_verified.162.144.2,False,shopping,AgentResponseEvaluator,144,test,webarena_verified.162.143.2 -webarena_verified.162.145.2,False,shopping,AgentResponseEvaluator,145,train,webarena_verified.162.144.2 -webarena_verified.155.146.2,False,shopping,AgentResponseEvaluator,146,test,webarena_verified.162.145.2 -webarena_verified.155.147.2,False,shopping,AgentResponseEvaluator,147,train,webarena_verified.155.146.2 -webarena_verified.155.148.2,False,shopping,AgentResponseEvaluator,148,train,webarena_verified.155.147.2 -webarena_verified.155.149.2,False,shopping,AgentResponseEvaluator,149,test,webarena_verified.155.148.2 -webarena_verified.155.150.2,False,shopping,AgentResponseEvaluator,150,train,webarena_verified.155.149.2 -webarena_verified.36.151.2,False,map,AgentResponseEvaluator,151,train,webarena_verified.51.140.2 -webarena_verified.36.152.2,False,map,AgentResponseEvaluator,152,train,webarena_verified.36.151.2 -webarena_verified.36.153.2,False,map,AgentResponseEvaluator,153,test,webarena_verified.36.152.2 -webarena_verified.36.154.2,False,map,AgentResponseEvaluator,154,train,webarena_verified.36.153.2 -webarena_verified.36.155.2,False,map,AgentResponseEvaluator,155,test,webarena_verified.36.154.2 -webarena_verified.290.156.2,False,gitlab,AgentResponseEvaluator NetworkEventEvaluator,156,test,webarena_verified.322.136.2 -webarena_verified.255.157.2,False,shopping_admin,AgentResponseEvaluator NetworkEventEvaluator,157,train,webarena_verified.1002.131.2 -webarena_verified.171.158.2,False,shopping,AgentResponseEvaluator NetworkEventEvaluator,158,test,webarena_verified.155.150.2 -webarena_verified.171.159.2,False,shopping,AgentResponseEvaluator NetworkEventEvaluator,159,train,webarena_verified.171.158.2 -webarena_verified.171.160.2,False,shopping,AgentResponseEvaluator NetworkEventEvaluator,160,train,webarena_verified.171.159.2 -webarena_verified.171.161.2,False,shopping,AgentResponseEvaluator NetworkEventEvaluator,161,train,webarena_verified.171.160.2 -webarena_verified.171.162.2,False,shopping,AgentResponseEvaluator NetworkEventEvaluator,162,test,webarena_verified.171.161.2 -webarena_verified.136.163.2,False,shopping,AgentResponseEvaluator,163,test,webarena_verified.171.162.2 -webarena_verified.136.164.2,False,shopping,AgentResponseEvaluator,164,test,webarena_verified.136.163.2 -webarena_verified.136.165.2,False,shopping,AgentResponseEvaluator,165,test,webarena_verified.136.164.2 -webarena_verified.136.166.2,False,shopping,AgentResponseEvaluator,166,test,webarena_verified.136.165.2 -webarena_verified.136.167.2,False,shopping,AgentResponseEvaluator,167,test,webarena_verified.136.166.2 -webarena_verified.289.168.2,False,gitlab,AgentResponseEvaluator,168,test,webarena_verified.290.156.2 -webarena_verified.289.169.2,False,gitlab,AgentResponseEvaluator,169,train,webarena_verified.289.168.2 -webarena_verified.289.170.2,False,gitlab,AgentResponseEvaluator,170,train,webarena_verified.289.169.2 -webarena_verified.289.171.2,False,gitlab,AgentResponseEvaluator,171,test,webarena_verified.289.170.2 -webarena_verified.289.172.2,False,gitlab,AgentResponseEvaluator,172,train,webarena_verified.289.171.2 -webarena_verified.310.173.2,False,gitlab,AgentResponseEvaluator,173,train,webarena_verified.289.172.2 -webarena_verified.310.174.2,False,gitlab,AgentResponseEvaluator,174,test,webarena_verified.310.173.2 -webarena_verified.310.175.2,False,gitlab,AgentResponseEvaluator,175,train,webarena_verified.310.174.2 -webarena_verified.310.176.2,False,gitlab,AgentResponseEvaluator,176,train,webarena_verified.310.175.2 -webarena_verified.310.177.2,False,gitlab,AgentResponseEvaluator,177,test,webarena_verified.310.176.2 -webarena_verified.500.178.2,False,gitlab,AgentResponseEvaluator,178,test,webarena_verified.310.177.2 -webarena_verified.500.179.2,False,gitlab,AgentResponseEvaluator,179,train,webarena_verified.500.178.2 -webarena_verified.500.180.2,False,gitlab,AgentResponseEvaluator,180,train,webarena_verified.500.179.2 -webarena_verified.500.181.2,False,gitlab,AgentResponseEvaluator,181,test,webarena_verified.500.180.2 -webarena_verified.500.182.2,False,gitlab,AgentResponseEvaluator,182,train,webarena_verified.500.181.2 -webarena_verified.368.183.2,False,shopping_admin,AgentResponseEvaluator,183,train,webarena_verified.255.157.2 -webarena_verified.368.184.2,False,shopping_admin,AgentResponseEvaluator,184,train,webarena_verified.368.183.2 -webarena_verified.368.185.2,False,shopping_admin,AgentResponseEvaluator,185,test,webarena_verified.368.184.2 -webarena_verified.368.186.2,False,shopping_admin,AgentResponseEvaluator,186,train,webarena_verified.368.185.2 -webarena_verified.368.187.2,False,shopping_admin,AgentResponseEvaluator,187,test,webarena_verified.368.186.2 -webarena_verified.214.188.2,False,shopping,AgentResponseEvaluator,188,test,webarena_verified.136.167.2 -webarena_verified.214.189.2,False,shopping,AgentResponseEvaluator,189,train,webarena_verified.214.188.2 -webarena_verified.214.190.2,False,shopping,AgentResponseEvaluator,190,train,webarena_verified.214.189.2 -webarena_verified.214.191.2,False,shopping,AgentResponseEvaluator,191,train,webarena_verified.214.190.2 -webarena_verified.214.192.2,False,shopping,AgentResponseEvaluator,192,test,webarena_verified.214.191.2 -webarena_verified.367.193.2,False,shopping_admin,AgentResponseEvaluator,193,train,webarena_verified.368.187.2 -webarena_verified.367.194.2,False,shopping_admin,AgentResponseEvaluator,194,train,webarena_verified.367.193.2 -webarena_verified.367.195.2,False,shopping_admin,AgentResponseEvaluator,195,test,webarena_verified.367.194.2 -webarena_verified.367.196.2,False,shopping_admin,AgentResponseEvaluator,196,train,webarena_verified.367.195.2 -webarena_verified.367.197.2,False,shopping_admin,AgentResponseEvaluator,197,train,webarena_verified.367.196.2 -webarena_verified.366.198.2,False,shopping_admin,AgentResponseEvaluator,198,train,webarena_verified.367.197.2 -webarena_verified.366.199.2,False,shopping_admin,AgentResponseEvaluator,199,train,webarena_verified.366.198.2 -webarena_verified.366.200.2,False,shopping_admin,AgentResponseEvaluator,200,train,webarena_verified.366.199.2 -webarena_verified.366.201.2,False,shopping_admin,AgentResponseEvaluator,201,test,webarena_verified.366.200.2 -webarena_verified.366.202.2,False,shopping_admin,AgentResponseEvaluator,202,train,webarena_verified.366.201.2 -webarena_verified.366.203.2,False,shopping_admin,AgentResponseEvaluator,203,test,webarena_verified.366.202.2 -webarena_verified.366.204.2,False,shopping_admin,AgentResponseEvaluator,204,test,webarena_verified.366.203.2 -webarena_verified.320.205.2,False,gitlab,AgentResponseEvaluator,205,train,webarena_verified.500.182.2 -webarena_verified.320.206.2,False,gitlab,AgentResponseEvaluator,206,test,webarena_verified.320.205.2 -webarena_verified.320.207.2,False,gitlab,AgentResponseEvaluator,207,test,webarena_verified.320.206.2 -webarena_verified.364.208.2,False,shopping_admin,AgentResponseEvaluator,208,test,webarena_verified.366.204.2 -webarena_verified.364.209.2,False,shopping_admin,AgentResponseEvaluator,209,test,webarena_verified.364.208.2 -webarena_verified.364.210.2,False,shopping_admin,AgentResponseEvaluator,210,train,webarena_verified.364.209.2 -webarena_verified.364.211.2,False,shopping_admin,AgentResponseEvaluator,211,train,webarena_verified.364.210.2 -webarena_verified.364.212.2,False,shopping_admin,AgentResponseEvaluator,212,train,webarena_verified.364.211.2 -webarena_verified.249.213.2,False,shopping_admin,AgentResponseEvaluator,213,test,webarena_verified.364.212.2 -webarena_verified.249.214.2,False,shopping_admin,AgentResponseEvaluator,214,train,webarena_verified.249.213.2 -webarena_verified.249.215.2,False,shopping_admin,AgentResponseEvaluator,215,test,webarena_verified.249.214.2 -webarena_verified.249.216.2,False,shopping_admin,AgentResponseEvaluator,216,train,webarena_verified.249.215.2 -webarena_verified.249.217.2,False,shopping_admin,AgentResponseEvaluator,217,train,webarena_verified.249.216.2 -webarena_verified.41.218.2,False,map,AgentResponseEvaluator,218,train,webarena_verified.36.155.2 -webarena_verified.41.219.2,False,map,AgentResponseEvaluator,219,test,webarena_verified.41.218.2 -webarena_verified.41.220.2,False,map,AgentResponseEvaluator,220,train,webarena_verified.41.219.2 -webarena_verified.35.221.2,False,map,AgentResponseEvaluator,221,test,webarena_verified.41.220.2 -webarena_verified.35.222.2,False,map,AgentResponseEvaluator,222,train,webarena_verified.35.221.2 -webarena_verified.35.223.2,False,map,AgentResponseEvaluator,223,test,webarena_verified.35.222.2 -webarena_verified.35.224.2,False,map,AgentResponseEvaluator,224,test,webarena_verified.35.223.2 -webarena_verified.135.225.2,False,shopping,AgentResponseEvaluator,225,test,webarena_verified.214.192.2 -webarena_verified.370.226.2,False,shopping,AgentResponseEvaluator,226,train,webarena_verified.135.225.2 -webarena_verified.370.227.2,False,shopping,AgentResponseEvaluator,227,train,webarena_verified.370.226.2 -webarena_verified.370.228.2,False,shopping,AgentResponseEvaluator,228,test,webarena_verified.370.227.2 -webarena_verified.370.229.2,False,shopping,AgentResponseEvaluator,229,test,webarena_verified.370.228.2 -webarena_verified.370.230.2,False,shopping,AgentResponseEvaluator,230,train,webarena_verified.370.229.2 -webarena_verified.213.231.2,False,shopping,AgentResponseEvaluator,231,test,webarena_verified.370.230.2 -webarena_verified.213.232.2,False,shopping,AgentResponseEvaluator,232,train,webarena_verified.213.231.2 -webarena_verified.213.233.2,False,shopping,AgentResponseEvaluator,233,test,webarena_verified.213.232.2 -webarena_verified.213.234.2,False,shopping,AgentResponseEvaluator,234,train,webarena_verified.213.233.2 -webarena_verified.213.235.2,False,shopping,AgentResponseEvaluator,235,train,webarena_verified.213.234.2 -webarena_verified.39.236.2,False,map,AgentResponseEvaluator,236,train,webarena_verified.35.224.2 -webarena_verified.39.237.2,False,map,AgentResponseEvaluator,237,train,webarena_verified.39.236.2 -webarena_verified.138.238.2,False,shopping,AgentResponseEvaluator NetworkEventEvaluator,238,train,webarena_verified.213.235.2 -webarena_verified.138.239.2,False,shopping,AgentResponseEvaluator NetworkEventEvaluator,239,train,webarena_verified.138.238.2 -webarena_verified.138.240.2,False,shopping,AgentResponseEvaluator NetworkEventEvaluator,240,test,webarena_verified.138.239.2 -webarena_verified.138.241.2,False,shopping,AgentResponseEvaluator NetworkEventEvaluator,241,train,webarena_verified.138.240.2 -webarena_verified.138.242.2,False,shopping,AgentResponseEvaluator NetworkEventEvaluator,242,test,webarena_verified.138.241.2 -webarena_verified.244.243.2,False,shopping_admin,AgentResponseEvaluator,243,train,webarena_verified.249.217.2 -webarena_verified.244.244.2,False,shopping_admin,AgentResponseEvaluator,244,test,webarena_verified.244.243.2 -webarena_verified.244.245.2,False,shopping_admin,AgentResponseEvaluator,245,train,webarena_verified.244.244.2 -webarena_verified.244.246.2,False,shopping_admin,AgentResponseEvaluator,246,test,webarena_verified.244.245.2 -webarena_verified.244.247.2,False,shopping_admin,AgentResponseEvaluator,247,train,webarena_verified.244.246.2 -webarena_verified.46.248.2,False,map,AgentResponseEvaluator,248,test,webarena_verified.39.237.2 -webarena_verified.46.249.2,False,map,AgentResponseEvaluator,249,train,webarena_verified.46.248.2 -webarena_verified.46.250.2,False,map,AgentResponseEvaluator,250,test,webarena_verified.46.249.2 -webarena_verified.46.251.2,False,map,AgentResponseEvaluator,251,train,webarena_verified.46.250.2 -webarena_verified.46.252.2,False,map,AgentResponseEvaluator,252,train,webarena_verified.46.251.2 -webarena_verified.501.253.2,False,map,AgentResponseEvaluator,253,test,webarena_verified.46.252.2 -webarena_verified.501.254.2,False,map,AgentResponseEvaluator,254,train,webarena_verified.501.253.2 -webarena_verified.501.255.2,False,map,AgentResponseEvaluator,255,test,webarena_verified.501.254.2 -webarena_verified.501.256.2,False,map,AgentResponseEvaluator,256,train,webarena_verified.501.255.2 -webarena_verified.501.257.2,False,map,AgentResponseEvaluator,257,test,webarena_verified.501.256.2 -webarena_verified.325.258.2,False,gitlab,AgentResponseEvaluator NetworkEventEvaluator,258,train,webarena_verified.320.207.2 -webarena_verified.312.259.2,False,gitlab,AgentResponseEvaluator,259,train,webarena_verified.325.258.2 -webarena_verified.211.260.2,False,shopping,AgentResponseEvaluator NetworkEventEvaluator,260,test,webarena_verified.138.242.2 -webarena_verified.211.261.2,False,shopping,AgentResponseEvaluator NetworkEventEvaluator,261,train,webarena_verified.211.260.2 -webarena_verified.211.262.2,False,shopping,AgentResponseEvaluator NetworkEventEvaluator,262,train,webarena_verified.211.261.2 -webarena_verified.211.263.2,False,shopping,AgentResponseEvaluator NetworkEventEvaluator,263,test,webarena_verified.211.262.2 -webarena_verified.211.264.2,False,shopping,AgentResponseEvaluator NetworkEventEvaluator,264,train,webarena_verified.211.263.2 -webarena_verified.85.265.4,False,wikipedia map,AgentResponseEvaluator NetworkEventEvaluator,265,test,webarena_verified.501.257.2 -webarena_verified.85.266.4,False,wikipedia map,AgentResponseEvaluator NetworkEventEvaluator,266,test,webarena_verified.85.265.4 -webarena_verified.85.267.4,False,wikipedia map,AgentResponseEvaluator NetworkEventEvaluator,267,train,webarena_verified.85.266.4 -webarena_verified.85.268.4,False,wikipedia map,AgentResponseEvaluator NetworkEventEvaluator,268,test,webarena_verified.85.267.4 -webarena_verified.139.269.2,False,shopping,AgentResponseEvaluator NetworkEventEvaluator,269,train,webarena_verified.211.264.2 -webarena_verified.139.270.2,False,shopping,AgentResponseEvaluator NetworkEventEvaluator,270,train,webarena_verified.139.269.2 -webarena_verified.139.271.2,False,shopping,AgentResponseEvaluator NetworkEventEvaluator,271,test,webarena_verified.139.270.2 -webarena_verified.139.272.2,False,shopping,AgentResponseEvaluator NetworkEventEvaluator,272,test,webarena_verified.139.271.2 -webarena_verified.139.273.2,False,shopping,AgentResponseEvaluator NetworkEventEvaluator,273,train,webarena_verified.139.272.2 -webarena_verified.212.274.2,False,shopping,AgentResponseEvaluator NetworkEventEvaluator,274,test,webarena_verified.139.273.2 -webarena_verified.212.275.2,False,shopping,AgentResponseEvaluator NetworkEventEvaluator,275,test,webarena_verified.212.274.2 -webarena_verified.212.276.2,False,shopping,AgentResponseEvaluator NetworkEventEvaluator,276,train,webarena_verified.212.275.2 -webarena_verified.212.277.2,False,shopping,AgentResponseEvaluator NetworkEventEvaluator,277,train,webarena_verified.212.276.2 -webarena_verified.212.278.2,False,shopping,AgentResponseEvaluator NetworkEventEvaluator,278,train,webarena_verified.212.277.2 -webarena_verified.204.279.2,False,shopping,AgentResponseEvaluator,279,train,webarena_verified.212.278.2 -webarena_verified.204.280.2,False,shopping,AgentResponseEvaluator,280,test,webarena_verified.204.279.2 -webarena_verified.204.281.2,False,shopping,AgentResponseEvaluator,281,train,webarena_verified.204.280.2 -webarena_verified.204.282.2,False,shopping,AgentResponseEvaluator,282,train,webarena_verified.204.281.2 -webarena_verified.210.283.2,False,shopping,AgentResponseEvaluator NetworkEventEvaluator,283,test,webarena_verified.204.282.2 -webarena_verified.207.284.2,False,shopping,AgentResponseEvaluator NetworkEventEvaluator,284,test,webarena_verified.210.283.2 -webarena_verified.207.285.2,False,shopping,AgentResponseEvaluator NetworkEventEvaluator,285,train,webarena_verified.207.284.2 -webarena_verified.207.286.2,False,shopping,AgentResponseEvaluator NetworkEventEvaluator,286,test,webarena_verified.207.285.2 -webarena_verified.47.287.2,False,map,AgentResponseEvaluator,287,test,webarena_verified.85.268.4 -webarena_verified.234.288.2,False,shopping_admin,AgentResponseEvaluator,288,train,webarena_verified.244.247.2 -webarena_verified.234.289.2,False,shopping_admin,AgentResponseEvaluator,289,test,webarena_verified.234.288.2 -webarena_verified.234.290.2,False,shopping_admin,AgentResponseEvaluator,290,train,webarena_verified.234.289.2 -webarena_verified.234.291.2,False,shopping_admin,AgentResponseEvaluator,291,train,webarena_verified.234.290.2 -webarena_verified.234.292.2,False,shopping_admin,AgentResponseEvaluator,292,test,webarena_verified.234.291.2 -webarena_verified.329.293.2,False,gitlab,AgentResponseEvaluator,293,train,webarena_verified.312.259.2 -webarena_verified.329.294.2,False,gitlab,AgentResponseEvaluator,294,train,webarena_verified.329.293.2 -webarena_verified.329.295.2,False,gitlab,AgentResponseEvaluator,295,test,webarena_verified.329.294.2 -webarena_verified.329.296.2,False,gitlab,AgentResponseEvaluator,296,train,webarena_verified.329.295.2 -webarena_verified.329.297.2,False,gitlab,AgentResponseEvaluator,297,test,webarena_verified.329.296.2 -webarena_verified.180.298.2,False,shopping,AgentResponseEvaluator NetworkEventEvaluator,298,train,webarena_verified.207.286.2 -webarena_verified.180.299.2,False,shopping,AgentResponseEvaluator NetworkEventEvaluator,299,train,webarena_verified.180.298.2 -webarena_verified.180.300.2,False,shopping,AgentResponseEvaluator NetworkEventEvaluator,300,test,webarena_verified.180.299.2 -webarena_verified.180.301.2,False,shopping,AgentResponseEvaluator NetworkEventEvaluator,301,test,webarena_verified.180.300.2 -webarena_verified.180.302.2,False,shopping,AgentResponseEvaluator NetworkEventEvaluator,302,train,webarena_verified.180.301.2 -webarena_verified.321.303.2,False,gitlab,AgentResponseEvaluator,303,test,webarena_verified.329.297.2 -webarena_verified.321.304.2,False,gitlab,AgentResponseEvaluator,304,train,webarena_verified.321.303.2 -webarena_verified.321.305.2,False,gitlab,AgentResponseEvaluator,305,train,webarena_verified.321.304.2 -webarena_verified.321.306.2,False,gitlab,AgentResponseEvaluator,306,test,webarena_verified.321.305.2 -webarena_verified.321.307.2,False,gitlab,AgentResponseEvaluator,307,train,webarena_verified.321.306.2 -webarena_verified.323.308.2,False,gitlab,AgentResponseEvaluator,308,train,webarena_verified.321.307.2 -webarena_verified.323.309.2,False,gitlab,AgentResponseEvaluator,309,train,webarena_verified.323.308.2 -webarena_verified.323.310.2,False,gitlab,AgentResponseEvaluator,310,train,webarena_verified.323.309.2 -webarena_verified.323.311.2,False,gitlab,AgentResponseEvaluator,311,test,webarena_verified.323.310.2 -webarena_verified.323.312.2,False,gitlab,AgentResponseEvaluator,312,test,webarena_verified.323.311.2 -webarena_verified.134.313.2,False,shopping,AgentResponseEvaluator,313,train,webarena_verified.180.302.2 -webarena_verified.324.314.2,False,gitlab,AgentResponseEvaluator,314,train,webarena_verified.323.312.2 -webarena_verified.324.315.2,False,gitlab,AgentResponseEvaluator,315,train,webarena_verified.324.314.2 -webarena_verified.324.316.2,False,gitlab,AgentResponseEvaluator,316,test,webarena_verified.324.315.2 -webarena_verified.324.317.2,False,gitlab,AgentResponseEvaluator,317,test,webarena_verified.324.316.2 -webarena_verified.324.318.2,False,gitlab,AgentResponseEvaluator,318,train,webarena_verified.324.317.2 -webarena_verified.160.319.2,False,shopping,AgentResponseEvaluator,319,train,webarena_verified.134.313.2 -webarena_verified.160.320.2,False,shopping,AgentResponseEvaluator,320,test,webarena_verified.160.319.2 -webarena_verified.160.321.2,False,shopping,AgentResponseEvaluator,321,train,webarena_verified.160.320.2 -webarena_verified.160.322.2,False,shopping,AgentResponseEvaluator,322,test,webarena_verified.160.321.2 -webarena_verified.160.323.2,False,shopping,AgentResponseEvaluator,323,train,webarena_verified.160.322.2 -webarena_verified.208.324.2,False,shopping,AgentResponseEvaluator NetworkEventEvaluator,324,train,webarena_verified.160.323.2 -webarena_verified.208.325.2,False,shopping,AgentResponseEvaluator NetworkEventEvaluator,325,test,webarena_verified.208.324.2 -webarena_verified.208.326.2,False,shopping,AgentResponseEvaluator NetworkEventEvaluator,326,train,webarena_verified.208.325.2 -webarena_verified.208.327.2,False,shopping,AgentResponseEvaluator NetworkEventEvaluator,327,test,webarena_verified.208.326.2 -webarena_verified.208.328.2,False,shopping,AgentResponseEvaluator NetworkEventEvaluator,328,train,webarena_verified.208.327.2 -webarena_verified.147.329.2,False,shopping,AgentResponseEvaluator,329,test,webarena_verified.208.328.2 -webarena_verified.147.330.2,False,shopping,AgentResponseEvaluator,330,test,webarena_verified.147.329.2 -webarena_verified.147.331.2,False,shopping,AgentResponseEvaluator,331,test,webarena_verified.147.330.2 -webarena_verified.147.332.2,False,shopping,AgentResponseEvaluator,332,train,webarena_verified.147.331.2 -webarena_verified.147.333.2,False,shopping,AgentResponseEvaluator,333,train,webarena_verified.147.332.2 -webarena_verified.169.334.2,False,shopping,AgentResponseEvaluator,334,train,webarena_verified.147.333.2 -webarena_verified.169.335.2,False,shopping,AgentResponseEvaluator,335,train,webarena_verified.169.334.2 -webarena_verified.169.336.2,False,shopping,AgentResponseEvaluator,336,test,webarena_verified.169.335.2 -webarena_verified.169.337.2,False,shopping,AgentResponseEvaluator,337,test,webarena_verified.169.336.2 -webarena_verified.169.338.2,False,shopping,AgentResponseEvaluator,338,train,webarena_verified.169.337.2 -webarena_verified.299.339.2,False,gitlab,AgentResponseEvaluator NetworkEventEvaluator NetworkEventEvaluator,339,test,webarena_verified.324.318.2 -webarena_verified.299.340.2,False,gitlab,AgentResponseEvaluator NetworkEventEvaluator NetworkEventEvaluator,340,train,webarena_verified.299.339.2 -webarena_verified.299.341.2,False,gitlab,AgentResponseEvaluator NetworkEventEvaluator NetworkEventEvaluator,341,test,webarena_verified.299.340.2 -webarena_verified.299.342.2,False,gitlab,AgentResponseEvaluator NetworkEventEvaluator NetworkEventEvaluator,342,test,webarena_verified.299.341.2 -webarena_verified.299.343.2,False,gitlab,AgentResponseEvaluator NetworkEventEvaluator NetworkEventEvaluator,343,test,webarena_verified.299.342.2 -webarena_verified.248.344.2,False,shopping_admin,AgentResponseEvaluator,344,test,webarena_verified.234.292.2 -webarena_verified.248.345.2,False,shopping_admin,AgentResponseEvaluator,345,train,webarena_verified.248.344.2 -webarena_verified.248.346.2,False,shopping_admin,AgentResponseEvaluator,346,train,webarena_verified.248.345.2 -webarena_verified.248.347.2,False,shopping_admin,AgentResponseEvaluator,347,train,webarena_verified.248.346.2 -webarena_verified.248.348.2,False,shopping_admin,AgentResponseEvaluator,348,test,webarena_verified.248.347.2 -webarena_verified.298.349.3,False,gitlab,AgentResponseEvaluator,349,test,webarena_verified.299.343.2 -webarena_verified.298.350.3,False,gitlab,AgentResponseEvaluator,350,test,webarena_verified.298.349.3 -webarena_verified.137.351.2,False,shopping,AgentResponseEvaluator NetworkEventEvaluator,351,train,webarena_verified.169.338.2 -webarena_verified.137.352.2,False,shopping,AgentResponseEvaluator NetworkEventEvaluator,352,test,webarena_verified.137.351.2 -webarena_verified.137.353.2,False,shopping,AgentResponseEvaluator NetworkEventEvaluator,353,test,webarena_verified.137.352.2 -webarena_verified.137.354.2,False,shopping,AgentResponseEvaluator NetworkEventEvaluator,354,train,webarena_verified.137.353.2 -webarena_verified.137.355.2,False,shopping,AgentResponseEvaluator NetworkEventEvaluator,355,train,webarena_verified.137.354.2 -webarena_verified.49.356.2,False,map,AgentResponseEvaluator NetworkEventEvaluator,356,test,webarena_verified.47.287.2 -webarena_verified.291.357.2,False,gitlab,AgentResponseEvaluator NetworkEventEvaluator,357,test,webarena_verified.298.350.3 -webarena_verified.206.358.2,False,shopping,AgentResponseEvaluator,358,train,webarena_verified.137.355.2 -webarena_verified.206.359.2,False,shopping,AgentResponseEvaluator,359,test,webarena_verified.206.358.2 -webarena_verified.206.360.2,False,shopping,AgentResponseEvaluator,360,train,webarena_verified.206.359.2 -webarena_verified.206.361.2,False,shopping,AgentResponseEvaluator,361,train,webarena_verified.206.360.2 -webarena_verified.206.362.2,False,shopping,AgentResponseEvaluator,362,test,webarena_verified.206.361.2 -webarena_verified.58.363.2,False,map,AgentResponseEvaluator,363,train,webarena_verified.49.356.2 -webarena_verified.58.364.2,False,map,AgentResponseEvaluator,364,test,webarena_verified.58.363.2 -webarena_verified.58.365.2,False,map,AgentResponseEvaluator,365,test,webarena_verified.58.364.2 -webarena_verified.58.366.2,False,map,AgentResponseEvaluator,366,train,webarena_verified.58.365.2 -webarena_verified.58.367.2,False,map,AgentResponseEvaluator,367,train,webarena_verified.58.366.2 -webarena_verified.188.368.2,False,shopping,AgentResponseEvaluator,368,test,webarena_verified.206.362.2 -webarena_verified.52.369.2,False,map,AgentResponseEvaluator NetworkEventEvaluator,369,train,webarena_verified.58.367.2 -webarena_verified.52.370.2,False,map,AgentResponseEvaluator NetworkEventEvaluator,370,test,webarena_verified.52.369.2 -webarena_verified.52.371.2,False,map,AgentResponseEvaluator NetworkEventEvaluator,371,test,webarena_verified.52.370.2 -webarena_verified.52.372.2,False,map,AgentResponseEvaluator NetworkEventEvaluator,372,train,webarena_verified.52.371.2 -webarena_verified.52.373.2,False,map,AgentResponseEvaluator NetworkEventEvaluator,373,train,webarena_verified.52.372.2 -webarena_verified.266.374.2,False,shopping_admin,AgentResponseEvaluator NetworkEventEvaluator,374,train,webarena_verified.248.348.2 -webarena_verified.266.375.2,False,shopping_admin,AgentResponseEvaluator NetworkEventEvaluator,375,train,webarena_verified.266.374.2 -webarena_verified.182.376.2,False,shopping,AgentResponseEvaluator,376,test,webarena_verified.188.368.2 -webarena_verified.59.377.2,False,map,AgentResponseEvaluator NetworkEventEvaluator,377,test,webarena_verified.52.373.2 -webarena_verified.59.378.2,False,map,AgentResponseEvaluator NetworkEventEvaluator,378,train,webarena_verified.59.377.2 -webarena_verified.59.379.2,False,map,AgentResponseEvaluator NetworkEventEvaluator,379,train,webarena_verified.59.378.2 -webarena_verified.59.380.2,False,map,AgentResponseEvaluator NetworkEventEvaluator,380,test,webarena_verified.59.379.2 -webarena_verified.59.381.2,False,map,AgentResponseEvaluator NetworkEventEvaluator,381,train,webarena_verified.59.380.2 -webarena_verified.781.382.2,False,map,AgentResponseEvaluator,382,test,webarena_verified.59.381.2 -webarena_verified.782.383.2,False,map,AgentResponseEvaluator,383,test,webarena_verified.781.382.2 -webarena_verified.666.384.2,False,shopping,AgentResponseEvaluator,384,test,webarena_verified.182.376.2 -webarena_verified.666.385.2,False,shopping,AgentResponseEvaluator,385,train,webarena_verified.666.384.2 -webarena_verified.1355.386.2,False,shopping,AgentResponseEvaluator,386,test,webarena_verified.666.385.2 -webarena_verified.1356.387.2,False,shopping,AgentResponseEvaluator,387,train,webarena_verified.1355.386.2 -webarena_verified.1356.388.2,False,shopping,AgentResponseEvaluator,388,test,webarena_verified.1356.387.2 -webarena_verified.348.389.2,False,gitlab,AgentResponseEvaluator NetworkEventEvaluator,389,test,webarena_verified.291.357.2 -webarena_verified.348.390.2,False,gitlab,AgentResponseEvaluator NetworkEventEvaluator,390,train,webarena_verified.348.389.2 -webarena_verified.348.391.2,False,gitlab,AgentResponseEvaluator NetworkEventEvaluator,391,train,webarena_verified.348.390.2 -webarena_verified.348.392.2,False,gitlab,AgentResponseEvaluator NetworkEventEvaluator,392,test,webarena_verified.348.391.2 -webarena_verified.348.393.2,False,gitlab,AgentResponseEvaluator NetworkEventEvaluator,393,train,webarena_verified.348.392.2 -webarena_verified.352.394.2,False,gitlab,AgentResponseEvaluator NetworkEventEvaluator,394,test,webarena_verified.348.393.2 -webarena_verified.352.395.2,False,gitlab,AgentResponseEvaluator NetworkEventEvaluator,395,train,webarena_verified.352.394.2 -webarena_verified.352.396.2,False,gitlab,AgentResponseEvaluator NetworkEventEvaluator,396,train,webarena_verified.352.395.2 -webarena_verified.352.397.2,False,gitlab,AgentResponseEvaluator NetworkEventEvaluator,397,train,webarena_verified.352.396.2 -webarena_verified.352.398.2,False,gitlab,AgentResponseEvaluator NetworkEventEvaluator NetworkEventEvaluator NetworkEventEvaluator NetworkEventEvaluator NetworkEventEvaluator,398,test,webarena_verified.352.397.2 -webarena_verified.6.399.2,False,reddit,AgentResponseEvaluator NetworkEventEvaluator,399,train,webarena_verified.17.69.2 -webarena_verified.6.400.2,False,reddit,AgentResponseEvaluator NetworkEventEvaluator,400,test,webarena_verified.6.399.2 -webarena_verified.6.401.2,False,reddit,AgentResponseEvaluator NetworkEventEvaluator,401,train,webarena_verified.6.400.2 -webarena_verified.6.402.2,False,reddit,AgentResponseEvaluator NetworkEventEvaluator,402,train,webarena_verified.6.401.2 -webarena_verified.6.403.2,False,reddit,AgentResponseEvaluator NetworkEventEvaluator,403,test,webarena_verified.6.402.2 -webarena_verified.22.404.2,False,reddit,AgentResponseEvaluator NetworkEventEvaluator,404,train,webarena_verified.6.403.2 -webarena_verified.22.405.2,False,reddit,AgentResponseEvaluator NetworkEventEvaluator,405,test,webarena_verified.22.404.2 -webarena_verified.22.406.2,False,reddit,AgentResponseEvaluator NetworkEventEvaluator,406,train,webarena_verified.22.405.2 -webarena_verified.22.407.2,False,reddit,AgentResponseEvaluator NetworkEventEvaluator,407,test,webarena_verified.22.406.2 -webarena_verified.22.408.2,False,reddit,AgentResponseEvaluator NetworkEventEvaluator,408,train,webarena_verified.22.407.2 -webarena_verified.23.409.2,False,reddit,AgentResponseEvaluator NetworkEventEvaluator,409,test,webarena_verified.22.408.2 -webarena_verified.23.410.2,False,reddit,AgentResponseEvaluator NetworkEventEvaluator,410,test,webarena_verified.23.409.2 -webarena_verified.355.411.2,False,gitlab,AgentResponseEvaluator NetworkEventEvaluator,411,test,webarena_verified.352.398.2 -webarena_verified.355.412.2,False,gitlab,AgentResponseEvaluator NetworkEventEvaluator,412,test,webarena_verified.355.411.2 -webarena_verified.355.413.2,False,gitlab,AgentResponseEvaluator NetworkEventEvaluator,413,test,webarena_verified.355.412.2 -webarena_verified.355.414.2,False,gitlab,AgentResponseEvaluator NetworkEventEvaluator,414,test,webarena_verified.355.413.2 -webarena_verified.360.415.2,False,gitlab,AgentResponseEvaluator NetworkEventEvaluator,415,test,webarena_verified.355.414.2 -webarena_verified.360.416.2,False,gitlab,AgentResponseEvaluator NetworkEventEvaluator,416,test,webarena_verified.360.415.2 -webarena_verified.360.417.2,False,gitlab,AgentResponseEvaluator NetworkEventEvaluator,417,test,webarena_verified.360.416.2 -webarena_verified.361.418.2,False,gitlab,AgentResponseEvaluator NetworkEventEvaluator,418,train,webarena_verified.360.417.2 -webarena_verified.361.419.2,False,gitlab,AgentResponseEvaluator NetworkEventEvaluator,419,test,webarena_verified.361.418.2 -webarena_verified.361.420.2,False,gitlab,AgentResponseEvaluator NetworkEventEvaluator,420,test,webarena_verified.361.419.2 -webarena_verified.361.421.2,False,gitlab,AgentResponseEvaluator NetworkEventEvaluator,421,train,webarena_verified.361.420.2 -webarena_verified.361.422.2,False,gitlab,AgentResponseEvaluator NetworkEventEvaluator,422,train,webarena_verified.361.421.2 -webarena_verified.237.423.2,False,shopping_admin,AgentResponseEvaluator NetworkEventEvaluator,423,train,webarena_verified.266.375.2 -webarena_verified.371.424.2,False,wikipedia map,AgentResponseEvaluator NetworkEventEvaluator,424,train,webarena_verified.782.383.2 -webarena_verified.371.425.2,False,wikipedia map,AgentResponseEvaluator NetworkEventEvaluator,425,train,webarena_verified.371.424.2 -webarena_verified.371.426.2,False,wikipedia map,AgentResponseEvaluator NetworkEventEvaluator,426,test,webarena_verified.371.425.2 -webarena_verified.371.427.2,False,wikipedia map,AgentResponseEvaluator NetworkEventEvaluator,427,test,webarena_verified.371.426.2 -webarena_verified.371.428.2,False,wikipedia map,AgentResponseEvaluator NetworkEventEvaluator,428,train,webarena_verified.371.427.2 -webarena_verified.371.429.2,False,wikipedia map,AgentResponseEvaluator NetworkEventEvaluator NetworkEventEvaluator,429,train,webarena_verified.371.428.2 -webarena_verified.371.430.2,False,wikipedia map,AgentResponseEvaluator NetworkEventEvaluator,430,test,webarena_verified.371.429.2 -webarena_verified.145.431.2,False,shopping,AgentResponseEvaluator NetworkEventEvaluator,431,train,webarena_verified.1356.388.2 -webarena_verified.145.432.2,False,shopping,AgentResponseEvaluator NetworkEventEvaluator,432,test,webarena_verified.145.431.2 -webarena_verified.145.433.2,False,shopping,AgentResponseEvaluator NetworkEventEvaluator,433,train,webarena_verified.145.432.2 -webarena_verified.145.434.2,False,shopping,AgentResponseEvaluator NetworkEventEvaluator,434,train,webarena_verified.145.433.2 -webarena_verified.145.435.2,False,shopping,AgentResponseEvaluator NetworkEventEvaluator,435,train,webarena_verified.145.434.2 -webarena_verified.156.436.2,False,shopping,AgentResponseEvaluator NetworkEventEvaluator,436,test,webarena_verified.145.435.2 -webarena_verified.156.437.2,False,shopping,AgentResponseEvaluator NetworkEventEvaluator,437,train,webarena_verified.156.436.2 -webarena_verified.156.438.2,False,shopping,AgentResponseEvaluator NetworkEventEvaluator,438,train,webarena_verified.156.437.2 -webarena_verified.156.439.2,False,shopping,AgentResponseEvaluator NetworkEventEvaluator,439,train,webarena_verified.156.438.2 -webarena_verified.156.440.2,False,shopping,AgentResponseEvaluator NetworkEventEvaluator,440,test,webarena_verified.156.439.2 -webarena_verified.308.441.2,False,gitlab,AgentResponseEvaluator NetworkEventEvaluator,441,train,webarena_verified.361.422.2 -webarena_verified.308.442.2,False,gitlab,AgentResponseEvaluator NetworkEventEvaluator,442,train,webarena_verified.308.441.2 -webarena_verified.308.443.2,False,gitlab,AgentResponseEvaluator NetworkEventEvaluator,443,test,webarena_verified.308.442.2 -webarena_verified.308.444.2,False,gitlab,AgentResponseEvaluator NetworkEventEvaluator,444,train,webarena_verified.308.443.2 -webarena_verified.308.445.2,False,gitlab,AgentResponseEvaluator NetworkEventEvaluator,445,test,webarena_verified.308.444.2 -webarena_verified.999.446.2,False,gitlab,AgentResponseEvaluator NetworkEventEvaluator,446,test,webarena_verified.308.445.2 -webarena_verified.999.447.2,False,gitlab,AgentResponseEvaluator NetworkEventEvaluator,447,train,webarena_verified.999.446.2 -webarena_verified.331.448.2,False,gitlab,AgentResponseEvaluator NetworkEventEvaluator,448,test,webarena_verified.999.447.2 -webarena_verified.331.449.2,False,gitlab,AgentResponseEvaluator NetworkEventEvaluator,449,test,webarena_verified.331.448.2 -webarena_verified.331.450.2,False,gitlab,AgentResponseEvaluator NetworkEventEvaluator,450,train,webarena_verified.331.449.2 -webarena_verified.331.451.2,False,gitlab,AgentResponseEvaluator NetworkEventEvaluator,451,train,webarena_verified.331.450.2 -webarena_verified.331.452.2,False,gitlab,AgentResponseEvaluator NetworkEventEvaluator,452,train,webarena_verified.331.451.2 -webarena_verified.242.453.2,False,shopping_admin,AgentResponseEvaluator NetworkEventEvaluator,453,train,webarena_verified.237.423.2 -webarena_verified.242.454.2,False,shopping_admin,AgentResponseEvaluator NetworkEventEvaluator,454,test,webarena_verified.242.453.2 -webarena_verified.242.455.2,False,shopping_admin,AgentResponseEvaluator NetworkEventEvaluator,455,train,webarena_verified.242.454.2 -webarena_verified.242.456.2,False,shopping_admin,AgentResponseEvaluator NetworkEventEvaluator,456,test,webarena_verified.242.455.2 -webarena_verified.242.457.2,False,shopping_admin,AgentResponseEvaluator NetworkEventEvaluator,457,train,webarena_verified.242.456.2 -webarena_verified.247.458.2,False,shopping_admin,AgentResponseEvaluator NetworkEventEvaluator,458,test,webarena_verified.242.457.2 -webarena_verified.247.459.2,False,shopping_admin,AgentResponseEvaluator NetworkEventEvaluator,459,test,webarena_verified.247.458.2 -webarena_verified.247.460.2,False,shopping_admin,AgentResponseEvaluator NetworkEventEvaluator,460,train,webarena_verified.247.459.2 -webarena_verified.247.461.2,False,shopping_admin,AgentResponseEvaluator NetworkEventEvaluator,461,train,webarena_verified.247.460.2 -webarena_verified.247.462.2,False,shopping_admin,AgentResponseEvaluator NetworkEventEvaluator,462,test,webarena_verified.247.461.2 -webarena_verified.247.463.2,False,shopping_admin,AgentResponseEvaluator NetworkEventEvaluator,463,test,webarena_verified.247.462.2 -webarena_verified.251.464.2,False,shopping_admin,AgentResponseEvaluator NetworkEventEvaluator,464,train,webarena_verified.247.463.2 -webarena_verified.186.465.2,False,shopping,AgentResponseEvaluator NetworkEventEvaluator,465,train,webarena_verified.156.440.2 -webarena_verified.186.466.2,False,shopping,AgentResponseEvaluator NetworkEventEvaluator,466,train,webarena_verified.186.465.2 -webarena_verified.186.467.2,False,shopping,AgentResponseEvaluator NetworkEventEvaluator,467,train,webarena_verified.186.466.2 -webarena_verified.186.468.2,False,shopping,AgentResponseEvaluator NetworkEventEvaluator,468,test,webarena_verified.186.467.2 -webarena_verified.186.469.2,False,shopping,AgentResponseEvaluator NetworkEventEvaluator,469,test,webarena_verified.186.468.2 -webarena_verified.257.470.2,False,shopping_admin,AgentResponseEvaluator NetworkEventEvaluator,470,test,webarena_verified.251.464.2 -webarena_verified.257.471.2,False,shopping_admin,AgentResponseEvaluator NetworkEventEvaluator,471,test,webarena_verified.257.470.2 -webarena_verified.257.472.2,False,shopping_admin,AgentResponseEvaluator NetworkEventEvaluator,472,train,webarena_verified.257.471.2 -webarena_verified.257.473.2,False,shopping_admin,AgentResponseEvaluator NetworkEventEvaluator,473,train,webarena_verified.257.472.2 -webarena_verified.257.474.2,False,shopping_admin,AgentResponseEvaluator NetworkEventEvaluator,474,train,webarena_verified.257.473.2 -webarena_verified.292.475.2,False,gitlab,AgentResponseEvaluator NetworkEventEvaluator,475,train,webarena_verified.331.452.2 -webarena_verified.292.476.2,False,gitlab,AgentResponseEvaluator NetworkEventEvaluator,476,train,webarena_verified.292.475.2 -webarena_verified.292.477.2,False,gitlab,AgentResponseEvaluator NetworkEventEvaluator,477,train,webarena_verified.292.476.2 -webarena_verified.292.478.2,False,gitlab,AgentResponseEvaluator NetworkEventEvaluator,478,test,webarena_verified.292.477.2 -webarena_verified.292.479.2,False,gitlab,AgentResponseEvaluator NetworkEventEvaluator,479,test,webarena_verified.292.478.2 -webarena_verified.293.480.2,False,gitlab,AgentResponseEvaluator NetworkEventEvaluator,480,train,webarena_verified.292.479.2 -webarena_verified.294.481.2,False,gitlab,AgentResponseEvaluator NetworkEventEvaluator,481,train,webarena_verified.293.480.2 -webarena_verified.294.482.2,False,gitlab,AgentResponseEvaluator NetworkEventEvaluator,482,train,webarena_verified.294.481.2 -webarena_verified.294.483.2,False,gitlab,AgentResponseEvaluator NetworkEventEvaluator,483,test,webarena_verified.294.482.2 -webarena_verified.294.484.2,False,gitlab,AgentResponseEvaluator NetworkEventEvaluator,484,train,webarena_verified.294.483.2 -webarena_verified.294.485.2,False,gitlab,AgentResponseEvaluator NetworkEventEvaluator,485,test,webarena_verified.294.484.2 -webarena_verified.275.486.2,False,shopping_admin,AgentResponseEvaluator NetworkEventEvaluator,486,train,webarena_verified.257.474.2 -webarena_verified.275.487.2,False,shopping_admin,AgentResponseEvaluator NetworkEventEvaluator,487,test,webarena_verified.275.486.2 -webarena_verified.275.488.2,False,shopping_admin,AgentResponseEvaluator NetworkEventEvaluator,488,test,webarena_verified.275.487.2 -webarena_verified.275.489.2,False,shopping_admin,AgentResponseEvaluator NetworkEventEvaluator,489,train,webarena_verified.275.488.2 -webarena_verified.275.490.2,False,shopping_admin,AgentResponseEvaluator NetworkEventEvaluator,490,train,webarena_verified.275.489.2 -webarena_verified.280.491.2,False,shopping_admin,AgentResponseEvaluator,491,test,webarena_verified.275.490.2 -webarena_verified.280.492.2,False,shopping_admin,AgentResponseEvaluator NetworkEventEvaluator,492,train,webarena_verified.280.491.2 -webarena_verified.280.493.2,False,shopping_admin,AgentResponseEvaluator NetworkEventEvaluator,493,train,webarena_verified.280.492.2 -webarena_verified.280.494.2,False,shopping_admin,AgentResponseEvaluator NetworkEventEvaluator,494,train,webarena_verified.280.493.2 -webarena_verified.280.495.2,False,shopping_admin,AgentResponseEvaluator NetworkEventEvaluator,495,test,webarena_verified.280.494.2 -webarena_verified.284.496.2,False,shopping_admin,AgentResponseEvaluator NetworkEventEvaluator,496,train,webarena_verified.280.495.2 -webarena_verified.284.497.2,False,shopping_admin,AgentResponseEvaluator NetworkEventEvaluator,497,test,webarena_verified.284.496.2 -webarena_verified.284.498.2,False,shopping_admin,AgentResponseEvaluator NetworkEventEvaluator,498,test,webarena_verified.284.497.2 -webarena_verified.284.499.2,False,shopping_admin,AgentResponseEvaluator NetworkEventEvaluator,499,train,webarena_verified.284.498.2 -webarena_verified.284.500.2,False,shopping_admin,AgentResponseEvaluator NetworkEventEvaluator,500,train,webarena_verified.284.499.2 -webarena_verified.287.501.2,False,shopping_admin,AgentResponseEvaluator NetworkEventEvaluator,501,train,webarena_verified.284.500.2 -webarena_verified.287.502.2,False,shopping_admin,AgentResponseEvaluator NetworkEventEvaluator,502,test,webarena_verified.287.501.2 -webarena_verified.287.503.2,False,shopping_admin,AgentResponseEvaluator NetworkEventEvaluator,503,train,webarena_verified.287.502.2 -webarena_verified.287.504.2,False,shopping_admin,AgentResponseEvaluator NetworkEventEvaluator,504,test,webarena_verified.287.503.2 -webarena_verified.287.505.2,False,shopping_admin,AgentResponseEvaluator NetworkEventEvaluator,505,train,webarena_verified.287.504.2 -webarena_verified.172.506.2,False,shopping,AgentResponseEvaluator NetworkEventEvaluator NetworkEventEvaluator NetworkEventEvaluator NetworkEventEvaluator,506,train,webarena_verified.186.469.2 -webarena_verified.172.507.2,False,shopping,AgentResponseEvaluator NetworkEventEvaluator NetworkEventEvaluator NetworkEventEvaluator NetworkEventEvaluator,507,train,webarena_verified.172.506.2 -webarena_verified.172.508.2,False,shopping,AgentResponseEvaluator NetworkEventEvaluator NetworkEventEvaluator NetworkEventEvaluator NetworkEventEvaluator,508,test,webarena_verified.172.507.2 -webarena_verified.216.509.2,False,shopping,AgentResponseEvaluator NetworkEventEvaluator NetworkEventEvaluator NetworkEventEvaluator NetworkEventEvaluator,509,test,webarena_verified.172.508.2 -webarena_verified.216.510.2,False,shopping,AgentResponseEvaluator NetworkEventEvaluator NetworkEventEvaluator NetworkEventEvaluator NetworkEventEvaluator,510,test,webarena_verified.216.509.2 -webarena_verified.189.511.2,False,shopping,AgentResponseEvaluator NetworkEventEvaluator,511,test,webarena_verified.216.510.2 -webarena_verified.189.512.2,False,shopping,AgentResponseEvaluator NetworkEventEvaluator,512,train,webarena_verified.189.511.2 -webarena_verified.189.513.2,False,shopping,AgentResponseEvaluator NetworkEventEvaluator,513,train,webarena_verified.189.512.2 -webarena_verified.189.514.2,False,shopping,AgentResponseEvaluator NetworkEventEvaluator,514,test,webarena_verified.189.513.2 -webarena_verified.189.515.2,False,shopping,AgentResponseEvaluator NetworkEventEvaluator,515,train,webarena_verified.189.514.2 -webarena_verified.196.516.2,False,shopping,AgentResponseEvaluator NetworkEventEvaluator,516,train,webarena_verified.189.515.2 -webarena_verified.196.517.2,False,shopping,AgentResponseEvaluator NetworkEventEvaluator,517,test,webarena_verified.196.516.2 -webarena_verified.196.518.2,False,shopping,AgentResponseEvaluator NetworkEventEvaluator,518,test,webarena_verified.196.517.2 -webarena_verified.196.519.2,False,shopping,AgentResponseEvaluator NetworkEventEvaluator,519,test,webarena_verified.196.518.2 -webarena_verified.196.520.2,False,shopping,AgentResponseEvaluator NetworkEventEvaluator,520,train,webarena_verified.196.519.2 -webarena_verified.199.521.2,False,shopping,AgentResponseEvaluator NetworkEventEvaluator,521,test,webarena_verified.196.520.2 -webarena_verified.352.522.2,False,gitlab,AgentResponseEvaluator NetworkEventEvaluator,522,test,webarena_verified.294.485.2 -webarena_verified.354.523.2,False,gitlab,AgentResponseEvaluator NetworkEventEvaluator NetworkEventEvaluator NetworkEventEvaluator NetworkEventEvaluator NetworkEventEvaluator,523,train,webarena_verified.352.522.2 -webarena_verified.354.524.2,False,gitlab,AgentResponseEvaluator NetworkEventEvaluator NetworkEventEvaluator NetworkEventEvaluator NetworkEventEvaluator NetworkEventEvaluator NetworkEventEvaluator NetworkEventEvaluator NetworkEventEvaluator,524,test,webarena_verified.354.523.2 -webarena_verified.354.525.2,False,gitlab,AgentResponseEvaluator NetworkEventEvaluator NetworkEventEvaluator NetworkEventEvaluator NetworkEventEvaluator,525,train,webarena_verified.354.524.2 -webarena_verified.354.526.2,False,gitlab,AgentResponseEvaluator NetworkEventEvaluator NetworkEventEvaluator NetworkEventEvaluator,526,train,webarena_verified.354.525.2 -webarena_verified.354.527.2,False,gitlab,AgentResponseEvaluator NetworkEventEvaluator,527,test,webarena_verified.354.526.2 -webarena_verified.154.528.2,False,shopping,AgentResponseEvaluator NetworkEventEvaluator NetworkEventEvaluator,528,train,webarena_verified.199.521.2 -webarena_verified.154.529.2,False,shopping,AgentResponseEvaluator NetworkEventEvaluator NetworkEventEvaluator,529,test,webarena_verified.154.528.2 -webarena_verified.154.530.2,False,shopping,AgentResponseEvaluator NetworkEventEvaluator NetworkEventEvaluator,530,test,webarena_verified.154.529.2 -webarena_verified.154.531.2,False,shopping,AgentResponseEvaluator NetworkEventEvaluator,531,train,webarena_verified.154.530.2 -webarena_verified.154.532.2,False,shopping,AgentResponseEvaluator NetworkEventEvaluator,532,train,webarena_verified.154.531.2 -webarena_verified.330.533.2,False,gitlab,AgentResponseEvaluator NetworkEventEvaluator NetworkEventEvaluator,533,test,webarena_verified.354.527.2 -webarena_verified.330.534.2,False,gitlab,AgentResponseEvaluator NetworkEventEvaluator NetworkEventEvaluator NetworkEventEvaluator,534,train,webarena_verified.330.533.2 -webarena_verified.330.535.2,False,gitlab,AgentResponseEvaluator NetworkEventEvaluator NetworkEventEvaluator NetworkEventEvaluator,535,test,webarena_verified.330.534.2 -webarena_verified.330.536.2,False,gitlab,AgentResponseEvaluator NetworkEventEvaluator NetworkEventEvaluator NetworkEventEvaluator,536,train,webarena_verified.330.535.2 -webarena_verified.330.537.2,False,gitlab,AgentResponseEvaluator NetworkEventEvaluator NetworkEventEvaluator NetworkEventEvaluator NetworkEventEvaluator NetworkEventEvaluator,537,train,webarena_verified.330.536.2 -webarena_verified.240.538.2,False,shopping_admin,AgentResponseEvaluator NetworkEventEvaluator,538,train,webarena_verified.287.505.2 -webarena_verified.240.539.2,False,shopping_admin,AgentResponseEvaluator NetworkEventEvaluator,539,train,webarena_verified.240.538.2 -webarena_verified.240.540.2,False,shopping_admin,AgentResponseEvaluator NetworkEventEvaluator,540,test,webarena_verified.240.539.2 -webarena_verified.240.541.2,False,shopping_admin,AgentResponseEvaluator NetworkEventEvaluator,541,test,webarena_verified.240.540.2 -webarena_verified.240.542.2,False,shopping_admin,AgentResponseEvaluator NetworkEventEvaluator,542,train,webarena_verified.240.541.2 -webarena_verified.251.543.2,False,shopping_admin,AgentResponseEvaluator NetworkEventEvaluator,543,test,webarena_verified.240.542.2 -webarena_verified.251.544.2,False,shopping_admin,AgentResponseEvaluator NetworkEventEvaluator,544,test,webarena_verified.251.543.2 -webarena_verified.251.545.2,False,shopping_admin,AgentResponseEvaluator NetworkEventEvaluator,545,test,webarena_verified.251.544.2 -webarena_verified.251.546.2,False,shopping_admin,AgentResponseEvaluator NetworkEventEvaluator,546,train,webarena_verified.251.545.2 -webarena_verified.252.547.2,False,shopping_admin,AgentResponseEvaluator NetworkEventEvaluator NetworkEventEvaluator,547,train,webarena_verified.251.546.2 -webarena_verified.252.548.2,False,shopping_admin,AgentResponseEvaluator NetworkEventEvaluator NetworkEventEvaluator,548,train,webarena_verified.252.547.2 -webarena_verified.252.549.2,False,shopping_admin,AgentResponseEvaluator NetworkEventEvaluator NetworkEventEvaluator,549,test,webarena_verified.252.548.2 -webarena_verified.252.550.2,False,shopping_admin,AgentResponseEvaluator NetworkEventEvaluator NetworkEventEvaluator,550,train,webarena_verified.252.549.2 -webarena_verified.252.551.2,False,shopping_admin,AgentResponseEvaluator NetworkEventEvaluator,551,test,webarena_verified.252.550.2 -webarena_verified.84.552.2,False,gitlab reddit,AgentResponseEvaluator NetworkEventEvaluator,552,test,webarena_verified.23.410.2 -webarena_verified.84.553.2,False,gitlab reddit,AgentResponseEvaluator NetworkEventEvaluator,553,test,webarena_verified.84.552.2 -webarena_verified.84.554.2,False,gitlab reddit,AgentResponseEvaluator NetworkEventEvaluator,554,test,webarena_verified.84.553.2 -webarena_verified.84.555.2,False,gitlab reddit,AgentResponseEvaluator NetworkEventEvaluator,555,test,webarena_verified.84.554.2 -webarena_verified.87.556.3,False,gitlab wikipedia,AgentResponseEvaluator NetworkEventEvaluator NetworkEventEvaluator,556,train,webarena_verified.84.555.2 -webarena_verified.87.557.3,False,gitlab wikipedia,AgentResponseEvaluator NetworkEventEvaluator NetworkEventEvaluator,557,test,webarena_verified.87.556.3 -webarena_verified.87.558.3,False,gitlab wikipedia,AgentResponseEvaluator NetworkEventEvaluator NetworkEventEvaluator,558,train,webarena_verified.87.557.3 -webarena_verified.87.559.3,False,gitlab wikipedia,AgentResponseEvaluator NetworkEventEvaluator NetworkEventEvaluator,559,train,webarena_verified.87.558.3 -webarena_verified.87.560.3,False,gitlab wikipedia,AgentResponseEvaluator NetworkEventEvaluator NetworkEventEvaluator,560,test,webarena_verified.87.559.3 -webarena_verified.87.561.3,False,gitlab wikipedia,AgentResponseEvaluator NetworkEventEvaluator NetworkEventEvaluator,561,test,webarena_verified.87.560.3 -webarena_verified.88.562.2,False,gitlab reddit,AgentResponseEvaluator NetworkEventEvaluator NetworkEventEvaluator,562,train,webarena_verified.84.555.2 -webarena_verified.88.563.2,False,gitlab reddit,AgentResponseEvaluator NetworkEventEvaluator NetworkEventEvaluator,563,train,webarena_verified.88.562.2 -webarena_verified.88.564.2,False,gitlab reddit,AgentResponseEvaluator NetworkEventEvaluator NetworkEventEvaluator,564,train,webarena_verified.88.563.2 -webarena_verified.88.565.2,False,gitlab reddit,AgentResponseEvaluator NetworkEventEvaluator NetworkEventEvaluator,565,test,webarena_verified.88.564.2 -webarena_verified.88.566.2,False,gitlab reddit,AgentResponseEvaluator NetworkEventEvaluator NetworkEventEvaluator,566,test,webarena_verified.88.565.2 -webarena_verified.293.567.2,False,gitlab,AgentResponseEvaluator NetworkEventEvaluator NetworkEventEvaluator,567,test,webarena_verified.88.566.2 -webarena_verified.293.568.2,False,gitlab,AgentResponseEvaluator NetworkEventEvaluator NetworkEventEvaluator,568,train,webarena_verified.293.567.2 -webarena_verified.293.569.2,False,gitlab,AgentResponseEvaluator NetworkEventEvaluator NetworkEventEvaluator,569,train,webarena_verified.293.568.2 -webarena_verified.293.570.2,False,gitlab,AgentResponseEvaluator NetworkEventEvaluator NetworkEventEvaluator NetworkEventEvaluator NetworkEventEvaluator,570,test,webarena_verified.293.569.2 -webarena_verified.165.571.2,False,shopping,AgentResponseEvaluator NetworkEventEvaluator,571,test,webarena_verified.154.532.2 -webarena_verified.165.572.2,False,shopping,AgentResponseEvaluator NetworkEventEvaluator,572,train,webarena_verified.165.571.2 -webarena_verified.165.573.2,False,shopping,AgentResponseEvaluator NetworkEventEvaluator,573,train,webarena_verified.165.572.2 -webarena_verified.165.574.2,False,shopping,AgentResponseEvaluator NetworkEventEvaluator,574,test,webarena_verified.165.573.2 -webarena_verified.165.575.2,False,shopping,AgentResponseEvaluator NetworkEventEvaluator,575,train,webarena_verified.165.574.2 -webarena_verified.351.576.2,False,gitlab,AgentResponseEvaluator NetworkEventEvaluator,576,test,webarena_verified.293.570.2 -webarena_verified.351.577.2,False,gitlab,AgentResponseEvaluator NetworkEventEvaluator,577,train,webarena_verified.351.576.2 -webarena_verified.351.578.2,False,gitlab,AgentResponseEvaluator NetworkEventEvaluator,578,test,webarena_verified.351.577.2 -webarena_verified.351.579.2,False,gitlab,AgentResponseEvaluator NetworkEventEvaluator,579,train,webarena_verified.351.578.2 -webarena_verified.7.580.2,False,reddit,AgentResponseEvaluator NetworkEventEvaluator,580,train,webarena_verified.88.566.2 -webarena_verified.7.581.2,False,reddit,AgentResponseEvaluator NetworkEventEvaluator,581,train,webarena_verified.7.580.2 -webarena_verified.7.582.2,False,reddit,AgentResponseEvaluator NetworkEventEvaluator,582,test,webarena_verified.7.581.2 -webarena_verified.7.583.2,False,reddit,AgentResponseEvaluator NetworkEventEvaluator,583,test,webarena_verified.7.582.2 -webarena_verified.7.584.2,False,reddit,AgentResponseEvaluator NetworkEventEvaluator,584,train,webarena_verified.7.583.2 -webarena_verified.194.585.2,False,shopping,AgentResponseEvaluator NetworkEventEvaluator,585,train,webarena_verified.165.575.2 -webarena_verified.194.586.2,False,shopping,AgentResponseEvaluator NetworkEventEvaluator,586,test,webarena_verified.194.585.2 -webarena_verified.194.587.2,False,shopping,AgentResponseEvaluator NetworkEventEvaluator,587,train,webarena_verified.194.586.2 -webarena_verified.194.588.2,False,shopping,AgentResponseEvaluator NetworkEventEvaluator,588,train,webarena_verified.194.587.2 -webarena_verified.194.589.2,False,shopping,AgentResponseEvaluator NetworkEventEvaluator,589,test,webarena_verified.194.588.2 -webarena_verified.339.590.2,False,gitlab,AgentResponseEvaluator NetworkEventEvaluator,590,train,webarena_verified.351.579.2 -webarena_verified.339.591.2,False,gitlab,AgentResponseEvaluator NetworkEventEvaluator,591,test,webarena_verified.339.590.2 -webarena_verified.339.592.2,False,gitlab,AgentResponseEvaluator NetworkEventEvaluator,592,test,webarena_verified.339.591.2 -webarena_verified.339.593.2,False,gitlab,AgentResponseEvaluator NetworkEventEvaluator,593,test,webarena_verified.339.592.2 -webarena_verified.339.594.2,False,gitlab,AgentResponseEvaluator NetworkEventEvaluator,594,train,webarena_verified.339.593.2 -webarena_verified.4.595.2,False,reddit,AgentResponseEvaluator NetworkEventEvaluator,595,train,webarena_verified.7.584.2 -webarena_verified.4.596.2,False,reddit,AgentResponseEvaluator NetworkEventEvaluator,596,test,webarena_verified.4.595.2 -webarena_verified.4.597.2,False,reddit,AgentResponseEvaluator NetworkEventEvaluator,597,train,webarena_verified.4.596.2 -webarena_verified.4.598.2,False,reddit,AgentResponseEvaluator NetworkEventEvaluator,598,train,webarena_verified.4.597.2 -webarena_verified.4.599.2,False,reddit,AgentResponseEvaluator NetworkEventEvaluator,599,test,webarena_verified.4.598.2 -webarena_verified.3765.600.2,False,reddit,AgentResponseEvaluator NetworkEventEvaluator,600,test,webarena_verified.4.599.2 -webarena_verified.3765.601.2,False,reddit,AgentResponseEvaluator NetworkEventEvaluator,601,train,webarena_verified.3765.600.2 -webarena_verified.3765.602.2,False,reddit,AgentResponseEvaluator NetworkEventEvaluator,602,train,webarena_verified.3765.601.2 -webarena_verified.3765.603.2,False,reddit,AgentResponseEvaluator NetworkEventEvaluator,603,train,webarena_verified.3765.602.2 -webarena_verified.3765.604.2,False,reddit,AgentResponseEvaluator NetworkEventEvaluator,604,test,webarena_verified.3765.603.2 -webarena_verified.5.605.2,False,reddit,AgentResponseEvaluator NetworkEventEvaluator,605,train,webarena_verified.3765.604.2 -webarena_verified.5.606.2,False,reddit,AgentResponseEvaluator NetworkEventEvaluator,606,train,webarena_verified.5.605.2 -webarena_verified.5.607.2,False,reddit,AgentResponseEvaluator NetworkEventEvaluator,607,test,webarena_verified.5.606.2 -webarena_verified.5.608.2,False,reddit,AgentResponseEvaluator NetworkEventEvaluator,608,test,webarena_verified.5.607.2 -webarena_verified.5.609.2,False,reddit,AgentResponseEvaluator NetworkEventEvaluator,609,train,webarena_verified.5.608.2 -webarena_verified.9.610.2,False,reddit,AgentResponseEvaluator NetworkEventEvaluator NetworkEventEvaluator,610,train,webarena_verified.5.609.2 -webarena_verified.9.611.2,False,reddit,AgentResponseEvaluator NetworkEventEvaluator NetworkEventEvaluator,611,train,webarena_verified.9.610.2 -webarena_verified.9.612.2,False,reddit,AgentResponseEvaluator NetworkEventEvaluator NetworkEventEvaluator,612,test,webarena_verified.9.611.2 -webarena_verified.9.613.2,False,reddit,AgentResponseEvaluator NetworkEventEvaluator NetworkEventEvaluator,613,train,webarena_verified.9.612.2 -webarena_verified.9.614.2,False,reddit,AgentResponseEvaluator NetworkEventEvaluator NetworkEventEvaluator,614,test,webarena_verified.9.613.2 -webarena_verified.11.615.2,False,reddit,AgentResponseEvaluator NetworkEventEvaluator,615,test,webarena_verified.9.614.2 -webarena_verified.11.616.2,False,reddit,AgentResponseEvaluator NetworkEventEvaluator,616,test,webarena_verified.11.615.2 -webarena_verified.11.617.2,False,reddit,AgentResponseEvaluator NetworkEventEvaluator,617,train,webarena_verified.11.616.2 -webarena_verified.11.618.2,False,reddit,AgentResponseEvaluator NetworkEventEvaluator,618,train,webarena_verified.11.617.2 -webarena_verified.11.619.2,False,reddit,AgentResponseEvaluator NetworkEventEvaluator,619,train,webarena_verified.11.618.2 -webarena_verified.12.620.2,False,reddit,AgentResponseEvaluator NetworkEventEvaluator,620,train,webarena_verified.11.619.2 -webarena_verified.12.621.2,False,reddit,AgentResponseEvaluator NetworkEventEvaluator,621,train,webarena_verified.12.620.2 -webarena_verified.12.622.2,False,reddit,AgentResponseEvaluator NetworkEventEvaluator,622,train,webarena_verified.12.621.2 -webarena_verified.12.623.2,False,reddit,AgentResponseEvaluator NetworkEventEvaluator,623,test,webarena_verified.12.622.2 -webarena_verified.12.624.2,False,reddit,AgentResponseEvaluator NetworkEventEvaluator,624,test,webarena_verified.12.623.2 -webarena_verified.13.625.2,False,reddit,AgentResponseEvaluator NetworkEventEvaluator,625,train,webarena_verified.12.624.2 -webarena_verified.13.626.2,False,reddit,AgentResponseEvaluator NetworkEventEvaluator,626,train,webarena_verified.13.625.2 -webarena_verified.13.627.2,False,reddit,AgentResponseEvaluator NetworkEventEvaluator,627,train,webarena_verified.13.626.2 -webarena_verified.13.628.2,False,reddit,AgentResponseEvaluator NetworkEventEvaluator,628,test,webarena_verified.13.627.2 -webarena_verified.13.629.2,False,reddit,AgentResponseEvaluator NetworkEventEvaluator,629,test,webarena_verified.13.628.2 -webarena_verified.15.630.2,False,reddit,AgentResponseEvaluator NetworkEventEvaluator,630,test,webarena_verified.13.629.2 -webarena_verified.15.631.2,False,reddit,AgentResponseEvaluator NetworkEventEvaluator,631,train,webarena_verified.15.630.2 -webarena_verified.15.632.2,False,reddit,AgentResponseEvaluator NetworkEventEvaluator,632,train,webarena_verified.15.631.2 -webarena_verified.15.633.2,False,reddit,AgentResponseEvaluator NetworkEventEvaluator,633,test,webarena_verified.15.632.2 -webarena_verified.15.634.2,False,reddit,AgentResponseEvaluator NetworkEventEvaluator,634,train,webarena_verified.15.633.2 -webarena_verified.6100.635.2,False,reddit,AgentResponseEvaluator NetworkEventEvaluator,635,train,webarena_verified.15.634.2 -webarena_verified.6100.636.2,False,reddit,AgentResponseEvaluator NetworkEventEvaluator,636,train,webarena_verified.6100.635.2 -webarena_verified.6100.637.2,False,reddit,AgentResponseEvaluator NetworkEventEvaluator,637,train,webarena_verified.6100.636.2 -webarena_verified.6100.638.2,False,reddit,AgentResponseEvaluator NetworkEventEvaluator,638,test,webarena_verified.6100.637.2 -webarena_verified.6100.639.2,False,reddit,AgentResponseEvaluator NetworkEventEvaluator,639,test,webarena_verified.6100.638.2 -webarena_verified.16.640.2,False,reddit,AgentResponseEvaluator NetworkEventEvaluator,640,train,webarena_verified.6100.639.2 -webarena_verified.16.641.2,False,reddit,AgentResponseEvaluator NetworkEventEvaluator,641,test,webarena_verified.16.640.2 -webarena_verified.16.642.2,False,reddit,AgentResponseEvaluator NetworkEventEvaluator,642,test,webarena_verified.16.641.2 -webarena_verified.16.643.2,False,reddit,AgentResponseEvaluator NetworkEventEvaluator,643,train,webarena_verified.16.642.2 -webarena_verified.16.644.2,False,reddit,AgentResponseEvaluator NetworkEventEvaluator,644,train,webarena_verified.16.643.2 -webarena_verified.19.645.2,False,reddit,AgentResponseEvaluator NetworkEventEvaluator,645,train,webarena_verified.16.644.2 -webarena_verified.19.646.2,False,reddit,AgentResponseEvaluator NetworkEventEvaluator,646,train,webarena_verified.19.645.2 -webarena_verified.19.647.2,False,reddit,AgentResponseEvaluator NetworkEventEvaluator,647,train,webarena_verified.19.646.2 -webarena_verified.19.648.2,False,reddit,AgentResponseEvaluator NetworkEventEvaluator,648,test,webarena_verified.19.647.2 -webarena_verified.19.649.2,False,reddit,AgentResponseEvaluator NetworkEventEvaluator,649,test,webarena_verified.19.648.2 -webarena_verified.23.650.2,False,reddit,AgentResponseEvaluator NetworkEventEvaluator,650,train,webarena_verified.19.649.2 -webarena_verified.23.651.2,False,reddit,AgentResponseEvaluator NetworkEventEvaluator,651,train,webarena_verified.23.650.2 -webarena_verified.23.652.2,False,reddit,AgentResponseEvaluator NetworkEventEvaluator,652,train,webarena_verified.23.651.2 -webarena_verified.153.653.2,False,shopping,AgentResponseEvaluator NetworkEventEvaluator,653,train,webarena_verified.194.589.2 -webarena_verified.153.654.2,False,shopping,AgentResponseEvaluator NetworkEventEvaluator,654,test,webarena_verified.153.653.2 -webarena_verified.153.655.2,False,shopping,AgentResponseEvaluator NetworkEventEvaluator,655,test,webarena_verified.153.654.2 -webarena_verified.153.656.3,False,shopping,AgentResponseEvaluator NetworkEventEvaluator,656,train,webarena_verified.153.655.2 -webarena_verified.153.657.2,False,shopping,AgentResponseEvaluator NetworkEventEvaluator,657,train,webarena_verified.153.656.3 -webarena_verified.327.658.2,False,gitlab,AgentResponseEvaluator NetworkEventEvaluator,658,train,webarena_verified.339.594.2 -webarena_verified.327.659.2,False,gitlab,AgentResponseEvaluator NetworkEventEvaluator NetworkEventEvaluator,659,test,webarena_verified.327.658.2 -webarena_verified.327.660.2,False,gitlab,AgentResponseEvaluator NetworkEventEvaluator NetworkEventEvaluator,660,test,webarena_verified.327.659.2 -webarena_verified.328.661.2,False,gitlab,AgentResponseEvaluator NetworkEventEvaluator,661,test,webarena_verified.327.660.2 -webarena_verified.328.662.2,False,gitlab,AgentResponseEvaluator NetworkEventEvaluator,662,train,webarena_verified.328.661.2 -webarena_verified.328.663.2,False,gitlab,AgentResponseEvaluator NetworkEventEvaluator,663,train,webarena_verified.328.662.2 -webarena_verified.328.664.2,False,gitlab,AgentResponseEvaluator NetworkEventEvaluator,664,test,webarena_verified.328.663.2 -webarena_verified.328.665.2,False,gitlab,AgentResponseEvaluator NetworkEventEvaluator,665,train,webarena_verified.328.664.2 -webarena_verified.335.666.2,False,gitlab,AgentResponseEvaluator NetworkEventEvaluator,666,test,webarena_verified.328.665.2 -webarena_verified.335.667.2,False,gitlab,AgentResponseEvaluator NetworkEventEvaluator,667,test,webarena_verified.335.666.2 -webarena_verified.335.668.2,False,gitlab,AgentResponseEvaluator NetworkEventEvaluator,668,test,webarena_verified.335.667.2 -webarena_verified.337.669.2,False,gitlab,AgentResponseEvaluator NetworkEventEvaluator,669,test,webarena_verified.335.668.2 -webarena_verified.337.670.2,False,gitlab,AgentResponseEvaluator NetworkEventEvaluator,670,train,webarena_verified.337.669.2 -webarena_verified.101.671.2,False,shopping reddit,AgentResponseEvaluator NetworkEventEvaluator,671,train,webarena_verified.23.652.2 -webarena_verified.101.672.2,False,shopping reddit,AgentResponseEvaluator NetworkEventEvaluator,672,train,webarena_verified.101.671.2 -webarena_verified.101.673.2,False,shopping reddit,AgentResponseEvaluator NetworkEventEvaluator,673,test,webarena_verified.101.672.2 -webarena_verified.101.674.2,False,shopping reddit,AgentResponseEvaluator NetworkEventEvaluator,674,test,webarena_verified.101.673.2 -webarena_verified.101.675.2,False,shopping reddit,AgentResponseEvaluator NetworkEventEvaluator,675,train,webarena_verified.101.674.2 -webarena_verified.253.676.2,False,shopping_admin,AgentResponseEvaluator NetworkEventEvaluator,676,test,webarena_verified.252.551.2 -webarena_verified.253.677.2,False,shopping_admin,AgentResponseEvaluator NetworkEventEvaluator,677,test,webarena_verified.253.676.2 -webarena_verified.253.678.2,False,shopping_admin,AgentResponseEvaluator NetworkEventEvaluator,678,train,webarena_verified.253.677.2 -webarena_verified.253.679.2,False,shopping_admin,AgentResponseEvaluator NetworkEventEvaluator,679,train,webarena_verified.253.678.2 -webarena_verified.253.680.2,False,shopping_admin,AgentResponseEvaluator NetworkEventEvaluator,680,train,webarena_verified.253.679.2 -webarena_verified.116.681.2,False,reddit gitlab,AgentResponseEvaluator NetworkEventEvaluator,681,train,webarena_verified.337.670.2 -webarena_verified.116.682.2,False,reddit gitlab,AgentResponseEvaluator NetworkEventEvaluator,682,train,webarena_verified.116.681.2 -webarena_verified.116.683.2,False,reddit gitlab,AgentResponseEvaluator NetworkEventEvaluator,683,test,webarena_verified.116.682.2 -webarena_verified.117.684.2,False,reddit gitlab,AgentResponseEvaluator NetworkEventEvaluator,684,train,webarena_verified.116.683.2 -webarena_verified.117.685.2,False,reddit gitlab,AgentResponseEvaluator NetworkEventEvaluator,685,train,webarena_verified.117.684.2 -webarena_verified.117.686.2,False,reddit gitlab,AgentResponseEvaluator NetworkEventEvaluator,686,train,webarena_verified.117.685.2 -webarena_verified.117.687.2,False,reddit gitlab,AgentResponseEvaluator NetworkEventEvaluator,687,test,webarena_verified.117.686.2 -webarena_verified.117.688.2,False,reddit gitlab,AgentResponseEvaluator NetworkEventEvaluator,688,test,webarena_verified.117.687.2 -webarena_verified.163.689.2,False,shopping,AgentResponseEvaluator NetworkEventEvaluator NetworkEventEvaluator,689,test,webarena_verified.101.675.2 -webarena_verified.163.690.2,False,shopping,AgentResponseEvaluator NetworkEventEvaluator NetworkEventEvaluator,690,test,webarena_verified.163.689.2 -webarena_verified.163.691.2,False,shopping,AgentResponseEvaluator NetworkEventEvaluator NetworkEventEvaluator,691,train,webarena_verified.163.690.2 -webarena_verified.163.692.2,False,shopping,AgentResponseEvaluator NetworkEventEvaluator NetworkEventEvaluator,692,train,webarena_verified.163.691.2 -webarena_verified.163.693.2,False,shopping,AgentResponseEvaluator NetworkEventEvaluator NetworkEventEvaluator,693,train,webarena_verified.163.692.2 -webarena_verified.256.694.2,False,shopping_admin,AgentResponseEvaluator NetworkEventEvaluator,694,train,webarena_verified.253.680.2 -webarena_verified.256.695.2,False,shopping_admin,AgentResponseEvaluator NetworkEventEvaluator,695,train,webarena_verified.256.694.2 -webarena_verified.256.696.2,False,shopping_admin,AgentResponseEvaluator NetworkEventEvaluator,696,test,webarena_verified.256.695.2 -webarena_verified.256.697.2,False,shopping_admin,AgentResponseEvaluator NetworkEventEvaluator,697,train,webarena_verified.256.696.2 -webarena_verified.256.698.2,False,shopping_admin,AgentResponseEvaluator NetworkEventEvaluator,698,test,webarena_verified.256.697.2 -webarena_verified.258.699.2,False,shopping_admin,AgentResponseEvaluator NetworkEventEvaluator,699,train,webarena_verified.256.698.2 -webarena_verified.258.700.2,False,shopping_admin,AgentResponseEvaluator NetworkEventEvaluator,700,test,webarena_verified.258.699.2 -webarena_verified.258.701.2,False,shopping_admin,AgentResponseEvaluator NetworkEventEvaluator,701,test,webarena_verified.258.700.2 -webarena_verified.258.702.2,False,shopping_admin,AgentResponseEvaluator NetworkEventEvaluator,702,train,webarena_verified.258.701.2 -webarena_verified.258.703.2,False,shopping_admin,AgentResponseEvaluator NetworkEventEvaluator,703,train,webarena_verified.258.702.2 -webarena_verified.268.704.2,False,shopping_admin,AgentResponseEvaluator NetworkEventEvaluator,704,test,webarena_verified.258.703.2 -webarena_verified.268.705.2,False,shopping_admin,AgentResponseEvaluator NetworkEventEvaluator,705,test,webarena_verified.268.704.2 -webarena_verified.268.706.2,False,shopping_admin,AgentResponseEvaluator NetworkEventEvaluator,706,train,webarena_verified.268.705.2 -webarena_verified.268.707.2,False,shopping_admin,AgentResponseEvaluator NetworkEventEvaluator,707,train,webarena_verified.268.706.2 -webarena_verified.268.708.2,False,shopping_admin,AgentResponseEvaluator NetworkEventEvaluator,708,train,webarena_verified.268.707.2 -webarena_verified.271.709.2,False,shopping_admin,AgentResponseEvaluator NetworkEventEvaluator,709,test,webarena_verified.268.708.2 -webarena_verified.271.710.2,False,shopping_admin,AgentResponseEvaluator NetworkEventEvaluator,710,test,webarena_verified.271.709.2 -webarena_verified.271.711.2,False,shopping_admin,AgentResponseEvaluator NetworkEventEvaluator,711,train,webarena_verified.271.710.2 -webarena_verified.271.712.2,False,shopping_admin,AgentResponseEvaluator NetworkEventEvaluator,712,train,webarena_verified.271.711.2 -webarena_verified.271.713.2,False,shopping_admin,AgentResponseEvaluator NetworkEventEvaluator,713,train,webarena_verified.271.712.2 -webarena_verified.24.714.2,False,reddit,AgentResponseEvaluator NetworkEventEvaluator,714,train,webarena_verified.117.688.2 -webarena_verified.24.715.2,False,reddit,AgentResponseEvaluator NetworkEventEvaluator NetworkEventEvaluator,715,train,webarena_verified.24.714.2 -webarena_verified.24.716.2,False,reddit,AgentResponseEvaluator NetworkEventEvaluator NetworkEventEvaluator NetworkEventEvaluator,716,train,webarena_verified.24.715.2 -webarena_verified.24.717.2,False,reddit,AgentResponseEvaluator NetworkEventEvaluator NetworkEventEvaluator NetworkEventEvaluator NetworkEventEvaluator,717,test,webarena_verified.24.716.2 -webarena_verified.24.718.2,False,reddit,AgentResponseEvaluator NetworkEventEvaluator NetworkEventEvaluator NetworkEventEvaluator NetworkEventEvaluator NetworkEventEvaluator,718,test,webarena_verified.24.717.2 -webarena_verified.25.719.2,False,reddit,AgentResponseEvaluator NetworkEventEvaluator NetworkEventEvaluator,719,train,webarena_verified.24.718.2 -webarena_verified.25.720.2,False,reddit,AgentResponseEvaluator NetworkEventEvaluator NetworkEventEvaluator NetworkEventEvaluator,720,test,webarena_verified.25.719.2 -webarena_verified.25.721.2,False,reddit,AgentResponseEvaluator NetworkEventEvaluator NetworkEventEvaluator NetworkEventEvaluator NetworkEventEvaluator NetworkEventEvaluator NetworkEventEvaluator NetworkEventEvaluator NetworkEventEvaluator,721,train,webarena_verified.25.720.2 -webarena_verified.25.722.2,False,reddit,AgentResponseEvaluator NetworkEventEvaluator,722,train,webarena_verified.25.721.2 -webarena_verified.25.723.2,False,reddit,AgentResponseEvaluator,723,test,webarena_verified.25.722.2 -webarena_verified.25.724.2,False,reddit,AgentResponseEvaluator NetworkEventEvaluator NetworkEventEvaluator NetworkEventEvaluator NetworkEventEvaluator NetworkEventEvaluator NetworkEventEvaluator NetworkEventEvaluator NetworkEventEvaluator NetworkEventEvaluator NetworkEventEvaluator,724,test,webarena_verified.25.723.2 -webarena_verified.1510.725.2,False,reddit,AgentResponseEvaluator NetworkEventEvaluator,725,test,webarena_verified.25.724.2 -webarena_verified.1510.726.2,False,reddit,AgentResponseEvaluator,726,test,webarena_verified.1510.725.2 -webarena_verified.1510.727.2,False,reddit,AgentResponseEvaluator NetworkEventEvaluator,727,train,webarena_verified.1510.726.2 -webarena_verified.1510.728.2,False,reddit,AgentResponseEvaluator NetworkEventEvaluator,728,train,webarena_verified.1510.727.2 -webarena_verified.1510.729.2,False,reddit,AgentResponseEvaluator NetworkEventEvaluator,729,train,webarena_verified.1510.728.2 -webarena_verified.1510.730.2,False,reddit,AgentResponseEvaluator NetworkEventEvaluator NetworkEventEvaluator NetworkEventEvaluator NetworkEventEvaluator NetworkEventEvaluator NetworkEventEvaluator NetworkEventEvaluator NetworkEventEvaluator NetworkEventEvaluator NetworkEventEvaluator,730,test,webarena_verified.1510.729.2 -webarena_verified.27.731.2,False,reddit,AgentResponseEvaluator NetworkEventEvaluator,731,test,webarena_verified.1510.730.2 -webarena_verified.27.732.2,False,reddit,AgentResponseEvaluator NetworkEventEvaluator,732,train,webarena_verified.27.731.2 -webarena_verified.27.733.2,False,reddit,AgentResponseEvaluator NetworkEventEvaluator,733,train,webarena_verified.27.732.2 -webarena_verified.27.734.2,False,reddit,AgentResponseEvaluator NetworkEventEvaluator,734,train,webarena_verified.27.733.2 -webarena_verified.27.735.2,False,reddit,AgentResponseEvaluator NetworkEventEvaluator,735,test,webarena_verified.27.734.2 -webarena_verified.355.736.2,False,gitlab,AgentResponseEvaluator NetworkEventEvaluator NetworkEventEvaluator,736,train,webarena_verified.117.688.2 -webarena_verified.94.737.2,False,wikipedia map,AgentResponseEvaluator NetworkEventEvaluator,737,train,webarena_verified.371.430.2 -webarena_verified.94.738.2,False,wikipedia map,AgentResponseEvaluator NetworkEventEvaluator,738,test,webarena_verified.94.737.2 -webarena_verified.94.739.2,False,wikipedia map,AgentResponseEvaluator NetworkEventEvaluator,739,train,webarena_verified.94.738.2 -webarena_verified.94.740.2,False,wikipedia map,AgentResponseEvaluator NetworkEventEvaluator,740,test,webarena_verified.94.739.2 -webarena_verified.94.741.2,False,wikipedia map,AgentResponseEvaluator NetworkEventEvaluator,741,train,webarena_verified.94.740.2 -webarena_verified.332.742.2,False,gitlab,AgentResponseEvaluator NetworkEventEvaluator NetworkEventEvaluator NetworkEventEvaluator,742,test,webarena_verified.355.736.2 -webarena_verified.332.743.2,False,gitlab,AgentResponseEvaluator NetworkEventEvaluator NetworkEventEvaluator NetworkEventEvaluator,743,test,webarena_verified.332.742.2 -webarena_verified.332.744.2,False,gitlab,AgentResponseEvaluator NetworkEventEvaluator NetworkEventEvaluator,744,test,webarena_verified.332.743.2 -webarena_verified.332.745.2,False,gitlab,AgentResponseEvaluator NetworkEventEvaluator NetworkEventEvaluator NetworkEventEvaluator NetworkEventEvaluator,745,test,webarena_verified.332.744.2 -webarena_verified.332.746.2,False,gitlab,AgentResponseEvaluator NetworkEventEvaluator NetworkEventEvaluator NetworkEventEvaluator NetworkEventEvaluator,746,train,webarena_verified.332.745.2 -webarena_verified.2100.747.2,False,gitlab,AgentResponseEvaluator NetworkEventEvaluator NetworkEventEvaluator,747,train,webarena_verified.332.746.2 -webarena_verified.2100.748.2,False,gitlab,AgentResponseEvaluator NetworkEventEvaluator NetworkEventEvaluator,748,train,webarena_verified.2100.747.2 -webarena_verified.2100.749.2,False,gitlab,AgentResponseEvaluator NetworkEventEvaluator NetworkEventEvaluator,749,test,webarena_verified.2100.748.2 -webarena_verified.2100.750.2,False,gitlab,AgentResponseEvaluator NetworkEventEvaluator NetworkEventEvaluator,750,test,webarena_verified.2100.749.2 -webarena_verified.2100.751.2,False,gitlab,AgentResponseEvaluator NetworkEventEvaluator NetworkEventEvaluator,751,train,webarena_verified.2100.750.2 -webarena_verified.332.752.2,False,gitlab,AgentResponseEvaluator NetworkEventEvaluator,752,train,webarena_verified.2100.751.2 -webarena_verified.332.753.2,False,gitlab,AgentResponseEvaluator NetworkEventEvaluator,753,test,webarena_verified.332.752.2 -webarena_verified.332.754.2,False,gitlab,AgentResponseEvaluator NetworkEventEvaluator,754,train,webarena_verified.332.753.2 -webarena_verified.332.755.2,False,gitlab,AgentResponseEvaluator NetworkEventEvaluator,755,test,webarena_verified.332.754.2 -webarena_verified.332.756.2,False,gitlab,AgentResponseEvaluator NetworkEventEvaluator,756,train,webarena_verified.332.755.2 -webarena_verified.42.757.2,False,map,AgentResponseEvaluator NetworkEventEvaluator,757,test,webarena_verified.94.741.2 -webarena_verified.42.758.2,False,map,AgentResponseEvaluator NetworkEventEvaluator,758,test,webarena_verified.42.757.2 -webarena_verified.42.759.2,False,map shopping_admin,AgentResponseEvaluator NetworkEventEvaluator,759,test,webarena_verified.271.713.2 -webarena_verified.42.760.2,False,map shopping_admin,AgentResponseEvaluator NetworkEventEvaluator,760,test,webarena_verified.42.759.2 -webarena_verified.54.761.2,False,map,AgentResponseEvaluator NetworkEventEvaluator,761,train,webarena_verified.42.760.2 -webarena_verified.54.762.2,False,map,AgentResponseEvaluator NetworkEventEvaluator,762,train,webarena_verified.54.761.2 -webarena_verified.75.763.2,False,map,AgentResponseEvaluator NetworkEventEvaluator,763,test,webarena_verified.54.762.2 -webarena_verified.75.764.2,False,map,AgentResponseEvaluator NetworkEventEvaluator,764,test,webarena_verified.75.763.2 -webarena_verified.75.765.2,False,map,AgentResponseEvaluator NetworkEventEvaluator,765,train,webarena_verified.75.764.2 -webarena_verified.75.766.2,False,map,AgentResponseEvaluator NetworkEventEvaluator,766,train,webarena_verified.75.765.2 -webarena_verified.75.767.2,False,map,AgentResponseEvaluator NetworkEventEvaluator,767,train,webarena_verified.75.766.2 -webarena_verified.241.768.2,False,shopping_admin,AgentResponseEvaluator NetworkEventEvaluator,768,test,webarena_verified.42.760.2 -webarena_verified.241.769.2,False,shopping_admin,AgentResponseEvaluator NetworkEventEvaluator NetworkEventEvaluator NetworkEventEvaluator NetworkEventEvaluator NetworkEventEvaluator,769,test,webarena_verified.241.768.2 -webarena_verified.241.770.2,False,shopping_admin,AgentResponseEvaluator NetworkEventEvaluator NetworkEventEvaluator,770,train,webarena_verified.241.769.2 -webarena_verified.243.771.2,False,shopping_admin,AgentResponseEvaluator NetworkEventEvaluator NetworkEventEvaluator,771,test,webarena_verified.241.770.2 -webarena_verified.246.772.2,False,shopping_admin,AgentResponseEvaluator NetworkEventEvaluator,772,test,webarena_verified.243.771.2 -webarena_verified.246.773.2,False,shopping_admin,AgentResponseEvaluator NetworkEventEvaluator NetworkEventEvaluator,773,train,webarena_verified.246.772.2 -webarena_verified.246.774.2,False,shopping_admin,AgentResponseEvaluator NetworkEventEvaluator NetworkEventEvaluator NetworkEventEvaluator,774,train,webarena_verified.246.773.2 -webarena_verified.246.775.2,False,shopping_admin,AgentResponseEvaluator NetworkEventEvaluator,775,train,webarena_verified.246.774.2 -webarena_verified.246.776.2,False,shopping_admin,AgentResponseEvaluator NetworkEventEvaluator NetworkEventEvaluator,776,test,webarena_verified.246.775.2 -webarena_verified.742.777.2,False,shopping_admin,AgentResponseEvaluator NetworkEventEvaluator NetworkEventEvaluator NetworkEventEvaluator NetworkEventEvaluator NetworkEventEvaluator,777,train,webarena_verified.246.776.2 -webarena_verified.742.778.2,False,shopping_admin,AgentResponseEvaluator NetworkEventEvaluator NetworkEventEvaluator NetworkEventEvaluator,778,test,webarena_verified.742.777.2 -webarena_verified.742.779.2,False,shopping_admin,AgentResponseEvaluator NetworkEventEvaluator NetworkEventEvaluator NetworkEventEvaluator,779,train,webarena_verified.742.778.2 -webarena_verified.742.780.2,False,shopping_admin,AgentResponseEvaluator NetworkEventEvaluator NetworkEventEvaluator,780,test,webarena_verified.742.779.2 -webarena_verified.742.781.2,False,shopping_admin,AgentResponseEvaluator NetworkEventEvaluator,781,train,webarena_verified.742.780.2 -webarena_verified.742.782.2,False,shopping_admin,AgentResponseEvaluator NetworkEventEvaluator NetworkEventEvaluator NetworkEventEvaluator,782,test,webarena_verified.742.781.2 -webarena_verified.351.783.2,False,gitlab,AgentResponseEvaluator,783,train,webarena_verified.332.756.2 -webarena_verified.316.784.2,False,gitlab,AgentResponseEvaluator,784,test,webarena_verified.351.783.2 -webarena_verified.316.785.2,False,gitlab,AgentResponseEvaluator,785,test,webarena_verified.316.784.2 -webarena_verified.316.786.2,False,gitlab,AgentResponseEvaluator,786,test,webarena_verified.316.785.2 -webarena_verified.316.787.2,False,gitlab,AgentResponseEvaluator,787,test,webarena_verified.316.786.2 -webarena_verified.316.788.4,False,gitlab,AgentResponseEvaluator,788,test,webarena_verified.316.787.2 -webarena_verified.328.789.2,False,gitlab,AgentResponseEvaluator,789,test,webarena_verified.316.788.4 -webarena_verified.246.790.2,False,shopping_admin,AgentResponseEvaluator,790,test,webarena_verified.742.782.2 -webarena_verified.84.791.2,False,gitlab reddit,AgentResponseEvaluator NetworkEventEvaluator,791,train,webarena_verified.27.735.2 -webarena_verified.172.792.2,False,shopping,AgentResponseEvaluator,792,test,webarena_verified.163.693.2 -webarena_verified.172.793.2,False,shopping,AgentResponseEvaluator,793,train,webarena_verified.172.792.2 -webarena_verified.191.794.2,False,shopping,AgentResponseEvaluator NetworkEventEvaluator,794,test,webarena_verified.172.793.2 -webarena_verified.191.795.2,False,shopping,AgentResponseEvaluator NetworkEventEvaluator,795,train,webarena_verified.191.794.2 -webarena_verified.191.796.2,False,shopping,AgentResponseEvaluator NetworkEventEvaluator,796,train,webarena_verified.191.795.2 -webarena_verified.191.797.2,False,shopping,AgentResponseEvaluator NetworkEventEvaluator,797,test,webarena_verified.191.796.2 -webarena_verified.191.798.2,False,shopping,AgentResponseEvaluator NetworkEventEvaluator,798,train,webarena_verified.191.797.2 -webarena_verified.600.799.2,False,gitlab,AgentResponseEvaluator NetworkEventEvaluator NetworkEventEvaluator,799,train,webarena_verified.84.791.2 -webarena_verified.600.800.2,False,gitlab,AgentResponseEvaluator NetworkEventEvaluator NetworkEventEvaluator,800,test,webarena_verified.600.799.2 -webarena_verified.600.801.2,False,gitlab,AgentResponseEvaluator NetworkEventEvaluator NetworkEventEvaluator,801,train,webarena_verified.600.800.2 -webarena_verified.600.802.2,False,gitlab,AgentResponseEvaluator NetworkEventEvaluator NetworkEventEvaluator,802,train,webarena_verified.600.801.2 -webarena_verified.600.803.2,False,gitlab,AgentResponseEvaluator NetworkEventEvaluator NetworkEventEvaluator,803,test,webarena_verified.600.802.2 -webarena_verified.999.804.2,False,gitlab,AgentResponseEvaluator NetworkEventEvaluator NetworkEventEvaluator,804,train,webarena_verified.600.803.2 -webarena_verified.335.805.2,False,gitlab,AgentResponseEvaluator,805,test,webarena_verified.999.804.2 -webarena_verified.335.806.2,False,gitlab,AgentResponseEvaluator NetworkEventEvaluator,806,test,webarena_verified.335.805.2 -webarena_verified.335.807.2,False,gitlab,AgentResponseEvaluator,807,train,webarena_verified.335.806.2 -webarena_verified.327.808.2,False,gitlab,AgentResponseEvaluator NetworkEventEvaluator,808,train,webarena_verified.335.807.2 -webarena_verified.327.809.2,False,gitlab,AgentResponseEvaluator NetworkEventEvaluator,809,train,webarena_verified.327.808.2 -webarena_verified.999.810.2,False,gitlab,AgentResponseEvaluator NetworkEventEvaluator,810,test,webarena_verified.327.809.2 -webarena_verified.999.811.2,False,gitlab,AgentResponseEvaluator NetworkEventEvaluator,811,test,webarena_verified.999.810.2 From 29ce81b4723ccadfb6fc685d7de2eba90b933925 Mon Sep 17 00:00:00 2001 From: Nicolas Gontier Date: Mon, 15 Dec 2025 16:49:39 +0000 Subject: [PATCH 54/64] do not hardcode revision number --- .../browsergym/experiments/benchmark/utils.py | 43 ++++++++++++++----- 1 file changed, 33 insertions(+), 10 deletions(-) diff --git a/browsergym/experiments/src/browsergym/experiments/benchmark/utils.py b/browsergym/experiments/src/browsergym/experiments/benchmark/utils.py index a5b5d2ee..b030e9bc 100644 --- a/browsergym/experiments/src/browsergym/experiments/benchmark/utils.py +++ b/browsergym/experiments/src/browsergym/experiments/benchmark/utils.py @@ -1,3 +1,5 @@ +import importlib.resources +import json import logging import multiprocessing as mp import os @@ -104,6 +106,27 @@ def make_env_args_list_from_fixed_seeds( return env_args_list +def get_webarena_verified_task_name(intent_template_id: int, task_id: int) -> str: + """ + Returns the task name (with revision) for a given intent template id and task id. + """ + # Load the json file from the webarena-verified library + data = json.loads( + importlib.resources.files("webarena_verified") + .joinpath("assets/dataset/webarena-verified.json") + .read_text() + ) + for task in data: + if task["intent_template_id"] == intent_template_id and task["task_id"] == task_id: + revision = task["revision"] + break + else: + raise ValueError( + f"No task found for intent template id {intent_template_id} and task id {task_id} in webarena-verified.json" + ) + return f"webarena_verified.{intent_template_id}.{task_id}.{revision}" + + def prepare_backend(backend: str): match backend: case "miniwob": @@ -157,16 +180,16 @@ def prepare_backend(backend: str): ) massage_tasks( [ - f"webarena_verified.{intent_template_id}.{task_id}.{revision}" - for intent_template_id, task_id, revision in [ - (23, 410, 2), # reddit - (330, 533, 2), # gitlab - (87, 561, 3), # gitlab wiki - (88, 562, 2), # gitlab reddit - (165, 574, 2), # shopping - (16, 640, 2), # reddit - (253, 680, 2), # shopping_admin - (94, 740, 2), # wiki map + get_webarena_verified_task_name(intent_template_id, task_id) + for intent_template_id, task_id in [ + (23, 410), # reddit + # (330, 533), # gitlab + # (87, 561), # gitlab wiki + # (88, 562), # gitlab reddit + (165, 574), # shopping + (16, 640), # reddit + (253, 680), # shopping_admin + # (94, 740), # wiki map ] ] ) From 4b73bb4f1846c88c69a4e8e684bef3639cd04ee3 Mon Sep 17 00:00:00 2001 From: Nicolas Gontier Date: Mon, 15 Dec 2025 16:51:08 +0000 Subject: [PATCH 55/64] fix --- .../src/browsergym/experiments/benchmark/utils.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/browsergym/experiments/src/browsergym/experiments/benchmark/utils.py b/browsergym/experiments/src/browsergym/experiments/benchmark/utils.py index b030e9bc..bd8915d1 100644 --- a/browsergym/experiments/src/browsergym/experiments/benchmark/utils.py +++ b/browsergym/experiments/src/browsergym/experiments/benchmark/utils.py @@ -183,13 +183,13 @@ def prepare_backend(backend: str): get_webarena_verified_task_name(intent_template_id, task_id) for intent_template_id, task_id in [ (23, 410), # reddit - # (330, 533), # gitlab - # (87, 561), # gitlab wiki - # (88, 562), # gitlab reddit + (330, 533), # gitlab + (87, 561), # gitlab wiki + (88, 562), # gitlab reddit (165, 574), # shopping (16, 640), # reddit (253, 680), # shopping_admin - # (94, 740), # wiki map + (94, 740), # wiki map ] ] ) From ed6d6683aed1ef4849f20f9c0025f93b9b75a8f5 Mon Sep 17 00:00:00 2001 From: Nicolas Gontier Date: Mon, 15 Dec 2025 17:46:42 +0000 Subject: [PATCH 56/64] run black formater --- .../evaluation/evaluate_utils/evaluate_strings.py | 2 +- .../src/browsergym/experiments/benchmark/configs.py | 4 +--- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/browsergym/assistantbench/src/browsergym/assistantbench/evaluation/evaluate_utils/evaluate_strings.py b/browsergym/assistantbench/src/browsergym/assistantbench/evaluation/evaluate_utils/evaluate_strings.py index 68b451e3..e3f4dc8d 100644 --- a/browsergym/assistantbench/src/browsergym/assistantbench/evaluation/evaluate_utils/evaluate_strings.py +++ b/browsergym/assistantbench/src/browsergym/assistantbench/evaluation/evaluate_utils/evaluate_strings.py @@ -69,7 +69,7 @@ def _normalize_number(text: str) -> str: def _answer_to_bags( - answer: Union[str, List[str], Tuple[str, ...]] + answer: Union[str, List[str], Tuple[str, ...]], ) -> Tuple[List[str], List[Set[str]]]: if isinstance(answer, (list, tuple)): raw_spans = answer diff --git a/browsergym/experiments/src/browsergym/experiments/benchmark/configs.py b/browsergym/experiments/src/browsergym/experiments/benchmark/configs.py index ede9ca08..4728a573 100644 --- a/browsergym/experiments/src/browsergym/experiments/benchmark/configs.py +++ b/browsergym/experiments/src/browsergym/experiments/benchmark/configs.py @@ -140,9 +140,7 @@ supports_parallel_seeds=False, backends=["webarena_verified"], env_args_list=make_env_args_list_from_repeat_tasks( - task_list=task_list_from_metadata( - metadata=task_metadata("webarena_verified") - ), + task_list=task_list_from_metadata(metadata=task_metadata("webarena_verified")), max_steps=30, n_repeats=n_repeats, seeds_rng=np.random.RandomState(42), From 89b646085e3b71dc4d361e36c2b61e3be2059a63 Mon Sep 17 00:00:00 2001 From: Nicolas Gontier Date: Mon, 15 Dec 2025 18:38:42 +0000 Subject: [PATCH 57/64] fix format? --- .../src/browsergym/experiments/benchmark/metadata/utils.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/browsergym/experiments/src/browsergym/experiments/benchmark/metadata/utils.py b/browsergym/experiments/src/browsergym/experiments/benchmark/metadata/utils.py index 697f5911..87d3e47d 100644 --- a/browsergym/experiments/src/browsergym/experiments/benchmark/metadata/utils.py +++ b/browsergym/experiments/src/browsergym/experiments/benchmark/metadata/utils.py @@ -67,9 +67,10 @@ def make_webarena_verified_metadata_if_not_exists(): original_task = original_tasks.get(task_id, {}) # Assert that new task sites matches the original task sites - assert sites_str == original_task.get( - "sites", "" - ), f"Task {task_id}: sites mismatch - JSON: {sites_str}, CSV: {original_task.get("sites", "")}" + original_sites_str = original_task.get("sites", "") + assert ( + sites_str == original_sites_str + ), f"Task {task_id}: sites mismatch - JSON: {sites_str}, CSV: {original_sites_str}" # Construct the dependency task name if original_dependency := original_task.get("depends_on"): From 333d368b7f357f879872120675f08ede28037ddd Mon Sep 17 00:00:00 2001 From: Nicolas Gontier Date: Mon, 15 Dec 2025 20:08:39 +0000 Subject: [PATCH 58/64] always create the metadata file --- .../browsergym/experiments/benchmark/metadata/utils.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/browsergym/experiments/src/browsergym/experiments/benchmark/metadata/utils.py b/browsergym/experiments/src/browsergym/experiments/benchmark/metadata/utils.py index 87d3e47d..f1fccd6f 100644 --- a/browsergym/experiments/src/browsergym/experiments/benchmark/metadata/utils.py +++ b/browsergym/experiments/src/browsergym/experiments/benchmark/metadata/utils.py @@ -13,13 +13,10 @@ from browsergym.experiments.loop import EnvArgs -def make_webarena_verified_metadata_if_not_exists(): +def make_webarena_verified_metadata(): """ - Checks if the webarena_verified.csv file exists. If not, it creates it. + Creates the webarena_verified.csv metadata file based on the original webarena.csv file and the webarena-verified.json file in the webarena-verified library. """ - if os.path.exists(os.path.join(os.path.dirname(__file__), "webarena_verified.csv")): - return - # Load the json file from the webarena-verified library data = json.loads( importlib.resources.files("webarena_verified") @@ -118,7 +115,7 @@ def make_webarena_verified_metadata_if_not_exists(): def task_metadata(benchmark_name: str): if benchmark_name == "webarena_verified": - make_webarena_verified_metadata_if_not_exists() + make_webarena_verified_metadata() return task_metadata_from_csv( io.StringIO(pkgutil.get_data(__name__, f"{benchmark_name}.csv").decode("utf-8")) From 6535641860165beccdf1b0d69f4ac693d8fccca7 Mon Sep 17 00:00:00 2001 From: Aman Jaiswal Date: Mon, 15 Dec 2025 15:39:06 -0500 Subject: [PATCH 59/64] version-bump-dev --- browsergym/assistantbench/requirements.txt | 2 +- browsergym/core/src/browsergym/core/__init__.py | 2 +- browsergym/experiments/requirements.txt | 2 +- browsergym/miniwob/requirements.txt | 2 +- browsergym/pyproject.toml | 17 +++++++++-------- browsergym/visualwebarena/requirements.txt | 2 +- browsergym/webarena/requirements.txt | 2 +- browsergym/webarena_verified/requirements.txt | 2 +- browsergym/webarenalite/requirements.txt | 4 ++-- docs/src/conf.py | 2 +- 10 files changed, 19 insertions(+), 18 deletions(-) diff --git a/browsergym/assistantbench/requirements.txt b/browsergym/assistantbench/requirements.txt index b25a036e..08f228d4 100644 --- a/browsergym/assistantbench/requirements.txt +++ b/browsergym/assistantbench/requirements.txt @@ -1,4 +1,4 @@ -browsergym-core==0.14.3.dev1 +browsergym-core==0.14.3.dev2 datasets scipy numpy diff --git a/browsergym/core/src/browsergym/core/__init__.py b/browsergym/core/src/browsergym/core/__init__.py index 945dd106..9b712dc8 100644 --- a/browsergym/core/src/browsergym/core/__init__.py +++ b/browsergym/core/src/browsergym/core/__init__.py @@ -1,4 +1,4 @@ -__version__ = "0.14.3.dev1" +__version__ = "0.14.3.dev2" import playwright.sync_api diff --git a/browsergym/experiments/requirements.txt b/browsergym/experiments/requirements.txt index c0cbb7c7..b827c9b3 100644 --- a/browsergym/experiments/requirements.txt +++ b/browsergym/experiments/requirements.txt @@ -1,3 +1,3 @@ -browsergym-core==0.14.3.dev1 +browsergym-core==0.14.3.dev2 tiktoken>=0.4 dataclasses-json diff --git a/browsergym/miniwob/requirements.txt b/browsergym/miniwob/requirements.txt index 885b7518..8bc0306e 100644 --- a/browsergym/miniwob/requirements.txt +++ b/browsergym/miniwob/requirements.txt @@ -1 +1 @@ -browsergym-core==0.14.3.dev1 +browsergym-core==0.14.3.dev2 diff --git a/browsergym/pyproject.toml b/browsergym/pyproject.toml index 1988e430..12976fdb 100644 --- a/browsergym/pyproject.toml +++ b/browsergym/pyproject.toml @@ -29,17 +29,18 @@ classifiers = [ "Topic :: Scientific/Engineering :: Artificial Intelligence", "License :: OSI Approved :: Apache Software License", ] -version="0.14.3.dev1" +version="0.14.3.dev2" dependencies = [ - "browsergym-core==0.14.3.dev1", - "browsergym-miniwob==0.14.3.dev1", - "browsergym-webarena==0.14.3.dev1", - "browsergym-visualwebarena==0.14.3.dev1", - "browsergym-assistantbench==0.14.3.dev1", - "browsergym-experiments==0.14.3.dev1", + "browsergym-core==0.14.3.dev2", + "browsergym-miniwob==0.14.3.dev2", + "browsergym-webarena==0.14.3.dev2", + "browsergym-visualwebarena==0.14.3.dev2", + "browsergym-assistantbench==0.14.3.dev2", + "browsergym-experiments==0.14.3.dev2", "browsergym-workarena>=0.4.1", "weblinx-browsergym>=0.0.2", - "browsergym-webarenalite==0.14.3.dev1" + "browsergym-webarenalite==0.14.3.dev2", + "browsergym-webarena-verified==0.14.3.dev2" ] [tool.setuptools] diff --git a/browsergym/visualwebarena/requirements.txt b/browsergym/visualwebarena/requirements.txt index 376e6b41..35669958 100644 --- a/browsergym/visualwebarena/requirements.txt +++ b/browsergym/visualwebarena/requirements.txt @@ -1,4 +1,4 @@ -browsergym-core==0.14.3.dev1 +browsergym-core==0.14.3.dev2 browsergym-webarena libvisualwebarena==0.0.15 requests diff --git a/browsergym/webarena/requirements.txt b/browsergym/webarena/requirements.txt index 3b5c8171..32e7278f 100644 --- a/browsergym/webarena/requirements.txt +++ b/browsergym/webarena/requirements.txt @@ -1,2 +1,2 @@ -browsergym-core==0.14.3.dev1 +browsergym-core==0.14.3.dev2 libwebarena==0.0.4 diff --git a/browsergym/webarena_verified/requirements.txt b/browsergym/webarena_verified/requirements.txt index 46075a76..906a24b9 100644 --- a/browsergym/webarena_verified/requirements.txt +++ b/browsergym/webarena_verified/requirements.txt @@ -1,2 +1,2 @@ -browsergym-core==0.14.3.dev1 +browsergym-core==0.14.3.dev2 webarena-verified @ git+https://github.com/ServiceNow/webarena-verified \ No newline at end of file diff --git a/browsergym/webarenalite/requirements.txt b/browsergym/webarenalite/requirements.txt index b918175f..e814bf07 100644 --- a/browsergym/webarenalite/requirements.txt +++ b/browsergym/webarenalite/requirements.txt @@ -1,3 +1,3 @@ -browsergym-core==0.14.3.dev1 -browsergym-webarena==0.14.3.dev1 +browsergym-core==0.14.3.dev2 +browsergym-webarena==0.14.3.dev2 libwebarena==0.0.4 diff --git a/docs/src/conf.py b/docs/src/conf.py index 594d0259..80c4b1b0 100644 --- a/docs/src/conf.py +++ b/docs/src/conf.py @@ -6,7 +6,7 @@ copyright = "2024, ServiceNow Research" author = "ServiceNow Research" -version = "0.14.3.dev1" +version = "0.14.3.dev2" release = version # -- General configuration From 84c1246824fb80547b7bd920207068644865fa41 Mon Sep 17 00:00:00 2001 From: Aman Jaiswal Date: Mon, 15 Dec 2025 16:19:18 -0500 Subject: [PATCH 60/64] Remove git dependency and add ins to install from source --- browsergym/webarena_verified/README.md | 14 +++++++------- browsergym/webarena_verified/requirements.txt | 1 - 2 files changed, 7 insertions(+), 8 deletions(-) diff --git a/browsergym/webarena_verified/README.md b/browsergym/webarena_verified/README.md index 44fe4d5e..6722d998 100644 --- a/browsergym/webarena_verified/README.md +++ b/browsergym/webarena_verified/README.md @@ -10,13 +10,13 @@ Follow the official [webarena README](https://github.com/web-arena-x/webarena/bl #### 1. Install webarena-verified -```bash -make install -``` -Alternatively, you can also run: -```bash -pip install -e ./browsergym/webarena_verified -``` +**IMPORTANT!** webarena-verified is still in active development. You need to install the webarena-verified package from source. + +> `pip install git+https://github.com/ServiceNow/webarena-verified` + +then install + +`pip install browsergym-webarena-verified` #### 2. Setup WebArena environment URLs diff --git a/browsergym/webarena_verified/requirements.txt b/browsergym/webarena_verified/requirements.txt index 906a24b9..8bc0306e 100644 --- a/browsergym/webarena_verified/requirements.txt +++ b/browsergym/webarena_verified/requirements.txt @@ -1,2 +1 @@ browsergym-core==0.14.3.dev2 -webarena-verified @ git+https://github.com/ServiceNow/webarena-verified \ No newline at end of file From ddeb2e7f48a08e13f8e455e72c27dd1373255225 Mon Sep 17 00:00:00 2001 From: Aman Jaiswal Date: Mon, 15 Dec 2025 16:21:06 -0500 Subject: [PATCH 61/64] version-bump-dev 0.14.3.dev3 --- browsergym/assistantbench/requirements.txt | 2 +- .../core/src/browsergym/core/__init__.py | 2 +- browsergym/experiments/requirements.txt | 2 +- browsergym/miniwob/requirements.txt | 2 +- browsergym/pyproject.toml | 18 +++++++++--------- browsergym/visualwebarena/requirements.txt | 2 +- browsergym/webarena/requirements.txt | 2 +- browsergym/webarena_verified/requirements.txt | 2 +- browsergym/webarenalite/requirements.txt | 4 ++-- docs/src/conf.py | 2 +- 10 files changed, 19 insertions(+), 19 deletions(-) diff --git a/browsergym/assistantbench/requirements.txt b/browsergym/assistantbench/requirements.txt index 08f228d4..aa7a96c1 100644 --- a/browsergym/assistantbench/requirements.txt +++ b/browsergym/assistantbench/requirements.txt @@ -1,4 +1,4 @@ -browsergym-core==0.14.3.dev2 +browsergym-core==0.14.3.dev3 datasets scipy numpy diff --git a/browsergym/core/src/browsergym/core/__init__.py b/browsergym/core/src/browsergym/core/__init__.py index 9b712dc8..3007788d 100644 --- a/browsergym/core/src/browsergym/core/__init__.py +++ b/browsergym/core/src/browsergym/core/__init__.py @@ -1,4 +1,4 @@ -__version__ = "0.14.3.dev2" +__version__ = "0.14.3.dev3" import playwright.sync_api diff --git a/browsergym/experiments/requirements.txt b/browsergym/experiments/requirements.txt index b827c9b3..b1f5af26 100644 --- a/browsergym/experiments/requirements.txt +++ b/browsergym/experiments/requirements.txt @@ -1,3 +1,3 @@ -browsergym-core==0.14.3.dev2 +browsergym-core==0.14.3.dev3 tiktoken>=0.4 dataclasses-json diff --git a/browsergym/miniwob/requirements.txt b/browsergym/miniwob/requirements.txt index 8bc0306e..4fbbdb9b 100644 --- a/browsergym/miniwob/requirements.txt +++ b/browsergym/miniwob/requirements.txt @@ -1 +1 @@ -browsergym-core==0.14.3.dev2 +browsergym-core==0.14.3.dev3 diff --git a/browsergym/pyproject.toml b/browsergym/pyproject.toml index 12976fdb..de310035 100644 --- a/browsergym/pyproject.toml +++ b/browsergym/pyproject.toml @@ -29,18 +29,18 @@ classifiers = [ "Topic :: Scientific/Engineering :: Artificial Intelligence", "License :: OSI Approved :: Apache Software License", ] -version="0.14.3.dev2" +version="0.14.3.dev3" dependencies = [ - "browsergym-core==0.14.3.dev2", - "browsergym-miniwob==0.14.3.dev2", - "browsergym-webarena==0.14.3.dev2", - "browsergym-visualwebarena==0.14.3.dev2", - "browsergym-assistantbench==0.14.3.dev2", - "browsergym-experiments==0.14.3.dev2", + "browsergym-core==0.14.3.dev3", + "browsergym-miniwob==0.14.3.dev3", + "browsergym-webarena==0.14.3.dev3", + "browsergym-visualwebarena==0.14.3.dev3", + "browsergym-assistantbench==0.14.3.dev3", + "browsergym-experiments==0.14.3.dev3", "browsergym-workarena>=0.4.1", "weblinx-browsergym>=0.0.2", - "browsergym-webarenalite==0.14.3.dev2", - "browsergym-webarena-verified==0.14.3.dev2" + "browsergym-webarenalite==0.14.3.dev3", + "browsergym-webarena-verified==0.14.3.dev3" ] [tool.setuptools] diff --git a/browsergym/visualwebarena/requirements.txt b/browsergym/visualwebarena/requirements.txt index 35669958..a20b168a 100644 --- a/browsergym/visualwebarena/requirements.txt +++ b/browsergym/visualwebarena/requirements.txt @@ -1,4 +1,4 @@ -browsergym-core==0.14.3.dev2 +browsergym-core==0.14.3.dev3 browsergym-webarena libvisualwebarena==0.0.15 requests diff --git a/browsergym/webarena/requirements.txt b/browsergym/webarena/requirements.txt index 32e7278f..2f7d4448 100644 --- a/browsergym/webarena/requirements.txt +++ b/browsergym/webarena/requirements.txt @@ -1,2 +1,2 @@ -browsergym-core==0.14.3.dev2 +browsergym-core==0.14.3.dev3 libwebarena==0.0.4 diff --git a/browsergym/webarena_verified/requirements.txt b/browsergym/webarena_verified/requirements.txt index 8bc0306e..4fbbdb9b 100644 --- a/browsergym/webarena_verified/requirements.txt +++ b/browsergym/webarena_verified/requirements.txt @@ -1 +1 @@ -browsergym-core==0.14.3.dev2 +browsergym-core==0.14.3.dev3 diff --git a/browsergym/webarenalite/requirements.txt b/browsergym/webarenalite/requirements.txt index e814bf07..f2389fe1 100644 --- a/browsergym/webarenalite/requirements.txt +++ b/browsergym/webarenalite/requirements.txt @@ -1,3 +1,3 @@ -browsergym-core==0.14.3.dev2 -browsergym-webarena==0.14.3.dev2 +browsergym-core==0.14.3.dev3 +browsergym-webarena==0.14.3.dev3 libwebarena==0.0.4 diff --git a/docs/src/conf.py b/docs/src/conf.py index 80c4b1b0..18173f7e 100644 --- a/docs/src/conf.py +++ b/docs/src/conf.py @@ -6,7 +6,7 @@ copyright = "2024, ServiceNow Research" author = "ServiceNow Research" -version = "0.14.3.dev2" +version = "0.14.3.dev3" release = version # -- General configuration From e61b022832bd64fb829a095d8564fcce6651be55 Mon Sep 17 00:00:00 2001 From: Aman Jaiswal Date: Thu, 8 Jan 2026 15:46:41 -0500 Subject: [PATCH 62/64] add webarena-verified package as a dependency --- browsergym/webarena_verified/requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/browsergym/webarena_verified/requirements.txt b/browsergym/webarena_verified/requirements.txt index 4fbbdb9b..f0b5a1c9 100644 --- a/browsergym/webarena_verified/requirements.txt +++ b/browsergym/webarena_verified/requirements.txt @@ -1 +1,2 @@ browsergym-core==0.14.3.dev3 +webarena-verified \ No newline at end of file From 731852c8c5488c55944fc54e16ccc64f8a7917dc Mon Sep 17 00:00:00 2001 From: Aman Jaiswal Date: Thu, 8 Jan 2026 16:07:45 -0500 Subject: [PATCH 63/64] version-bump-dev 0.14.3.dev4 --- browsergym/assistantbench/requirements.txt | 2 +- .../core/src/browsergym/core/__init__.py | 2 +- browsergym/experiments/requirements.txt | 2 +- browsergym/miniwob/requirements.txt | 2 +- browsergym/pyproject.toml | 18 +++++++++--------- browsergym/visualwebarena/requirements.txt | 2 +- browsergym/webarena/requirements.txt | 2 +- browsergym/webarena_verified/requirements.txt | 2 +- browsergym/webarenalite/requirements.txt | 4 ++-- docs/src/conf.py | 2 +- 10 files changed, 19 insertions(+), 19 deletions(-) diff --git a/browsergym/assistantbench/requirements.txt b/browsergym/assistantbench/requirements.txt index aa7a96c1..a52e4bc5 100644 --- a/browsergym/assistantbench/requirements.txt +++ b/browsergym/assistantbench/requirements.txt @@ -1,4 +1,4 @@ -browsergym-core==0.14.3.dev3 +browsergym-core==0.14.3.dev4 datasets scipy numpy diff --git a/browsergym/core/src/browsergym/core/__init__.py b/browsergym/core/src/browsergym/core/__init__.py index 3007788d..9c60a0e4 100644 --- a/browsergym/core/src/browsergym/core/__init__.py +++ b/browsergym/core/src/browsergym/core/__init__.py @@ -1,4 +1,4 @@ -__version__ = "0.14.3.dev3" +__version__ = "0.14.3.dev4" import playwright.sync_api diff --git a/browsergym/experiments/requirements.txt b/browsergym/experiments/requirements.txt index b1f5af26..41ccde8c 100644 --- a/browsergym/experiments/requirements.txt +++ b/browsergym/experiments/requirements.txt @@ -1,3 +1,3 @@ -browsergym-core==0.14.3.dev3 +browsergym-core==0.14.3.dev4 tiktoken>=0.4 dataclasses-json diff --git a/browsergym/miniwob/requirements.txt b/browsergym/miniwob/requirements.txt index 4fbbdb9b..d703649d 100644 --- a/browsergym/miniwob/requirements.txt +++ b/browsergym/miniwob/requirements.txt @@ -1 +1 @@ -browsergym-core==0.14.3.dev3 +browsergym-core==0.14.3.dev4 diff --git a/browsergym/pyproject.toml b/browsergym/pyproject.toml index de310035..a934692a 100644 --- a/browsergym/pyproject.toml +++ b/browsergym/pyproject.toml @@ -29,18 +29,18 @@ classifiers = [ "Topic :: Scientific/Engineering :: Artificial Intelligence", "License :: OSI Approved :: Apache Software License", ] -version="0.14.3.dev3" +version="0.14.3.dev4" dependencies = [ - "browsergym-core==0.14.3.dev3", - "browsergym-miniwob==0.14.3.dev3", - "browsergym-webarena==0.14.3.dev3", - "browsergym-visualwebarena==0.14.3.dev3", - "browsergym-assistantbench==0.14.3.dev3", - "browsergym-experiments==0.14.3.dev3", + "browsergym-core==0.14.3.dev4", + "browsergym-miniwob==0.14.3.dev4", + "browsergym-webarena==0.14.3.dev4", + "browsergym-visualwebarena==0.14.3.dev4", + "browsergym-assistantbench==0.14.3.dev4", + "browsergym-experiments==0.14.3.dev4", "browsergym-workarena>=0.4.1", "weblinx-browsergym>=0.0.2", - "browsergym-webarenalite==0.14.3.dev3", - "browsergym-webarena-verified==0.14.3.dev3" + "browsergym-webarenalite==0.14.3.dev4", + "browsergym-webarena-verified==0.14.3.dev4" ] [tool.setuptools] diff --git a/browsergym/visualwebarena/requirements.txt b/browsergym/visualwebarena/requirements.txt index a20b168a..34c4a321 100644 --- a/browsergym/visualwebarena/requirements.txt +++ b/browsergym/visualwebarena/requirements.txt @@ -1,4 +1,4 @@ -browsergym-core==0.14.3.dev3 +browsergym-core==0.14.3.dev4 browsergym-webarena libvisualwebarena==0.0.15 requests diff --git a/browsergym/webarena/requirements.txt b/browsergym/webarena/requirements.txt index 2f7d4448..34c17e3f 100644 --- a/browsergym/webarena/requirements.txt +++ b/browsergym/webarena/requirements.txt @@ -1,2 +1,2 @@ -browsergym-core==0.14.3.dev3 +browsergym-core==0.14.3.dev4 libwebarena==0.0.4 diff --git a/browsergym/webarena_verified/requirements.txt b/browsergym/webarena_verified/requirements.txt index f0b5a1c9..07eb12be 100644 --- a/browsergym/webarena_verified/requirements.txt +++ b/browsergym/webarena_verified/requirements.txt @@ -1,2 +1,2 @@ -browsergym-core==0.14.3.dev3 +browsergym-core==0.14.3.dev4 webarena-verified \ No newline at end of file diff --git a/browsergym/webarenalite/requirements.txt b/browsergym/webarenalite/requirements.txt index f2389fe1..43031ee6 100644 --- a/browsergym/webarenalite/requirements.txt +++ b/browsergym/webarenalite/requirements.txt @@ -1,3 +1,3 @@ -browsergym-core==0.14.3.dev3 -browsergym-webarena==0.14.3.dev3 +browsergym-core==0.14.3.dev4 +browsergym-webarena==0.14.3.dev4 libwebarena==0.0.4 diff --git a/docs/src/conf.py b/docs/src/conf.py index 18173f7e..fad961c5 100644 --- a/docs/src/conf.py +++ b/docs/src/conf.py @@ -6,7 +6,7 @@ copyright = "2024, ServiceNow Research" author = "ServiceNow Research" -version = "0.14.3.dev3" +version = "0.14.3.dev4" release = version # -- General configuration From 4367cc708aded6b138c904700f55d8fb864a0dd3 Mon Sep 17 00:00:00 2001 From: Aman Jaiswal Date: Thu, 8 Jan 2026 16:43:25 -0500 Subject: [PATCH 64/64] add webarena-verified in the dev requirements.txt --- dev/requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/dev/requirements.txt b/dev/requirements.txt index 4ea56ecf..46126fe9 100644 --- a/dev/requirements.txt +++ b/dev/requirements.txt @@ -12,5 +12,6 @@ tenacity -e ../browsergym/experiments # local package -e ../browsergym/assistantbench # local package -e ../browsergym/webarenalite # local package +-e ../browsergym/webarena_verified # local package browsergym-workarena weblinx_browsergym