File size: 7,855 Bytes
dfe35ce |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 |
import os
import jsonschema
import pandas as pd
from jsonschema import validate
from chinatravel.data.load_datasets import load_query
from chinatravel.evaluation.utils import load_json_file
from chinatravel.symbol_verification.commonsense_constraint import (
Is_time_correct,
Is_space_correct,
Is_hotels_correct,
Is_transport_correct,
Is_attractions_correct,
Is_restaurants_correct,
Is_intercity_transport_correct,
)
from chinatravel.symbol_verification.hard_constraint import evaluate_constraints_py
os.environ["HF_DATASETS_OFFLINE"] = "1"
def load_result(result_dir, query_index):
result = {}
matched_uid = []
unmatched_uid = []
for query_id in query_index:
result_file = os.path.join(result_dir, f"{query_id}.json")
# print(f"Loading result for {query_id} from {result_file}")
try:
if os.path.exists(result_file):
result[query_id] = load_json_file(result_file)
matched_uid.append(query_id)
else:
result[query_id] = {}
unmatched_uid.append(query_id)
except Exception:
result[query_id] = {}
unmatched_uid.append(query_id)
return result, matched_uid, unmatched_uid
def validate_json(json_data, schema):
try:
validate(instance=json_data, schema=schema)
return True
except jsonschema.exceptions.ValidationError as e:
return False
def evaluate_schema_constraints(data_index, plan_json_dict, schema, result):
total_correct = 0
result_agg = pd.DataFrame(columns=["data_id", "schema"])
result_agg["data_id"] = data_index
pass_id = []
total = len(data_index)
for ii, idx in enumerate(data_index):
plan_json = plan_json_dict[idx]
succ_flag = 0
try:
if validate_json(plan_json, schema):
succ_flag = 1
pass_id.append(idx)
except Exception as e:
pass
result_agg.loc[ii, "schema"] = succ_flag
total_correct += succ_flag
yield {
"stage": "schema",
"progress": (ii + 1) / total * 100,
}
total_count = len(data_index)
result["DR"] = total_correct / total_count * 100
result["S_pass_id"] = pass_id
"""
Constraints:
Available
1. Intercity transport information exsits and is objective: ID, time, startpos and endpos need to be correct.
2. Attractions
3. Hotels
4. Restaurants
5. transportation
6. Times
7. space
"""
def evaluate_commonsense_constraints(
data_index, symbolic_input_dict, plan_json_dict, result
):
func_list = [
Is_intercity_transport_correct,
Is_attractions_correct,
Is_hotels_correct,
Is_restaurants_correct,
Is_transport_correct,
Is_time_correct,
Is_space_correct,
]
result_agg = pd.DataFrame(columns=["data_id"])
result_agg["data_id"] = data_index
individual_succ = 0
pass_id = []
total = len(data_index)
for ii, idx in enumerate(data_index):
symbolic_input, plan_json = symbolic_input_dict[idx], plan_json_dict[idx]
try:
for func in func_list:
table_res, _ = func(symbolic_input, plan_json, verbose=False)
for colum_i in table_res.columns:
if colum_i not in result_agg.columns:
result_agg[colum_i] = 0
result_agg.loc[ii, colum_i] = table_res[colum_i].loc[0]
if result_agg.loc[ii][1:].sum() == 0:
individual_succ += 1
pass_id.append(idx)
except Exception as message:
pass
yield {
"stage": "commonsense",
"progress": (ii + 1) / total * 100,
}
total_count = len(data_index)
micro_accuracy = 1.0 - result_agg.drop("data_id", axis=1).sum().sum() / (
total_count * (result_agg.shape[1] - 1)
)
macro_accuracy = individual_succ / total_count
result["EPR_micro"] = micro_accuracy * 100
result["EPR_macro"] = macro_accuracy * 100
result["E_pass_id"] = pass_id
def evaluate_hard_constraints_v2(
data_index, symbolic_input_dict, plan_json_dict, env_pass_id, result: dict
):
max_logic_num = 0
for idx in data_index:
max_logic_num = max(
max_logic_num, len(symbolic_input_dict[idx]["hard_logic_py"])
)
columns = ["data_id"]
for i in range(max_logic_num):
columns.append(f"logic_py_{i}")
result_agg = pd.DataFrame(columns=columns)
for col_i in result_agg.columns[1:]:
result_agg[col_i] = 0
macro_count, macro_succ_count = 0, 0
micro_count, micro_succ_count = 0, 0
conditional_micro_succ_count, conditional_macro_succ_count = 0, 0
results = []
passed_id = []
total = len(data_index)
for ii, idx in enumerate(data_index):
symbolic_input, plan_json = symbolic_input_dict[idx], plan_json_dict[idx]
result_ii = evaluate_constraints_py(
symbolic_input["hard_logic_py"], plan_json, verbose=False
)
results.append(result_ii)
dict_ii = {}
succ_c_sum = 0
for logic_i in range(len(symbolic_input["hard_logic_py"])):
dict_ii[f"logic_py_{logic_i}"] = int(result_ii[logic_i])
succ_c_sum += int(result_ii[logic_i])
macro_count += 1
macro_succ_count += succ_c_sum == len(dict_ii)
micro_count += len(dict_ii)
micro_succ_count += succ_c_sum
if idx in env_pass_id:
conditional_micro_succ_count += succ_c_sum
conditional_macro_succ_count += succ_c_sum == len(dict_ii)
if succ_c_sum == len(dict_ii):
passed_id.append(idx)
dict_ii["data_id"] = idx
result_agg.loc[ii] = pd.Series(dict_ii)
yield {
"stage": "logic",
"progress": (ii + 1) / total * 100,
}
macro = macro_succ_count / macro_count
micro = micro_succ_count / micro_count
c_marco = conditional_macro_succ_count / macro_count
c_micro = conditional_micro_succ_count / micro_count
result["LPR_micro"] = micro * 100
result["LPR_macro"] = macro * 100
result["C-LPR"] = c_micro * 100
result["L_pass_id"] = passed_id
def evaluate(args, result):
eval_result = {}
query_index, query_data = load_query(args)
result_data, matched_uid, unmatched_uid = load_result(
args.result_dir, query_index=query_index
)
eval_result["matched_uid"] = matched_uid
eval_result["unmatched_uid"] = unmatched_uid
schema_file_path = "chinatravel/evaluation/output_schema.json"
schema = load_json_file(schema_file_path)
# schema pass rate
yield from evaluate_schema_constraints(
query_index, result_data, schema=schema, result=eval_result
)
# commonsense pass rate
yield from evaluate_commonsense_constraints(
query_index, query_data, result_data, result=eval_result
)
# hard logic pass rate
yield from evaluate_hard_constraints_v2(
query_index,
query_data,
result_data,
env_pass_id=eval_result.get("E_pass_id", []),
result=eval_result,
)
# all pass rate
# all_pass_id = list(
# set(schema_pass_id) & set(commonsense_pass_id) & set(logi_pass_id)
# )
all_pass_id = set(query_index) # Initialize with all query IDs
for key in eval_result:
if "pass_id" in key:
all_pass_id.intersection_update(set(eval_result[key]))
eval_result["FPR"] = len(all_pass_id) / len(query_index) * 100
# del pass_id
del_keys = [key for key in eval_result if "pass_id" in key]
for key in del_keys:
del eval_result[key]
result = eval_result
yield {
"stage": "final",
"progress": 100,
"result": result,
}
|