|
import os |
|
import jsonschema |
|
import pandas as pd |
|
from jsonschema import validate |
|
from chinatravel.data.load_datasets import load_query |
|
from chinatravel.evaluation.utils import load_json_file |
|
from chinatravel.symbol_verification.commonsense_constraint import ( |
|
Is_time_correct, |
|
Is_space_correct, |
|
Is_hotels_correct, |
|
Is_transport_correct, |
|
Is_attractions_correct, |
|
Is_restaurants_correct, |
|
Is_intercity_transport_correct, |
|
) |
|
from chinatravel.symbol_verification.hard_constraint import evaluate_constraints_py |
|
|
|
os.environ["HF_DATASETS_OFFLINE"] = "1" |
|
|
|
|
|
def load_result(result_dir, query_index): |
|
result = {} |
|
matched_uid = [] |
|
unmatched_uid = [] |
|
for query_id in query_index: |
|
result_file = os.path.join(result_dir, f"{query_id}.json") |
|
|
|
try: |
|
if os.path.exists(result_file): |
|
result[query_id] = load_json_file(result_file) |
|
matched_uid.append(query_id) |
|
else: |
|
result[query_id] = {} |
|
unmatched_uid.append(query_id) |
|
except Exception: |
|
result[query_id] = {} |
|
unmatched_uid.append(query_id) |
|
return result, matched_uid, unmatched_uid |
|
|
|
|
|
def validate_json(json_data, schema): |
|
try: |
|
validate(instance=json_data, schema=schema) |
|
return True |
|
except jsonschema.exceptions.ValidationError as e: |
|
return False |
|
|
|
|
|
def evaluate_schema_constraints(data_index, plan_json_dict, schema, result): |
|
|
|
total_correct = 0 |
|
result_agg = pd.DataFrame(columns=["data_id", "schema"]) |
|
result_agg["data_id"] = data_index |
|
|
|
pass_id = [] |
|
|
|
total = len(data_index) |
|
for ii, idx in enumerate(data_index): |
|
plan_json = plan_json_dict[idx] |
|
succ_flag = 0 |
|
try: |
|
if validate_json(plan_json, schema): |
|
succ_flag = 1 |
|
pass_id.append(idx) |
|
except Exception as e: |
|
pass |
|
result_agg.loc[ii, "schema"] = succ_flag |
|
total_correct += succ_flag |
|
yield { |
|
"stage": "schema", |
|
"progress": (ii + 1) / total * 100, |
|
} |
|
total_count = len(data_index) |
|
result["DR"] = total_correct / total_count * 100 |
|
result["S_pass_id"] = pass_id |
|
|
|
|
|
""" |
|
Constraints: |
|
Available |
|
1. Intercity transport information exsits and is objective: ID, time, startpos and endpos need to be correct. |
|
2. Attractions |
|
3. Hotels |
|
4. Restaurants |
|
5. transportation |
|
6. Times |
|
7. space |
|
""" |
|
|
|
|
|
def evaluate_commonsense_constraints( |
|
data_index, symbolic_input_dict, plan_json_dict, result |
|
): |
|
func_list = [ |
|
Is_intercity_transport_correct, |
|
Is_attractions_correct, |
|
Is_hotels_correct, |
|
Is_restaurants_correct, |
|
Is_transport_correct, |
|
Is_time_correct, |
|
Is_space_correct, |
|
] |
|
result_agg = pd.DataFrame(columns=["data_id"]) |
|
result_agg["data_id"] = data_index |
|
individual_succ = 0 |
|
pass_id = [] |
|
total = len(data_index) |
|
for ii, idx in enumerate(data_index): |
|
symbolic_input, plan_json = symbolic_input_dict[idx], plan_json_dict[idx] |
|
try: |
|
for func in func_list: |
|
table_res, _ = func(symbolic_input, plan_json, verbose=False) |
|
for colum_i in table_res.columns: |
|
if colum_i not in result_agg.columns: |
|
result_agg[colum_i] = 0 |
|
result_agg.loc[ii, colum_i] = table_res[colum_i].loc[0] |
|
if result_agg.loc[ii][1:].sum() == 0: |
|
individual_succ += 1 |
|
pass_id.append(idx) |
|
except Exception as message: |
|
pass |
|
yield { |
|
"stage": "commonsense", |
|
"progress": (ii + 1) / total * 100, |
|
} |
|
total_count = len(data_index) |
|
micro_accuracy = 1.0 - result_agg.drop("data_id", axis=1).sum().sum() / ( |
|
total_count * (result_agg.shape[1] - 1) |
|
) |
|
macro_accuracy = individual_succ / total_count |
|
result["EPR_micro"] = micro_accuracy * 100 |
|
result["EPR_macro"] = macro_accuracy * 100 |
|
result["E_pass_id"] = pass_id |
|
|
|
|
|
def evaluate_hard_constraints_v2( |
|
data_index, symbolic_input_dict, plan_json_dict, env_pass_id, result: dict |
|
): |
|
max_logic_num = 0 |
|
for idx in data_index: |
|
max_logic_num = max( |
|
max_logic_num, len(symbolic_input_dict[idx]["hard_logic_py"]) |
|
) |
|
columns = ["data_id"] |
|
for i in range(max_logic_num): |
|
columns.append(f"logic_py_{i}") |
|
result_agg = pd.DataFrame(columns=columns) |
|
for col_i in result_agg.columns[1:]: |
|
result_agg[col_i] = 0 |
|
macro_count, macro_succ_count = 0, 0 |
|
micro_count, micro_succ_count = 0, 0 |
|
conditional_micro_succ_count, conditional_macro_succ_count = 0, 0 |
|
results = [] |
|
passed_id = [] |
|
total = len(data_index) |
|
for ii, idx in enumerate(data_index): |
|
symbolic_input, plan_json = symbolic_input_dict[idx], plan_json_dict[idx] |
|
result_ii = evaluate_constraints_py( |
|
symbolic_input["hard_logic_py"], plan_json, verbose=False |
|
) |
|
results.append(result_ii) |
|
dict_ii = {} |
|
succ_c_sum = 0 |
|
for logic_i in range(len(symbolic_input["hard_logic_py"])): |
|
dict_ii[f"logic_py_{logic_i}"] = int(result_ii[logic_i]) |
|
succ_c_sum += int(result_ii[logic_i]) |
|
macro_count += 1 |
|
macro_succ_count += succ_c_sum == len(dict_ii) |
|
micro_count += len(dict_ii) |
|
micro_succ_count += succ_c_sum |
|
if idx in env_pass_id: |
|
conditional_micro_succ_count += succ_c_sum |
|
conditional_macro_succ_count += succ_c_sum == len(dict_ii) |
|
if succ_c_sum == len(dict_ii): |
|
passed_id.append(idx) |
|
dict_ii["data_id"] = idx |
|
result_agg.loc[ii] = pd.Series(dict_ii) |
|
yield { |
|
"stage": "logic", |
|
"progress": (ii + 1) / total * 100, |
|
} |
|
macro = macro_succ_count / macro_count |
|
micro = micro_succ_count / micro_count |
|
c_marco = conditional_macro_succ_count / macro_count |
|
c_micro = conditional_micro_succ_count / micro_count |
|
result["LPR_micro"] = micro * 100 |
|
result["LPR_macro"] = macro * 100 |
|
result["C-LPR"] = c_micro * 100 |
|
result["L_pass_id"] = passed_id |
|
|
|
|
|
def evaluate(args, result): |
|
eval_result = {} |
|
|
|
query_index, query_data = load_query(args) |
|
result_data, matched_uid, unmatched_uid = load_result( |
|
args.result_dir, query_index=query_index |
|
) |
|
eval_result["matched_uid"] = matched_uid |
|
eval_result["unmatched_uid"] = unmatched_uid |
|
|
|
schema_file_path = "chinatravel/evaluation/output_schema.json" |
|
schema = load_json_file(schema_file_path) |
|
|
|
yield from evaluate_schema_constraints( |
|
query_index, result_data, schema=schema, result=eval_result |
|
) |
|
|
|
|
|
yield from evaluate_commonsense_constraints( |
|
query_index, query_data, result_data, result=eval_result |
|
) |
|
|
|
|
|
yield from evaluate_hard_constraints_v2( |
|
query_index, |
|
query_data, |
|
result_data, |
|
env_pass_id=eval_result.get("E_pass_id", []), |
|
result=eval_result, |
|
) |
|
|
|
|
|
|
|
|
|
|
|
all_pass_id = set(query_index) |
|
for key in eval_result: |
|
if "pass_id" in key: |
|
all_pass_id.intersection_update(set(eval_result[key])) |
|
eval_result["FPR"] = len(all_pass_id) / len(query_index) * 100 |
|
|
|
del_keys = [key for key in eval_result if "pass_id" in key] |
|
for key in del_keys: |
|
del eval_result[key] |
|
result = eval_result |
|
yield { |
|
"stage": "final", |
|
"progress": 100, |
|
"result": result, |
|
} |
|
|