Lukas Helff
commited on
Commit
·
f567a45
1
Parent(s):
62ba87c
fix validation program
Browse files
VerifiableRewardsForScalableLogicalReasoning.py
CHANGED
|
@@ -99,19 +99,21 @@ Returns:
|
|
| 99 |
detailed_results (`list` of `dict`): Per-example results including correctness, partial score, execution time, and any errors encountered.
|
| 100 |
"""
|
| 101 |
|
| 102 |
-
|
| 103 |
-
|
| 104 |
-
|
| 105 |
-
|
| 106 |
-
|
| 107 |
-
|
| 108 |
-
|
| 109 |
-
|
| 110 |
-
|
| 111 |
-
|
| 112 |
-
|
| 113 |
-
|
| 114 |
-
|
|
|
|
|
|
|
| 115 |
|
| 116 |
|
| 117 |
def _evaluate_with_prolog(prediction, validation_program, eval_config, timeout=5):
|
|
@@ -128,15 +130,6 @@ def _evaluate_with_prolog(prediction, validation_program, eval_config, timeout=5
|
|
| 128 |
# extract predicate from rule_to_evaluate
|
| 129 |
rule_to_evaluate = extract_ilp_from_text_v2(prediction, positive_pred, allow_multiple_rules)
|
| 130 |
|
| 131 |
-
is_valid, validation_msg = validate_rule_no_hardcoded_cars(rule_to_evaluate)
|
| 132 |
-
if not is_valid:
|
| 133 |
-
return {
|
| 134 |
-
"is_correct": False,
|
| 135 |
-
"partial_score": 0.0,
|
| 136 |
-
"syntax_valid": False,
|
| 137 |
-
"error": f"Rule validation failed: {validation_msg}"
|
| 138 |
-
}
|
| 139 |
-
|
| 140 |
if positive_pred not in rule_to_evaluate:
|
| 141 |
p = prediction.replace('\n', ' ')
|
| 142 |
return {
|
|
@@ -175,8 +168,7 @@ check_count(Count) :-
|
|
| 175 |
check_all :- forall((pos({vars});neg({vars})), check({vars})).
|
| 176 |
"""
|
| 177 |
# Add the rule to evaluate
|
| 178 |
-
validation_program =
|
| 179 |
-
validation_program = re.sub(rf'\b{negative_pred}\b', 'neg', validation_program)
|
| 180 |
|
| 181 |
pos_negs = validation_program.count("pos(") + validation_program.count("neg(")
|
| 182 |
validation_program = '\n'.join(sorted(validation_program.splitlines()))
|
|
|
|
| 99 |
detailed_results (`list` of `dict`): Per-example results including correctness, partial score, execution time, and any errors encountered.
|
| 100 |
"""
|
| 101 |
|
| 102 |
+
def fix_validation_program(validation_program, positive_pred="eastbound", negative_pred="westbound"):
|
| 103 |
+
"""
|
| 104 |
+
Fixes the validation program by ensuring it has a consistent format.
|
| 105 |
+
- Removes comments
|
| 106 |
+
- Ensures all rules end with a period
|
| 107 |
+
- Removes empty lines
|
| 108 |
+
"""
|
| 109 |
+
# anonymize train and car instances, and head predicates
|
| 110 |
+
validation_program = re.sub(rf'\b{positive_pred}\b', 'pos', validation_program)
|
| 111 |
+
validation_program = re.sub(rf'\b{negative_pred}\b', 'neg', validation_program)
|
| 112 |
+
# replace train with mytrain and car with mycar
|
| 113 |
+
#trains must follow a digit pattern train\d+ and cars must follow a pattern car\d+_\d+
|
| 114 |
+
validation_program = validation_program.replace('(train', '(mytrain')
|
| 115 |
+
validation_program = validation_program.replace('(car', '(mycar').replace(', car', ', mycar')
|
| 116 |
+
return validation_program
|
| 117 |
|
| 118 |
|
| 119 |
def _evaluate_with_prolog(prediction, validation_program, eval_config, timeout=5):
|
|
|
|
| 130 |
# extract predicate from rule_to_evaluate
|
| 131 |
rule_to_evaluate = extract_ilp_from_text_v2(prediction, positive_pred, allow_multiple_rules)
|
| 132 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 133 |
if positive_pred not in rule_to_evaluate:
|
| 134 |
p = prediction.replace('\n', ' ')
|
| 135 |
return {
|
|
|
|
| 168 |
check_all :- forall((pos({vars});neg({vars})), check({vars})).
|
| 169 |
"""
|
| 170 |
# Add the rule to evaluate
|
| 171 |
+
validation_program = fix_validation_program(validation_program, positive_pred, negative_pred)
|
|
|
|
| 172 |
|
| 173 |
pos_negs = validation_program.count("pos(") + validation_program.count("neg(")
|
| 174 |
validation_program = '\n'.join(sorted(validation_program.splitlines()))
|