|
|
|
|
|
import difflib |
|
from typing import List, Tuple |
|
|
|
import editdistance |
|
|
|
|
|
class ChunkSimilarity(object): |
|
def edit_distance(self, chunk: str, match_content: str) -> List[Tuple[str, float, str]]: |
|
edit_distance = editdistance.distance(chunk, match_content) |
|
|
|
chunk_length = len(chunk) |
|
content_length = len(match_content) |
|
|
|
normalized_edit_distance = edit_distance / (chunk_length + content_length) |
|
normalized_edit_distance2 = 2 * edit_distance / (chunk_length + content_length) |
|
|
|
result = [ |
|
("edit_distance", edit_distance, ""), |
|
( |
|
"ed_score", round(1 - normalized_edit_distance, 4), |
|
"1 - d / (l1 + l2)" |
|
), |
|
( |
|
"ed_score2", round(1 - normalized_edit_distance2, 4), |
|
"1 - 2*d / (l1 + l2)" |
|
), |
|
] |
|
return result |
|
|
|
def seq_match(self, chunk: str, match_content: str) -> List[Tuple[str, str, str]]: |
|
seq_match = difflib.SequenceMatcher() |
|
seq_match.set_seqs(chunk, match_content) |
|
score = seq_match.ratio() |
|
|
|
result = [ |
|
("seq_match", round(score, 4), "(2.0*M / T) similar to edit_distance"), |
|
] |
|
return result |
|
|
|
def similar(self, chunk: str, match_content: str): |
|
result = [ |
|
("metric", "score", "note") |
|
] |
|
scores = self.edit_distance(chunk, match_content) |
|
result.extend(scores) |
|
scores = self.seq_match(chunk, match_content) |
|
result.extend(scores) |
|
|
|
return result |
|
|
|
|
|
CHUNK_TRUE = """ |
|
2023年人工智能发展现状:生成式AI的突破之年 |
|
""" |
|
|
|
|
|
CHUNK_EDIT = """ |
|
Among these PEFT methods, the reparameterization-based method low-rank adaptation (LoRA) (Hu et al.,2021) is considered one of the most efficient and effective methods at present. |
|
LoRA is especially popular after open-sourced LLMs become ubiquitous (Dettmers et al., 2023). |
|
LoRA assumes that the change of the model’s parameters for adaptation is intrinsically low-dimensional and performs adaptation by optimizing the matrix obtained from low-rank decomposition. |
|
Since it is in the form of weight matrix reparameterization, LoRA parameters can be merged with the original LLMs and cause no forward propagation latency. |
|
""" |
|
|
|
|
|
def main(): |
|
|
|
chunk_similarity = ChunkSimilarity() |
|
scores = chunk_similarity.similar( |
|
CHUNK_TRUE, |
|
CHUNK_EDIT |
|
) |
|
for score in scores: |
|
metric, score, _ = score |
|
print(f"{metric}: {score}") |
|
return |
|
|
|
|
|
if __name__ == "__main__": |
|
main() |
|
|