Spaces:

intelli-zen
/

document_loaders

Paused

App Files Files Community

document_loaders / examples /sentence_similarity.py

HoneyTian

first commit

e94100d 9 months ago

raw

history blame contribute delete

2.62 kB

	#!/usr/bin/python3
	# -- coding: utf-8 --
	import difflib
	from typing import List, Tuple

	import editdistance


	class ChunkSimilarity(object):
	def edit_distance(self, chunk: str, match_content: str) -> List[Tuple[str, float, str]]:
	edit_distance = editdistance.distance(chunk, match_content)

	chunk_length = len(chunk)
	content_length = len(match_content)

	normalized_edit_distance = edit_distance / (chunk_length + content_length)
	normalized_edit_distance2 = 2 * edit_distance / (chunk_length + content_length)

	result = [
	("edit_distance", edit_distance, ""),
	(
	"ed_score", round(1 - normalized_edit_distance, 4),
	"1 - d / (l1 + l2)"
	),
	(
	"ed_score2", round(1 - normalized_edit_distance2, 4),
	"1 - 2*d / (l1 + l2)"
	),
	]
	return result

	def seq_match(self, chunk: str, match_content: str) -> List[Tuple[str, str, str]]:
	seq_match = difflib.SequenceMatcher()
	seq_match.set_seqs(chunk, match_content)
	score = seq_match.ratio()

	result = [
	("seq_match", round(score, 4), "(2.0*M / T) similar to edit_distance"),
	]
	return result

	def similar(self, chunk: str, match_content: str):
	result = [
	("metric", "score", "note")
	]
	scores = self.edit_distance(chunk, match_content)
	result.extend(scores)
	scores = self.seq_match(chunk, match_content)
	result.extend(scores)

	return result


	CHUNK_TRUE = """
	2023年人工智能发展现状：生成式AI的突破之年
	"""


	CHUNK_EDIT = """
	Among these PEFT methods, the reparameterization-based method low-rank adaptation (LoRA) (Hu et al.,2021) is considered one of the most efficient and effective methods at present.
	LoRA is especially popular after open-sourced LLMs become ubiquitous (Dettmers et al., 2023).
	LoRA assumes that the change of the model’s parameters for adaptation is intrinsically low-dimensional and performs adaptation by optimizing the matrix obtained from low-rank decomposition.
	Since it is in the form of weight matrix reparameterization, LoRA parameters can be merged with the original LLMs and cause no forward propagation latency.
	"""


	def main():

	chunk_similarity = ChunkSimilarity()
	scores = chunk_similarity.similar(
	CHUNK_TRUE,
	CHUNK_EDIT
	)
	for score in scores:
	metric, score, _ = score
	print(f"{metric}: {score}")
	return


	if __name__ == "__main__":
	main()