Spaces:

intelli-zen
/

document_loaders

Paused

App Files Files Community

document_loaders / toolbox /unstructured_eval /unstructured_eval.py

HoneyTian

first commit

e94100d 9 months ago

raw

history blame contribute delete

6.61 kB

	#!/usr/bin/python3
	# -- coding: utf-8 --
	import difflib
	from typing import List, Tuple

	import editdistance

	from project_settings import project_path
	from toolbox.string.tokenization import FastTokenizer


	class ChunkSearcher(object):
	def __init__(self,
	vocab_file: str = (project_path / "data/vocab.txt").as_posix()
	):
	# 需要一个多语言的分词器. (找一个多语言 bert 的词汇表).
	self.tokenizer = self.init_tokenizer(vocab_file)

	@staticmethod
	def init_tokenizer(vocab_file):
	tokenizer = FastTokenizer()
	with open(vocab_file, "r", encoding="utf-8") as f:
	for row in f:
	token = str(row).strip()
	tokenizer.insert(token)
	return tokenizer

	def chunk_search(self, chunk: str, content: str, win_size_radio: float = 1.5):
	chunk_tokens, _ = self.tokenizer.tokenize(chunk)
	content_tokens, _ = self.tokenizer.tokenize(content)

	counter = [0] * len(content_tokens)
	win_score = [0] * len(content_tokens)

	for token1 in chunk_tokens:
	if len(token1.strip()) == 0:
	continue
	for idx, token2 in enumerate(content_tokens):
	if token1 == token2:
	counter[idx] = 1

	win_size = len(chunk_tokens) * win_size_radio
	win_size = int(win_size)
	for begin in range(0, len(content_tokens) - win_size, 1):
	win = counter[begin: begin+win_size]
	score = sum(win)
	win_score[begin] = score

	idx = win_score.index(max(win_score))

	match = content_tokens[idx: idx+win_size]
	match_content = "".join(match)

	match_content = self.rstrip_match_content(chunk, match_content)
	match_content = self.rstrip_match_content(chunk[::-1], match_content[::-1])
	match_content = match_content[::-1]

	return match_content

	def rstrip_match_content(self, chunk: str, match_content: str):
	differ = difflib.Differ()
	diff = differ.compare(match_content, chunk)

	operation_list = list()
	for d in diff:
	operation = d[0]
	operation_list.append(operation)

	r_strip_count = 0
	for operation in reversed(operation_list):
	if operation != "-":
	break
	r_strip_count += 1

	if r_strip_count != 0:
	match_content = match_content[:-r_strip_count].strip()
	return match_content


	class ChunkSimilarity(object):

	def edit_distance(self, chunk: str, match_content: str) -> List[Tuple[str, float, str]]:
	edit_distance = editdistance.distance(chunk, match_content)

	chunk_length = len(chunk)
	content_length = len(match_content)

	normalized_edit_distance = edit_distance / (chunk_length + content_length)
	normalized_edit_distance2 = 2 * edit_distance / (chunk_length + content_length)

	result = [
	("edit_distance", edit_distance, ""),
	(
	"ed_score", round(1 - normalized_edit_distance, 4),
	"1 - d / (l1 + l2)"
	),
	(
	"ed_score2", round(1 - normalized_edit_distance2, 4),
	"1 - 2*d / (l1 + l2)"
	),
	]
	return result

	def seq_match(self, chunk: str, match_content: str) -> List[Tuple[str, str, str]]:
	seq_match = difflib.SequenceMatcher()
	seq_match.set_seqs(chunk, match_content)
	score = seq_match.ratio()

	result = [
	("seq_match", round(score, 4), "(2.0*M / T) similar to edit_distance"),
	]
	return result

	def similar(self, chunk: str, match_content: str):
	result = [
	("metric", "score", "note")
	]
	scores = self.edit_distance(chunk, match_content)
	result.extend(scores)
	scores = self.seq_match(chunk, match_content)
	result.extend(scores)

	return result


	PAGE_CONTENT = """
	40
	麦肯锡中国金融业 CEO季刊 2023年秋季刊
	2023年人工智能发展现状：
	生成式 AI的突破之年
	Michael Chui ，Eric Hazan ，Lareina Yee ，Bryce Hall ，Alex Singla
	和Alexander Sukharevsky如今，生成式 AI工具遍地开花，各组织均在快速部署；麦肯锡调查的
	受访者们预计，该技术将对自己所在行业及就业产生重大影响。

	41
	2023年人工智能发展现状：生成式 AI的突破之年
	麦肯锡针对人工智能发展现状的最新年度全球调研结果证实，生
	成式人工智能（简称 GenAI ）工具已出现爆炸式增长。许多此类工
	具至今推出尚不满一年，但已有 1/3的受访者表示，其所在组织会在
	至少一项业务职能中经常使用 GenAI 。随着这些最新进展，人工智
	能已经从一个技术话题上升为企业领导的关注焦点：近 1/4受访高
	管表示，他们会在工作中使用 GenAI 工具；而在已应用人工智能的
	企业中，有超过 1/4的受访者表示 GenAI 已被列入董事会议程。此外，
	40% 的受访者表示，其所在组织将会因 GenAI 的最新进展而增加对
	人工智能的整体投入。调查结果表明， GenAI 相关风险管理仍处于
	早期阶段：即便是针对受访者眼中最常见的不准确问题，也只有不
	"""


	CHUNK = """2023年人工智能发展现状：生成式AI的突破之年"""


	CHUNK1 = """
	Among these PEFT methods, the reparameterization-based method low-rank adaptation (LoRA) (Hu et al.,2021) is considered one of the most efficient and effective methods at present.
	LoRA is especially popular after open-sourced LLMs become ubiquitous (Dettmers et al., 2023).
	LoRA assumes that the change of the model’s parameters for adaptation is intrinsically low-dimensional and performs adaptation by optimizing the matrix obtained from low-rank decomposition.
	Since it is in the form of weight matrix reparameterization, LoRA parameters can be merged with the original LLMs and cause no forward propagation latency.
	"""


	def main():
	from project_settings import project_path
	searcher = ChunkSearcher()
	match_content = searcher.chunk_search(
	CHUNK,
	PAGE_CONTENT,
	win_size_radio=1.6,
	)
	print(match_content)

	chunk_similarity = ChunkSimilarity()
	scores = chunk_similarity.similar(CHUNK, match_content)
	print(scores)
	return


	if __name__ == "__main__":
	main()