|
|
|
|
|
import difflib |
|
from typing import List, Tuple |
|
|
|
import editdistance |
|
|
|
from project_settings import project_path |
|
from toolbox.string.tokenization import FastTokenizer |
|
|
|
|
|
class ChunkSearcher(object): |
|
def __init__(self, |
|
vocab_file: str = (project_path / "data/vocab.txt").as_posix() |
|
): |
|
|
|
self.tokenizer = self.init_tokenizer(vocab_file) |
|
|
|
@staticmethod |
|
def init_tokenizer(vocab_file): |
|
tokenizer = FastTokenizer() |
|
with open(vocab_file, "r", encoding="utf-8") as f: |
|
for row in f: |
|
token = str(row).strip() |
|
tokenizer.insert(token) |
|
return tokenizer |
|
|
|
def chunk_search(self, chunk: str, content: str, win_size_radio: float = 1.5): |
|
chunk_tokens, _ = self.tokenizer.tokenize(chunk) |
|
content_tokens, _ = self.tokenizer.tokenize(content) |
|
|
|
counter = [0] * len(content_tokens) |
|
win_score = [0] * len(content_tokens) |
|
|
|
for token1 in chunk_tokens: |
|
if len(token1.strip()) == 0: |
|
continue |
|
for idx, token2 in enumerate(content_tokens): |
|
if token1 == token2: |
|
counter[idx] = 1 |
|
|
|
win_size = len(chunk_tokens) * win_size_radio |
|
win_size = int(win_size) |
|
for begin in range(0, len(content_tokens) - win_size, 1): |
|
win = counter[begin: begin+win_size] |
|
score = sum(win) |
|
win_score[begin] = score |
|
|
|
idx = win_score.index(max(win_score)) |
|
|
|
match = content_tokens[idx: idx+win_size] |
|
match_content = "".join(match) |
|
|
|
match_content = self.rstrip_match_content(chunk, match_content) |
|
match_content = self.rstrip_match_content(chunk[::-1], match_content[::-1]) |
|
match_content = match_content[::-1] |
|
|
|
return match_content |
|
|
|
def rstrip_match_content(self, chunk: str, match_content: str): |
|
differ = difflib.Differ() |
|
diff = differ.compare(match_content, chunk) |
|
|
|
operation_list = list() |
|
for d in diff: |
|
operation = d[0] |
|
operation_list.append(operation) |
|
|
|
r_strip_count = 0 |
|
for operation in reversed(operation_list): |
|
if operation != "-": |
|
break |
|
r_strip_count += 1 |
|
|
|
if r_strip_count != 0: |
|
match_content = match_content[:-r_strip_count].strip() |
|
return match_content |
|
|
|
|
|
class ChunkSimilarity(object): |
|
|
|
def edit_distance(self, chunk: str, match_content: str) -> List[Tuple[str, float, str]]: |
|
edit_distance = editdistance.distance(chunk, match_content) |
|
|
|
chunk_length = len(chunk) |
|
content_length = len(match_content) |
|
|
|
normalized_edit_distance = edit_distance / (chunk_length + content_length) |
|
normalized_edit_distance2 = 2 * edit_distance / (chunk_length + content_length) |
|
|
|
result = [ |
|
("edit_distance", edit_distance, ""), |
|
( |
|
"ed_score", round(1 - normalized_edit_distance, 4), |
|
"1 - d / (l1 + l2)" |
|
), |
|
( |
|
"ed_score2", round(1 - normalized_edit_distance2, 4), |
|
"1 - 2*d / (l1 + l2)" |
|
), |
|
] |
|
return result |
|
|
|
def seq_match(self, chunk: str, match_content: str) -> List[Tuple[str, str, str]]: |
|
seq_match = difflib.SequenceMatcher() |
|
seq_match.set_seqs(chunk, match_content) |
|
score = seq_match.ratio() |
|
|
|
result = [ |
|
("seq_match", round(score, 4), "(2.0*M / T) similar to edit_distance"), |
|
] |
|
return result |
|
|
|
def similar(self, chunk: str, match_content: str): |
|
result = [ |
|
("metric", "score", "note") |
|
] |
|
scores = self.edit_distance(chunk, match_content) |
|
result.extend(scores) |
|
scores = self.seq_match(chunk, match_content) |
|
result.extend(scores) |
|
|
|
return result |
|
|
|
|
|
PAGE_CONTENT = """ |
|
40 |
|
麦肯锡中国金融业 CEO季刊 2023年秋季刊 |
|
2023年人工智能发展现状: |
|
生成式 AI的突破之年 |
|
Michael Chui ,Eric Hazan ,Lareina Yee ,Bryce Hall ,Alex Singla |
|
和Alexander Sukharevsky如 今 ,生 成 式 AI工具遍地开花, 各组织均在快速部署; 麦肯锡调查的 |
|
受访者们预计, 该技术将对自己所在行业及就业产生重大影响。 |
|
|
|
41 |
|
2023年 人 工 智 能 发 展 现 状 :生 成 式 AI的突破之年 |
|
麦肯锡针对人工智能发展现状的最新年度全球调研结果证实, 生 |
|
成式人工智能 (简称 GenAI )工 具 已 出 现 爆 炸 式 增 长 。许 多 此 类 工 |
|
具 至 今 推 出 尚 不 满 一 年 ,但 已 有 1/3的 受 访 者 表 示 ,其 所 在 组 织 会 在 |
|
至少一项业务职能中经常使 用 GenAI 。 随着这些最新进展, 人工智 |
|
能 已 经 从 一 个 技 术 话 题 上 升 为 企 业 领 导 的 关 注 焦 点 :近 1/4受访高 |
|
管 表 示 ,他 们 会 在 工 作 中 使 用 GenAI 工具; 而在已应用人工智能的 |
|
企 业 中,有 超 过 1/4的受访者表示 GenAI 已 被 列 入 董 事 会 议 程 。此 外 , |
|
40% 的受访者表示, 其所在组织将会因 GenAI 的最新进 展而增加对 |
|
人工智能的整体投入。 调查结果表明, GenAI 相关风险管理仍处于 |
|
早期阶段: 即便是针对受访者眼中最常见的不准确问题, 也只有不 |
|
""" |
|
|
|
|
|
CHUNK = """2023年人工智能发展现状:生成式AI的突破之年""" |
|
|
|
|
|
CHUNK1 = """ |
|
Among these PEFT methods, the reparameterization-based method low-rank adaptation (LoRA) (Hu et al.,2021) is considered one of the most efficient and effective methods at present. |
|
LoRA is especially popular after open-sourced LLMs become ubiquitous (Dettmers et al., 2023). |
|
LoRA assumes that the change of the model’s parameters for adaptation is intrinsically low-dimensional and performs adaptation by optimizing the matrix obtained from low-rank decomposition. |
|
Since it is in the form of weight matrix reparameterization, LoRA parameters can be merged with the original LLMs and cause no forward propagation latency. |
|
""" |
|
|
|
|
|
def main(): |
|
from project_settings import project_path |
|
searcher = ChunkSearcher() |
|
match_content = searcher.chunk_search( |
|
CHUNK, |
|
PAGE_CONTENT, |
|
win_size_radio=1.6, |
|
) |
|
print(match_content) |
|
|
|
chunk_similarity = ChunkSimilarity() |
|
scores = chunk_similarity.similar(CHUNK, match_content) |
|
print(scores) |
|
return |
|
|
|
|
|
if __name__ == "__main__": |
|
main() |
|
|