Spaces:

chenzihong
/

GraphGen

Runtime error

App Files Files Community

github-actions[bot] commited on about 20 hours ago

Commit

fb9c306

1 Parent(s): e453a65

Auto-sync from demo at Thu Aug 28 09:22:58 UTC 2025

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.env.example +0 -6
.gitattributes +0 -35
.gitignore +0 -179
README.md +0 -14
app.py +281 -214
graphgen/configs/README.md +1 -0
graphgen/configs/aggregated_config.yaml +21 -0
graphgen/configs/atomic_config.yaml +21 -0
graphgen/configs/config.yaml.example +0 -16
graphgen/configs/cot_config.yaml +13 -0
graphgen/configs/graphgen_config.yaml +0 -16
graphgen/configs/multi_hop_config.yaml +21 -0
graphgen/generate.py +64 -62
graphgen/graphgen.py +232 -97
graphgen/models/__init__.py +18 -14
graphgen/models/community/__init__.py +0 -0
graphgen/models/community/community_detector.py +95 -0
graphgen/models/llm/openai_model.py +58 -33
graphgen/models/search/db/__init__.py +0 -0
graphgen/models/search/db/uniprot_search.py +64 -0
graphgen/models/search/kg/__init__.py +0 -0
graphgen/models/search/{wiki_search.py → kg/wiki_search.py} +4 -3
graphgen/models/search/web/__init__.py +0 -0
graphgen/models/search/web/bing_search.py +43 -0
graphgen/models/search/web/google_search.py +45 -0
graphgen/models/storage/base_storage.py +25 -4
graphgen/models/storage/json_storage.py +39 -3
graphgen/models/vis/__init__.py +0 -0
graphgen/models/vis/community_visualizer.py +48 -0
graphgen/operators/__init__.py +13 -7
graphgen/operators/generate/__init__.py +0 -0
graphgen/operators/generate/generate_cot.py +117 -0
graphgen/operators/judge.py +48 -87
graphgen/operators/kg/__init__.py +0 -0
graphgen/operators/{extract_kg.py → kg/extract_kg.py} +48 -29
graphgen/operators/{merge_kg.py → kg/merge_kg.py} +38 -41
graphgen/operators/{split_graph.py → kg/split_kg.py} +92 -44
graphgen/operators/preprocess/__init__.py +0 -0
graphgen/operators/{resolute_coreference.py → preprocess/resolute_coreference.py} +8 -8
graphgen/operators/search/__init__.py +0 -0
graphgen/operators/search/db/__init__.py +0 -0
graphgen/operators/search/db/search_uniprot.py +0 -0
graphgen/operators/search/kg/__init__.py +0 -0
graphgen/operators/search/kg/search_wikipedia.py +58 -0
graphgen/operators/search/search_all.py +82 -0
graphgen/operators/search/web/__init__.py +0 -0
graphgen/operators/search/web/search_bing.py +53 -0
graphgen/operators/search/web/search_google.py +49 -0
graphgen/operators/search_wikipedia.py +0 -71
graphgen/operators/traverse_graph.py +199 -148

.env.example DELETED Viewed

@@ -1,6 +0,0 @@
-SYNTHESIZER_MODEL=
-SYNTHESIZER_BASE_URL=
-SYNTHESIZER_API_KEY=
-TRAINEE_MODEL=
-TRAINEE_BASE_URL=
-TRAINEE_API_KEY=

.gitattributes DELETED Viewed

@@ -1,35 +0,0 @@
-*.7z filter=lfs diff=lfs merge=lfs -text
-*.arrow filter=lfs diff=lfs merge=lfs -text
-*.bin filter=lfs diff=lfs merge=lfs -text
-*.bz2 filter=lfs diff=lfs merge=lfs -text
-*.ckpt filter=lfs diff=lfs merge=lfs -text
-*.ftz filter=lfs diff=lfs merge=lfs -text
-*.gz filter=lfs diff=lfs merge=lfs -text
-*.h5 filter=lfs diff=lfs merge=lfs -text
-*.joblib filter=lfs diff=lfs merge=lfs -text
-*.lfs.* filter=lfs diff=lfs merge=lfs -text
-*.mlmodel filter=lfs diff=lfs merge=lfs -text
-*.model filter=lfs diff=lfs merge=lfs -text
-*.msgpack filter=lfs diff=lfs merge=lfs -text
-*.npy filter=lfs diff=lfs merge=lfs -text
-*.npz filter=lfs diff=lfs merge=lfs -text
-*.onnx filter=lfs diff=lfs merge=lfs -text
-*.ot filter=lfs diff=lfs merge=lfs -text
-*.parquet filter=lfs diff=lfs merge=lfs -text
-*.pb filter=lfs diff=lfs merge=lfs -text
-*.pickle filter=lfs diff=lfs merge=lfs -text
-*.pkl filter=lfs diff=lfs merge=lfs -text
-*.pt filter=lfs diff=lfs merge=lfs -text
-*.pth filter=lfs diff=lfs merge=lfs -text
-*.rar filter=lfs diff=lfs merge=lfs -text
-*.safetensors filter=lfs diff=lfs merge=lfs -text
-saved_model/**/* filter=lfs diff=lfs merge=lfs -text
-*.tar.* filter=lfs diff=lfs merge=lfs -text
-*.tar filter=lfs diff=lfs merge=lfs -text
-*.tflite filter=lfs diff=lfs merge=lfs -text
-*.tgz filter=lfs diff=lfs merge=lfs -text
-*.wasm filter=lfs diff=lfs merge=lfs -text
-*.xz filter=lfs diff=lfs merge=lfs -text
-*.zip filter=lfs diff=lfs merge=lfs -text
-*.zst filter=lfs diff=lfs merge=lfs -text
-*tfevents* filter=lfs diff=lfs merge=lfs -text

.gitignore DELETED Viewed

@@ -1,179 +0,0 @@
-# Byte-compiled / optimized / DLL files
-__pycache__/
-*.py[cod]
-*$py.class
-# C extensions
-*.so
-# Distribution / packaging
-.Python
-build/
-develop-eggs/
-dist/
-downloads/
-eggs/
-.eggs/
-lib/
-lib64/
-parts/
-sdist/
-var/
-wheels/
-share/python-wheels/
-*.egg-info/
-.installed.cfg
-*.egg
-MANIFEST
-# PyInstaller
-#  Usually these files are written by a python script from a template
-#  before PyInstaller builds the exe, so as to inject date/other infos into it.
-*.manifest
-*.spec
-# Installer logs
-pip-log.txt
-pip-delete-this-directory.txt
-# Unit test / coverage reports
-htmlcov/
-.tox/
-.nox/
-.coverage
-.coverage.*
-.cache
-nosetests.xml
-coverage.xml
-*.cover
-*.py,cover
-.hypothesis/
-.pytest_cache/
-cover/
-# Translations
-*.mo
-*.pot
-# Django stuff:
-*.log
-local_settings.py
-db.sqlite3
-db.sqlite3-journal
-# Flask stuff:
-instance/
-.webassets-cache
-# Scrapy stuff:
-.scrapy
-# Sphinx documentation
-docs/_build/
-# PyBuilder
-.pybuilder/
-target/
-# Jupyter Notebook
-.ipynb_checkpoints
-# IPython
-profile_default/
-ipython_config.py
-# pyenv
-#   For a library or package, you might want to ignore these files since the code is
-#   intended to run in multiple environments; otherwise, check them in:
-# .python-version
-# pipenv
-#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
-#   However, in case of collaboration, if having platform-specific dependencies or dependencies
-#   having no cross-platform support, pipenv may install dependencies that don't work, or not
-#   install all needed dependencies.
-#Pipfile.lock
-# UV
-#   Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
-#   This is especially recommended for binary packages to ensure reproducibility, and is more
-#   commonly ignored for libraries.
-#uv.lock
-# poetry
-#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
-#   This is especially recommended for binary packages to ensure reproducibility, and is more
-#   commonly ignored for libraries.
-#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
-#poetry.lock
-# pdm
-#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
-#pdm.lock
-#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
-#   in version control.
-#   https://pdm.fming.dev/latest/usage/project/#working-with-version-control
-.pdm.toml
-.pdm-python
-.pdm-build/
-# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
-__pypackages__/
-# Celery stuff
-celerybeat-schedule
-celerybeat.pid
-# SageMath parsed files
-*.sage.py
-# Environments
-.env
-.venv
-env/
-venv/
-ENV/
-env.bak/
-venv.bak/
-# Spyder project settings
-.spyderproject
-.spyproject
-# Rope project settings
-.ropeproject
-# mkdocs documentation
-/site
-# mypy
-.mypy_cache/
-.dmypy.json
-dmypy.json
-# Pyre type checker
-.pyre/
-# pytype static type analyzer
-.pytype/
-# Cython debug symbols
-cython_debug/
-# PyCharm
-#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
-#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
-#  and can be added to the global gitignore or merged into this file.  For a more nuclear
-#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
-.idea/
-# Ruff stuff:
-.ruff_cache/
-# PyPI configuration file
-.pypirc
-cache
-*.pyc
-*.html
-.gradio

README.md DELETED Viewed

@@ -1,14 +0,0 @@
----
-title: GraphGen
-emoji: 🐠
-colorFrom: gray
-colorTo: blue
-sdk: gradio
-sdk_version: 5.32.1
-app_file: app.py
-pinned: false
-license: apache-2.0
-short_description: A framework for synthetic data generation based on KG.
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py CHANGED Viewed

@@ -1,20 +1,19 @@
 import os
 import sys
-import json
 import tempfile
-import pandas as pd
 import gradio as gr
-from gradio_i18n import Translate, gettext as _
-from webui.base import GraphGenParams
-from webui.test_api import test_api_connection
-from webui.cache_utils import setup_workspace, cleanup_workspace
-from webui.count_tokens import count_tokens
 # pylint: disable=wrong-import-position
-root_dir = os.path.dirname(os.path.abspath(__file__))
 sys.path.append(root_dir)
 from graphgen.graphgen import GraphGen
@@ -22,7 +21,6 @@ from graphgen.models import OpenAIModel, Tokenizer, TraverseStrategy
 from graphgen.models.llm.limitter import RPM, TPM
 from graphgen.utils import set_logger
 css = """
 .center-row {
     display: flex;
@@ -37,9 +35,7 @@ def init_graph_gen(config: dict, env: dict) -> GraphGen:
     log_file, working_dir = setup_workspace(os.path.join(root_dir, "cache"))
     set_logger(log_file, if_stream=False)
-    graph_gen = GraphGen(
-        working_dir=working_dir
-    )
     # Set up LLM clients
     graph_gen.synthesizer_llm_client = OpenAIModel(
@@ -47,8 +43,8 @@ def init_graph_gen(config: dict, env: dict) -> GraphGen:
         base_url=env.get("SYNTHESIZER_BASE_URL", ""),
         api_key=env.get("SYNTHESIZER_API_KEY", ""),
         request_limit=True,
-        rpm= RPM(env.get("RPM", 1000)),
-        tpm= TPM(env.get("TPM", 50000)),
     )
     graph_gen.trainee_llm_client = OpenAIModel(
@@ -56,16 +52,15 @@ def init_graph_gen(config: dict, env: dict) -> GraphGen:
         base_url=env.get("TRAINEE_BASE_URL", ""),
         api_key=env.get("TRAINEE_API_KEY", ""),
         request_limit=True,
-        rpm= RPM(env.get("RPM", 1000)),
-        tpm= TPM(env.get("TPM", 50000)),
     )
-    graph_gen.tokenizer_instance = Tokenizer(
-        config.get("tokenizer", "cl100k_base"))
     strategy_config = config.get("traverse_strategy", {})
     graph_gen.traverse_strategy = TraverseStrategy(
-        qa_form=config.get("qa_form"),
         expand_method=strategy_config.get("expand_method"),
         bidirectional=strategy_config.get("bidirectional"),
         max_extra_edges=strategy_config.get("max_extra_edges"),
@@ -73,11 +68,12 @@ def init_graph_gen(config: dict, env: dict) -> GraphGen:
         max_depth=strategy_config.get("max_depth"),
         edge_sampling=strategy_config.get("edge_sampling"),
         isolated_node_strategy=strategy_config.get("isolated_node_strategy"),
-        loss_strategy=str(strategy_config.get("loss_strategy"))
     )
     return graph_gen
 # pylint: disable=too-many-statements
 def run_graphgen(params, progress=gr.Progress()):
     def sum_tokens(client):
@@ -87,10 +83,9 @@ def run_graphgen(params, progress=gr.Progress()):
         "if_trainee_model": params.if_trainee_model,
         "input_file": params.input_file,
         "tokenizer": params.tokenizer,
-        "qa_form": params.qa_form,
-        "web_search": False,
         "quiz_samples": params.quiz_samples,
         "traverse_strategy": {
             "bidirectional": params.bidirectional,
             "expand_method": params.expand_method,
             "max_extra_edges": params.max_extra_edges,
@@ -98,7 +93,7 @@ def run_graphgen(params, progress=gr.Progress()):
             "max_depth": params.max_depth,
             "edge_sampling": params.edge_sampling,
             "isolated_node_strategy": params.isolated_node_strategy,
-            "loss_strategy": params.loss_strategy
         },
         "chunk_size": params.chunk_size,
     }
@@ -115,11 +110,15 @@ def run_graphgen(params, progress=gr.Progress()):
     }
     # Test API connection
-    test_api_connection(env["SYNTHESIZER_BASE_URL"],
-                        env["SYNTHESIZER_API_KEY"], env["SYNTHESIZER_MODEL"])
-    if config['if_trainee_model']:
-        test_api_connection(env["TRAINEE_BASE_URL"],
-                            env["TRAINEE_API_KEY"], env["TRAINEE_MODEL"])
     # Initialize GraphGen
     graph_gen = init_graph_gen(config, env)
@@ -129,7 +128,7 @@ def run_graphgen(params, progress=gr.Progress()):
     try:
         # Load input data
-        file = config['input_file']
         if isinstance(file, list):
             file = file[0]
@@ -137,24 +136,22 @@ def run_graphgen(params, progress=gr.Progress()):
         if file.endswith(".jsonl"):
             data_type = "raw"
-            with open(file, "r", encoding='utf-8') as f:
                 data.extend(json.loads(line) for line in f)
         elif file.endswith(".json"):
             data_type = "chunked"
-            with open(file, "r", encoding='utf-8') as f:
                 data.extend(json.load(f))
         elif file.endswith(".txt"):
             # 读取文件后根据chunk_size转成raw格式的数据
             data_type = "raw"
             content = ""
-            with open(file, "r", encoding='utf-8') as f:
                 lines = f.readlines()
                 for line in lines:
                     content += line.strip() + " "
             size = int(config.get("chunk_size", 512))
-            chunks = [
-                content[i:i + size] for i in range(0, len(content), size)
-            ]
             data.extend([{"content": chunk} for chunk in chunks])
         else:
             raise ValueError(f"Unsupported file type: {file}")
@@ -162,9 +159,9 @@ def run_graphgen(params, progress=gr.Progress()):
         # Process the data
         graph_gen.insert(data, data_type)
-        if config['if_trainee_model']:
             # Generate quiz
-            graph_gen.quiz(max_samples=config['quiz_samples'])
             # Judge statements
             graph_gen.judge()
@@ -174,47 +171,44 @@ def run_graphgen(params, progress=gr.Progress()):
             graph_gen.judge(skip=True)
         # Traverse graph
-        graph_gen.traverse()
         # Save output
         output_data = graph_gen.qa_storage.data
         with tempfile.NamedTemporaryFile(
-                mode="w",
-                suffix=".jsonl",
-                delete=False,
-                encoding="utf-8") as tmpfile:
             json.dump(output_data, tmpfile, ensure_ascii=False)
             output_file = tmpfile.name
         synthesizer_tokens = sum_tokens(graph_gen.synthesizer_llm_client)
-        trainee_tokens = sum_tokens(graph_gen.trainee_llm_client) if config['if_trainee_model'] else 0
         total_tokens = synthesizer_tokens + trainee_tokens
         data_frame = params.token_counter
         try:
             _update_data = [
-                [
-                    data_frame.iloc[0, 0],
-                    data_frame.iloc[0, 1],
-                    str(total_tokens)
-                ]
             ]
-            new_df = pd.DataFrame(
-                _update_data,
-                columns=data_frame.columns
-            )
             data_frame = new_df
         except Exception as e:
             raise gr.Error(f"DataFrame operation error: {str(e)}")
-        return output_file, gr.DataFrame(label='Token Stats',
-                         headers=["Source Text Token Count", "Expected Token Usage", "Token Used"],
-                         datatype="str",
-                         interactive=False,
-                         value=data_frame,
-                         visible=True,
-                         wrap=True)
     except Exception as e:  # pylint: disable=broad-except
         raise gr.Error(f"Error occurred: {str(e)}")
@@ -223,16 +217,18 @@ def run_graphgen(params, progress=gr.Progress()):
         # Clean up workspace
         cleanup_workspace(graph_gen.working_dir)
-with (gr.Blocks(title="GraphGen Demo", theme=gr.themes.Glass(),
-               css=css) as demo):
     # Header
-    gr.Image(value="https://github.com/open-sciencelab/GraphGen/blob/main/resources/images/logo.png?raw=true",
-             label="GraphGen Banner",
-             elem_id="banner",
-             interactive=False,
-             container=False,
-             show_download_button=False,
-             show_fullscreen_button=False)
     lang_btn = gr.Radio(
         choices=[
             ("English", "en"),
@@ -245,7 +241,8 @@ with (gr.Blocks(title="GraphGen Demo", theme=gr.themes.Glass(),
         elem_classes=["center-row"],
     )
-    gr.HTML("""
     <div style="display: flex; gap: 8px; margin-left: auto; align-items: center; justify-content: center;">
         <a href="https://github.com/open-sciencelab/GraphGen/releases">
             <img src="https://img.shields.io/badge/Version-v0.1.0-blue" alt="Version">
@@ -260,80 +257,98 @@ with (gr.Blocks(title="GraphGen Demo", theme=gr.themes.Glass(),
             <img src="https://img.shields.io/badge/arXiv-pdf-yellow" alt="arXiv">
         </a>
     </div>
-    """)
     with Translate(
-            os.path.join(root_dir, 'webui', 'translation.json'),
-            lang_btn,
-            placeholder_langs=["en", "zh"],
-            persistant=
-            False,  # True to save the language setting in the browser. Requires gradio >= 5.6.0
     ):
         lang_btn.render()
         gr.Markdown(
-            value = "# " + _("Title") + "\n\n" + \
-                "### [GraphGen](https://github.com/open-sciencelab/GraphGen) " + _("Intro")
         )
-        if_trainee_model = gr.Checkbox(label=_("Use Trainee Model"),
-                                       value=False,
-                                       interactive=True)
         with gr.Accordion(label=_("Model Config"), open=False):
-            synthesizer_url = gr.Textbox(label="Synthesizer URL",
-                                  value="https://api.siliconflow.cn/v1",
-                                  info=_("Synthesizer URL Info"),
-                                  interactive=True)
-            synthesizer_model = gr.Textbox(label="Synthesizer Model",
-                                           value="Qwen/Qwen2.5-7B-Instruct",
-                                           info=_("Synthesizer Model Info"),
-                                           interactive=True)
-            trainee_url = gr.Textbox(label="Trainee URL",
-                                        value="https://api.siliconflow.cn/v1",
-                                        info=_("Trainee URL Info"),
-                                        interactive=True,
-                                        visible=if_trainee_model.value is True)
             trainee_model = gr.Textbox(
                 label="Trainee Model",
                 value="Qwen/Qwen2.5-7B-Instruct",
                 info=_("Trainee Model Info"),
                 interactive=True,
-                visible=if_trainee_model.value is True)
             trainee_api_key = gr.Textbox(
-                    label=_("SiliconCloud Token for Trainee Model"),
-                    type="password",
-                    value="",
-                    info="https://cloud.siliconflow.cn/account/ak",
-                    visible=if_trainee_model.value is True)
         with gr.Accordion(label=_("Generation Config"), open=False):
-            chunk_size = gr.Slider(label="Chunk Size",
-                                   minimum=256,
-                                   maximum=4096,
-                                   value=512,
-                                   step=256,
-                                   interactive=True)
-            tokenizer = gr.Textbox(label="Tokenizer",
-                                   value="cl100k_base",
-                                   interactive=True)
-            qa_form = gr.Radio(choices=["atomic", "multi_hop", "aggregated"],
-                               label="QA Form",
-                               value="aggregated",
-                               interactive=True)
-            quiz_samples = gr.Number(label="Quiz Samples",
-                                     value=2,
-                                     minimum=1,
-                                     interactive=True,
-                                     visible=if_trainee_model.value is True)
-            bidirectional = gr.Checkbox(label="Bidirectional",
-                                        value=True,
-                                        interactive=True)
-            expand_method = gr.Radio(choices=["max_width", "max_tokens"],
-                                     label="Expand Method",
-                                     value="max_tokens",
-                                     interactive=True)
             max_extra_edges = gr.Slider(
                 minimum=1,
                 maximum=10,
@@ -341,44 +356,54 @@ with (gr.Blocks(title="GraphGen Demo", theme=gr.themes.Glass(),
                 label="Max Extra Edges",
                 step=1,
                 interactive=True,
-                visible=expand_method.value == "max_width")
-            max_tokens = gr.Slider(minimum=64,
-                                   maximum=1024,
-                                   value=256,
-                                   label="Max Tokens",
-                                   step=64,
-                                   interactive=True,
-                                   visible=(expand_method.value
-                                            != "max_width"))
-            max_depth = gr.Slider(minimum=1,
-                                  maximum=5,
-                                  value=2,
-                                  label="Max Depth",
-                                  step=1,
-                                  interactive=True)
             edge_sampling = gr.Radio(
                 choices=["max_loss", "min_loss", "random"],
                 label="Edge Sampling",
                 value="max_loss",
                 interactive=True,
-                visible=if_trainee_model.value is True)
-            isolated_node_strategy = gr.Radio(choices=["add", "ignore"],
-                                              label="Isolated Node Strategy",
-                                              value="ignore",
-                                              interactive=True)
-            loss_strategy = gr.Radio(choices=["only_edge", "both"],
-                                     label="Loss Strategy",
-                                     value="only_edge",
-                                     interactive=True)
         with gr.Row(equal_height=True):
             with gr.Column(scale=3):
                 api_key = gr.Textbox(
-                    label=_("SiliconCloud Token"),
                     type="password",
                     value="",
-                    info="https://cloud.siliconflow.cn/account/ak")
             with gr.Column(scale=1):
                 test_connection_btn = gr.Button(_("Test Connection"))
@@ -392,7 +417,8 @@ with (gr.Blocks(title="GraphGen Demo", theme=gr.themes.Glass(),
                         value=1000,
                         step=100,
                         interactive=True,
-                        visible=True)
                 with gr.Column():
                     tpm = gr.Slider(
                         label="TPM",
@@ -401,8 +427,8 @@ with (gr.Blocks(title="GraphGen Demo", theme=gr.themes.Glass(),
                         value=50000,
                         step=1000,
                         interactive=True,
-                        visible=True)
         with gr.Blocks():
             with gr.Row(equal_height=True):
@@ -413,15 +439,17 @@ with (gr.Blocks(title="GraphGen Demo", theme=gr.themes.Glass(),
                         file_types=[".txt", ".json", ".jsonl"],
                         interactive=True,
                     )
-                    examples_dir = os.path.join(root_dir, 'webui', 'examples')
-                    gr.Examples(examples=[
-                        [os.path.join(examples_dir, "txt_demo.txt")],
-                        [os.path.join(examples_dir, "raw_demo.jsonl")],
-                        [os.path.join(examples_dir, "chunked_demo.json")],
-                    ],
-                                inputs=upload_file,
-                                label=_("Example Files"),
-                                examples_per_page=3)
                 with gr.Column(scale=1):
                     output = gr.File(
                         label="Output(See Github FAQ)",
@@ -430,12 +458,18 @@ with (gr.Blocks(title="GraphGen Demo", theme=gr.themes.Glass(),
                     )
         with gr.Blocks():
-            token_counter = gr.DataFrame(label='Token Stats',
-                         headers=["Source Text Token Count", "Estimated Token Usage", "Token Used"],
-                         datatype="str",
-                         interactive=False,
-                         visible=False,
-                         wrap=True)
         submit_btn = gr.Button(_("Run GraphGen"))
@@ -443,23 +477,36 @@ with (gr.Blocks(title="GraphGen Demo", theme=gr.themes.Glass(),
         test_connection_btn.click(
             test_api_connection,
             inputs=[synthesizer_url, api_key, synthesizer_model],
-            outputs=[])
         if if_trainee_model.value:
-            test_connection_btn.click(test_api_connection,
-                                    inputs=[trainee_url, api_key, trainee_model],
-                                    outputs=[])
-        expand_method.change(lambda method:
-                             (gr.update(visible=method == "max_width"),
-                              gr.update(visible=method != "max_width")),
-                             inputs=expand_method,
-                             outputs=[max_extra_edges, max_tokens])
         if_trainee_model.change(
             lambda use_trainee: [gr.update(visible=use_trainee)] * 5,
             inputs=if_trainee_model,
-            outputs=[trainee_url, trainee_model, quiz_samples, edge_sampling, trainee_api_key])
         upload_file.change(
             lambda x: (gr.update(visible=True)),
@@ -479,41 +526,61 @@ with (gr.Blocks(title="GraphGen Demo", theme=gr.themes.Glass(),
         )
         submit_btn.click(
-            lambda *args: run_graphgen(GraphGenParams(
-                if_trainee_model=args[0],
-                input_file=args[1],
-                tokenizer=args[2],
-                qa_form=args[3],
-                bidirectional=args[4],
-                expand_method=args[5],
-                max_extra_edges=args[6],
-                max_tokens=args[7],
-                max_depth=args[8],
-                edge_sampling=args[9],
-                isolated_node_strategy=args[10],
-                loss_strategy=args[11],
-                synthesizer_url=args[12],
-                synthesizer_model=args[13],
-                trainee_model=args[14],
-                api_key=args[15],
-                chunk_size=args[16],
-                rpm=args[17],
-                tpm=args[18],
-                quiz_samples=args[19],
-                trainee_url=args[20],
-                trainee_api_key=args[21],
-                token_counter=args[22],
-            )),
             inputs=[
-                if_trainee_model, upload_file, tokenizer, qa_form,
-                bidirectional, expand_method, max_extra_edges, max_tokens,
-                max_depth, edge_sampling, isolated_node_strategy,
-                loss_strategy, synthesizer_url, synthesizer_model, trainee_model,
-                api_key, chunk_size, rpm, tpm, quiz_samples, trainee_url, trainee_api_key, token_counter
             ],
             outputs=[output, token_counter],
         )
 if __name__ == "__main__":
     demo.queue(api_open=False, default_concurrency_limit=2)
-    demo.launch(server_name='0.0.0.0')

+import json
 import os
 import sys
 import tempfile
 import gradio as gr
+import pandas as pd
+from base import GraphGenParams
+from cache_utils import cleanup_workspace, setup_workspace
+from count_tokens import count_tokens
+from gradio_i18n import Translate
+from gradio_i18n import gettext as _
+from test_api import test_api_connection
 # pylint: disable=wrong-import-position
+root_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
 sys.path.append(root_dir)
 from graphgen.graphgen import GraphGen
 from graphgen.models.llm.limitter import RPM, TPM
 from graphgen.utils import set_logger
 css = """
 .center-row {
     display: flex;
     log_file, working_dir = setup_workspace(os.path.join(root_dir, "cache"))
     set_logger(log_file, if_stream=False)
+    graph_gen = GraphGen(working_dir=working_dir)
     # Set up LLM clients
     graph_gen.synthesizer_llm_client = OpenAIModel(
         base_url=env.get("SYNTHESIZER_BASE_URL", ""),
         api_key=env.get("SYNTHESIZER_API_KEY", ""),
         request_limit=True,
+        rpm=RPM(env.get("RPM", 1000)),
+        tpm=TPM(env.get("TPM", 50000)),
     )
     graph_gen.trainee_llm_client = OpenAIModel(
         base_url=env.get("TRAINEE_BASE_URL", ""),
         api_key=env.get("TRAINEE_API_KEY", ""),
         request_limit=True,
+        rpm=RPM(env.get("RPM", 1000)),
+        tpm=TPM(env.get("TPM", 50000)),
     )
+    graph_gen.tokenizer_instance = Tokenizer(config.get("tokenizer", "cl100k_base"))
     strategy_config = config.get("traverse_strategy", {})
     graph_gen.traverse_strategy = TraverseStrategy(
+        qa_form=strategy_config.get("qa_form"),
         expand_method=strategy_config.get("expand_method"),
         bidirectional=strategy_config.get("bidirectional"),
         max_extra_edges=strategy_config.get("max_extra_edges"),
         max_depth=strategy_config.get("max_depth"),
         edge_sampling=strategy_config.get("edge_sampling"),
         isolated_node_strategy=strategy_config.get("isolated_node_strategy"),
+        loss_strategy=str(strategy_config.get("loss_strategy")),
     )
     return graph_gen
 # pylint: disable=too-many-statements
 def run_graphgen(params, progress=gr.Progress()):
     def sum_tokens(client):
         "if_trainee_model": params.if_trainee_model,
         "input_file": params.input_file,
         "tokenizer": params.tokenizer,
         "quiz_samples": params.quiz_samples,
         "traverse_strategy": {
+            "qa_form": params.qa_form,
             "bidirectional": params.bidirectional,
             "expand_method": params.expand_method,
             "max_extra_edges": params.max_extra_edges,
             "max_depth": params.max_depth,
             "edge_sampling": params.edge_sampling,
             "isolated_node_strategy": params.isolated_node_strategy,
+            "loss_strategy": params.loss_strategy,
         },
         "chunk_size": params.chunk_size,
     }
     }
     # Test API connection
+    test_api_connection(
+        env["SYNTHESIZER_BASE_URL"],
+        env["SYNTHESIZER_API_KEY"],
+        env["SYNTHESIZER_MODEL"],
+    )
+    if config["if_trainee_model"]:
+        test_api_connection(
+            env["TRAINEE_BASE_URL"], env["TRAINEE_API_KEY"], env["TRAINEE_MODEL"]
+        )
     # Initialize GraphGen
     graph_gen = init_graph_gen(config, env)
     try:
         # Load input data
+        file = config["input_file"]
         if isinstance(file, list):
             file = file[0]
         if file.endswith(".jsonl"):
             data_type = "raw"
+            with open(file, "r", encoding="utf-8") as f:
                 data.extend(json.loads(line) for line in f)
         elif file.endswith(".json"):
             data_type = "chunked"
+            with open(file, "r", encoding="utf-8") as f:
                 data.extend(json.load(f))
         elif file.endswith(".txt"):
             # 读取文件后根据chunk_size转成raw格式的数据
             data_type = "raw"
             content = ""
+            with open(file, "r", encoding="utf-8") as f:
                 lines = f.readlines()
                 for line in lines:
                     content += line.strip() + " "
             size = int(config.get("chunk_size", 512))
+            chunks = [content[i : i + size] for i in range(0, len(content), size)]
             data.extend([{"content": chunk} for chunk in chunks])
         else:
             raise ValueError(f"Unsupported file type: {file}")
         # Process the data
         graph_gen.insert(data, data_type)
+        if config["if_trainee_model"]:
             # Generate quiz
+            graph_gen.quiz(max_samples=config["quiz_samples"])
             # Judge statements
             graph_gen.judge()
             graph_gen.judge(skip=True)
         # Traverse graph
+        graph_gen.traverse(traverse_strategy=graph_gen.traverse_strategy)
         # Save output
         output_data = graph_gen.qa_storage.data
         with tempfile.NamedTemporaryFile(
+            mode="w", suffix=".jsonl", delete=False, encoding="utf-8"
+        ) as tmpfile:
             json.dump(output_data, tmpfile, ensure_ascii=False)
             output_file = tmpfile.name
         synthesizer_tokens = sum_tokens(graph_gen.synthesizer_llm_client)
+        trainee_tokens = (
+            sum_tokens(graph_gen.trainee_llm_client)
+            if config["if_trainee_model"]
+            else 0
+        )
         total_tokens = synthesizer_tokens + trainee_tokens
         data_frame = params.token_counter
         try:
             _update_data = [
+                [data_frame.iloc[0, 0], data_frame.iloc[0, 1], str(total_tokens)]
             ]
+            new_df = pd.DataFrame(_update_data, columns=data_frame.columns)
             data_frame = new_df
         except Exception as e:
             raise gr.Error(f"DataFrame operation error: {str(e)}")
+        return output_file, gr.DataFrame(
+            label="Token Stats",
+            headers=["Source Text Token Count", "Expected Token Usage", "Token Used"],
+            datatype="str",
+            interactive=False,
+            value=data_frame,
+            visible=True,
+            wrap=True,
+        )
     except Exception as e:  # pylint: disable=broad-except
         raise gr.Error(f"Error occurred: {str(e)}")
         # Clean up workspace
         cleanup_workspace(graph_gen.working_dir)
+with gr.Blocks(title="GraphGen Demo", theme=gr.themes.Glass(), css=css) as demo:
     # Header
+    gr.Image(
+        value=os.path.join(root_dir, "resources", "images", "logo.png"),
+        label="GraphGen Banner",
+        elem_id="banner",
+        interactive=False,
+        container=False,
+        show_download_button=False,
+        show_fullscreen_button=False,
+    )
     lang_btn = gr.Radio(
         choices=[
             ("English", "en"),
         elem_classes=["center-row"],
     )
+    gr.HTML(
+        """
     <div style="display: flex; gap: 8px; margin-left: auto; align-items: center; justify-content: center;">
         <a href="https://github.com/open-sciencelab/GraphGen/releases">
             <img src="https://img.shields.io/badge/Version-v0.1.0-blue" alt="Version">
             <img src="https://img.shields.io/badge/arXiv-pdf-yellow" alt="arXiv">
         </a>
     </div>
+    """
+    )
     with Translate(
+        os.path.join(root_dir, "webui", "translation.json"),
+        lang_btn,
+        placeholder_langs=["en", "zh"],
+        persistant=False,  # True to save the language setting in the browser. Requires gradio >= 5.6.0
     ):
         lang_btn.render()
         gr.Markdown(
+            value="# "
+            + _("Title")
+            + "\n\n"
+            + "### [GraphGen](https://github.com/open-sciencelab/GraphGen) "
+            + _("Intro")
         )
+        if_trainee_model = gr.Checkbox(
+            label=_("Use Trainee Model"), value=False, interactive=True
+        )
         with gr.Accordion(label=_("Model Config"), open=False):
+            synthesizer_url = gr.Textbox(
+                label="Synthesizer URL",
+                value="https://api.siliconflow.cn/v1",
+                info=_("Synthesizer URL Info"),
+                interactive=True,
+            )
+            synthesizer_model = gr.Textbox(
+                label="Synthesizer Model",
+                value="Qwen/Qwen2.5-7B-Instruct",
+                info=_("Synthesizer Model Info"),
+                interactive=True,
+            )
+            trainee_url = gr.Textbox(
+                label="Trainee URL",
+                value="https://api.siliconflow.cn/v1",
+                info=_("Trainee URL Info"),
+                interactive=True,
+                visible=if_trainee_model.value is True,
+            )
             trainee_model = gr.Textbox(
                 label="Trainee Model",
                 value="Qwen/Qwen2.5-7B-Instruct",
                 info=_("Trainee Model Info"),
                 interactive=True,
+                visible=if_trainee_model.value is True,
+            )
             trainee_api_key = gr.Textbox(
+                label=_("SiliconFlow Token for Trainee Model"),
+                type="password",
+                value="",
+                info="https://cloud.siliconflow.cn/account/ak",
+                visible=if_trainee_model.value is True,
+            )
         with gr.Accordion(label=_("Generation Config"), open=False):
+            chunk_size = gr.Slider(
+                label="Chunk Size",
+                minimum=256,
+                maximum=4096,
+                value=512,
+                step=256,
+                interactive=True,
+            )
+            tokenizer = gr.Textbox(
+                label="Tokenizer", value="cl100k_base", interactive=True
+            )
+            qa_form = gr.Radio(
+                choices=["atomic", "multi_hop", "aggregated"],
+                label="QA Form",
+                value="aggregated",
+                interactive=True,
+            )
+            quiz_samples = gr.Number(
+                label="Quiz Samples",
+                value=2,
+                minimum=1,
+                interactive=True,
+                visible=if_trainee_model.value is True,
+            )
+            bidirectional = gr.Checkbox(
+                label="Bidirectional", value=True, interactive=True
+            )
+            expand_method = gr.Radio(
+                choices=["max_width", "max_tokens"],
+                label="Expand Method",
+                value="max_tokens",
+                interactive=True,
+            )
             max_extra_edges = gr.Slider(
                 minimum=1,
                 maximum=10,
                 label="Max Extra Edges",
                 step=1,
                 interactive=True,
+                visible=expand_method.value == "max_width",
+            )
+            max_tokens = gr.Slider(
+                minimum=64,
+                maximum=1024,
+                value=256,
+                label="Max Tokens",
+                step=64,
+                interactive=True,
+                visible=(expand_method.value != "max_width"),
+            )
+            max_depth = gr.Slider(
+                minimum=1,
+                maximum=5,
+                value=2,
+                label="Max Depth",
+                step=1,
+                interactive=True,
+            )
             edge_sampling = gr.Radio(
                 choices=["max_loss", "min_loss", "random"],
                 label="Edge Sampling",
                 value="max_loss",
                 interactive=True,
+                visible=if_trainee_model.value is True,
+            )
+            isolated_node_strategy = gr.Radio(
+                choices=["add", "ignore"],
+                label="Isolated Node Strategy",
+                value="ignore",
+                interactive=True,
+            )
+            loss_strategy = gr.Radio(
+                choices=["only_edge", "both"],
+                label="Loss Strategy",
+                value="only_edge",
+                interactive=True,
+            )
         with gr.Row(equal_height=True):
             with gr.Column(scale=3):
                 api_key = gr.Textbox(
+                    label=_("SiliconFlow Token"),
                     type="password",
                     value="",
+                    info="https://cloud.siliconflow.cn/account/ak",
+                )
             with gr.Column(scale=1):
                 test_connection_btn = gr.Button(_("Test Connection"))
                         value=1000,
                         step=100,
                         interactive=True,
+                        visible=True,
+                    )
                 with gr.Column():
                     tpm = gr.Slider(
                         label="TPM",
                         value=50000,
                         step=1000,
                         interactive=True,
+                        visible=True,
+                    )
         with gr.Blocks():
             with gr.Row(equal_height=True):
                         file_types=[".txt", ".json", ".jsonl"],
                         interactive=True,
                     )
+                    examples_dir = os.path.join(root_dir, "webui", "examples")
+                    gr.Examples(
+                        examples=[
+                            [os.path.join(examples_dir, "txt_demo.txt")],
+                            [os.path.join(examples_dir, "raw_demo.jsonl")],
+                            [os.path.join(examples_dir, "chunked_demo.json")],
+                        ],
+                        inputs=upload_file,
+                        label=_("Example Files"),
+                        examples_per_page=3,
+                    )
                 with gr.Column(scale=1):
                     output = gr.File(
                         label="Output(See Github FAQ)",
                     )
         with gr.Blocks():
+            token_counter = gr.DataFrame(
+                label="Token Stats",
+                headers=[
+                    "Source Text Token Count",
+                    "Estimated Token Usage",
+                    "Token Used",
+                ],
+                datatype="str",
+                interactive=False,
+                visible=False,
+                wrap=True,
+            )
         submit_btn = gr.Button(_("Run GraphGen"))
         test_connection_btn.click(
             test_api_connection,
             inputs=[synthesizer_url, api_key, synthesizer_model],
+            outputs=[],
+        )
         if if_trainee_model.value:
+            test_connection_btn.click(
+                test_api_connection,
+                inputs=[trainee_url, api_key, trainee_model],
+                outputs=[],
+            )
+        expand_method.change(
+            lambda method: (
+                gr.update(visible=method == "max_width"),
+                gr.update(visible=method != "max_width"),
+            ),
+            inputs=expand_method,
+            outputs=[max_extra_edges, max_tokens],
+        )
         if_trainee_model.change(
             lambda use_trainee: [gr.update(visible=use_trainee)] * 5,
             inputs=if_trainee_model,
+            outputs=[
+                trainee_url,
+                trainee_model,
+                quiz_samples,
+                edge_sampling,
+                trainee_api_key,
+            ],
+        )
         upload_file.change(
             lambda x: (gr.update(visible=True)),
         )
         submit_btn.click(
+            lambda *args: run_graphgen(
+                GraphGenParams(
+                    if_trainee_model=args[0],
+                    input_file=args[1],
+                    tokenizer=args[2],
+                    qa_form=args[3],
+                    bidirectional=args[4],
+                    expand_method=args[5],
+                    max_extra_edges=args[6],
+                    max_tokens=args[7],
+                    max_depth=args[8],
+                    edge_sampling=args[9],
+                    isolated_node_strategy=args[10],
+                    loss_strategy=args[11],
+                    synthesizer_url=args[12],
+                    synthesizer_model=args[13],
+                    trainee_model=args[14],
+                    api_key=args[15],
+                    chunk_size=args[16],
+                    rpm=args[17],
+                    tpm=args[18],
+                    quiz_samples=args[19],
+                    trainee_url=args[20],
+                    trainee_api_key=args[21],
+                    token_counter=args[22],
+                )
+            ),
             inputs=[
+                if_trainee_model,
+                upload_file,
+                tokenizer,
+                qa_form,
+                bidirectional,
+                expand_method,
+                max_extra_edges,
+                max_tokens,
+                max_depth,
+                edge_sampling,
+                isolated_node_strategy,
+                loss_strategy,
+                synthesizer_url,
+                synthesizer_model,
+                trainee_model,
+                api_key,
+                chunk_size,
+                rpm,
+                tpm,
+                quiz_samples,
+                trainee_url,
+                trainee_api_key,
+                token_counter,
             ],
             outputs=[output, token_counter],
         )
 if __name__ == "__main__":
     demo.queue(api_open=False, default_concurrency_limit=2)
+    demo.launch(server_name="0.0.0.0")

graphgen/configs/README.md ADDED Viewed

	@@ -0,0 +1 @@


1	+ # Configs for GraphGen

graphgen/configs/aggregated_config.yaml ADDED Viewed

	@@ -0,0 +1,21 @@

+input_data_type: raw # raw, chunked
+input_file: resources/input_examples/raw_demo.jsonl # input file path, support json, jsonl, txt. See resources/input_examples for examples
+output_data_type: aggregated # atomic, aggregated, multi_hop, cot
+output_data_format: ChatML # Alpaca, Sharegpt, ChatML
+tokenizer: cl100k_base # tokenizer for counting tokens, support tiktoken tokenizer names and local tokenizer path
+search: # web search configuration
+  enabled: false # whether to enable web search
+  search_types: ["google"] # search engine types, support: google, bing, uniprot, wikipedia
+quiz_and_judge_strategy: # quiz and test whether the LLM masters the knowledge points
+  enabled: true
+  quiz_samples: 2 # number of quiz samples to generate
+  re_judge: false # whether to re-judge the existing quiz samples
+traverse_strategy: # strategy for clustering sub-graphs using comprehension loss
+  bidirectional: true # whether to traverse the graph in both directions
+  edge_sampling: max_loss # edge sampling strategy, support: random, max_loss, min_loss
+  expand_method: max_width # expand method, support: max_width, max_depth
+  isolated_node_strategy: ignore # strategy for isolated nodes, support: ignore, add
+  max_depth: 5 # maximum depth for graph traversal
+  max_extra_edges: 20 # max edges per direction (if expand_method="max_width")
+  max_tokens: 256 # restricts input length (if expand_method="max_tokens")
+  loss_strategy: only_edge # defines loss computation focus, support: only_edge, both

graphgen/configs/atomic_config.yaml ADDED Viewed

	@@ -0,0 +1,21 @@

+input_data_type: raw # raw, chunked
+input_file: resources/input_examples/raw_demo.jsonl # input file path, support json, jsonl, txt. See resources/input_examples for examples
+output_data_type: atomic # atomic, aggregated, multi_hop, cot
+output_data_format: Alpaca # Alpaca, Sharegpt, ChatML
+tokenizer: cl100k_base # tokenizer for counting tokens, support tiktoken tokenizer names and local tokenizer path
+search: # web search configuration
+  enabled: false # whether to enable web search
+  search_types: ["google"] # search engine types, support: google, bing, uniprot, wikipedia
+quiz_and_judge_strategy: # quiz and test whether the LLM masters the knowledge points
+  enabled: true
+  quiz_samples: 2 # number of quiz samples to generate
+  re_judge: false # whether to re-judge the existing quiz samples
+traverse_strategy: # strategy for clustering sub-graphs using comprehension loss
+  bidirectional: true # whether to traverse the graph in both directions
+  edge_sampling: max_loss # edge sampling strategy, support: random, max_loss, min_loss
+  expand_method: max_width # expand method, support: max_width, max_depth
+  isolated_node_strategy: ignore # strategy for isolated nodes, support: ignore, add
+  max_depth: 3 # maximum depth for graph traversal
+  max_extra_edges: 5 # max edges per direction (if expand_method="max_width")
+  max_tokens: 256 # restricts input length (if expand_method="max_tokens")
+  loss_strategy: only_edge # defines loss computation focus, support: only_edge, both

graphgen/configs/config.yaml.example DELETED Viewed

@@ -1,16 +0,0 @@
-data_type: raw
-input_file: resources/examples/raw_demo.jsonl
-tokenizer: cl100k_base
-quiz_samples: 2
-traverse_strategy:
-  qa_form: atomic
-  bidirectional: true
-  edge_sampling: max_loss
-  expand_method: max_tokens
-  isolated_node_strategy: add
-  max_depth: 2
-  max_extra_edges: 5
-  max_tokens: 256
-  loss_strategy: only_edge
-web_search: false
-re_judge: false

graphgen/configs/cot_config.yaml ADDED Viewed

	@@ -0,0 +1,13 @@

+input_data_type: raw # raw, chunked
+input_file: resources/input_examples/raw_demo.jsonl  # input file path, support json, jsonl, txt. See resources/input_examples for examples
+output_data_type: cot # atomic, aggregated, multi_hop, cot
+output_data_format: Sharegpt # Alpaca, Sharegpt, ChatML
+tokenizer: cl100k_base # tokenizer for counting tokens, support tiktoken tokenizer names and local tokenizer path
+search: # web search configuration
+  enabled: false # whether to enable web search
+  search_types: ["google"] # search engine types, support: google, bing, uniprot, wikipedia
+method_params:
+  method: leiden
+  max_size: 20 # Maximum size of communities
+  use_lcc: false
+  random_seed: 42

graphgen/configs/graphgen_config.yaml DELETED Viewed

@@ -1,16 +0,0 @@
-data_type: raw
-input_file: resources/examples/raw_demo.jsonl
-tokenizer: cl100k_base
-quiz_samples: 2
-traverse_strategy:
-  qa_form: aggregated
-  bidirectional: true
-  edge_sampling: max_loss
-  expand_method: max_width
-  isolated_node_strategy: ignore
-  max_depth: 1
-  max_extra_edges: 2
-  max_tokens: 256
-  loss_strategy: only_edge
-web_search: false
-re_judge: false

graphgen/configs/multi_hop_config.yaml ADDED Viewed

	@@ -0,0 +1,21 @@

+input_data_type: raw # raw, chunked
+input_file: resources/input_examples/raw_demo.jsonl # input file path, support json, jsonl, txt. See resources/input_examples for examples
+output_data_type: multi_hop # atomic, aggregated, multi_hop, cot
+output_data_format: ChatML # Alpaca, Sharegpt, ChatML
+tokenizer: cl100k_base # tokenizer for counting tokens, support tiktoken tokenizer names and local tokenizer path
+search: # web search configuration
+  enabled: false # whether to enable web search
+  search_types: ["google"] # search engine types, support: google, bing, uniprot, wikipedia
+quiz_and_judge_strategy: # quiz and test whether the LLM masters the knowledge points
+  enabled: true
+  quiz_samples: 2 # number of quiz samples to generate
+  re_judge: false # whether to re-judge the existing quiz samples
+traverse_strategy: # strategy for clustering sub-graphs using comprehension loss
+  bidirectional: true # whether to traverse the graph in both directions
+  edge_sampling: max_loss # edge sampling strategy, support: random, max_loss, min_loss
+  expand_method: max_width # expand method, support: max_width, max_depth
+  isolated_node_strategy: ignore # strategy for isolated nodes, support: ignore, add
+  max_depth: 1 # maximum depth for graph traversal
+  max_extra_edges: 2 # max edges per direction (if expand_method="max_width")
+  max_tokens: 256 # restricts input length (if expand_method="max_tokens")
+  loss_strategy: only_edge # defines loss computation focus, support: only_edge, both

graphgen/generate.py CHANGED Viewed

@@ -1,101 +1,103 @@
 import os
-import json
 import time
-import argparse
 from importlib.resources import files
 import yaml
 from dotenv import load_dotenv
 from .graphgen import GraphGen
-from .models import OpenAIModel, Tokenizer, TraverseStrategy
-from .utils import set_logger
 sys_path = os.path.abspath(os.path.dirname(__file__))
 load_dotenv()
 def set_working_dir(folder):
     os.makedirs(folder, exist_ok=True)
     os.makedirs(os.path.join(folder, "data", "graphgen"), exist_ok=True)
     os.makedirs(os.path.join(folder, "logs"), exist_ok=True)
 def save_config(config_path, global_config):
     if not os.path.exists(os.path.dirname(config_path)):
         os.makedirs(os.path.dirname(config_path))
-    with open(config_path, "w", encoding='utf-8') as config_file:
-        yaml.dump(global_config, config_file, default_flow_style=False, allow_unicode=True)
 def main():
     parser = argparse.ArgumentParser()
-    parser.add_argument('--config_file',
-                        help='Config parameters for GraphGen.',
-                        # default=os.path.join(sys_path, "configs", "graphgen_config.yaml"),
-                        default=files('graphgen').joinpath("configs", "graphgen_config.yaml"),
-                        type=str)
-    parser.add_argument('--output_dir',
-                        help='Output directory for GraphGen.',
-                        default=sys_path,
-                        required=True,
-                        type=str)
     args = parser.parse_args()
     working_dir = args.output_dir
     set_working_dir(working_dir)
-    unique_id = int(time.time())
-    set_logger(os.path.join(working_dir, "logs", f"graphgen_{unique_id}.log"), if_stream=False)
-    with open(args.config_file, "r", encoding='utf-8') as f:
         config = yaml.load(f, Loader=yaml.FullLoader)
-    input_file = config['input_file']
-    if config['data_type'] == 'raw':
-        with open(input_file, "r", encoding='utf-8') as f:
-            data = [json.loads(line) for line in f]
-    elif config['data_type'] == 'chunked':
-        with open(input_file, "r", encoding='utf-8') as f:
-            data = json.load(f)
-    else:
-        raise ValueError(f"Invalid data type: {config['data_type']}")
-    synthesizer_llm_client = OpenAIModel(
-        model_name=os.getenv("SYNTHESIZER_MODEL"),
-        api_key=os.getenv("SYNTHESIZER_API_KEY"),
-        base_url=os.getenv("SYNTHESIZER_BASE_URL")
-    )
-    trainee_llm_client = OpenAIModel(
-        model_name=os.getenv("TRAINEE_MODEL"),
-        api_key=os.getenv("TRAINEE_API_KEY"),
-        base_url=os.getenv("TRAINEE_BASE_URL")
-    )
-    traverse_strategy = TraverseStrategy(
-        **config['traverse_strategy']
     )
-    graph_gen = GraphGen(
-        working_dir=working_dir,
-        unique_id=unique_id,
-        synthesizer_llm_client=synthesizer_llm_client,
-        trainee_llm_client=trainee_llm_client,
-        if_web_search=config['web_search'],
-        tokenizer_instance=Tokenizer(
-            model_name=config['tokenizer']
         ),
-        traverse_strategy=traverse_strategy
     )
-    graph_gen.insert(data, config['data_type'])
-    graph_gen.quiz(max_samples=config['quiz_samples'])
-    graph_gen.judge(re_judge=config["re_judge"])
-    graph_gen.traverse()
-    path = os.path.join(working_dir, "data", "graphgen", str(unique_id), f"config-{unique_id}.yaml")
-    save_config(path, config)
-if __name__ == '__main__':
     main()

+import argparse
 import os
 import time
 from importlib.resources import files
 import yaml
 from dotenv import load_dotenv
 from .graphgen import GraphGen
+from .utils import logger, set_logger
 sys_path = os.path.abspath(os.path.dirname(__file__))
 load_dotenv()
 def set_working_dir(folder):
     os.makedirs(folder, exist_ok=True)
     os.makedirs(os.path.join(folder, "data", "graphgen"), exist_ok=True)
     os.makedirs(os.path.join(folder, "logs"), exist_ok=True)
 def save_config(config_path, global_config):
     if not os.path.exists(os.path.dirname(config_path)):
         os.makedirs(os.path.dirname(config_path))
+    with open(config_path, "w", encoding="utf-8") as config_file:
+        yaml.dump(
+            global_config, config_file, default_flow_style=False, allow_unicode=True
+        )
 def main():
     parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--config_file",
+        help="Config parameters for GraphGen.",
+        default=files("graphgen").joinpath("configs", "aggregated_config.yaml"),
+        type=str,
+    )
+    parser.add_argument(
+        "--output_dir",
+        help="Output directory for GraphGen.",
+        default=sys_path,
+        required=True,
+        type=str,
+    )
     args = parser.parse_args()
     working_dir = args.output_dir
     set_working_dir(working_dir)
+    with open(args.config_file, "r", encoding="utf-8") as f:
         config = yaml.load(f, Loader=yaml.FullLoader)
+    output_data_type = config["output_data_type"]
+    unique_id = int(time.time())
+    set_logger(
+        os.path.join(
+            working_dir, "logs", f"graphgen_{output_data_type}_{unique_id}.log"
+        ),
+        if_stream=True,
     )
+    logger.info(
+        "GraphGen with unique ID %s logging to %s",
+        unique_id,
+        os.path.join(
+            working_dir, "logs", f"graphgen_{output_data_type}_{unique_id}.log"
         ),
     )
+    graph_gen = GraphGen(working_dir=working_dir, unique_id=unique_id, config=config)
+    graph_gen.insert()
+    if config["search"]["enabled"]:
+        graph_gen.search()
+    # Use pipeline according to the output data type
+    if output_data_type in ["atomic", "aggregated", "multi_hop"]:
+        if "quiz_and_judge_strategy" in config and config[
+            "quiz_and_judge_strategy"
+        ].get("enabled", False):
+            graph_gen.quiz()
+            graph_gen.judge()
+        else:
+            logger.warning(
+                "Quiz and Judge strategy is disabled. Edge sampling falls back to random."
+            )
+            graph_gen.traverse_strategy.edge_sampling = "random"
+        graph_gen.traverse()
+    elif output_data_type == "cot":
+        graph_gen.generate_reasoning(method_params=config["method_params"])
+    else:
+        raise ValueError(f"Unsupported output data type: {output_data_type}")
+    output_path = os.path.join(working_dir, "data", "graphgen", str(unique_id))
+    save_config(os.path.join(output_path, f"config-{unique_id}.yaml"), config)
+    logger.info("GraphGen completed successfully. Data saved to %s", output_path)
+if __name__ == "__main__":
     main()

graphgen/graphgen.py CHANGED Viewed

@@ -1,10 +1,8 @@
-# Adapt from https://github.com/HKUDS/LightRAG
 import asyncio
 import os
 import time
 from dataclasses import dataclass, field
-from typing import List, Union, cast
 import gradio as gr
 from tqdm.asyncio import tqdm as tqdm_async
@@ -12,85 +10,124 @@ from tqdm.asyncio import tqdm as tqdm_async
 from .models import (
     Chunk,
     JsonKVStorage,
     NetworkXStorage,
     OpenAIModel,
     Tokenizer,
     TraverseStrategy,
-    WikiSearch,
 )
 from .models.storage.base_storage import StorageNameSpace
 from .operators import (
     extract_kg,
     judge_statement,
     quiz,
-    search_wikipedia,
-    skip_judge_statement,
     traverse_graph_atomically,
     traverse_graph_by_edge,
     traverse_graph_for_multi_hop,
 )
-from .utils import compute_content_hash, create_event_loop, logger
 sys_path = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
 @dataclass
 class GraphGen:
     unique_id: int = int(time.time())
     working_dir: str = os.path.join(sys_path, "cache")
-    # text chunking
-    chunk_size: int = 1024
-    chunk_overlap_size: int = 100
     # llm
     synthesizer_llm_client: OpenAIModel = None
     trainee_llm_client: OpenAIModel = None
-    tokenizer_instance: Tokenizer = None
-    # web search
-    if_web_search: bool = False
-    wiki_client: WikiSearch = field(default_factory=WikiSearch)
-    # traverse strategy
-    traverse_strategy: TraverseStrategy = field(default_factory=TraverseStrategy)
     # webui
     progress_bar: gr.Progress = None
     def __post_init__(self):
         self.full_docs_storage: JsonKVStorage = JsonKVStorage(
             self.working_dir, namespace="full_docs"
         )
         self.text_chunks_storage: JsonKVStorage = JsonKVStorage(
             self.working_dir, namespace="text_chunks"
         )
-        self.wiki_storage: JsonKVStorage = JsonKVStorage(
-            self.working_dir, namespace="wiki"
-        )
         self.graph_storage: NetworkXStorage = NetworkXStorage(
             self.working_dir, namespace="graph"
         )
         self.rephrase_storage: JsonKVStorage = JsonKVStorage(
             self.working_dir, namespace="rephrase"
         )
-        self.qa_storage: JsonKVStorage = JsonKVStorage(
-            os.path.join(self.working_dir, "data", "graphgen", str(self.unique_id)), namespace=f"qa-{self.unique_id}"
         )
-    async def async_split_chunks(self, data: Union[List[list], List[dict]], data_type: str) -> dict:
-        # TODO： 是否进行指代消解
         if len(data) == 0:
             return {}
-        new_docs = {}
         inserting_chunks = {}
         if data_type == "raw":
             assert isinstance(data, list) and isinstance(data[0], dict)
             # compute hash for each document
             new_docs = {
-                compute_content_hash(doc['content'], prefix="doc-"): {'content': doc['content']} for doc in data
             }
-            _add_doc_keys = await self.full_docs_storage.filter_keys(list(new_docs.keys()))
             new_docs = {k: v for k, v in new_docs.items() if k in _add_doc_keys}
             if len(new_docs) == 0:
                 logger.warning("All docs are already in the storage")
@@ -100,63 +137,83 @@ class GraphGen:
             cur_index = 1
             doc_number = len(new_docs)
             async for doc_key, doc in tqdm_async(
-                    new_docs.items(), desc="[1/4]Chunking documents", unit="doc"
-                ):
                 chunks = {
                     compute_content_hash(dp["content"], prefix="chunk-"): {
                         **dp,
-                        'full_doc_id': doc_key
-                    } for dp in self.tokenizer_instance.chunk_by_token_size(doc["content"],
-                                                                            self.chunk_overlap_size, self.chunk_size)
                 }
                 inserting_chunks.update(chunks)
                 if self.progress_bar is not None:
-                    self.progress_bar(
-                        cur_index / doc_number, f"Chunking {doc_key}"
-                    )
                     cur_index += 1
-            _add_chunk_keys = await self.text_chunks_storage.filter_keys(list(inserting_chunks.keys()))
-            inserting_chunks = {k: v for k, v in inserting_chunks.items() if k in _add_chunk_keys}
         elif data_type == "chunked":
             assert isinstance(data, list) and isinstance(data[0], list)
             new_docs = {
-                compute_content_hash("".join(chunk['content']), prefix="doc-"): {'content': "".join(chunk['content'])}
-                for doc in data for chunk in doc
             }
-            _add_doc_keys = await self.full_docs_storage.filter_keys(list(new_docs.keys()))
             new_docs = {k: v for k, v in new_docs.items() if k in _add_doc_keys}
             if len(new_docs) == 0:
                 logger.warning("All docs are already in the storage")
                 return {}
             logger.info("[New Docs] inserting %d docs", len(new_docs))
-            async for doc in tqdm_async(data, desc="[1/4]Chunking documents", unit="doc"):
-                doc_str = "".join([chunk['content'] for chunk in doc])
                 for chunk in doc:
-                    chunk_key = compute_content_hash(chunk['content'], prefix="chunk-")
                     inserting_chunks[chunk_key] = {
                         **chunk,
-                        'full_doc_id': compute_content_hash(doc_str, prefix="doc-")
                     }
-            _add_chunk_keys = await self.text_chunks_storage.filter_keys(list(inserting_chunks.keys()))
-            inserting_chunks = {k: v for k, v in inserting_chunks.items() if k in _add_chunk_keys}
         await self.full_docs_storage.upsert(new_docs)
         await self.text_chunks_storage.upsert(inserting_chunks)
         return inserting_chunks
-    def insert(self, data: Union[List[list], List[dict]], data_type: str):
         loop = create_event_loop()
-        loop.run_until_complete(self.async_insert(data, data_type))
-    async def async_insert(self, data: Union[List[list], List[dict]], data_type: str):
         """
         insert chunks into the graph
         """
         inserting_chunks = await self.async_split_chunks(data, data_type)
         if len(inserting_chunks) == 0:
@@ -169,52 +226,96 @@ class GraphGen:
             llm_client=self.synthesizer_llm_client,
             kg_instance=self.graph_storage,
             tokenizer_instance=self.tokenizer_instance,
-            chunks=[Chunk(id=k, content=v['content']) for k, v in inserting_chunks.items()],
-            progress_bar = self.progress_bar,
         )
         if not _add_entities_and_relations:
             logger.warning("No entities or relations extracted")
             return
-        logger.info("[Wiki Search] is %s", 'enabled' if self.if_web_search else 'disabled')
-        if self.if_web_search:
-            logger.info("[Wiki Search]...")
-            _add_wiki_data = await search_wikipedia(
-                llm_client= self.synthesizer_llm_client,
-                wiki_search_client=self.wiki_client,
-                knowledge_graph_instance=_add_entities_and_relations
-            )
-            await self.wiki_storage.upsert(_add_wiki_data)
         await self._insert_done()
     async def _insert_done(self):
         tasks = []
-        for storage_instance in [self.full_docs_storage, self.text_chunks_storage,
-                                 self.graph_storage, self.wiki_storage]:
             if storage_instance is None:
                 continue
             tasks.append(cast(StorageNameSpace, storage_instance).index_done_callback())
         await asyncio.gather(*tasks)
-    def quiz(self, max_samples=1):
         loop = create_event_loop()
-        loop.run_until_complete(self.async_quiz(max_samples))
-    async def async_quiz(self, max_samples=1):
-        await quiz(self.synthesizer_llm_client, self.graph_storage, self.rephrase_storage, max_samples)
-        await self.rephrase_storage.index_done_callback()
-    def judge(self, re_judge=False, skip=False):
         loop = create_event_loop()
-        loop.run_until_complete(self.async_judge(re_judge, skip))
-    async def async_judge(self, re_judge=False, skip=False):
-        if skip:
-            _update_relations = await skip_judge_statement(self.graph_storage)
-        else:
-            _update_relations = await judge_statement(self.trainee_llm_client, self.graph_storage,
-                                                      self.rephrase_storage, re_judge)
         await _update_relations.index_done_callback()
     def traverse(self):
@@ -222,26 +323,60 @@ class GraphGen:
         loop.run_until_complete(self.async_traverse())
     async def async_traverse(self):
-        if self.traverse_strategy.qa_form == "atomic":
-            results = await traverse_graph_atomically(self.synthesizer_llm_client,
-                                                      self.tokenizer_instance,
-                                                      self.graph_storage,
-                                                      self.traverse_strategy,
-                                                      self.text_chunks_storage,
-                                                      self.progress_bar)
-        elif self.traverse_strategy.qa_form == "multi_hop":
-            results = await traverse_graph_for_multi_hop(self.synthesizer_llm_client,
-                                                            self.tokenizer_instance,
-                                                            self.graph_storage,
-                                                            self.traverse_strategy,
-                                                            self.text_chunks_storage,
-                                                            self.progress_bar)
-        elif self.traverse_strategy.qa_form == "aggregated":
-            results = await traverse_graph_by_edge(self.synthesizer_llm_client, self.tokenizer_instance,
-                                                   self.graph_storage, self.traverse_strategy, self.text_chunks_storage,
-                                                   self.progress_bar)
         else:
-            raise ValueError(f"Unknown qa_form: {self.traverse_strategy.qa_form}")
         await self.qa_storage.upsert(results)
         await self.qa_storage.index_done_callback()
@@ -252,7 +387,7 @@ class GraphGen:
     async def async_clear(self):
         await self.full_docs_storage.drop()
         await self.text_chunks_storage.drop()
-        await self.wiki_storage.drop()
         await self.graph_storage.clear()
         await self.rephrase_storage.drop()
         await self.qa_storage.drop()

 import asyncio
 import os
 import time
 from dataclasses import dataclass, field
+from typing import Dict, List, Union, cast
 import gradio as gr
 from tqdm.asyncio import tqdm as tqdm_async
 from .models import (
     Chunk,
     JsonKVStorage,
+    JsonListStorage,
     NetworkXStorage,
     OpenAIModel,
     Tokenizer,
     TraverseStrategy,
 )
 from .models.storage.base_storage import StorageNameSpace
 from .operators import (
     extract_kg,
+    generate_cot,
     judge_statement,
     quiz,
+    search_all,
     traverse_graph_atomically,
     traverse_graph_by_edge,
     traverse_graph_for_multi_hop,
 )
+from .utils import (
+    compute_content_hash,
+    create_event_loop,
+    format_generation_results,
+    logger,
+    read_file,
+)
 sys_path = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
 @dataclass
 class GraphGen:
     unique_id: int = int(time.time())
     working_dir: str = os.path.join(sys_path, "cache")
+    config: Dict = field(default_factory=dict)
     # llm
+    tokenizer_instance: Tokenizer = None
     synthesizer_llm_client: OpenAIModel = None
     trainee_llm_client: OpenAIModel = None
+    # text chunking
+    # TODO: make it configurable
+    chunk_size: int = 1024
+    chunk_overlap_size: int = 100
+    # search
+    search_config: dict = field(
+        default_factory=lambda: {"enabled": False, "search_types": ["wikipedia"]}
+    )
+    # traversal
+    traverse_strategy: TraverseStrategy = None
     # webui
     progress_bar: gr.Progress = None
     def __post_init__(self):
+        self.tokenizer_instance: Tokenizer = Tokenizer(
+            model_name=self.config["tokenizer"]
+        )
+        self.synthesizer_llm_client: OpenAIModel = OpenAIModel(
+            model_name=os.getenv("SYNTHESIZER_MODEL"),
+            api_key=os.getenv("SYNTHESIZER_API_KEY"),
+            base_url=os.getenv("SYNTHESIZER_BASE_URL"),
+            tokenizer_instance=self.tokenizer_instance,
+        )
+        self.trainee_llm_client: OpenAIModel = OpenAIModel(
+            model_name=os.getenv("TRAINEE_MODEL"),
+            api_key=os.getenv("TRAINEE_API_KEY"),
+            base_url=os.getenv("TRAINEE_BASE_URL"),
+            tokenizer_instance=self.tokenizer_instance,
+        )
+        self.search_config = self.config["search"]
+        if "traverse_strategy" in self.config:
+            self.traverse_strategy = TraverseStrategy(
+                **self.config["traverse_strategy"]
+            )
         self.full_docs_storage: JsonKVStorage = JsonKVStorage(
             self.working_dir, namespace="full_docs"
         )
         self.text_chunks_storage: JsonKVStorage = JsonKVStorage(
             self.working_dir, namespace="text_chunks"
         )
         self.graph_storage: NetworkXStorage = NetworkXStorage(
             self.working_dir, namespace="graph"
         )
+        self.search_storage: JsonKVStorage = JsonKVStorage(
+            self.working_dir, namespace="search"
+        )
         self.rephrase_storage: JsonKVStorage = JsonKVStorage(
             self.working_dir, namespace="rephrase"
         )
+        self.qa_storage: JsonListStorage = JsonListStorage(
+            os.path.join(self.working_dir, "data", "graphgen", str(self.unique_id)),
+            namespace=f"qa-{self.unique_id}",
         )
+    async def async_split_chunks(
+        self, data: List[Union[List, Dict]], data_type: str
+    ) -> dict:
+        # TODO: configurable whether to use coreference resolution
         if len(data) == 0:
             return {}
         inserting_chunks = {}
         if data_type == "raw":
             assert isinstance(data, list) and isinstance(data[0], dict)
             # compute hash for each document
             new_docs = {
+                compute_content_hash(doc["content"], prefix="doc-"): {
+                    "content": doc["content"]
+                }
+                for doc in data
             }
+            _add_doc_keys = await self.full_docs_storage.filter_keys(
+                list(new_docs.keys())
+            )
             new_docs = {k: v for k, v in new_docs.items() if k in _add_doc_keys}
             if len(new_docs) == 0:
                 logger.warning("All docs are already in the storage")
             cur_index = 1
             doc_number = len(new_docs)
             async for doc_key, doc in tqdm_async(
+                new_docs.items(), desc="[1/4]Chunking documents", unit="doc"
+            ):
                 chunks = {
                     compute_content_hash(dp["content"], prefix="chunk-"): {
                         **dp,
+                        "full_doc_id": doc_key,
+                    }
+                    for dp in self.tokenizer_instance.chunk_by_token_size(
+                        doc["content"], self.chunk_overlap_size, self.chunk_size
+                    )
                 }
                 inserting_chunks.update(chunks)
                 if self.progress_bar is not None:
+                    self.progress_bar(cur_index / doc_number, f"Chunking {doc_key}")
                     cur_index += 1
+            _add_chunk_keys = await self.text_chunks_storage.filter_keys(
+                list(inserting_chunks.keys())
+            )
+            inserting_chunks = {
+                k: v for k, v in inserting_chunks.items() if k in _add_chunk_keys
+            }
         elif data_type == "chunked":
             assert isinstance(data, list) and isinstance(data[0], list)
             new_docs = {
+                compute_content_hash("".join(chunk["content"]), prefix="doc-"): {
+                    "content": "".join(chunk["content"])
+                }
+                for doc in data
+                for chunk in doc
             }
+            _add_doc_keys = await self.full_docs_storage.filter_keys(
+                list(new_docs.keys())
+            )
             new_docs = {k: v for k, v in new_docs.items() if k in _add_doc_keys}
             if len(new_docs) == 0:
                 logger.warning("All docs are already in the storage")
                 return {}
             logger.info("[New Docs] inserting %d docs", len(new_docs))
+            async for doc in tqdm_async(
+                data, desc="[1/4]Chunking documents", unit="doc"
+            ):
+                doc_str = "".join([chunk["content"] for chunk in doc])
                 for chunk in doc:
+                    chunk_key = compute_content_hash(chunk["content"], prefix="chunk-")
                     inserting_chunks[chunk_key] = {
                         **chunk,
+                        "full_doc_id": compute_content_hash(doc_str, prefix="doc-"),
                     }
+            _add_chunk_keys = await self.text_chunks_storage.filter_keys(
+                list(inserting_chunks.keys())
+            )
+            inserting_chunks = {
+                k: v for k, v in inserting_chunks.items() if k in _add_chunk_keys
+            }
+        else:
+            raise ValueError(f"Unknown data type: {data_type}")
         await self.full_docs_storage.upsert(new_docs)
         await self.text_chunks_storage.upsert(inserting_chunks)
         return inserting_chunks
+    def insert(self):
         loop = create_event_loop()
+        loop.run_until_complete(self.async_insert())
+    async def async_insert(self):
         """
         insert chunks into the graph
         """
+        input_file = self.config["input_file"]
+        data_type = self.config["input_data_type"]
+        data = read_file(input_file)
         inserting_chunks = await self.async_split_chunks(data, data_type)
         if len(inserting_chunks) == 0:
             llm_client=self.synthesizer_llm_client,
             kg_instance=self.graph_storage,
             tokenizer_instance=self.tokenizer_instance,
+            chunks=[
+                Chunk(id=k, content=v["content"]) for k, v in inserting_chunks.items()
+            ],
+            progress_bar=self.progress_bar,
         )
         if not _add_entities_and_relations:
             logger.warning("No entities or relations extracted")
             return
         await self._insert_done()
     async def _insert_done(self):
         tasks = []
+        for storage_instance in [
+            self.full_docs_storage,
+            self.text_chunks_storage,
+            self.graph_storage,
+            self.search_storage,
+        ]:
             if storage_instance is None:
                 continue
             tasks.append(cast(StorageNameSpace, storage_instance).index_done_callback())
         await asyncio.gather(*tasks)
+    def search(self):
         loop = create_event_loop()
+        loop.run_until_complete(self.async_search())
+    async def async_search(self):
+        logger.info(
+            "Search is %s", "enabled" if self.search_config["enabled"] else "disabled"
+        )
+        if self.search_config["enabled"]:
+            logger.info(
+                "[Search] %s ...", ", ".join(self.search_config["search_types"])
+            )
+            all_nodes = await self.graph_storage.get_all_nodes()
+            all_nodes_names = [node[0] for node in all_nodes]
+            new_search_entities = await self.full_docs_storage.filter_keys(
+                all_nodes_names
+            )
+            logger.info(
+                "[Search] Found %d entities to search", len(new_search_entities)
+            )
+            _add_search_data = await search_all(
+                search_types=self.search_config["search_types"],
+                search_entities=new_search_entities,
+            )
+            if _add_search_data:
+                await self.search_storage.upsert(_add_search_data)
+                logger.info("[Search] %d entities searched", len(_add_search_data))
+                # Format search results for inserting
+                search_results = []
+                for _, search_data in _add_search_data.items():
+                    search_results.extend(
+                        [
+                            {"content": search_data[key]}
+                            for key in list(search_data.keys())
+                        ]
+                    )
+                # TODO: fix insert after search
+                await self.async_insert()
+    def quiz(self):
         loop = create_event_loop()
+        loop.run_until_complete(self.async_quiz())
+    async def async_quiz(self):
+        max_samples = self.config["quiz_and_judge_strategy"]["quiz_samples"]
+        await quiz(
+            self.synthesizer_llm_client,
+            self.graph_storage,
+            self.rephrase_storage,
+            max_samples,
+        )
+        await self.rephrase_storage.index_done_callback()
+    def judge(self):
+        loop = create_event_loop()
+        loop.run_until_complete(self.async_judge())
+    async def async_judge(self):
+        re_judge = self.config["quiz_and_judge_strategy"]["re_judge"]
+        _update_relations = await judge_statement(
+            self.trainee_llm_client,
+            self.graph_storage,
+            self.rephrase_storage,
+            re_judge,
+        )
         await _update_relations.index_done_callback()
     def traverse(self):
         loop.run_until_complete(self.async_traverse())
     async def async_traverse(self):
+        output_data_type = self.config["output_data_type"]
+        if output_data_type == "atomic":
+            results = await traverse_graph_atomically(
+                self.synthesizer_llm_client,
+                self.tokenizer_instance,
+                self.graph_storage,
+                self.traverse_strategy,
+                self.text_chunks_storage,
+                self.progress_bar,
+            )
+        elif output_data_type == "multi_hop":
+            results = await traverse_graph_for_multi_hop(
+                self.synthesizer_llm_client,
+                self.tokenizer_instance,
+                self.graph_storage,
+                self.traverse_strategy,
+                self.text_chunks_storage,
+                self.progress_bar,
+            )
+        elif output_data_type == "aggregated":
+            results = await traverse_graph_by_edge(
+                self.synthesizer_llm_client,
+                self.tokenizer_instance,
+                self.graph_storage,
+                self.traverse_strategy,
+                self.text_chunks_storage,
+                self.progress_bar,
+            )
         else:
+            raise ValueError(f"Unknown qa_form: {output_data_type}")
+        results = format_generation_results(
+            results, output_data_format=self.config["output_data_format"]
+        )
+        await self.qa_storage.upsert(results)
+        await self.qa_storage.index_done_callback()
+    def generate_reasoning(self, method_params):
+        loop = create_event_loop()
+        loop.run_until_complete(self.async_generate_reasoning(method_params))
+    async def async_generate_reasoning(self, method_params):
+        results = await generate_cot(
+            self.graph_storage,
+            self.synthesizer_llm_client,
+            method_params=method_params,
+        )
+        results = format_generation_results(
+            results, output_data_format=self.config["output_data_format"]
+        )
         await self.qa_storage.upsert(results)
         await self.qa_storage.index_done_callback()
     async def async_clear(self):
         await self.full_docs_storage.drop()
         await self.text_chunks_storage.drop()
+        await self.search_storage.drop()
         await self.graph_storage.clear()
         await self.rephrase_storage.drop()
         await self.qa_storage.drop()

graphgen/models/__init__.py CHANGED Viewed

@@ -1,22 +1,20 @@
-from .text.chunk import Chunk
-from .text.text_pair import TextPair
-from .llm.topk_token_model import Token, TopkTokenModel
-from .llm.openai_model import OpenAIModel
-from .llm.tokenizer import Tokenizer
-from .storage.networkx_storage import NetworkXStorage
-from .storage.json_storage import JsonKVStorage
-from .search.wiki_search import WikiSearch
 from .evaluate.length_evaluator import LengthEvaluator
 from .evaluate.mtld_evaluator import MTLDEvaluator
 from .evaluate.reward_evaluator import RewardEvaluator
 from .evaluate.uni_evaluator import UniEvaluator
 from .strategy.travserse_strategy import TraverseStrategy
 __all__ = [
     # llm models
@@ -28,8 +26,12 @@ __all__ = [
     "Chunk",
     "NetworkXStorage",
     "JsonKVStorage",
     # search models
     "WikiSearch",
     # evaluate models
     "TextPair",
     "LengthEvaluator",
@@ -38,4 +40,6 @@ __all__ = [
     "UniEvaluator",
     # strategy models
     "TraverseStrategy",
 ]

+from .community.community_detector import CommunityDetector
 from .evaluate.length_evaluator import LengthEvaluator
 from .evaluate.mtld_evaluator import MTLDEvaluator
 from .evaluate.reward_evaluator import RewardEvaluator
 from .evaluate.uni_evaluator import UniEvaluator
+from .llm.openai_model import OpenAIModel
+from .llm.tokenizer import Tokenizer
+from .llm.topk_token_model import Token, TopkTokenModel
+from .search.db.uniprot_search import UniProtSearch
+from .search.kg.wiki_search import WikiSearch
+from .search.web.bing_search import BingSearch
+from .search.web.google_search import GoogleSearch
+from .storage.json_storage import JsonKVStorage, JsonListStorage
+from .storage.networkx_storage import NetworkXStorage
 from .strategy.travserse_strategy import TraverseStrategy
+from .text.chunk import Chunk
+from .text.text_pair import TextPair
 __all__ = [
     # llm models
     "Chunk",
     "NetworkXStorage",
     "JsonKVStorage",
+    "JsonListStorage",
     # search models
     "WikiSearch",
+    "GoogleSearch",
+    "BingSearch",
+    "UniProtSearch",
     # evaluate models
     "TextPair",
     "LengthEvaluator",
     "UniEvaluator",
     # strategy models
     "TraverseStrategy",
+    # community models
+    "CommunityDetector",
 ]

graphgen/models/community/__init__.py ADDED Viewed

File without changes

graphgen/models/community/community_detector.py ADDED Viewed

	@@ -0,0 +1,95 @@

+from collections import defaultdict
+from dataclasses import dataclass
+from typing import Any, Dict, List
+from graphgen.models.storage.networkx_storage import NetworkXStorage
+@dataclass
+class CommunityDetector:
+    """Class for community detection algorithms."""
+    graph_storage: NetworkXStorage = None
+    method: str = "leiden"
+    method_params: Dict[str, Any] = None
+    async def detect_communities(self) -> Dict[str, int]:
+        if self.method == "leiden":
+            return await self._leiden_communities(**self.method_params or {})
+        raise ValueError(f"Unknown community detection method: {self.method}")
+    async def get_graph(self):
+        return await self.graph_storage.get_graph()
+    async def _leiden_communities(
+        self, max_size: int = None, **kwargs
+    ) -> Dict[str, int]:
+        """
+        Detect communities using the Leiden algorithm.
+        If max_size is given, any community larger than max_size will be split
+        into smaller sub-communities each having at most max_size nodes.
+        """
+        import igraph as ig
+        import networkx as nx
+        from leidenalg import ModularityVertexPartition, find_partition
+        graph = await self.get_graph()
+        graph.remove_nodes_from(list(nx.isolates(graph)))
+        ig_graph = ig.Graph.TupleList(graph.edges(), directed=False)
+        random_seed = kwargs.get("random_seed", 42)
+        use_lcc = kwargs.get("use_lcc", False)
+        communities: Dict[str, int] = {}
+        if use_lcc:
+            lcc = ig_graph.components().giant()
+            partition = find_partition(lcc, ModularityVertexPartition, seed=random_seed)
+            for part, cluster in enumerate(partition):
+                for v in cluster:
+                    communities[lcc.vs[v]["name"]] = part
+        else:
+            offset = 0
+            for component in ig_graph.components():
+                subgraph = ig_graph.induced_subgraph(component)
+                partition = find_partition(
+                    subgraph, ModularityVertexPartition, seed=random_seed
+                )
+                for part, cluster in enumerate(partition):
+                    for v in cluster:
+                        original_node = subgraph.vs[v]["name"]
+                        communities[original_node] = part + offset
+                offset += len(partition)
+        # split large communities if max_size is specified
+        if max_size is None or max_size <= 0:
+            return communities
+        return await self._split_communities(communities, max_size)
+    @staticmethod
+    async def _split_communities(
+        communities: Dict[str, int], max_size: int
+    ) -> Dict[str, int]:
+        """
+        Split communities larger than max_size into smaller sub-communities.
+        """
+        cid2nodes: Dict[int, List[str]] = defaultdict(list)
+        for node, cid in communities.items():
+            cid2nodes[cid].append(node)
+        new_communities: Dict[str, int] = {}
+        new_cid = 0
+        for cid, nodes in cid2nodes.items():
+            if len(nodes) <= max_size:
+                for n in nodes:
+                    new_communities[n] = new_cid
+                new_cid += 1
+            else:
+                for start in range(0, len(nodes), max_size):
+                    sub = nodes[start : start + max_size]
+                    for n in sub:
+                        new_communities[n] = new_cid
+                    new_cid += 1
+        return new_communities

graphgen/models/llm/openai_model.py CHANGED Viewed

@@ -1,18 +1,21 @@
 import math
 from dataclasses import dataclass, field
-from typing import List, Dict, Optional
 import openai
-from openai import AsyncOpenAI, RateLimitError, APIConnectionError, APITimeoutError
 from tenacity import (
     retry,
     stop_after_attempt,
     wait_exponential,
-    retry_if_exception_type,
 )
-from graphgen.models.llm.topk_token_model import TopkTokenModel, Token
-from graphgen.models.llm.tokenizer import Tokenizer
 from graphgen.models.llm.limitter import RPM, TPM
 def get_top_response_tokens(response: openai.ChatCompletion) -> List[Token]:
     token_logprobs = response.choices[0].logprobs.content
@@ -20,13 +23,23 @@ def get_top_response_tokens(response: openai.ChatCompletion) -> List[Token]:
     for token_prob in token_logprobs:
         prob = math.exp(token_prob.logprob)
         candidate_tokens = [
-            Token(t.token, math.exp(t.logprob))
-            for t in token_prob.top_logprobs
         ]
         token = Token(token_prob.token, prob, top_candidates=candidate_tokens)
         tokens.append(token)
     return tokens
 @dataclass
 class OpenAIModel(TopkTokenModel):
     model_name: str = "gpt-4o-mini"
@@ -42,12 +55,13 @@ class OpenAIModel(TopkTokenModel):
     rpm: RPM = field(default_factory=lambda: RPM(rpm=1000))
     tpm: TPM = field(default_factory=lambda: TPM(tpm=50000))
     def __post_init__(self):
         assert self.api_key is not None, "Please provide api key to access openai api."
-        if self.api_key == "":
-            self.api_key = "none"
-        self.client = AsyncOpenAI(api_key=self.api_key, base_url=self.base_url)
     def _pre_generate(self, text: str, history: List[str]) -> Dict:
         kwargs = {
@@ -69,16 +83,19 @@ class OpenAIModel(TopkTokenModel):
             assert len(history) % 2 == 0, "History should have even number of elements."
             messages = history + messages
-        kwargs['messages']= messages
         return kwargs
     @retry(
         stop=stop_after_attempt(5),
         wait=wait_exponential(multiplier=1, min=4, max=10),
-        retry=retry_if_exception_type((RateLimitError, APIConnectionError, APITimeoutError)),
     )
-    async def generate_topk_per_token(self, text: str, history: Optional[List[str]] = None) -> List[Token]:
         kwargs = self._pre_generate(text, history)
         if self.topk_per_token > 0:
             kwargs["logprobs"] = True
@@ -87,9 +104,8 @@ class OpenAIModel(TopkTokenModel):
         # Limit max_tokens to 1 to avoid long completions
         kwargs["max_tokens"] = 1
-        completion = await self.client.chat.completions.create( # pylint: disable=E1125
-            model=self.model_name,
-            **kwargs
         )
         tokens = get_top_response_tokens(completion)
@@ -99,32 +115,41 @@ class OpenAIModel(TopkTokenModel):
     @retry(
         stop=stop_after_attempt(5),
         wait=wait_exponential(multiplier=1, min=4, max=10),
-        retry=retry_if_exception_type((RateLimitError, APIConnectionError, APITimeoutError)),
     )
-    async def generate_answer(self, text: str, history: Optional[List[str]] = None, temperature: int = 0) -> str:
         kwargs = self._pre_generate(text, history)
         kwargs["temperature"] = temperature
         prompt_tokens = 0
-        for message in kwargs['messages']:
-            prompt_tokens += len(Tokenizer().encode_string(message['content']))
-        estimated_tokens = prompt_tokens + kwargs['max_tokens']
         if self.request_limit:
             await self.rpm.wait(silent=True)
             await self.tpm.wait(estimated_tokens, silent=True)
-        completion = await self.client.chat.completions.create( # pylint: disable=E1125
-            model=self.model_name,
-            **kwargs
         )
         if hasattr(completion, "usage"):
-            self.token_usage.append({
-                "prompt_tokens": completion.usage.prompt_tokens,
-                "completion_tokens": completion.usage.completion_tokens,
-                "total_tokens": completion.usage.total_tokens,
-            })
-        return completion.choices[0].message.content
-    async def generate_inputs_prob(self, text: str, history: Optional[List[str]] = None) -> List[Token]:
         raise NotImplementedError

 import math
+import re
 from dataclasses import dataclass, field
+from typing import Dict, List, Optional
 import openai
+from openai import APIConnectionError, APITimeoutError, AsyncOpenAI, RateLimitError
 from tenacity import (
     retry,
+    retry_if_exception_type,
     stop_after_attempt,
     wait_exponential,
 )
 from graphgen.models.llm.limitter import RPM, TPM
+from graphgen.models.llm.tokenizer import Tokenizer
+from graphgen.models.llm.topk_token_model import Token, TopkTokenModel
 def get_top_response_tokens(response: openai.ChatCompletion) -> List[Token]:
     token_logprobs = response.choices[0].logprobs.content
     for token_prob in token_logprobs:
         prob = math.exp(token_prob.logprob)
         candidate_tokens = [
+            Token(t.token, math.exp(t.logprob)) for t in token_prob.top_logprobs
         ]
         token = Token(token_prob.token, prob, top_candidates=candidate_tokens)
         tokens.append(token)
     return tokens
+def filter_think_tags(text: str) -> str:
+    """
+    Remove <think> tags from the text.
+    If the text contains <think> and </think>, it removes everything between them and the tags themselves.
+    """
+    think_pattern = re.compile(r"<think>.*?</think>", re.DOTALL)
+    filtered_text = think_pattern.sub("", text).strip()
+    return filtered_text if filtered_text else text.strip()
 @dataclass
 class OpenAIModel(TopkTokenModel):
     model_name: str = "gpt-4o-mini"
     rpm: RPM = field(default_factory=lambda: RPM(rpm=1000))
     tpm: TPM = field(default_factory=lambda: TPM(tpm=50000))
+    tokenizer_instance: Tokenizer = field(default_factory=Tokenizer)
     def __post_init__(self):
         assert self.api_key is not None, "Please provide api key to access openai api."
+        self.client = AsyncOpenAI(
+            api_key=self.api_key or "dummy", base_url=self.base_url
+        )
     def _pre_generate(self, text: str, history: List[str]) -> Dict:
         kwargs = {
             assert len(history) % 2 == 0, "History should have even number of elements."
             messages = history + messages
+        kwargs["messages"] = messages
         return kwargs
     @retry(
         stop=stop_after_attempt(5),
         wait=wait_exponential(multiplier=1, min=4, max=10),
+        retry=retry_if_exception_type(
+            (RateLimitError, APIConnectionError, APITimeoutError)
+        ),
     )
+    async def generate_topk_per_token(
+        self, text: str, history: Optional[List[str]] = None
+    ) -> List[Token]:
         kwargs = self._pre_generate(text, history)
         if self.topk_per_token > 0:
             kwargs["logprobs"] = True
         # Limit max_tokens to 1 to avoid long completions
         kwargs["max_tokens"] = 1
+        completion = await self.client.chat.completions.create(  # pylint: disable=E1125
+            model=self.model_name, **kwargs
         )
         tokens = get_top_response_tokens(completion)
     @retry(
         stop=stop_after_attempt(5),
         wait=wait_exponential(multiplier=1, min=4, max=10),
+        retry=retry_if_exception_type(
+            (RateLimitError, APIConnectionError, APITimeoutError)
+        ),
     )
+    async def generate_answer(
+        self, text: str, history: Optional[List[str]] = None, temperature: int = 0
+    ) -> str:
         kwargs = self._pre_generate(text, history)
         kwargs["temperature"] = temperature
         prompt_tokens = 0
+        for message in kwargs["messages"]:
+            prompt_tokens += len(
+                self.tokenizer_instance.encode_string(message["content"])
+            )
+        estimated_tokens = prompt_tokens + kwargs["max_tokens"]
         if self.request_limit:
             await self.rpm.wait(silent=True)
             await self.tpm.wait(estimated_tokens, silent=True)
+        completion = await self.client.chat.completions.create(  # pylint: disable=E1125
+            model=self.model_name, **kwargs
         )
         if hasattr(completion, "usage"):
+            self.token_usage.append(
+                {
+                    "prompt_tokens": completion.usage.prompt_tokens,
+                    "completion_tokens": completion.usage.completion_tokens,
+                    "total_tokens": completion.usage.total_tokens,
+                }
+            )
+        return filter_think_tags(completion.choices[0].message.content)
+    async def generate_inputs_prob(
+        self, text: str, history: Optional[List[str]] = None
+    ) -> List[Token]:
         raise NotImplementedError

graphgen/models/search/db/__init__.py ADDED Viewed

File without changes

graphgen/models/search/db/uniprot_search.py ADDED Viewed

	@@ -0,0 +1,64 @@

+from dataclasses import dataclass
+import requests
+from fastapi import HTTPException
+from graphgen.utils import logger
+UNIPROT_BASE = "https://rest.uniprot.org/uniprotkb/search"
+@dataclass
+class UniProtSearch:
+    """
+    UniProt Search client to search with UniProt.
+    1) Get the protein by accession number.
+    2) Search with keywords or protein names.
+    """
+    def get_entry(self, accession: str) -> dict:
+        """
+        Get the UniProt entry by accession number(e.g., P04637).
+        """
+        url = f"{UNIPROT_BASE}/{accession}.json"
+        return self._safe_get(url).json()
+    def search(
+        self,
+        query: str,
+        *,
+        size: int = 10,
+        cursor: str = None,
+        fields: list[str] = None,
+    ) -> dict:
+        """
+        Search UniProt with a query string.
+        :param query: The search query.
+        :param size: The number of results to return.
+        :param cursor: The cursor for pagination.
+        :param fields: The fields to return in the response.
+        :return: A dictionary containing the search results.
+        """
+        params = {
+            "query": query,
+            "size": size,
+        }
+        if cursor:
+            params["cursor"] = cursor
+        if fields:
+            params["fields"] = ",".join(fields)
+        url = UNIPROT_BASE
+        return self._safe_get(url, params=params).json()
+    @staticmethod
+    def _safe_get(url: str, params: dict = None) -> requests.Response:
+        r = requests.get(
+            url,
+            params=params,
+            headers={"Accept": "application/json"},
+            timeout=10,
+        )
+        if not r.ok:
+            logger.error("Search engine error: %s", r.text)
+            raise HTTPException(r.status_code, "Search engine error.")
+        return r

graphgen/models/search/kg/__init__.py ADDED Viewed

File without changes

graphgen/models/search/{wiki_search.py → kg/wiki_search.py} RENAMED Viewed

@@ -1,8 +1,9 @@
-from typing import List, Union
 from dataclasses import dataclass
 import wikipedia
 from wikipedia import set_lang
 from graphgen.utils import detect_main_language, logger
@@ -13,9 +14,9 @@ class WikiSearch:
         assert language in ["en", "zh"], "Only support English and Chinese"
         set_lang(language)
-    async def search(self, query: str) -> Union[List[str], None]:
         self.set_language(detect_main_language(query))
-        return wikipedia.search(query)
     async def summary(self, query: str) -> Union[str, None]:
         self.set_language(detect_main_language(query))

 from dataclasses import dataclass
+from typing import List, Union
 import wikipedia
 from wikipedia import set_lang
 from graphgen.utils import detect_main_language, logger
         assert language in ["en", "zh"], "Only support English and Chinese"
         set_lang(language)
+    async def search(self, query: str, num_results: int = 1) -> Union[List[str], None]:
         self.set_language(detect_main_language(query))
+        return wikipedia.search(query, results=num_results, suggestion=False)
     async def summary(self, query: str) -> Union[str, None]:
         self.set_language(detect_main_language(query))

graphgen/models/search/web/__init__.py ADDED Viewed

File without changes

graphgen/models/search/web/bing_search.py ADDED Viewed

	@@ -0,0 +1,43 @@

+from dataclasses import dataclass
+import requests
+from fastapi import HTTPException
+from graphgen.utils import logger
+BING_SEARCH_V7_ENDPOINT = "https://api.bing.microsoft.com/v7.0/search"
+BING_MKT = "en-US"
+@dataclass
+class BingSearch:
+    """
+    Bing Search client to search with Bing.
+    """
+    subscription_key: str
+    def search(self, query: str, num_results: int = 1):
+        """
+        Search with Bing and return the contexts.
+        :param query: The search query.
+        :param num_results: The number of results to return.
+        :return: A list of search results.
+        """
+        params = {"q": query, "mkt": BING_MKT, "count": num_results}
+        response = requests.get(
+            BING_SEARCH_V7_ENDPOINT,
+            headers={"Ocp-Apim-Subscription-Key": self.subscription_key},
+            params=params,
+            timeout=10,
+        )
+        if not response.ok:
+            logger.error("Search engine error: %s", response.text)
+            raise HTTPException(response.status_code, "Search engine error.")
+        json_content = response.json()
+        try:
+            contexts = json_content["webPages"]["value"][:num_results]
+        except KeyError:
+            logger.error("Error encountered: %s", json_content)
+            return []
+        return contexts

graphgen/models/search/web/google_search.py ADDED Viewed

	@@ -0,0 +1,45 @@

+from dataclasses import dataclass
+import requests
+from fastapi import HTTPException
+from graphgen.utils import logger
+GOOGLE_SEARCH_ENDPOINT = "https://customsearch.googleapis.com/customsearch/v1"
+@dataclass
+class GoogleSearch:
+    def __init__(self, subscription_key: str, cx: str):
+        """
+        Initialize the Google Search client with the subscription key and custom search engine ID.
+        :param subscription_key: Your Google API subscription key.
+        :param cx: Your custom search engine ID.
+        """
+        self.subscription_key = subscription_key
+        self.cx = cx
+    def search(self, query: str, num_results: int = 1):
+        """
+        Search with Google and return the contexts.
+        :param query: The search query.
+        :param num_results: The number of results to return.
+        :return: A list of search results.
+        """
+        params = {
+            "key": self.subscription_key,
+            "cx": self.cx,
+            "q": query,
+            "num": num_results,
+        }
+        response = requests.get(GOOGLE_SEARCH_ENDPOINT, params=params, timeout=10)
+        if not response.ok:
+            logger.error("Search engine error: %s", response.text)
+            raise HTTPException(response.status_code, "Search engine error.")
+        json_content = response.json()
+        try:
+            contexts = json_content["items"][:num_results]
+        except KeyError:
+            logger.error("Error encountered: %s", json_content)
+            return []
+        return contexts

graphgen/models/storage/base_storage.py CHANGED Viewed

@@ -1,9 +1,11 @@
 from dataclasses import dataclass
-from typing import Union, Generic, TypeVar
 from graphgen.models.embed.embedding import EmbeddingFunc
 T = TypeVar("T")
 @dataclass
 class StorageNameSpace:
     working_dir: str = None
@@ -17,9 +19,25 @@ class StorageNameSpace:
 @dataclass
-class BaseKVStorage(Generic[T], StorageNameSpace):
-    embedding_func: EmbeddingFunc = None
     async def all_keys(self) -> list[str]:
         raise NotImplementedError
@@ -41,6 +59,7 @@ class BaseKVStorage(Generic[T], StorageNameSpace):
     async def drop(self):
         raise NotImplementedError
 @dataclass
 class BaseGraphStorage(StorageNameSpace):
     embedding_func: EmbeddingFunc = None
@@ -71,7 +90,9 @@ class BaseGraphStorage(StorageNameSpace):
     ) -> Union[dict, None]:
         raise NotImplementedError
-    async def update_edge(self, source_node_id: str, target_node_id: str, edge_data: dict[str, str]):
         raise NotImplementedError
     async def get_all_edges(self) -> Union[list[dict], None]:

 from dataclasses import dataclass
+from typing import Generic, TypeVar, Union
 from graphgen.models.embed.embedding import EmbeddingFunc
 T = TypeVar("T")
 @dataclass
 class StorageNameSpace:
     working_dir: str = None
 @dataclass
+class BaseListStorage(Generic[T], StorageNameSpace):
+    async def all_items(self) -> list[T]:
+        raise NotImplementedError
+    async def get_by_index(self, index: int) -> Union[T, None]:
+        raise NotImplementedError
+    async def append(self, data: T):
+        raise NotImplementedError
+    async def upsert(self, data: list[T]):
+        raise NotImplementedError
+    async def drop(self):
+        raise NotImplementedError
+@dataclass
+class BaseKVStorage(Generic[T], StorageNameSpace):
     async def all_keys(self) -> list[str]:
         raise NotImplementedError
     async def drop(self):
         raise NotImplementedError
 @dataclass
 class BaseGraphStorage(StorageNameSpace):
     embedding_func: EmbeddingFunc = None
     ) -> Union[dict, None]:
         raise NotImplementedError
+    async def update_edge(
+        self, source_node_id: str, target_node_id: str, edge_data: dict[str, str]
+    ):
         raise NotImplementedError
     async def get_all_edges(self) -> Union[list[dict], None]:

graphgen/models/storage/json_storage.py CHANGED Viewed

@@ -1,8 +1,8 @@
 import os
 from dataclasses import dataclass
-from graphgen.utils import logger, load_json, write_json
-from graphgen.models.storage.base_storage import BaseKVStorage
 @dataclass
@@ -49,3 +49,39 @@ class JsonKVStorage(BaseKVStorage):
     async def drop(self):
         self._data = {}

 import os
 from dataclasses import dataclass
+from graphgen.models.storage.base_storage import BaseKVStorage, BaseListStorage
+from graphgen.utils import load_json, logger, write_json
 @dataclass
     async def drop(self):
         self._data = {}
+@dataclass
+class JsonListStorage(BaseListStorage):
+    _data: list = None
+    def __post_init__(self):
+        self._file_name = os.path.join(self.working_dir, f"{self.namespace}.json")
+        self._data = load_json(self._file_name) or []
+        logger.info("Load List %s with %d data", self.namespace, len(self._data))
+    @property
+    def data(self):
+        return self._data
+    async def all_items(self) -> list:
+        return self._data
+    async def index_done_callback(self):
+        write_json(self._data, self._file_name)
+    async def get_by_index(self, index: int):
+        if index < 0 or index >= len(self._data):
+            return None
+        return self._data[index]
+    async def append(self, data):
+        self._data.append(data)
+    async def upsert(self, data: list):
+        left_data = [d for d in data if d not in self._data]
+        self._data.extend(left_data)
+        return left_data
+    async def drop(self):
+        self._data = []

graphgen/models/vis/__init__.py ADDED Viewed

File without changes

graphgen/models/vis/community_visualizer.py ADDED Viewed

	@@ -0,0 +1,48 @@

+from dataclasses import dataclass
+from typing import Dict
+import matplotlib.pyplot as plt
+import networkx as nx
+@dataclass
+class Visualizer:
+    """
+    Class for visualizing graphs using NetworkX and Matplotlib.
+    """
+    graph: nx.Graph = None
+    communities: Dict[str, int] = None
+    layout: str = "spring"
+    max_nodes: int = 1000
+    node_size: int = 10
+    alpha: float = 0.6
+    def visualize(self, save_path: str = None):
+        n = self.graph.number_of_nodes()
+        if self.layout == "spring":
+            k = max(0.1, 1.0 / (n**0.5))
+            pos = nx.spring_layout(self.graph, k=k, seed=42)
+        else:
+            raise ValueError(f"Unknown layout: {self.layout}")
+        plt.figure(figsize=(10, 10))
+        node_colors = [self.communities.get(node, 0) for node in self.graph.nodes()]
+        nx.draw_networkx_nodes(
+            self.graph,
+            pos,
+            node_size=self.node_size,
+            node_color=node_colors,
+            cmap=plt.cm.tab20,
+            alpha=self.alpha,
+        )
+        nx.draw_networkx_edges(self.graph, pos, alpha=0.3, width=0.2)
+        plt.axis("off")
+        if save_path:
+            plt.savefig(save_path, dpi=300, bbox_inches="tight")
+            print("Saved to", save_path)
+        else:
+            plt.show()

graphgen/operators/__init__.py CHANGED Viewed

@@ -1,16 +1,22 @@
-from .extract_kg import extract_kg
 from .quiz import quiz
-from .judge import judge_statement, skip_judge_statement
-from .search_wikipedia import search_wikipedia
-from .traverse_graph import traverse_graph_by_edge, traverse_graph_atomically, traverse_graph_for_multi_hop
 __all__ = [
     "extract_kg",
     "quiz",
     "judge_statement",
-    "skip_judge_statement",
-    "search_wikipedia",
     "traverse_graph_by_edge",
     "traverse_graph_atomically",
-    "traverse_graph_for_multi_hop"
 ]

+from graphgen.operators.generate.generate_cot import generate_cot
+from graphgen.operators.kg.extract_kg import extract_kg
+from graphgen.operators.search.search_all import search_all
+from .judge import judge_statement
 from .quiz import quiz
+from .traverse_graph import (
+    traverse_graph_atomically,
+    traverse_graph_by_edge,
+    traverse_graph_for_multi_hop,
+)
 __all__ = [
     "extract_kg",
     "quiz",
     "judge_statement",
+    "search_all",
     "traverse_graph_by_edge",
     "traverse_graph_atomically",
+    "traverse_graph_for_multi_hop",
+    "generate_cot",
 ]

graphgen/operators/generate/__init__.py ADDED Viewed

File without changes

graphgen/operators/generate/generate_cot.py ADDED Viewed

	@@ -0,0 +1,117 @@

+import asyncio
+from typing import Dict, List, Tuple
+from tqdm.asyncio import tqdm as tqdm_async
+from graphgen.models import CommunityDetector, NetworkXStorage, OpenAIModel
+from graphgen.templates import COT_GENERATION_PROMPT, COT_TEMPLATE_DESIGN_PROMPT
+from graphgen.utils import compute_content_hash, detect_main_language
+async def generate_cot(
+    graph_storage: NetworkXStorage,
+    synthesizer_llm_client: OpenAIModel,
+    method_params: Dict = None,
+):
+    method = method_params.get("method", "leiden")
+    detector = CommunityDetector(
+        graph_storage=graph_storage, method=method, method_params=method_params
+    )
+    results = await detector.detect_communities()
+    # Convert results to a format suitable for summarization
+    communities = {}
+    for node, community_id in results.items():
+        if community_id not in communities:
+            communities[community_id] = []
+        communities[community_id].append(node)
+    if not communities:
+        return {}
+    semaphore = asyncio.Semaphore(value=1000)
+    async def _generate_from_single_community(
+        c_id: int, nodes: List[str]
+    ) -> Tuple[int, Tuple[str, str, str]]:
+        """Summarize a single community."""
+        async with semaphore:
+            entities: List[str] = []
+            relationships: List[str] = []
+            for n in nodes:
+                node_data = await graph_storage.get_node(n)
+                if node_data is not None:
+                    entities.append(f"({n}: {node_data.get('description')})")
+                edges = await graph_storage.get_node_edges(n)
+                for edge in edges:
+                    target = edge[1]
+                    if target in nodes:
+                        edge_data = await graph_storage.get_edge(n, target)
+                        relationships.append(
+                            f"({n}) - [{edge_data['description']}] -> ({target})"
+                        )
+            entities_str = "\n".join(entities)
+            relationships_str = "\n".join(relationships)
+            language = (
+                "English"
+                if detect_main_language(entities_str + relationships_str) == "en"
+                else "Chinese"
+            )
+            prompt = COT_TEMPLATE_DESIGN_PROMPT[language]["TEMPLATE"].format(
+                entities=entities_str,
+                relationships=relationships_str,
+            )
+            cot_template = await synthesizer_llm_client.generate_answer(prompt)
+            if "问题：" in cot_template and "推理路径设计：" in cot_template:
+                question = cot_template.split("问题：")[1].split("推理路径设计：")[0].strip()
+                reasoning_path = cot_template.split("推理路径设计：")[1].strip()
+            elif (
+                "Question:" in cot_template and "Reasoning-Path Design:" in cot_template
+            ):
+                question = (
+                    cot_template.split("Question:")[1]
+                    .split("Reasoning-Path Design:")[0]
+                    .strip()
+                )
+                reasoning_path = cot_template.split("Reasoning-Path Design:")[1].strip()
+            else:
+                raise ValueError("COT template format is incorrect.")
+            prompt = COT_GENERATION_PROMPT[language]["TEMPLATE"].format(
+                entities=entities_str,
+                relationships=relationships_str,
+                question=question,
+                reasoning_template=reasoning_path,
+            )
+            cot_answer = await synthesizer_llm_client.generate_answer(prompt)
+            return c_id, (question, reasoning_path, cot_answer)
+    cid_nodes = list(communities.items())
+    results: Dict = {}
+    async for coro in tqdm_async(
+        asyncio.as_completed(
+            [_generate_from_single_community(cid, nodes) for cid, nodes in cid_nodes]
+        ),
+        total=len(cid_nodes),
+        desc="[Generating COT] Generating CoT data from communities",
+        unit="community",
+    ):
+        cid, (q, r, a) = await coro
+        results[compute_content_hash(q)] = {
+            "question": q,
+            "reasoning_path": r,
+            "answer": a,
+        }
+    return results

graphgen/operators/judge.py CHANGED Viewed

@@ -1,17 +1,20 @@
-import math
 import asyncio
 from tqdm.asyncio import tqdm as tqdm_async
-from graphgen.models import NetworkXStorage, OpenAIModel, JsonKVStorage
-from graphgen.utils import logger, yes_no_loss_entropy
 from graphgen.templates import STATEMENT_JUDGEMENT_PROMPT
-async def judge_statement( # pylint: disable=too-many-statements
-        trainee_llm_client: OpenAIModel,
-        graph_storage: NetworkXStorage,
-        rephrase_storage: JsonKVStorage,
-        re_judge: bool = False,
-        max_concurrent: int = 1000) -> NetworkXStorage:
     """
     Get all edges and nodes and judge them
@@ -34,7 +37,12 @@ async def judge_statement( # pylint: disable=too-many-statements
             edge_data = edge[2]
             if (not re_judge) and "loss" in edge_data and edge_data["loss"] is not None:
-                logger.info("Edge %s -> %s already judged, loss: %s, skip", source_id, target_id, edge_data["loss"])
                 return source_id, target_id, edge_data
             description = edge_data["description"]
@@ -47,17 +55,27 @@ async def judge_statement( # pylint: disable=too-many-statements
                 gts = [gt for _, gt in descriptions]
                 for description, gt in descriptions:
                     judgement = await trainee_llm_client.generate_topk_per_token(
-                        STATEMENT_JUDGEMENT_PROMPT['TEMPLATE'].format(statement=description)
                     )
                     judgements.append(judgement[0].top_candidates)
                 loss = yes_no_loss_entropy(judgements, gts)
-                logger.info("Edge %s -> %s description: %s loss: %s", source_id, target_id, description, loss)
                 edge_data["loss"] = loss
-            except Exception as e: # pylint: disable=broad-except
-                logger.error("Error in judging relation %s -> %s: %s", source_id, target_id, e)
                 logger.info("Use default loss 0.1")
                 edge_data["loss"] = -math.log(0.1)
@@ -68,9 +86,9 @@ async def judge_statement( # pylint: disable=too-many-statements
     results = []
     for result in tqdm_async(
-            asyncio.as_completed([_judge_single_relation(edge) for edge in edges]),
-            total=len(edges),
-            desc="Judging relations"
     ):
         results.append(await result)
@@ -82,7 +100,9 @@ async def judge_statement( # pylint: disable=too-many-statements
             node_data = node[1]
             if (not re_judge) and "loss" in node_data and node_data["loss"] is not None:
-                logger.info("Node %s already judged, loss: %s, skip", node_id, node_data["loss"])
                 return node_id, node_data
             description = node_data["description"]
@@ -95,16 +115,20 @@ async def judge_statement( # pylint: disable=too-many-statements
                 gts = [gt for _, gt in descriptions]
                 for description, gt in descriptions:
                     judgement = await trainee_llm_client.generate_topk_per_token(
-                        STATEMENT_JUDGEMENT_PROMPT['TEMPLATE'].format(statement=description)
                     )
                     judgements.append(judgement[0].top_candidates)
                 loss = yes_no_loss_entropy(judgements, gts)
-                logger.info("Node %s description: %s loss: %s", node_id, description, loss)
                 node_data["loss"] = loss
-            except Exception as e: # pylint: disable=broad-except
                 logger.error("Error in judging entity %s: %s", node_id, e)
                 logger.info("Use default loss 0.1")
                 node_data["loss"] = -math.log(0.1)
@@ -116,72 +140,9 @@ async def judge_statement( # pylint: disable=too-many-statements
     results = []
     for result in tqdm_async(
-            asyncio.as_completed([_judge_single_entity(node) for node in nodes]),
-            total=len(nodes),
-            desc="Judging entities"
-    ):
-        results.append(await result)
-    return graph_storage
-async def skip_judge_statement(
-        graph_storage: NetworkXStorage,
-        max_concurrent: int = 1000
-):
-    """
-    Skip the judgement of the statement
-    :param graph_storage: graph storage instance
-    :param max_concurrent: max concurrent
-    :return:
-    """
-    semaphore = asyncio.Semaphore(max_concurrent)
-    async def _skip_single_relation(
-        edge: tuple,
-    ):
-        async with semaphore:
-            source_id = edge[0]
-            target_id = edge[1]
-            edge_data = edge[2]
-            if "loss" in edge_data and edge_data["loss"] is not None:
-                logger.info("Edge %s -> %s already judged, loss: %s, skip", source_id, target_id, edge_data["loss"])
-                return source_id, target_id, edge_data
-            edge_data["loss"] = -math.log(0.1)
-            await graph_storage.update_edge(source_id, target_id, edge_data)
-            return source_id, target_id, edge_data
-    edges = await graph_storage.get_all_edges()
-    results = []
-    for result in tqdm_async(
-            asyncio.as_completed([_skip_single_relation(edge) for edge in edges]),
-            total=len(edges),
-            desc="Skipping judgement of relations"
-    ):
-        results.append(await result)
-    async def _skip_single_entity(
-        node: tuple,
-    ):
-        async with semaphore:
-            node_id = node[0]
-            node_data = node[1]
-            if "loss" in node_data and node_data["loss"] is not None:
-                logger.info("Node %s already judged, loss: %s, skip", node_id, node_data["loss"])
-                return node_id, node_data
-            node_data["loss"] = -math.log(0.1)
-            await graph_storage.update_node(node_id, node_data)
-            return node_id, node_data
-    nodes = await graph_storage.get_all_nodes()
-    results = []
-    for result in tqdm_async(
-            asyncio.as_completed([_skip_single_entity(node) for node in nodes]),
-            total=len(nodes),
-            desc="Skipping judgement of entities"
     ):
         results.append(await result)

 import asyncio
+import math
 from tqdm.asyncio import tqdm as tqdm_async
+from graphgen.models import JsonKVStorage, NetworkXStorage, OpenAIModel
 from graphgen.templates import STATEMENT_JUDGEMENT_PROMPT
+from graphgen.utils import logger, yes_no_loss_entropy
+async def judge_statement(  # pylint: disable=too-many-statements
+    trainee_llm_client: OpenAIModel,
+    graph_storage: NetworkXStorage,
+    rephrase_storage: JsonKVStorage,
+    re_judge: bool = False,
+    max_concurrent: int = 1000,
+) -> NetworkXStorage:
     """
     Get all edges and nodes and judge them
             edge_data = edge[2]
             if (not re_judge) and "loss" in edge_data and edge_data["loss"] is not None:
+                logger.info(
+                    "Edge %s -> %s already judged, loss: %s, skip",
+                    source_id,
+                    target_id,
+                    edge_data["loss"],
+                )
                 return source_id, target_id, edge_data
             description = edge_data["description"]
                 gts = [gt for _, gt in descriptions]
                 for description, gt in descriptions:
                     judgement = await trainee_llm_client.generate_topk_per_token(
+                        STATEMENT_JUDGEMENT_PROMPT["TEMPLATE"].format(
+                            statement=description
+                        )
                     )
                     judgements.append(judgement[0].top_candidates)
                 loss = yes_no_loss_entropy(judgements, gts)
+                logger.info(
+                    "Edge %s -> %s description: %s loss: %s",
+                    source_id,
+                    target_id,
+                    description,
+                    loss,
+                )
                 edge_data["loss"] = loss
+            except Exception as e:  # pylint: disable=broad-except
+                logger.error(
+                    "Error in judging relation %s -> %s: %s", source_id, target_id, e
+                )
                 logger.info("Use default loss 0.1")
                 edge_data["loss"] = -math.log(0.1)
     results = []
     for result in tqdm_async(
+        asyncio.as_completed([_judge_single_relation(edge) for edge in edges]),
+        total=len(edges),
+        desc="Judging relations",
     ):
         results.append(await result)
             node_data = node[1]
             if (not re_judge) and "loss" in node_data and node_data["loss"] is not None:
+                logger.info(
+                    "Node %s already judged, loss: %s, skip", node_id, node_data["loss"]
+                )
                 return node_id, node_data
             description = node_data["description"]
                 gts = [gt for _, gt in descriptions]
                 for description, gt in descriptions:
                     judgement = await trainee_llm_client.generate_topk_per_token(
+                        STATEMENT_JUDGEMENT_PROMPT["TEMPLATE"].format(
+                            statement=description
+                        )
                     )
                     judgements.append(judgement[0].top_candidates)
                 loss = yes_no_loss_entropy(judgements, gts)
+                logger.info(
+                    "Node %s description: %s loss: %s", node_id, description, loss
+                )
                 node_data["loss"] = loss
+            except Exception as e:  # pylint: disable=broad-except
                 logger.error("Error in judging entity %s: %s", node_id, e)
                 logger.info("Use default loss 0.1")
                 node_data["loss"] = -math.log(0.1)
     results = []
     for result in tqdm_async(
+        asyncio.as_completed([_judge_single_entity(node) for node in nodes]),
+        total=len(nodes),
+        desc="Judging entities",
     ):
         results.append(await result)

graphgen/operators/kg/__init__.py ADDED Viewed

File without changes

graphgen/operators/{extract_kg.py → kg/extract_kg.py} RENAMED Viewed

@@ -1,27 +1,33 @@
-import re
 import asyncio
-from typing import List
 from collections import defaultdict
 import gradio as gr
 from tqdm.asyncio import tqdm as tqdm_async
 from graphgen.models import Chunk, OpenAIModel, Tokenizer
 from graphgen.models.storage.base_storage import BaseGraphStorage
 from graphgen.templates import KG_EXTRACTION_PROMPT
-from graphgen.utils import (logger, pack_history_conversations, split_string_by_multi_markers,
-                            handle_single_entity_extraction, handle_single_relationship_extraction,
-                            detect_if_chinese)
-from graphgen.operators.merge_kg import merge_nodes, merge_edges
 # pylint: disable=too-many-statements
 async def extract_kg(
-        llm_client: OpenAIModel,
-        kg_instance: BaseGraphStorage,
-        tokenizer_instance: Tokenizer,
-        chunks: List[Chunk],
-        progress_bar: gr.Progress = None,
-        max_concurrent: int = 1000
 ):
     """
     :param llm_client: Synthesizer LLM model to extract entities and relationships
@@ -50,25 +56,25 @@ async def extract_kg(
             )
             final_result = await llm_client.generate_answer(hint_prompt)
-            logger.info('First result: %s', final_result)
             history = pack_history_conversations(hint_prompt, final_result)
             for loop_index in range(max_loop):
                 if_loop_result = await llm_client.generate_answer(
-                    text=KG_EXTRACTION_PROMPT[language]["IF_LOOP"],
-                    history=history
                 )
                 if_loop_result = if_loop_result.strip().strip('"').strip("'").lower()
                 if if_loop_result != "yes":
                     break
                 glean_result = await llm_client.generate_answer(
-                    text=KG_EXTRACTION_PROMPT[language]["CONTINUE"],
-                    history=history
                 )
-                logger.info('Loop %s glean: %s', loop_index, glean_result)
-                history += pack_history_conversations(KG_EXTRACTION_PROMPT[language]["CONTINUE"], glean_result)
                 final_result += glean_result
                 if loop_index == max_loop - 1:
                     break
@@ -76,8 +82,9 @@ async def extract_kg(
             records = split_string_by_multi_markers(
                 final_result,
                 [
-                KG_EXTRACTION_PROMPT["FORMAT"]["record_delimiter"],
-                KG_EXTRACTION_PROMPT["FORMAT"]["completion_delimiter"]],
             )
             nodes = defaultdict(list)
@@ -87,16 +94,20 @@ async def extract_kg(
                 record = re.search(r"\((.*)\)", record)
                 if record is None:
                     continue
-                record = record.group(1) # 提取括号内的内容
                 record_attributes = split_string_by_multi_markers(
                     record, [KG_EXTRACTION_PROMPT["FORMAT"]["tuple_delimiter"]]
                 )
-                entity = await handle_single_entity_extraction(record_attributes, chunk_id)
                 if entity is not None:
                     nodes[entity["entity_name"]].append(entity)
                     continue
-                relation = await handle_single_relationship_extraction(record_attributes, chunk_id)
                 if relation is not None:
                     edges[(relation["src_id"], relation["tgt_id"])].append(relation)
             return dict(nodes), dict(edges)
@@ -106,17 +117,25 @@ async def extract_kg(
     async for result in tqdm_async(
         asyncio.as_completed([_process_single_content(c) for c in chunks]),
         total=len(chunks),
-        desc="[3/4]Extracting entities and relationships from chunks",
         unit="chunk",
     ):
         try:
             if progress_bar is not None:
-                progress_bar(len(results) / chunk_number, desc="[3/4]Extracting entities and relationships from chunks")
             results.append(await result)
             if progress_bar is not None and len(results) == chunk_number:
-                progress_bar(1, desc="[3/4]Extracting entities and relationships from chunks")
-        except Exception as e: # pylint: disable=broad-except
-            logger.error("Error occurred while extracting entities and relationships from chunks: %s", e)
     nodes = defaultdict(list)
     edges = defaultdict(list)

 import asyncio
+import re
 from collections import defaultdict
+from typing import List
 import gradio as gr
 from tqdm.asyncio import tqdm as tqdm_async
 from graphgen.models import Chunk, OpenAIModel, Tokenizer
 from graphgen.models.storage.base_storage import BaseGraphStorage
+from graphgen.operators.kg.merge_kg import merge_edges, merge_nodes
 from graphgen.templates import KG_EXTRACTION_PROMPT
+from graphgen.utils import (
+    detect_if_chinese,
+    handle_single_entity_extraction,
+    handle_single_relationship_extraction,
+    logger,
+    pack_history_conversations,
+    split_string_by_multi_markers,
+)
 # pylint: disable=too-many-statements
 async def extract_kg(
+    llm_client: OpenAIModel,
+    kg_instance: BaseGraphStorage,
+    tokenizer_instance: Tokenizer,
+    chunks: List[Chunk],
+    progress_bar: gr.Progress = None,
+    max_concurrent: int = 1000,
 ):
     """
     :param llm_client: Synthesizer LLM model to extract entities and relationships
             )
             final_result = await llm_client.generate_answer(hint_prompt)
+            logger.info("First result: %s", final_result)
             history = pack_history_conversations(hint_prompt, final_result)
             for loop_index in range(max_loop):
                 if_loop_result = await llm_client.generate_answer(
+                    text=KG_EXTRACTION_PROMPT[language]["IF_LOOP"], history=history
                 )
                 if_loop_result = if_loop_result.strip().strip('"').strip("'").lower()
                 if if_loop_result != "yes":
                     break
                 glean_result = await llm_client.generate_answer(
+                    text=KG_EXTRACTION_PROMPT[language]["CONTINUE"], history=history
                 )
+                logger.info("Loop %s glean: %s", loop_index, glean_result)
+                history += pack_history_conversations(
+                    KG_EXTRACTION_PROMPT[language]["CONTINUE"], glean_result
+                )
                 final_result += glean_result
                 if loop_index == max_loop - 1:
                     break
             records = split_string_by_multi_markers(
                 final_result,
                 [
+                    KG_EXTRACTION_PROMPT["FORMAT"]["record_delimiter"],
+                    KG_EXTRACTION_PROMPT["FORMAT"]["completion_delimiter"],
+                ],
             )
             nodes = defaultdict(list)
                 record = re.search(r"\((.*)\)", record)
                 if record is None:
                     continue
+                record = record.group(1)  # 提取括号内的内容
                 record_attributes = split_string_by_multi_markers(
                     record, [KG_EXTRACTION_PROMPT["FORMAT"]["tuple_delimiter"]]
                 )
+                entity = await handle_single_entity_extraction(
+                    record_attributes, chunk_id
+                )
                 if entity is not None:
                     nodes[entity["entity_name"]].append(entity)
                     continue
+                relation = await handle_single_relationship_extraction(
+                    record_attributes, chunk_id
+                )
                 if relation is not None:
                     edges[(relation["src_id"], relation["tgt_id"])].append(relation)
             return dict(nodes), dict(edges)
     async for result in tqdm_async(
         asyncio.as_completed([_process_single_content(c) for c in chunks]),
         total=len(chunks),
+        desc="[2/4]Extracting entities and relationships from chunks",
         unit="chunk",
     ):
         try:
             if progress_bar is not None:
+                progress_bar(
+                    len(results) / chunk_number,
+                    desc="[3/4]Extracting entities and relationships from chunks",
+                )
             results.append(await result)
             if progress_bar is not None and len(results) == chunk_number:
+                progress_bar(
+                    1, desc="[3/4]Extracting entities and relationships from chunks"
+                )
+        except Exception as e:  # pylint: disable=broad-except
+            logger.error(
+                "Error occurred while extracting entities and relationships from chunks: %s",
+                e,
+            )
     nodes = defaultdict(list)
     edges = defaultdict(list)

graphgen/operators/{merge_kg.py → kg/merge_kg.py} RENAMED Viewed

@@ -1,19 +1,21 @@
-from collections import Counter
 import asyncio
 from tqdm.asyncio import tqdm as tqdm_async
-from graphgen.utils.format import split_string_by_multi_markers
-from graphgen.utils import logger, detect_main_language
-from graphgen.models import TopkTokenModel, Tokenizer
 from graphgen.models.storage.base_storage import BaseGraphStorage
-from graphgen.templates import KG_SUMMARIZATION_PROMPT, KG_EXTRACTION_PROMPT
 async def _handle_kg_summary(
     entity_or_relation_name: str,
     description: str,
     llm_client: TopkTokenModel,
     tokenizer_instance: Tokenizer,
-    max_summary_tokens: int = 200
 ) -> str:
     """
     处理实体或关系的描述信息
@@ -33,17 +35,19 @@ async def _handle_kg_summary(
     KG_EXTRACTION_PROMPT["FORMAT"]["language"] = language
     tokens = tokenizer_instance.encode_string(description)
-    if len(tokens) <  max_summary_tokens:
         return description
     use_description = tokenizer_instance.decode_tokens(tokens[:max_summary_tokens])
     prompt = KG_SUMMARIZATION_PROMPT[language]["TEMPLATE"].format(
         entity_name=entity_or_relation_name,
-        description_list=use_description.split('<SEP>'),
-        **KG_SUMMARIZATION_PROMPT["FORMAT"]
     )
     new_description = await llm_client.generate_answer(prompt)
-    logger.info("Entity or relation %s summary: %s", entity_or_relation_name, new_description)
     return new_description
@@ -52,7 +56,7 @@ async def merge_nodes(
     kg_instance: BaseGraphStorage,
     llm_client: TopkTokenModel,
     tokenizer_instance: Tokenizer,
-    max_concurrent: int = 1000
 ):
     """
     Merge nodes
@@ -77,39 +81,34 @@ async def merge_nodes(
             if node is not None:
                 entity_types.append(node["entity_type"])
                 source_ids.extend(
-                    split_string_by_multi_markers(node["source_id"], ['<SEP>'])
                 )
                 descriptions.append(node["description"])
             # 统计当前节点数据和已有节点数据的entity_type出现次数，取出现次数最多的entity_type
             entity_type = sorted(
-                Counter(
-                    [dp["entity_type"] for dp in node_data] + entity_types
-                ).items(),
                 key=lambda x: x[1],
                 reverse=True,
             )[0][0]
-            description = '<SEP>'.join(
                 sorted(set([dp["description"] for dp in node_data] + descriptions))
             )
             description = await _handle_kg_summary(
                 entity_name, description, llm_client, tokenizer_instance
             )
-            source_id = '<SEP>'.join(
                 set([dp["source_id"] for dp in node_data] + source_ids)
             )
             node_data = {
                 "entity_type": entity_type,
                 "description": description,
-                "source_id": source_id
             }
-            await kg_instance.upsert_node(
-                entity_name,
-                node_data=node_data
-            )
             node_data["entity_name"] = entity_name
             return node_data
@@ -125,7 +124,7 @@ async def merge_nodes(
     ):
         try:
             entities_data.append(await result)
-        except Exception as e: # pylint: disable=broad-except
             logger.error("Error occurred while inserting entities into storage: %s", e)
@@ -134,7 +133,7 @@ async def merge_edges(
     kg_instance: BaseGraphStorage,
     llm_client: TopkTokenModel,
     tokenizer_instance: Tokenizer,
-    max_concurrent: int = 1000
 ):
     """
     Merge edges
@@ -157,14 +156,14 @@ async def merge_edges(
             edge = await kg_instance.get_edge(src_id, tgt_id)
             if edge is not None:
                 source_ids.extend(
-                    split_string_by_multi_markers(edge["source_id"], ['<SEP>'])
                 )
                 descriptions.append(edge["description"])
-            description = '<SEP>'.join(
                 sorted(set([dp["description"] for dp in edge_data] + descriptions))
             )
-            source_id = '<SEP>'.join(
                 set([dp["source_id"] for dp in edge_data] + source_ids)
             )
@@ -175,8 +174,8 @@ async def merge_edges(
                         node_data={
                             "source_id": source_id,
                             "description": description,
-                            "entity_type": "UNKNOWN"
-                        }
                     )
             description = await _handle_kg_summary(
@@ -186,24 +185,20 @@ async def merge_edges(
             await kg_instance.upsert_edge(
                 src_id,
                 tgt_id,
-                edge_data={
-                    "source_id": source_id,
-                    "description": description
-                }
             )
-            edge_data = {
-                "src_id": src_id,
-                "tgt_id": tgt_id,
-                "description": description
-            }
             return edge_data
     logger.info("Inserting relationships into storage...")
     relationships_data = []
     for result in tqdm_async(
         asyncio.as_completed(
-            [process_single_edge(src_id, tgt_id, v) for (src_id, tgt_id), v in edges_data.items()]
         ),
         total=len(edges_data),
         desc="Inserting relationships into storage",
@@ -211,5 +206,7 @@ async def merge_edges(
     ):
         try:
             relationships_data.append(await result)
-        except Exception as e: # pylint: disable=broad-except
-            logger.error("Error occurred while inserting relationships into storage: %s", e)

 import asyncio
+from collections import Counter
 from tqdm.asyncio import tqdm as tqdm_async
+from graphgen.models import Tokenizer, TopkTokenModel
 from graphgen.models.storage.base_storage import BaseGraphStorage
+from graphgen.templates import KG_EXTRACTION_PROMPT, KG_SUMMARIZATION_PROMPT
+from graphgen.utils import detect_main_language, logger
+from graphgen.utils.format import split_string_by_multi_markers
 async def _handle_kg_summary(
     entity_or_relation_name: str,
     description: str,
     llm_client: TopkTokenModel,
     tokenizer_instance: Tokenizer,
+    max_summary_tokens: int = 200,
 ) -> str:
     """
     处理实体或关系的描述信息
     KG_EXTRACTION_PROMPT["FORMAT"]["language"] = language
     tokens = tokenizer_instance.encode_string(description)
+    if len(tokens) < max_summary_tokens:
         return description
     use_description = tokenizer_instance.decode_tokens(tokens[:max_summary_tokens])
     prompt = KG_SUMMARIZATION_PROMPT[language]["TEMPLATE"].format(
         entity_name=entity_or_relation_name,
+        description_list=use_description.split("<SEP>"),
+        **KG_SUMMARIZATION_PROMPT["FORMAT"],
     )
     new_description = await llm_client.generate_answer(prompt)
+    logger.info(
+        "Entity or relation %s summary: %s", entity_or_relation_name, new_description
+    )
     return new_description
     kg_instance: BaseGraphStorage,
     llm_client: TopkTokenModel,
     tokenizer_instance: Tokenizer,
+    max_concurrent: int = 1000,
 ):
     """
     Merge nodes
             if node is not None:
                 entity_types.append(node["entity_type"])
                 source_ids.extend(
+                    split_string_by_multi_markers(node["source_id"], ["<SEP>"])
                 )
                 descriptions.append(node["description"])
             # 统计当前节点数据和已有节点数据的entity_type出现次数，取出现次数最多的entity_type
             entity_type = sorted(
+                Counter([dp["entity_type"] for dp in node_data] + entity_types).items(),
                 key=lambda x: x[1],
                 reverse=True,
             )[0][0]
+            description = "<SEP>".join(
                 sorted(set([dp["description"] for dp in node_data] + descriptions))
             )
             description = await _handle_kg_summary(
                 entity_name, description, llm_client, tokenizer_instance
             )
+            source_id = "<SEP>".join(
                 set([dp["source_id"] for dp in node_data] + source_ids)
             )
             node_data = {
                 "entity_type": entity_type,
                 "description": description,
+                "source_id": source_id,
             }
+            await kg_instance.upsert_node(entity_name, node_data=node_data)
             node_data["entity_name"] = entity_name
             return node_data
     ):
         try:
             entities_data.append(await result)
+        except Exception as e:  # pylint: disable=broad-except
             logger.error("Error occurred while inserting entities into storage: %s", e)
     kg_instance: BaseGraphStorage,
     llm_client: TopkTokenModel,
     tokenizer_instance: Tokenizer,
+    max_concurrent: int = 1000,
 ):
     """
     Merge edges
             edge = await kg_instance.get_edge(src_id, tgt_id)
             if edge is not None:
                 source_ids.extend(
+                    split_string_by_multi_markers(edge["source_id"], ["<SEP>"])
                 )
                 descriptions.append(edge["description"])
+            description = "<SEP>".join(
                 sorted(set([dp["description"] for dp in edge_data] + descriptions))
             )
+            source_id = "<SEP>".join(
                 set([dp["source_id"] for dp in edge_data] + source_ids)
             )
                         node_data={
                             "source_id": source_id,
                             "description": description,
+                            "entity_type": "UNKNOWN",
+                        },
                     )
             description = await _handle_kg_summary(
             await kg_instance.upsert_edge(
                 src_id,
                 tgt_id,
+                edge_data={"source_id": source_id, "description": description},
             )
+            edge_data = {"src_id": src_id, "tgt_id": tgt_id, "description": description}
             return edge_data
     logger.info("Inserting relationships into storage...")
     relationships_data = []
     for result in tqdm_async(
         asyncio.as_completed(
+            [
+                process_single_edge(src_id, tgt_id, v)
+                for (src_id, tgt_id), v in edges_data.items()
+            ]
         ),
         total=len(edges_data),
         desc="Inserting relationships into storage",
     ):
         try:
             relationships_data.append(await result)
+        except Exception as e:  # pylint: disable=broad-except
+            logger.error(
+                "Error occurred while inserting relationships into storage: %s", e
+            )

graphgen/operators/{split_graph.py → kg/split_kg.py} RENAMED Viewed

@@ -1,14 +1,16 @@
 import random
 from collections import defaultdict
 from tqdm.asyncio import tqdm as tqdm_async
-from graphgen.utils import logger
 from graphgen.models import NetworkXStorage, TraverseStrategy
 async def _get_node_info(
     node_id: str,
     graph_storage: NetworkXStorage,
-)-> dict:
     """
     Get node info
@@ -17,10 +19,7 @@ async def _get_node_info(
     :return: node info
     """
     node_data = await graph_storage.get_node(node_id)
-    return {
-        "node_id": node_id,
-        **node_data
-    }
 def _get_level_n_edges_by_max_width(
@@ -33,7 +32,7 @@ def _get_level_n_edges_by_max_width(
     bidirectional: bool,
     max_extra_edges: int,
     edge_sampling: str,
-    loss_strategy: str = "only_edge"
 ) -> list:
     """
     Get level n edges for an edge.
@@ -71,10 +70,17 @@ def _get_level_n_edges_by_max_width(
         if len(candidate_edges) >= max_extra_edges:
             if loss_strategy == "both":
-                er_tuples = [([nodes[node_dict[edge[0]]], nodes[node_dict[edge[1]]]], edge) for edge in candidate_edges]
-                candidate_edges = _sort_tuples(er_tuples, edge_sampling)[:max_extra_edges]
             elif loss_strategy == "only_edge":
-                candidate_edges = _sort_edges(candidate_edges, edge_sampling)[:max_extra_edges]
             else:
                 raise ValueError(f"Invalid loss strategy: {loss_strategy}")
@@ -101,16 +107,16 @@ def _get_level_n_edges_by_max_width(
 def _get_level_n_edges_by_max_tokens(
-        edge_adj_list: dict,
-        node_dict: dict,
-        edges: list,
-        nodes: list,
-        src_edge: tuple,
-        max_depth: int,
-        bidirectional: bool,
-        max_tokens: int,
-        edge_sampling: str,
-        loss_strategy: str = "only_edge"
 ) -> list:
     """
     Get level n edges for an edge.
@@ -129,8 +135,11 @@ def _get_level_n_edges_by_max_tokens(
     """
     src_id, tgt_id, src_edge_data = src_edge
-    max_tokens -= (src_edge_data["length"] + nodes[node_dict[src_id]][1]["length"]
-                   + nodes[node_dict[tgt_id]][1]["length"])
     level_n_edges = []
@@ -151,7 +160,10 @@ def _get_level_n_edges_by_max_tokens(
             break
         if loss_strategy == "both":
-            er_tuples = [([nodes[node_dict[edge[0]]], nodes[node_dict[edge[1]]]], edge) for edge in candidate_edges]
             candidate_edges = _sort_tuples(er_tuples, edge_sampling)
         elif loss_strategy == "only_edge":
             candidate_edges = _sort_edges(candidate_edges, edge_sampling)
@@ -196,15 +208,22 @@ def _sort_tuples(er_tuples: list, edge_sampling: str) -> list:
     if edge_sampling == "random":
         er_tuples = random.sample(er_tuples, len(er_tuples))
     elif edge_sampling == "min_loss":
-        er_tuples = sorted(er_tuples, key=lambda x: sum(node[1]["loss"] for node in x[0]) + x[1][2]["loss"])
     elif edge_sampling == "max_loss":
-        er_tuples = sorted(er_tuples, key=lambda x: sum(node[1]["loss"] for node in x[0]) + x[1][2]["loss"],
-                           reverse=True)
     else:
         raise ValueError(f"Invalid edge sampling: {edge_sampling}")
     edges = [edge for _, edge in er_tuples]
     return edges
 def _sort_edges(edges: list, edge_sampling: str) -> list:
     """
     Sort edges with edge sampling strategy
@@ -223,11 +242,12 @@ def _sort_edges(edges: list, edge_sampling: str) -> list:
         raise ValueError(f"Invalid edge sampling: {edge_sampling}")
     return edges
-async def get_batches_with_strategy( # pylint: disable=too-many-branches
     nodes: list,
     edges: list,
     graph_storage: NetworkXStorage,
-    traverse_strategy: TraverseStrategy
 ):
     expand_method = traverse_strategy.expand_method
     if expand_method == "max_width":
@@ -256,7 +276,10 @@ async def get_batches_with_strategy( # pylint: disable=too-many-branches
         node_dict[node_name] = i
     if traverse_strategy.loss_strategy == "both":
-        er_tuples = [([nodes[node_dict[edge[0]]], nodes[node_dict[edge[1]]]], edge) for edge in edges]
         edges = _sort_tuples(er_tuples, edge_sampling)
     elif traverse_strategy.loss_strategy == "only_edge":
         edges = _sort_edges(edges, edge_sampling)
@@ -279,21 +302,36 @@ async def get_batches_with_strategy( # pylint: disable=too-many-branches
         src_id = edge[0]
         tgt_id = edge[1]
-        _process_nodes.extend([await get_cached_node_info(src_id),
-                               await get_cached_node_info(tgt_id)])
         _process_edges.append(edge)
         if expand_method == "max_width":
             level_n_edges = _get_level_n_edges_by_max_width(
-                edge_adj_list, node_dict, edges, nodes, edge, max_depth,
-                traverse_strategy.bidirectional, traverse_strategy.max_extra_edges,
-                edge_sampling, traverse_strategy.loss_strategy
             )
         else:
             level_n_edges = _get_level_n_edges_by_max_tokens(
-                edge_adj_list, node_dict, edges, nodes, edge, max_depth,
-                traverse_strategy.bidirectional, traverse_strategy.max_tokens,
-                edge_sampling, traverse_strategy.loss_strategy
             )
         for _edge in level_n_edges:
@@ -302,8 +340,12 @@ async def get_batches_with_strategy( # pylint: disable=too-many-branches
             _process_edges.append(_edge)
         # 去重
-        _process_nodes = list({node['node_id']: node for node in _process_nodes}.values())
-        _process_edges = list({(edge[0], edge[1]): edge for edge in _process_edges}.values())
         processing_batches.append((_process_nodes, _process_edges))
@@ -312,15 +354,21 @@ async def get_batches_with_strategy( # pylint: disable=too-many-branches
     # isolate nodes
     isolated_node_strategy = traverse_strategy.isolated_node_strategy
     if isolated_node_strategy == "add":
-        processing_batches = await _add_isolated_nodes(nodes, processing_batches, graph_storage)
-        logger.info("Processing batches after adding isolated nodes: %d", len(processing_batches))
     return processing_batches
 async def _add_isolated_nodes(
-        nodes: list,
-        processing_batches: list,
-        graph_storage: NetworkXStorage,
 ) -> list:
     visited_nodes = set()
     for _process_nodes, _process_edges in processing_batches:

 import random
 from collections import defaultdict
 from tqdm.asyncio import tqdm as tqdm_async
 from graphgen.models import NetworkXStorage, TraverseStrategy
+from graphgen.utils import logger
 async def _get_node_info(
     node_id: str,
     graph_storage: NetworkXStorage,
+) -> dict:
     """
     Get node info
     :return: node info
     """
     node_data = await graph_storage.get_node(node_id)
+    return {"node_id": node_id, **node_data}
 def _get_level_n_edges_by_max_width(
     bidirectional: bool,
     max_extra_edges: int,
     edge_sampling: str,
+    loss_strategy: str = "only_edge",
 ) -> list:
     """
     Get level n edges for an edge.
         if len(candidate_edges) >= max_extra_edges:
             if loss_strategy == "both":
+                er_tuples = [
+                    ([nodes[node_dict[edge[0]]], nodes[node_dict[edge[1]]]], edge)
+                    for edge in candidate_edges
+                ]
+                candidate_edges = _sort_tuples(er_tuples, edge_sampling)[
+                    :max_extra_edges
+                ]
             elif loss_strategy == "only_edge":
+                candidate_edges = _sort_edges(candidate_edges, edge_sampling)[
+                    :max_extra_edges
+                ]
             else:
                 raise ValueError(f"Invalid loss strategy: {loss_strategy}")
 def _get_level_n_edges_by_max_tokens(
+    edge_adj_list: dict,
+    node_dict: dict,
+    edges: list,
+    nodes: list,
+    src_edge: tuple,
+    max_depth: int,
+    bidirectional: bool,
+    max_tokens: int,
+    edge_sampling: str,
+    loss_strategy: str = "only_edge",
 ) -> list:
     """
     Get level n edges for an edge.
     """
     src_id, tgt_id, src_edge_data = src_edge
+    max_tokens -= (
+        src_edge_data["length"]
+        + nodes[node_dict[src_id]][1]["length"]
+        + nodes[node_dict[tgt_id]][1]["length"]
+    )
     level_n_edges = []
             break
         if loss_strategy == "both":
+            er_tuples = [
+                ([nodes[node_dict[edge[0]]], nodes[node_dict[edge[1]]]], edge)
+                for edge in candidate_edges
+            ]
             candidate_edges = _sort_tuples(er_tuples, edge_sampling)
         elif loss_strategy == "only_edge":
             candidate_edges = _sort_edges(candidate_edges, edge_sampling)
     if edge_sampling == "random":
         er_tuples = random.sample(er_tuples, len(er_tuples))
     elif edge_sampling == "min_loss":
+        er_tuples = sorted(
+            er_tuples,
+            key=lambda x: sum(node[1]["loss"] for node in x[0]) + x[1][2]["loss"],
+        )
     elif edge_sampling == "max_loss":
+        er_tuples = sorted(
+            er_tuples,
+            key=lambda x: sum(node[1]["loss"] for node in x[0]) + x[1][2]["loss"],
+            reverse=True,
+        )
     else:
         raise ValueError(f"Invalid edge sampling: {edge_sampling}")
     edges = [edge for _, edge in er_tuples]
     return edges
 def _sort_edges(edges: list, edge_sampling: str) -> list:
     """
     Sort edges with edge sampling strategy
         raise ValueError(f"Invalid edge sampling: {edge_sampling}")
     return edges
+async def get_batches_with_strategy(  # pylint: disable=too-many-branches
     nodes: list,
     edges: list,
     graph_storage: NetworkXStorage,
+    traverse_strategy: TraverseStrategy,
 ):
     expand_method = traverse_strategy.expand_method
     if expand_method == "max_width":
         node_dict[node_name] = i
     if traverse_strategy.loss_strategy == "both":
+        er_tuples = [
+            ([nodes[node_dict[edge[0]]], nodes[node_dict[edge[1]]]], edge)
+            for edge in edges
+        ]
         edges = _sort_tuples(er_tuples, edge_sampling)
     elif traverse_strategy.loss_strategy == "only_edge":
         edges = _sort_edges(edges, edge_sampling)
         src_id = edge[0]
         tgt_id = edge[1]
+        _process_nodes.extend(
+            [await get_cached_node_info(src_id), await get_cached_node_info(tgt_id)]
+        )
         _process_edges.append(edge)
         if expand_method == "max_width":
             level_n_edges = _get_level_n_edges_by_max_width(
+                edge_adj_list,
+                node_dict,
+                edges,
+                nodes,
+                edge,
+                max_depth,
+                traverse_strategy.bidirectional,
+                traverse_strategy.max_extra_edges,
+                edge_sampling,
+                traverse_strategy.loss_strategy,
             )
         else:
             level_n_edges = _get_level_n_edges_by_max_tokens(
+                edge_adj_list,
+                node_dict,
+                edges,
+                nodes,
+                edge,
+                max_depth,
+                traverse_strategy.bidirectional,
+                traverse_strategy.max_tokens,
+                edge_sampling,
+                traverse_strategy.loss_strategy,
             )
         for _edge in level_n_edges:
             _process_edges.append(_edge)
         # 去重
+        _process_nodes = list(
+            {node["node_id"]: node for node in _process_nodes}.values()
+        )
+        _process_edges = list(
+            {(edge[0], edge[1]): edge for edge in _process_edges}.values()
+        )
         processing_batches.append((_process_nodes, _process_edges))
     # isolate nodes
     isolated_node_strategy = traverse_strategy.isolated_node_strategy
     if isolated_node_strategy == "add":
+        processing_batches = await _add_isolated_nodes(
+            nodes, processing_batches, graph_storage
+        )
+        logger.info(
+            "Processing batches after adding isolated nodes: %d",
+            len(processing_batches),
+        )
     return processing_batches
 async def _add_isolated_nodes(
+    nodes: list,
+    processing_batches: list,
+    graph_storage: NetworkXStorage,
 ) -> list:
     visited_nodes = set()
     for _process_nodes, _process_edges in processing_batches:

graphgen/operators/preprocess/__init__.py ADDED Viewed

File without changes

graphgen/operators/{resolute_coreference.py → preprocess/resolute_coreference.py} RENAMED Viewed

@@ -1,12 +1,13 @@
 from typing import List
-from graphgen.models import Chunk
-from graphgen.models import OpenAIModel
-from graphgen.templates import COREFERENCE_RESOLUTION_TEMPLATE
 from graphgen.utils import detect_main_language
 async def resolute_coreference(
-        llm_client: OpenAIModel,
-        chunks: List[Chunk]) -> List[Chunk]:
     """
     Resolute conference
@@ -23,9 +24,8 @@ async def resolute_coreference(
     for _, chunk in enumerate(chunks[1:]):
         language = detect_main_language(chunk.content)
         result = await llm_client.generate_answer(
-            COREFERENCE_RESOLUTION_TEMPLATE[language].format(
-                reference = results[0].content,
-                input_sentence = chunk.content
             )
         )
         results.append(Chunk(id=chunk.id, content=result))

 from typing import List
+from graphgen.models import Chunk, OpenAIModel
+from graphgen.templates import COREFERENCE_RESOLUTION_PROMPT
 from graphgen.utils import detect_main_language
 async def resolute_coreference(
+    llm_client: OpenAIModel, chunks: List[Chunk]
+) -> List[Chunk]:
     """
     Resolute conference
     for _, chunk in enumerate(chunks[1:]):
         language = detect_main_language(chunk.content)
         result = await llm_client.generate_answer(
+            COREFERENCE_RESOLUTION_PROMPT[language].format(
+                reference=results[0].content, input_sentence=chunk.content
             )
         )
         results.append(Chunk(id=chunk.id, content=result))

graphgen/operators/search/__init__.py ADDED Viewed

File without changes

graphgen/operators/search/db/__init__.py ADDED Viewed

File without changes

graphgen/operators/search/db/search_uniprot.py ADDED Viewed

File without changes

graphgen/operators/search/kg/__init__.py ADDED Viewed

File without changes

graphgen/operators/search/kg/search_wikipedia.py ADDED Viewed

	@@ -0,0 +1,58 @@

+from tqdm.asyncio import tqdm_asyncio as tqdm_async
+from graphgen.models import WikiSearch
+from graphgen.utils import logger
+async def _process_single_entity(
+    entity_name: str,
+    wiki_search_client: WikiSearch,
+) -> str | None:
+    """
+    Process single entity by searching Wikipedia
+    :param entity_name
+    :param wiki_search_client
+    :return: summary of the entity or None if not found
+    """
+    search_results = await wiki_search_client.search(entity_name)
+    if not search_results:
+        return None
+    summary = None
+    try:
+        summary = await wiki_search_client.summary(search_results[-1])
+        logger.info(
+            "Entity %s search result: %s summary: %s",
+            entity_name,
+            str(search_results),
+            summary,
+        )
+    except Exception as e:  # pylint: disable=broad-except
+        logger.error("Error processing entity %s: %s", entity_name, str(e))
+    return summary
+async def search_wikipedia(
+    wiki_search_client: WikiSearch,
+    entities: set[str],
+) -> dict:
+    """
+    Search wikipedia for entities
+    :param wiki_search_client: wiki search client
+    :param entities: list of entities to search
+    :return: nodes with search results
+    """
+    wiki_data = {}
+    async for entity in tqdm_async(
+        entities, desc="Searching Wikipedia", total=len(entities)
+    ):
+        try:
+            summary = await _process_single_entity(entity, wiki_search_client)
+            if summary:
+                wiki_data[entity] = summary
+        except Exception as e:  # pylint: disable=broad-except
+            logger.error("Error processing entity %s: %s", entity, str(e))
+    return wiki_data

graphgen/operators/search/search_all.py ADDED Viewed

	@@ -0,0 +1,82 @@

+"""
+To use Google Web Search API,
+follow the instructions [here](https://developers.google.com/custom-search/v1/overview)
+to get your Google search api key.
+To use Bing Web Search API,
+follow the instructions [here](https://www.microsoft.com/en-us/bing/apis/bing-web-search-api)
+and obtain your Bing subscription key.
+"""
+import os
+from graphgen.utils import logger
+async def search_all(
+    search_types: dict, search_entities: set[str]
+) -> dict[str, dict[str, str]]:
+    """
+    :param search_types
+    :param search_entities: list of entities to search
+    :return: nodes with search results
+    """
+    results = {}
+    for search_type in search_types:
+        if search_type == "wikipedia":
+            from graphgen.models import WikiSearch
+            from graphgen.operators.search.kg.search_wikipedia import search_wikipedia
+            wiki_search_client = WikiSearch()
+            wiki_results = await search_wikipedia(wiki_search_client, search_entities)
+            for entity_name, description in wiki_results.items():
+                if description:
+                    results[entity_name] = {"wikipedia": description}
+        elif search_type == "google":
+            from graphgen.models import GoogleSearch
+            from graphgen.operators.search.web.search_google import search_google
+            google_search_client = GoogleSearch(
+                subscription_key=os.environ["GOOGLE_SEARCH_API_KEY"],
+                cx=os.environ["GOOGLE_SEARCH_CX"],
+            )
+            google_results = await search_google(google_search_client, search_entities)
+            for entity_name, description in google_results.items():
+                if description:
+                    results[entity_name] = results.get(entity_name, {})
+                    results[entity_name]["google"] = description
+        elif search_type == "bing":
+            from graphgen.models import BingSearch
+            from graphgen.operators.search.web.search_bing import search_bing
+            bing_search_client = BingSearch(
+                subscription_key=os.environ["BING_SEARCH_API_KEY"]
+            )
+            bing_results = await search_bing(bing_search_client, search_entities)
+            for entity_name, description in bing_results.items():
+                if description:
+                    results[entity_name] = results.get(entity_name, {})
+                    results[entity_name]["bing"] = description
+        elif search_type == "uniprot":
+            # from graphgen.models import UniProtSearch
+            # from graphgen.operators.search.db.search_uniprot import search_uniprot
+            #
+            # uniprot_search_client = UniProtSearch()
+            #
+            # uniprot_results = await search_uniprot(
+            #     uniprot_search_client, search_entities
+            # )
+            raise NotImplementedError(
+                "Processing of UniProt search results is not implemented yet."
+            )
+        else:
+            logger.error("Search type %s is not supported yet.", search_type)
+            continue
+    return results

graphgen/operators/search/web/__init__.py ADDED Viewed

File without changes

graphgen/operators/search/web/search_bing.py ADDED Viewed

	@@ -0,0 +1,53 @@

+import trafilatura
+from tqdm.asyncio import tqdm_asyncio as tqdm_async
+from graphgen.models import BingSearch
+from graphgen.utils import logger
+async def _process_single_entity(
+    entity_name: str, bing_search_client: BingSearch
+) -> str | None:
+    """
+    Process single entity by searching Bing.
+    :param entity_name: The name of the entity to search.
+    :param bing_search_client: The Bing search client.
+    :return: Summary of the entity or None if not found.
+    """
+    search_results = bing_search_client.search(entity_name)
+    if not search_results:
+        return None
+    # Get more details from the first search result
+    first_result = search_results[0]
+    content = trafilatura.fetch_url(first_result["url"])
+    summary = trafilatura.extract(content, include_comments=False, include_links=False)
+    summary = summary.strip()
+    logger.info(
+        "Entity %s search result: %s",
+        entity_name,
+        summary,
+    )
+    return summary
+async def search_bing(
+    bing_search_client: BingSearch,
+    entities: set[str],
+) -> dict[str, str]:
+    """
+    Search with Bing and return the contexts.
+    :return:
+    """
+    bing_data = {}
+    async for entity in tqdm_async(
+        entities, desc="Searching Bing", total=len(entities)
+    ):
+        try:
+            summary = await _process_single_entity(entity, bing_search_client)
+            if summary:
+                bing_data[entity] = summary
+        except Exception as e:  # pylint: disable=broad-except
+            logger.error("Error processing entity %s: %s", entity, str(e))
+    return bing_data

graphgen/operators/search/web/search_google.py ADDED Viewed

	@@ -0,0 +1,49 @@

+import trafilatura
+from tqdm.asyncio import tqdm_asyncio as tqdm_async
+from graphgen.models import GoogleSearch
+from graphgen.utils import logger
+async def _process_single_entity(
+    entity_name: str, google_search_client: GoogleSearch
+) -> str | None:
+    search_results = google_search_client.search(entity_name)
+    if not search_results:
+        return None
+    # Get more details from the first search result
+    first_result = search_results[0]
+    content = trafilatura.fetch_url(first_result["link"])
+    summary = trafilatura.extract(content, include_comments=False, include_links=False)
+    summary = summary.strip()
+    logger.info(
+        "Entity %s search result: %s",
+        entity_name,
+        summary,
+    )
+    return summary
+async def search_google(
+    google_search_client: GoogleSearch,
+    entities: set[str],
+) -> dict:
+    """
+    Search with Google and return the contexts.
+    :param google_search_client: Google search client
+    :param entities: list of entities to search
+    :return:
+    """
+    google_data = {}
+    async for entity in tqdm_async(
+        entities, desc="Searching Google", total=len(entities)
+    ):
+        try:
+            summary = await _process_single_entity(entity, google_search_client)
+            if summary:
+                google_data[entity] = summary
+        except Exception as e:  # pylint: disable=broad-except
+            logger.error("Error processing entity %s: %s", entity, str(e))
+    return google_data

graphgen/operators/search_wikipedia.py DELETED Viewed

@@ -1,71 +0,0 @@
-import asyncio
-from graphgen.models import WikiSearch, OpenAIModel
-from graphgen.models.storage.base_storage import BaseGraphStorage
-from graphgen.templates import SEARCH_JUDGEMENT_PROMPT
-from graphgen.utils import logger
-async def _process_single_entity(entity_name: str,
-                                 description: str,
-                                 llm_client: OpenAIModel,
-                                 wiki_search_client: WikiSearch) -> tuple[str, None] | tuple[str, str]:
-    """
-    Process single entity
-    """
-    search_results = await wiki_search_client.search(entity_name)
-    if not search_results:
-        return entity_name, None
-    examples = "\n".join(SEARCH_JUDGEMENT_PROMPT["EXAMPLES"])
-    search_results.append("None of the above")
-    search_results_str = "\n".join([f"{i + 1}. {sr}" for i, sr in enumerate(search_results)])
-    prompt = SEARCH_JUDGEMENT_PROMPT["TEMPLATE"].format(
-        examples=examples,
-        entity_name=entity_name,
-        description=description,
-        search_results=search_results_str,
-    )
-    response = await llm_client.generate_answer(prompt)
-    try:
-        response = response.strip()
-        response = int(response)
-        if response < 1 or response >= len(search_results):
-            response = None
-        else:
-            response = await wiki_search_client.summary(search_results[response - 1])
-    except ValueError:
-        response = None
-    logger.info("Entity %s search result: %s response: %s", entity_name, str(search_results), response)
-    return entity_name, response
-async def search_wikipedia(llm_client: OpenAIModel,
-                           wiki_search_client: WikiSearch,
-                           knowledge_graph_instance: BaseGraphStorage,) -> dict:
-    """
-    Search wikipedia for entities
-    :param llm_client: LLM model
-    :param wiki_search_client: wiki search client
-    :param knowledge_graph_instance: knowledge graph instance
-    :return: nodes with search results
-    """
-    nodes = await knowledge_graph_instance.get_all_nodes()
-    nodes = list(nodes)
-    wiki_data = {}
-    tasks = [
-        _process_single_entity(node[0].strip('"'), node[1]["description"], llm_client, wiki_search_client)
-        for node in nodes
-    ]
-    for task in asyncio.as_completed(tasks):
-        result = await task
-        wiki_data[result[0]] = result[1]
-    return wiki_data

graphgen/operators/traverse_graph.py CHANGED Viewed

@@ -1,49 +1,67 @@
 import asyncio
-import gradio as gr
 from tqdm.asyncio import tqdm as tqdm_async
-from graphgen.models import OpenAIModel, NetworkXStorage, TraverseStrategy, Tokenizer, JsonKVStorage
-from graphgen.templates import ANSWER_REPHRASING_PROMPT, QUESTION_GENERATION_PROMPT, MULTI_HOP_GENERATION_PROMPT
-from graphgen.utils import detect_main_language, compute_content_hash, logger
-from graphgen.operators.split_graph import get_batches_with_strategy
-async def _pre_tokenize(graph_storage: NetworkXStorage,
-                        tokenizer: Tokenizer,
-                        edges: list,
-                        nodes: list) -> tuple:
     sem = asyncio.Semaphore(1000)
     async def handle_edge(edge: tuple) -> tuple:
         async with sem:
-            if 'length' not in edge[2]:
-                edge[2]['length'] = len(
-                    await asyncio.get_event_loop().run_in_executor(None,
-                                                                   tokenizer.encode_string,
-                                                                   edge[2]['description']))
             return edge
     async def handle_node(node: dict) -> dict:
         async with sem:
-            if 'length' not in node[1]:
-                node[1]['length'] = len(
-                    await asyncio.get_event_loop().run_in_executor(None,
-                                                                   tokenizer.encode_string,
-                                                                   node[1]['description']))
             return node
     new_edges = []
     new_nodes = []
-    for result in tqdm_async(asyncio.as_completed([handle_edge(edge) for edge in edges]),
-                             total=len(edges), desc="Pre-tokenizing edges"):
         new_edge = await result
         await graph_storage.update_edge(new_edge[0], new_edge[1], new_edge[2])
         new_edges.append(new_edge)
-    for result in tqdm_async(asyncio.as_completed([handle_node(node) for node in nodes]),
-                             total=len(nodes), desc="Pre-tokenizing nodes"):
         new_node = await result
         await graph_storage.update_node(new_node[0], new_node[1])
         new_nodes.append(new_node)
@@ -51,60 +69,75 @@ async def _pre_tokenize(graph_storage: NetworkXStorage,
     await graph_storage.index_done_callback()
     return new_edges, new_nodes
-async def _construct_rephrasing_prompt(_process_nodes: list,
-                                       _process_edges: list,
-                                       text_chunks_storage: JsonKVStorage,
-                                       add_context: bool = False
-                                       ) -> str:
     entities = [
-        f"{_process_node['node_id']}: {_process_node['description']}" for _process_node in _process_nodes
     ]
     relations = [
         f"{_process_edge[0]} -- {_process_edge[1]}: {_process_edge[2]['description']}"
         for _process_edge in _process_edges
     ]
-    entities_str = "\n".join([f"{index + 1}. {entity}" for index, entity in enumerate(entities)])
-    relations_str = "\n".join([f"{index + 1}. {relation}" for index, relation in enumerate(relations)])
-    language = "Chinese" if detect_main_language(entities_str + relations_str) == "zh" else "English"
     if add_context:
-        original_ids = ([node['source_id'].split('<SEP>')[0] for node in _process_nodes] +
-                        [edge[2]['source_id'].split('<SEP>')[0] for edge in _process_edges])
         original_ids = list(set(original_ids))
         original_text = await text_chunks_storage.get_by_ids(original_ids)
-        original_text = "\n".join([f"{index + 1}. {text['content']}" for index, text in enumerate(original_text)])
-        prompt = ANSWER_REPHRASING_PROMPT[language]['CONTEXT_TEMPLATE'].format(
             language=language,
             original_text=original_text,
             entities=entities_str,
-            relationships=relations_str
         )
         return prompt
-    prompt = ANSWER_REPHRASING_PROMPT[language]['TEMPLATE'].format(
-        language=language,
-        entities=entities_str,
-        relationships=relations_str
     )
     return prompt
-def get_loss_tercile(losses: list) -> (float, float):
-    losses = sorted(losses)
-    q1_index = int(len(losses) * (1 / 3))
-    q2_index = int(len(losses) * (2 / 3))
-    return losses[q1_index], losses[q2_index]
 def get_average_loss(batch: tuple, loss_strategy: str) -> float:
-    if loss_strategy == "only_edge":
-        return sum(edge[2]['loss'] for edge in batch[1]) / len(batch[1])
-    if loss_strategy == "both":
-        return sum(edge[2]['loss'] for edge in batch[1]) + sum(node['loss'] for node in batch[0]) / \
-               (len(batch[0]) + len(batch[1]))
-    raise ValueError("Invalid loss strategy")
 def _post_process_synthetic_data(data):
     block = data.split("\n\n")
@@ -113,26 +146,18 @@ def _post_process_synthetic_data(data):
         if "Question:" in line and "Answer:" in line:
             question = line.split("Question:")[1].split("Answer:")[0].strip()
             answer = line.split("Answer:")[1].strip()
-            qas.append({
-                "question": question,
-                "answer": answer
-            })
         elif "问题：" in line and "答案：" in line:
             question = line.split("问题：")[1].split("答案：")[0].strip()
             answer = line.split("答案：")[1].strip()
-            qas.append({
-                "question": question,
-                "answer": answer
-            })
         elif "问题:" in line and "回答:" in line:
             question = line.split("问题:")[1].split("回答:")[0].strip()
             answer = line.split("回答:")[1].strip()
-            qas.append({
-                "question": question,
-                "answer": answer
-            })
     return qas
 async def traverse_graph_by_edge(
     llm_client: OpenAIModel,
     tokenizer: Tokenizer,
@@ -140,7 +165,7 @@ async def traverse_graph_by_edge(
     traverse_strategy: TraverseStrategy,
     text_chunks_storage: JsonKVStorage,
     progress_bar: gr.Progress = None,
-    max_concurrent: int = 1000
 ) -> dict:
     """
     Traverse the graph
@@ -158,28 +183,24 @@ async def traverse_graph_by_edge(
     semaphore = asyncio.Semaphore(max_concurrent)
     async def _process_nodes_and_edges(
-            _process_nodes: list,
-            _process_edges: list,
     ) -> str:
         prompt = await _construct_rephrasing_prompt(
-            _process_nodes,
-            _process_edges,
-            text_chunks_storage,
-            add_context = False
         )
         context = await llm_client.generate_answer(prompt)
         # post-process the context
         if context.startswith("Rephrased Text:"):
-            context = context[len("Rephrased Text:"):].strip()
         elif context.startswith("重述文本:"):
-            context = context[len("重述文本:"):].strip()
         return context
     async def _process_single_batch(
-        _process_batch: tuple,
-        question_type: str = "single"
     ) -> dict:
         async with semaphore:
             context = await _process_nodes_and_edges(
@@ -188,21 +209,26 @@ async def traverse_graph_by_edge(
             )
             language = "Chinese" if detect_main_language(context) == "zh" else "English"
-            pre_length = sum(node['length'] for node in _process_batch[0]) \
-                         + sum(edge[2]['length'] for edge in _process_batch[1])
             if question_type == "single":
                 question = await llm_client.generate_answer(
-                    QUESTION_GENERATION_PROMPT[language]['SINGLE_TEMPLATE'].format(
                         answer=context
                     )
                 )
                 if question.startswith("Question:"):
-                    question = question[len("Question:"):].strip()
                 elif question.startswith("问题："):
-                    question = question[len("问题："):].strip()
-                logger.info("%d nodes and %d edges processed", len(_process_batch[0]), len(_process_batch[1]))
                 logger.info("Pre-length: %s", pre_length)
                 logger.info("Question: %s", question)
                 logger.info("Answer: %s", context)
@@ -211,12 +237,14 @@ async def traverse_graph_by_edge(
                     compute_content_hash(context): {
                         "question": question,
                         "answer": context,
-                        "loss": get_average_loss(_process_batch, traverse_strategy.loss_strategy)
                     }
                 }
             content = await llm_client.generate_answer(
-                QUESTION_GENERATION_PROMPT[language]['MULTI_TEMPLATE'].format(
                     doc=context
                 )
             )
@@ -224,19 +252,27 @@ async def traverse_graph_by_edge(
             if len(qas) == 0:
                 print(content)
-                logger.error("Error occurred while processing batch, question or answer is None")
                 return {}
             final_results = {}
-            logger.info("%d nodes and %d edges processed", len(_process_batch[0]), len(_process_batch[1]))
             logger.info("Pre-length: %s", pre_length)
             for qa in qas:
-                logger.info("Question: %s", qa['question'])
-                logger.info("Answer: %s", qa['answer'])
-                final_results[compute_content_hash(qa['question'])] = {
-                    "question": qa['question'],
-                    "answer": qa['answer'],
-                    "loss": get_average_loss(_process_batch, traverse_strategy.loss_strategy)
                 }
             return final_results
@@ -247,22 +283,25 @@ async def traverse_graph_by_edge(
     edges, nodes = await _pre_tokenize(graph_storage, tokenizer, edges, nodes)
     processing_batches = await get_batches_with_strategy(
-        nodes,
-        edges,
-        graph_storage,
-        traverse_strategy
     )
-    for result in tqdm_async(asyncio.as_completed(
-        [_process_single_batch(batch) for batch in processing_batches]
-    ), total=len(processing_batches), desc="[4/4]Generating QAs"):
         try:
             if progress_bar is not None:
-                progress_bar(len(results) / len(processing_batches), desc="[4/4]Generating QAs")
             results.update(await result)
             if progress_bar is not None and len(results) == len(processing_batches):
                 progress_bar(1, desc="[4/4]Generating QAs")
-        except Exception as e: # pylint: disable=broad-except
             logger.error("Error occurred while generating QA: %s", e)
     return results
@@ -275,7 +314,7 @@ async def traverse_graph_atomically(
     traverse_strategy: TraverseStrategy,
     text_chunks_storage: JsonKVStorage,
     progress_bar: gr.Progress = None,
-    max_concurrent: int = 1000
 ) -> dict:
     """
     Traverse the graph atomicly
@@ -292,22 +331,21 @@ async def traverse_graph_atomically(
     assert traverse_strategy.qa_form == "atomic"
     semaphore = asyncio.Semaphore(max_concurrent)
-    async def _generate_question(
-        node_or_edge: tuple
-    ):
         if len(node_or_edge) == 2:
-            des = node_or_edge[0] + ": " + node_or_edge[1]['description']
-            loss = node_or_edge[1]['loss']
         else:
-            des = node_or_edge[2]['description']
-            loss = node_or_edge[2]['loss']
         async with semaphore:
             try:
                 language = "Chinese" if detect_main_language(des) == "zh" else "English"
                 qa = await llm_client.generate_answer(
-                    QUESTION_GENERATION_PROMPT[language]['SINGLE_QA_TEMPLATE'].format(
                         doc=des
                     )
                 )
@@ -321,8 +359,8 @@ async def traverse_graph_atomically(
                 else:
                     return {}
-                question = question.strip("\"")
-                answer = answer.strip("\"")
                 logger.info("Question: %s", question)
                 logger.info("Answer: %s", answer)
@@ -330,10 +368,10 @@ async def traverse_graph_atomically(
                     compute_content_hash(question): {
                         "question": question,
                         "answer": answer,
-                        "loss": loss
                     }
                 }
-            except Exception as e: # pylint: disable=broad-except
                 logger.error("Error occurred while generating question: %s", e)
                 return {}
@@ -345,24 +383,26 @@ async def traverse_graph_atomically(
     tasks = []
     for node in nodes:
-        if "<SEP>" in node[1]['description']:
-            description_list = node[1]['description'].split("<SEP>")
             for item in description_list:
-                tasks.append((node[0], {"description": item, 'loss': node[1]['loss']}))
         else:
             tasks.append((node[0], node[1]))
     for edge in edges:
-        if "<SEP>" in edge[2]['description']:
-            description_list = edge[2]['description'].split("<SEP>")
             for item in description_list:
-                tasks.append((edge[0], edge[1], {"description": item, 'loss': edge[2]['loss']}))
         else:
             tasks.append((edge[0], edge[1], edge[2]))
     for result in tqdm_async(
         asyncio.as_completed([_generate_question(task) for task in tasks]),
         total=len(tasks),
-        desc="[4/4]Generating QAs"
     ):
         try:
             if progress_bar is not None:
@@ -370,10 +410,11 @@ async def traverse_graph_atomically(
             results.update(await result)
             if progress_bar is not None and len(results) == len(tasks):
                 progress_bar(1, desc="[4/4]Generating QAs")
-        except Exception as e: # pylint: disable=broad-except
             logger.error("Error occurred while generating QA: %s", e)
     return results
 async def traverse_graph_for_multi_hop(
     llm_client: OpenAIModel,
     tokenizer: Tokenizer,
@@ -381,7 +422,7 @@ async def traverse_graph_for_multi_hop(
     traverse_strategy: TraverseStrategy,
     text_chunks_storage: JsonKVStorage,
     progress_bar: gr.Progress = None,
-    max_concurrent: int = 1000
 ) -> dict:
     """
     Traverse the graph for multi-hop
@@ -395,8 +436,6 @@ async def traverse_graph_for_multi_hop(
     :param max_concurrent
     :return: question and answer
     """
-    assert traverse_strategy.qa_form == "multi_hop"
     semaphore = asyncio.Semaphore(max_concurrent)
     results = {}
@@ -406,24 +445,24 @@ async def traverse_graph_for_multi_hop(
     edges, nodes = await _pre_tokenize(graph_storage, tokenizer, edges, nodes)
     processing_batches = await get_batches_with_strategy(
-        nodes,
-        edges,
-        graph_storage,
-        traverse_strategy
     )
-    async def _process_single_batch(
-        _process_batch: tuple
-    ) -> dict:
         async with semaphore:
             try:
-                language = "Chinese" if detect_main_language(_process_batch[0][0]['description']) == "zh" else "English"
                 _process_nodes = _process_batch[0]
                 _process_edges = _process_batch[1]
                 entities = [
-                    f"{_process_node['node_id']}: {_process_node['description']}" for _process_node in _process_nodes
                 ]
                 relations = [
@@ -431,12 +470,18 @@ async def traverse_graph_for_multi_hop(
                     for _process_edge in _process_edges
                 ]
-                entities_str = "\n".join([f"{index + 1}. {entity}" for index, entity in enumerate(entities)])
-                relations_str = "\n".join([f"{index + 1}. {relation}" for index, relation in enumerate(relations)])
                 prompt = MULTI_HOP_GENERATION_PROMPT[language].format(
-                    entities=entities_str,
-                    relationships=relations_str
                 )
                 context = await llm_client.generate_answer(prompt)
@@ -451,8 +496,8 @@ async def traverse_graph_for_multi_hop(
                 else:
                     return {}
-                question = question.strip("\"")
-                answer = answer.strip("\"")
                 logger.info("Question: %s", question)
                 logger.info("Answer: %s", answer)
@@ -461,25 +506,31 @@ async def traverse_graph_for_multi_hop(
                     compute_content_hash(question): {
                         "question": question,
                         "answer": answer,
-                        "loss": get_average_loss(_process_batch, traverse_strategy.loss_strategy),
                     }
                 }
-            except Exception as e: # pylint: disable=broad-except
                 logger.error("Error occurred while processing batch: %s", e)
                 return {}
     async for result in tqdm_async(
-        asyncio.as_completed([_process_single_batch(batch) for batch in processing_batches]),
         total=len(processing_batches),
-        desc="[4/4]Generating QAs"
     ):
         try:
             if progress_bar is not None:
-                progress_bar(len(results) / len(processing_batches), desc="[4/4]Generating QAs")
             results.update(await result)
             if progress_bar is not None and len(results) == len(processing_batches):
                 progress_bar(1, desc="[4/4]Generating QAs")
-        except Exception as e: # pylint: disable=broad-except
             logger.error("Error occurred while generating QA: %s", e)
     return results

 import asyncio
+import gradio as gr
 from tqdm.asyncio import tqdm as tqdm_async
+from graphgen.models import (
+    JsonKVStorage,
+    NetworkXStorage,
+    OpenAIModel,
+    Tokenizer,
+    TraverseStrategy,
+)
+from graphgen.operators.kg.split_kg import get_batches_with_strategy
+from graphgen.templates import (
+    ANSWER_REPHRASING_PROMPT,
+    MULTI_HOP_GENERATION_PROMPT,
+    QUESTION_GENERATION_PROMPT,
+)
+from graphgen.utils import compute_content_hash, detect_main_language, logger
+async def _pre_tokenize(
+    graph_storage: NetworkXStorage, tokenizer: Tokenizer, edges: list, nodes: list
+) -> tuple:
     sem = asyncio.Semaphore(1000)
     async def handle_edge(edge: tuple) -> tuple:
         async with sem:
+            if "length" not in edge[2]:
+                edge[2]["length"] = len(
+                    await asyncio.get_event_loop().run_in_executor(
+                        None, tokenizer.encode_string, edge[2]["description"]
+                    )
+                )
             return edge
     async def handle_node(node: dict) -> dict:
         async with sem:
+            if "length" not in node[1]:
+                node[1]["length"] = len(
+                    await asyncio.get_event_loop().run_in_executor(
+                        None, tokenizer.encode_string, node[1]["description"]
+                    )
+                )
             return node
     new_edges = []
     new_nodes = []
+    for result in tqdm_async(
+        asyncio.as_completed([handle_edge(edge) for edge in edges]),
+        total=len(edges),
+        desc="Pre-tokenizing edges",
+    ):
         new_edge = await result
         await graph_storage.update_edge(new_edge[0], new_edge[1], new_edge[2])
         new_edges.append(new_edge)
+    for result in tqdm_async(
+        asyncio.as_completed([handle_node(node) for node in nodes]),
+        total=len(nodes),
+        desc="Pre-tokenizing nodes",
+    ):
         new_node = await result
         await graph_storage.update_node(new_node[0], new_node[1])
         new_nodes.append(new_node)
     await graph_storage.index_done_callback()
     return new_edges, new_nodes
+async def _construct_rephrasing_prompt(
+    _process_nodes: list,
+    _process_edges: list,
+    text_chunks_storage: JsonKVStorage,
+    add_context: bool = False,
+) -> str:
     entities = [
+        f"{_process_node['node_id']}: {_process_node['description']}"
+        for _process_node in _process_nodes
     ]
     relations = [
         f"{_process_edge[0]} -- {_process_edge[1]}: {_process_edge[2]['description']}"
         for _process_edge in _process_edges
     ]
+    entities_str = "\n".join(
+        [f"{index + 1}. {entity}" for index, entity in enumerate(entities)]
+    )
+    relations_str = "\n".join(
+        [f"{index + 1}. {relation}" for index, relation in enumerate(relations)]
+    )
+    language = (
+        "Chinese"
+        if detect_main_language(entities_str + relations_str) == "zh"
+        else "English"
+    )
     if add_context:
+        original_ids = [
+            node["source_id"].split("<SEP>")[0] for node in _process_nodes
+        ] + [edge[2]["source_id"].split("<SEP>")[0] for edge in _process_edges]
         original_ids = list(set(original_ids))
         original_text = await text_chunks_storage.get_by_ids(original_ids)
+        original_text = "\n".join(
+            [
+                f"{index + 1}. {text['content']}"
+                for index, text in enumerate(original_text)
+            ]
+        )
+        prompt = ANSWER_REPHRASING_PROMPT[language]["CONTEXT_TEMPLATE"].format(
             language=language,
             original_text=original_text,
             entities=entities_str,
+            relationships=relations_str,
         )
         return prompt
+    prompt = ANSWER_REPHRASING_PROMPT[language]["TEMPLATE"].format(
+        language=language, entities=entities_str, relationships=relations_str
     )
     return prompt
 def get_average_loss(batch: tuple, loss_strategy: str) -> float:
+    try:
+        if loss_strategy == "only_edge":
+            return sum(edge[2]["loss"] for edge in batch[1]) / len(batch[1])
+        if loss_strategy == "both":
+            return sum(edge[2]["loss"] for edge in batch[1]) + sum(
+                node["loss"] for node in batch[0]
+            ) / (len(batch[0]) + len(batch[1]))
+        raise ValueError("Invalid loss strategy")
+    except Exception as e:  # pylint: disable=broad-except
+        logger.error("Error calculating average loss: %s", e)
+        return -1.0
 def _post_process_synthetic_data(data):
     block = data.split("\n\n")
         if "Question:" in line and "Answer:" in line:
             question = line.split("Question:")[1].split("Answer:")[0].strip()
             answer = line.split("Answer:")[1].strip()
+            qas.append({"question": question, "answer": answer})
         elif "问题：" in line and "答案：" in line:
             question = line.split("问题：")[1].split("答案：")[0].strip()
             answer = line.split("答案：")[1].strip()
+            qas.append({"question": question, "answer": answer})
         elif "问题:" in line and "回答:" in line:
             question = line.split("问题:")[1].split("回答:")[0].strip()
             answer = line.split("回答:")[1].strip()
+            qas.append({"question": question, "answer": answer})
     return qas
 async def traverse_graph_by_edge(
     llm_client: OpenAIModel,
     tokenizer: Tokenizer,
     traverse_strategy: TraverseStrategy,
     text_chunks_storage: JsonKVStorage,
     progress_bar: gr.Progress = None,
+    max_concurrent: int = 1000,
 ) -> dict:
     """
     Traverse the graph
     semaphore = asyncio.Semaphore(max_concurrent)
     async def _process_nodes_and_edges(
+        _process_nodes: list,
+        _process_edges: list,
     ) -> str:
         prompt = await _construct_rephrasing_prompt(
+            _process_nodes, _process_edges, text_chunks_storage, add_context=False
         )
         context = await llm_client.generate_answer(prompt)
         # post-process the context
         if context.startswith("Rephrased Text:"):
+            context = context[len("Rephrased Text:") :].strip()
         elif context.startswith("重述文本:"):
+            context = context[len("重述文本:") :].strip()
         return context
     async def _process_single_batch(
+        _process_batch: tuple, question_type: str = "single"
     ) -> dict:
         async with semaphore:
             context = await _process_nodes_and_edges(
             )
             language = "Chinese" if detect_main_language(context) == "zh" else "English"
+            pre_length = sum(node["length"] for node in _process_batch[0]) + sum(
+                edge[2]["length"] for edge in _process_batch[1]
+            )
             if question_type == "single":
                 question = await llm_client.generate_answer(
+                    QUESTION_GENERATION_PROMPT[language]["SINGLE_TEMPLATE"].format(
                         answer=context
                     )
                 )
                 if question.startswith("Question:"):
+                    question = question[len("Question:") :].strip()
                 elif question.startswith("问题："):
+                    question = question[len("问题：") :].strip()
+                logger.info(
+                    "%d nodes and %d edges processed",
+                    len(_process_batch[0]),
+                    len(_process_batch[1]),
+                )
                 logger.info("Pre-length: %s", pre_length)
                 logger.info("Question: %s", question)
                 logger.info("Answer: %s", context)
                     compute_content_hash(context): {
                         "question": question,
                         "answer": context,
+                        "loss": get_average_loss(
+                            _process_batch, traverse_strategy.loss_strategy
+                        ),
                     }
                 }
             content = await llm_client.generate_answer(
+                QUESTION_GENERATION_PROMPT[language]["MULTI_TEMPLATE"].format(
                     doc=context
                 )
             )
             if len(qas) == 0:
                 print(content)
+                logger.error(
+                    "Error occurred while processing batch, question or answer is None"
+                )
                 return {}
             final_results = {}
+            logger.info(
+                "%d nodes and %d edges processed",
+                len(_process_batch[0]),
+                len(_process_batch[1]),
+            )
             logger.info("Pre-length: %s", pre_length)
             for qa in qas:
+                logger.info("Question: %s", qa["question"])
+                logger.info("Answer: %s", qa["answer"])
+                final_results[compute_content_hash(qa["question"])] = {
+                    "question": qa["question"],
+                    "answer": qa["answer"],
+                    "loss": get_average_loss(
+                        _process_batch, traverse_strategy.loss_strategy
+                    ),
                 }
             return final_results
     edges, nodes = await _pre_tokenize(graph_storage, tokenizer, edges, nodes)
     processing_batches = await get_batches_with_strategy(
+        nodes, edges, graph_storage, traverse_strategy
     )
+    for result in tqdm_async(
+        asyncio.as_completed(
+            [_process_single_batch(batch) for batch in processing_batches]
+        ),
+        total=len(processing_batches),
+        desc="[4/4]Generating QAs",
+    ):
         try:
             if progress_bar is not None:
+                progress_bar(
+                    len(results) / len(processing_batches), desc="[4/4]Generating QAs"
+                )
             results.update(await result)
             if progress_bar is not None and len(results) == len(processing_batches):
                 progress_bar(1, desc="[4/4]Generating QAs")
+        except Exception as e:  # pylint: disable=broad-except
             logger.error("Error occurred while generating QA: %s", e)
     return results
     traverse_strategy: TraverseStrategy,
     text_chunks_storage: JsonKVStorage,
     progress_bar: gr.Progress = None,
+    max_concurrent: int = 1000,
 ) -> dict:
     """
     Traverse the graph atomicly
     assert traverse_strategy.qa_form == "atomic"
     semaphore = asyncio.Semaphore(max_concurrent)
+    async def _generate_question(node_or_edge: tuple):
         if len(node_or_edge) == 2:
+            des = node_or_edge[0] + ": " + node_or_edge[1]["description"]
+            loss = node_or_edge[1]["loss"]
         else:
+            des = node_or_edge[2]["description"]
+            loss = node_or_edge[2]["loss"]
         async with semaphore:
             try:
                 language = "Chinese" if detect_main_language(des) == "zh" else "English"
                 qa = await llm_client.generate_answer(
+                    QUESTION_GENERATION_PROMPT[language]["SINGLE_QA_TEMPLATE"].format(
                         doc=des
                     )
                 )
                 else:
                     return {}
+                question = question.strip('"')
+                answer = answer.strip('"')
                 logger.info("Question: %s", question)
                 logger.info("Answer: %s", answer)
                     compute_content_hash(question): {
                         "question": question,
                         "answer": answer,
+                        "loss": loss,
                     }
                 }
+            except Exception as e:  # pylint: disable=broad-except
                 logger.error("Error occurred while generating question: %s", e)
                 return {}
     tasks = []
     for node in nodes:
+        if "<SEP>" in node[1]["description"]:
+            description_list = node[1]["description"].split("<SEP>")
             for item in description_list:
+                tasks.append((node[0], {"description": item, "loss": node[1]["loss"]}))
         else:
             tasks.append((node[0], node[1]))
     for edge in edges:
+        if "<SEP>" in edge[2]["description"]:
+            description_list = edge[2]["description"].split("<SEP>")
             for item in description_list:
+                tasks.append(
+                    (edge[0], edge[1], {"description": item, "loss": edge[2]["loss"]})
+                )
         else:
             tasks.append((edge[0], edge[1], edge[2]))
     for result in tqdm_async(
         asyncio.as_completed([_generate_question(task) for task in tasks]),
         total=len(tasks),
+        desc="[4/4]Generating QAs",
     ):
         try:
             if progress_bar is not None:
             results.update(await result)
             if progress_bar is not None and len(results) == len(tasks):
                 progress_bar(1, desc="[4/4]Generating QAs")
+        except Exception as e:  # pylint: disable=broad-except
             logger.error("Error occurred while generating QA: %s", e)
     return results
 async def traverse_graph_for_multi_hop(
     llm_client: OpenAIModel,
     tokenizer: Tokenizer,
     traverse_strategy: TraverseStrategy,
     text_chunks_storage: JsonKVStorage,
     progress_bar: gr.Progress = None,
+    max_concurrent: int = 1000,
 ) -> dict:
     """
     Traverse the graph for multi-hop
     :param max_concurrent
     :return: question and answer
     """
     semaphore = asyncio.Semaphore(max_concurrent)
     results = {}
     edges, nodes = await _pre_tokenize(graph_storage, tokenizer, edges, nodes)
     processing_batches = await get_batches_with_strategy(
+        nodes, edges, graph_storage, traverse_strategy
     )
+    async def _process_single_batch(_process_batch: tuple) -> dict:
         async with semaphore:
             try:
+                language = (
+                    "Chinese"
+                    if detect_main_language(_process_batch[0][0]["description"]) == "zh"
+                    else "English"
+                )
                 _process_nodes = _process_batch[0]
                 _process_edges = _process_batch[1]
                 entities = [
+                    f"{_process_node['node_id']}: {_process_node['description']}"
+                    for _process_node in _process_nodes
                 ]
                 relations = [
                     for _process_edge in _process_edges
                 ]
+                entities_str = "\n".join(
+                    [f"{index + 1}. {entity}" for index, entity in enumerate(entities)]
+                )
+                relations_str = "\n".join(
+                    [
+                        f"{index + 1}. {relation}"
+                        for index, relation in enumerate(relations)
+                    ]
+                )
                 prompt = MULTI_HOP_GENERATION_PROMPT[language].format(
+                    entities=entities_str, relationships=relations_str
                 )
                 context = await llm_client.generate_answer(prompt)
                 else:
                     return {}
+                question = question.strip('"')
+                answer = answer.strip('"')
                 logger.info("Question: %s", question)
                 logger.info("Answer: %s", answer)
                     compute_content_hash(question): {
                         "question": question,
                         "answer": answer,
+                        "loss": get_average_loss(
+                            _process_batch, traverse_strategy.loss_strategy
+                        ),
                     }
                 }
+            except Exception as e:  # pylint: disable=broad-except
                 logger.error("Error occurred while processing batch: %s", e)
                 return {}
     async for result in tqdm_async(
+        asyncio.as_completed(
+            [_process_single_batch(batch) for batch in processing_batches]
+        ),
         total=len(processing_batches),
+        desc="[4/4]Generating QAs",
     ):
         try:
             if progress_bar is not None:
+                progress_bar(
+                    len(results) / len(processing_batches), desc="[4/4]Generating QAs"
+                )
             results.update(await result)
             if progress_bar is not None and len(results) == len(processing_batches):
                 progress_bar(1, desc="[4/4]Generating QAs")
+        except Exception as e:  # pylint: disable=broad-except
             logger.error("Error occurred while generating QA: %s", e)
     return results