DyNin commited on Aug 4, 2024

Commit

8e28984

verified ·

1 Parent(s): 9a353a0

Upload 17 files

Browse files

Files changed (17) hide show

.gitignore +475 -0
LICENSE +21 -0
LSP-generic.yml +70 -0
LSP-linux.yml +77 -0
LSP_train.py +386 -0
MANIFEST.in +1 -0
README.md +591 -29
SECURITY.md +35 -0
config.json +30 -0
data_config.py +17 -0
data_loader.py +291 -0
demo.py +128 -0
demo_utils.py +91 -0
env.py +8 -0
gradiodemo.py +39 -0
prepro.py +221 -0
requirements.txt +10 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,475 @@

+## Ignore Visual Studio temporary files, build results, and
+## files generated by popular Visual Studio add-ons.
+##
+## Get latest from https://github.com/github/gitignore/blob/master/VisualStudio.gitignore
+# User-specific files
+*.suo
+*.user
+*.userosscache
+*.sln.docstates
+# User-specific files (MonoDevelop/Xamarin Studio)
+*.userprefs
+# Build results
+[Dd]ebug/
+[Dd]ebugPublic/
+[Rr]elease/
+[Rr]eleases/
+x64/
+x86/
+bld/
+[Bb]in/
+[Oo]bj/
+[Ll]og/
+# Visual Studio 2015/2017 cache/options directory
+.vs/
+# Uncomment if you have tasks that create the project's static files in wwwroot
+#wwwroot/
+# Visual Studio 2017 auto generated files
+Generated\ Files/
+# MSTest test Results
+[Tt]est[Rr]esult*/
+[Bb]uild[Ll]og.*
+# NUNIT
+*.VisualState.xml
+TestResult.xml
+# Build Results of an ATL Project
+[Dd]ebugPS/
+[Rr]eleasePS/
+dlldata.c
+# Benchmark Results
+BenchmarkDotNet.Artifacts/
+# .NET Core
+project.lock.json
+project.fragment.lock.json
+artifacts/
+**/Properties/launchSettings.json
+# StyleCop
+StyleCopReport.xml
+# Files built by Visual Studio
+*_i.c
+*_p.c
+*_i.h
+*.ilk
+*.meta
+*.obj
+*.iobj
+*.pch
+*.pdb
+*.ipdb
+*.pgc
+*.pgd
+*.rsp
+*.sbr
+*.tlb
+*.tli
+*.tlh
+*.tmp
+*.tmp_proj
+*.log
+*.vspscc
+*.vssscc
+.builds
+*.pidb
+*.svclog
+*.scc
+# Chutzpah Test files
+_Chutzpah*
+# Visual C++ cache files
+ipch/
+*.aps
+*.ncb
+*.opendb
+*.opensdf
+*.sdf
+*.cachefile
+*.VC.db
+*.VC.VC.opendb
+# Visual Studio profiler
+*.psess
+*.vsp
+*.vspx
+*.sap
+# Visual Studio Trace Files
+*.e2e
+# TFS 2012 Local Workspace
+$tf/
+# Guidance Automation Toolkit
+*.gpState
+# ReSharper is a .NET coding add-in
+_ReSharper*/
+*.[Rr]e[Ss]harper
+*.DotSettings.user
+# JustCode is a .NET coding add-in
+.JustCode
+# TeamCity is a build add-in
+_TeamCity*
+# DotCover is a Code Coverage Tool
+*.dotCover
+# AxoCover is a Code Coverage Tool
+.axoCover/*
+!.axoCover/settings.json
+# Visual Studio code coverage results
+*.coverage
+*.coveragexml
+# NCrunch
+_NCrunch_*
+.*crunch*.local.xml
+nCrunchTemp_*
+# MightyMoose
+*.mm.*
+AutoTest.Net/
+# Web workbench (sass)
+.sass-cache/
+# Installshield output folder
+[Ee]xpress/
+# DocProject is a documentation generator add-in
+DocProject/buildhelp/
+DocProject/Help/*.HxT
+DocProject/Help/*.HxC
+DocProject/Help/*.hhc
+DocProject/Help/*.hhk
+DocProject/Help/*.hhp
+DocProject/Help/Html2
+DocProject/Help/html
+# Click-Once directory
+publish/
+# Publish Web Output
+*.[Pp]ublish.xml
+*.azurePubxml
+# Note: Comment the next line if you want to checkin your web deploy settings,
+# but database connection strings (with potential passwords) will be unencrypted
+*.pubxml
+*.publishproj
+# Microsoft Azure Web App publish settings. Comment the next line if you want to
+# checkin your Azure Web App publish settings, but sensitive information contained
+# in these scripts will be unencrypted
+PublishScripts/
+# NuGet Packages
+*.nupkg
+# The packages folder can be ignored because of Package Restore
+**/[Pp]ackages/*
+# except build/, which is used as an MSBuild target.
+!**/[Pp]ackages/build/
+# Uncomment if necessary however generally it will be regenerated when needed
+#!**/[Pp]ackages/repositories.config
+# NuGet v3's project.json files produces more ignorable files
+*.nuget.props
+*.nuget.targets
+# Microsoft Azure Build Output
+csx/
+*.build.csdef
+# Microsoft Azure Emulator
+ecf/
+rcf/
+# Windows Store app package directories and files
+AppPackages/
+BundleArtifacts/
+Package.StoreAssociation.xml
+_pkginfo.txt
+*.appx
+# Visual Studio cache files
+# files ending in .cache can be ignored
+*.[Cc]ache
+# but keep track of directories ending in .cache
+!*.[Cc]ache/
+# Others
+ClientBin/
+~$*
+*~
+*.dbmdl
+*.dbproj.schemaview
+*.jfm
+*.pfx
+*.publishsettings
+orleans.codegen.cs
+# Including strong name files can present a security risk
+# (https://github.com/github/gitignore/pull/2483#issue-259490424)
+#*.snk
+# Since there are multiple workflows, uncomment next line to ignore bower_components
+# (https://github.com/github/gitignore/pull/1529#issuecomment-104372622)
+#bower_components/
+# RIA/Silverlight projects
+Generated_Code/
+# Backup & report files from converting an old project file
+# to a newer Visual Studio version. Backup files are not needed,
+# because we have git ;-)
+_UpgradeReport_Files/
+Backup*/
+UpgradeLog*.XML
+UpgradeLog*.htm
+ServiceFabricBackup/
+*.rptproj.bak
+# SQL Server files
+*.mdf
+*.ldf
+*.ndf
+# Business Intelligence projects
+*.rdl.data
+*.bim.layout
+*.bim_*.settings
+*.rptproj.rsuser
+# Microsoft Fakes
+FakesAssemblies/
+# GhostDoc plugin setting file
+*.GhostDoc.xml
+# Node.js Tools for Visual Studio
+.ntvs_analysis.dat
+node_modules/
+# Visual Studio 6 build log
+*.plg
+# Visual Studio 6 workspace options file
+*.opt
+# Visual Studio 6 auto-generated workspace file (contains which files were open etc.)
+*.vbw
+# Visual Studio LightSwitch build output
+**/*.HTMLClient/GeneratedArtifacts
+**/*.DesktopClient/GeneratedArtifacts
+**/*.DesktopClient/ModelManifest.xml
+**/*.Server/GeneratedArtifacts
+**/*.Server/ModelManifest.xml
+_Pvt_Extensions
+# Paket dependency manager
+.paket/paket.exe
+paket-files/
+# FAKE - F# Make
+.fake/
+# JetBrains Rider
+.idea/
+*.sln.iml
+# CodeRush
+.cr/
+# Python Tools for Visual Studio (PTVS)
+__pycache__/
+*.pyc
+# Cake - Uncomment if you are using it
+# tools/**
+# !tools/packages.config
+# Tabs Studio
+*.tss
+# Telerik's JustMock configuration file
+*.jmconfig
+# BizTalk build output
+*.btp.cs
+*.btm.cs
+*.odx.cs
+*.xsd.cs
+# OpenCover UI analysis results
+OpenCover/
+# Azure Stream Analytics local run output
+ASALocalRun/
+# MSBuild Binary and Structured Log
+*.binlog
+# NVidia Nsight GPU debugger configuration file
+*.nvuser
+# MFractors (Xamarin productivity tool) working folder
+.mfractor/
+# preprocessed data caused by testing
+tests/corpus/*.db
+# Initially taken from Github's Python gitignore file
+# Byte-compiled / optimized / DLL files
+__pycache__/
+.idea/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.DS_Store
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+.hypothesis/
+.pytest_cache/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+.python-version
+# celery beat schedule file
+celerybeat-schedule
+# SageMath parsed files
+*.sage.py
+# unstaged code
+# run_gpt2.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# vscode
+.vscode
+# TF code
+tensorflow_code
+# Models
+# models
+models/
+# test.txt
+# dstc/
+# scripts/
+# philly/
+# demo/
+# docker/
+# prepro_v4.py
+# prepro.py

LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+    MIT License
+    Copyright (c) Microsoft Corporation.
+    Permission is hereby granted, free of charge, to any person obtaining a copy
+    of this software and associated documentation files (the "Software"), to deal
+    in the Software without restriction, including without limitation the rights
+    to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+    copies of the Software, and to permit persons to whom the Software is
+    furnished to do so, subject to the following conditions:
+    The above copyright notice and this permission notice shall be included in all
+    copies or substantial portions of the Software.
+    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+    OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+    SOFTWARE

LSP-generic.yml ADDED Viewed

	@@ -0,0 +1,70 @@

+name: LSP
+channels:
+  - pytorch
+  - defaults
+dependencies:
+  - _libgcc_mutex=0.1=main
+  - blas=1.0=mkl
+  - ca-certificates=2019.5.15=1
+  - certifi=2019.6.16=py36_1
+  - cffi=1.12.3
+  - cudatoolkit=10.0.130=0
+  - freetype=2.9.1
+  - intel-openmp=2019.4
+  - jpeg=9b
+  - libpng=1.6.37
+  - libtiff=4.0.10
+  - mkl=2019.4
+  - mkl-service=2.3.0
+  - mkl_fft=1.0.14
+  - mkl_random=1.0.2
+  - ninja=1.9.0
+  - numpy=1.16.5
+  - numpy-base=1.16.5
+  - olefile=0.46=py36_0
+  - openssl=1.1.1d
+  - pillow=6.1.0
+  - pip=19.2.2=py36_0
+  - pycparser=2.19=py36_0
+  - python=3.6.9
+  - pytorch=1.2.0
+  - setuptools=41.0.1=py36_0
+  - six=1.12.0=py36_0
+  - sqlite=3.29.0
+  - tk=8.6.8
+  - torchvision=0.4.0=py36_cu100
+  - wheel=0.33.4=py36_0
+  - xz=5.2.4
+  - zlib=1.2.11
+  - zstd=1.3.7
+  - nltk=3.4.1
+  - pip:
+    - backcall==0.1.0
+    - boto3==1.9.228
+    - botocore==1.12.228
+    - chardet==3.0.4
+    - decorator==4.4.0
+    - docutils==0.15.2
+    - idna==2.8
+    - ipython==7.8.0
+    - ipython-genutils==0.2.0
+    - jedi==0.15.1
+    - jmespath==0.9.4
+    - parso==0.5.1
+    - pexpect==4.7.0
+    - pickleshare==0.7.5
+    - prompt-toolkit==2.0.9
+    - ptyprocess==0.6.0
+    - pygments==2.4.2
+    - python-dateutil==2.8.0
+    - pytorch-pretrained-bert==0.6.1
+    - regex==2019.8.19
+    - requests==2.22.0
+    - s3transfer==0.2.1
+    - tqdm==4.35.0
+    - traitlets==4.3.2
+    - urllib3==1.25.3
+    - wcwidth==0.1.7
+    - flashtext==2.7
+prefix: /home/JJteam/anaconda3/envs/LSP

LSP-linux.yml ADDED Viewed

	@@ -0,0 +1,77 @@

+name: LSP
+channels:
+  - pytorch
+  - defaults
+dependencies:
+  - _libgcc_mutex=0.1=main
+  - blas=1.0=mkl
+  - ca-certificates=2019.5.15=1
+  - certifi=2019.6.16=py36_1
+  - cffi=1.12.3=py36h2e261b9_0
+  - cudatoolkit=10.0.130=0
+  - freetype=2.9.1=h8a8886c_1
+  - intel-openmp=2019.4=243
+  - jpeg=9b=h024ee3a_2
+  - libedit=3.1.20181209=hc058e9b_0
+  - libffi=3.2.1=hd88cf55_4
+  - libgcc-ng=9.1.0=hdf63c60_0
+  - libgfortran-ng=7.3.0=hdf63c60_0
+  - libpng=1.6.37=hbc83047_0
+  - libstdcxx-ng=9.1.0=hdf63c60_0
+  - libtiff=4.0.10=h2733197_2
+  - mkl=2019.4=243
+  - mkl-service=2.3.0=py36he904b0f_0
+  - mkl_fft=1.0.14=py36ha843d7b_0
+  - mkl_random=1.0.2=py36hd81dba3_0
+  - ncurses=6.1=he6710b0_1
+  - ninja=1.9.0=py36hfd86e86_0
+  - numpy=1.16.5=py36h7e9f1db_0
+  - numpy-base=1.16.5=py36hde5b4d6_0
+  - olefile=0.46=py36_0
+  - openssl=1.1.1d=h7b6447c_1
+  - pillow=6.1.0=py36h34e0f95_0
+  - pip=19.2.2=py36_0
+  - pycparser=2.19=py36_0
+  - python=3.6.9=h265db76_0
+  - pytorch=1.2.0=py3.6_cuda10.0.130_cudnn7.6.2_0
+  - readline=7.0=h7b6447c_5
+  - setuptools=41.0.1=py36_0
+  - six=1.12.0=py36_0
+  - sqlite=3.29.0=h7b6447c_0
+  - tk=8.6.8=hbc83047_0
+  - torchvision=0.4.0=py36_cu100
+  - wheel=0.33.4=py36_0
+  - xz=5.2.4=h14c3975_4
+  - zlib=1.2.11=h7b6447c_3
+  - zstd=1.3.7=h0b5b093_0
+  - nltk=3.4.1
+  - pip:
+    - backcall==0.1.0
+    - boto3==1.9.228
+    - botocore==1.12.228
+    - chardet==3.0.4
+    - decorator==4.4.0
+    - docutils==0.15.2
+    - idna==2.8
+    - ipython==7.8.0
+    - ipython-genutils==0.2.0
+    - jedi==0.15.1
+    - jmespath==0.9.4
+    - parso==0.5.1
+    - pexpect==4.7.0
+    - pickleshare==0.7.5
+    - prompt-toolkit==2.0.9
+    - ptyprocess==0.6.0
+    - pygments==2.4.2
+    - python-dateutil==2.8.0
+    - pytorch-pretrained-bert==0.6.1
+    - regex==2019.8.19
+    - requests==2.22.0
+    - s3transfer==0.2.1
+    - tqdm==4.35.0
+    - traitlets==4.3.2
+    - urllib3==1.25.3
+    - wcwidth==0.1.7
+    - flashtext==2.7
+prefix: /home/JJteam/anaconda3/envs/LSP

LSP_train.py ADDED Viewed

	@@ -0,0 +1,386 @@

+#  Copyright (c) Microsoft Corporation.
+#  Licensed under the MIT license.
+'''
+ * @Desc: train GPT2 from scratch/ fine tuning.
+          Modified based on Huggingface GPT-2 implementation
+'''
+import json
+import os
+import sys
+import argparse
+import logging
+import time
+import tqdm
+import datetime
+import torch
+import numpy as np
+from os.path import join
+from torch.distributed import get_rank, get_world_size
+from lsp_model import GPT2LMHeadModel, GPT2Tokenizer, GPT2Config, Adam
+from gpt2_training.train_utils import load_model, boolean_string, set_lr, get_eval_list_same_length
+from gpt2_training.eval_utils import eval_model_loss
+from data_loader import BucketingDataLoader, DynamicBatchingLoader, DistributedBucketingDataLoader
+from gpt2_training.distributed import all_reduce_and_rescale_tensors, all_gather_list
+logging.basicConfig(
+    format='%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
+    datefmt='%m/%d/%Y %H:%M:%S', level=logging.INFO)
+logger = logging.getLogger(__name__)
+INF = 100000000
+CACHE_EMPTY_STEP = 10000
+EVAL_STEP = 100000
+#########################################################################
+# Prepare Parser
+##########################################################################
+parser = argparse.ArgumentParser()
+parser.add_argument('--model_name_or_path', type=str,
+                    help='pretrained model name or path to local checkpoint')
+parser.add_argument("--seed", type=int, default=42)
+parser.add_argument("--max_seq_length", type=int, default=128)
+parser.add_argument("--skip_eval", action='store_true',
+                    help='If true, skip evaluation.')
+parser.add_argument("--init_checkpoint", type=str)
+parser.add_argument("--train_input_file", type=str)
+parser.add_argument("--eval_input_file", type=str)
+parser.add_argument("--continue_from", type=int, default=0)
+parser.add_argument("--train_batch_size", type=int, default=4,
+                    help="batch size now means per GPU per step")
+parser.add_argument("--gradient_accumulation_steps", type=int, default=2,
+                    help="to increase effective batch size "
+                         "and reduce synchronization")
+parser.add_argument("--eval_batch_size", type=int, default=4)
+parser.add_argument("--learning_rate", type=float, default=1e-5)
+parser.add_argument("--num_optim_steps", type=int, default=1000000,
+                    help="new API specifies num update steps")
+parser.add_argument("--valid_step", type=int, default=10000,
+                    help="how many optim steps between validations")
+parser.add_argument("--warmup_proportion", type=float, default=0.1)
+parser.add_argument("--warmup_steps", type=int, default=16000)
+parser.add_argument("--normalize_data", type=boolean_string, default=True)
+parser.add_argument("--fp16", type=boolean_string, default=True)
+parser.add_argument("--lr_schedule", type=str,
+                    choices=['noam', 'noamwd', 'BERT', 'None'], default='noam')
+parser.add_argument("--loss_scale", type=float, default=0)
+parser.add_argument("--no_token_id", type=boolean_string, default=True)
+parser.add_argument("--output_dir", type=str)
+parser.add_argument("--log_dir", type=str)
+parser.add_argument('--pbar', type=boolean_string, default=True, help='turn on progress bar')
+# distributed
+parser.add_argument('--local_rank', type=int, default=-1,
+                    help='for torch.distributed')
+parser.add_argument('--config', help='JSON config file')
+# do normal parsing
+args = parser.parse_args()
+if args.config is not None:
+    # override argparse defaults by config JSON
+    opts = json.load(open(args.config))
+    for k, v in opts.items():
+        if isinstance(v, str):
+            # PHILLY ENV special cases
+            if 'PHILLY_JOB_DIRECTORY' in v:
+                v = v.replace('PHILLY_JOB_DIRECTORY',
+                              os.environ['PHILLY_JOB_DIRECTORY'])
+            elif 'PHILLY_LOG_DIRECTORY' in v:
+                v = v.replace('PHILLY_LOG_DIRECTORY',
+                              os.environ['PHILLY_LOG_DIRECTORY'])
+        setattr(args, k, v)
+    # command line should override config JSON
+    argv = sys.argv[1:]
+    overrides, _ = parser.parse_known_args(argv)
+    for k, v in vars(overrides).items():
+        if f'--{k}' in argv:
+            setattr(args, k, v)
+    setattr(args, 'local_rank', overrides.local_rank)
+assert args.train_batch_size % args.gradient_accumulation_steps == 0, \
+    'batch size % gradient accumulation steps != 0!'
+args.train_batch_size = (args.train_batch_size
+                         // args.gradient_accumulation_steps)
+logger.info('train batch size = {}, '
+            'new train batch size (after gradient accumulation) = {}'.format(
+                args.train_batch_size*args.gradient_accumulation_steps,
+                args.train_batch_size))
+if args.local_rank == -1:
+    logger.info('CUDA available? {}'.format(str(torch.cuda.is_available())))
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    n_gpu = torch.cuda.device_count()
+    args.device, args.n_gpu = device, n_gpu
+else:
+    # distributed training
+    torch.cuda.set_device(args.local_rank)
+    device = torch.device("cuda", args.local_rank)
+    # Initializes the distributed backend which will take care of
+    # sychronizing nodes/GPUs
+    torch.distributed.init_process_group(backend='nccl')
+    n_gpu = torch.distributed.get_world_size()
+    args.device, args.n_gpu = device, 1
+    logger.info("device: {} n_gpu: {}, distributed training: {}, "
+                "16-bits training: {}".format(
+                    device, n_gpu, bool(args.local_rank != -1), args.fp16))
+np.random.seed(args.seed)
+torch.random.manual_seed(args.seed)
+torch.cuda.manual_seed(args.seed)
+if n_gpu > 0:
+    torch.cuda.manual_seed_all(args.seed)
+timestamp = datetime.datetime.now().strftime('%Y-%m-%d%H%M%S')
+output_dir = join(args.output_dir,
+                  'GPT2.{}.{}.{}gpu.{}'.format(args.learning_rate,
+                                               args.train_batch_size, n_gpu,
+                                               timestamp))
+log_dir = args.log_dir if args.log_dir is not None and len(args.log_dir) > 0 else output_dir
+if args.local_rank == -1 or get_rank() == 0:
+    os.makedirs(output_dir, exist_ok=True)
+logger.info('Input Argument Information')
+args_dict = vars(args)
+for a in args_dict:
+    logger.info('%-28s  %s' % (a, args_dict[a]))
+#########################################################################
+# Prepare Data Set
+##########################################################################
+enc = GPT2Tokenizer.from_pretrained(args.model_name_or_path)
+config = GPT2Config.from_json_file(
+    join(args.model_name_or_path, 'config.json'))
+if args.local_rank == -1:
+    train_dataloader = BucketingDataLoader(args.train_input_file,
+                                           args.train_batch_size,
+                                           args.max_seq_length)
+else:
+    train_dataloader = DistributedBucketingDataLoader(
+        get_rank(), get_world_size(),
+        args.train_input_file, args.train_batch_size,
+        args.max_seq_length)
+eval_dataloader_loss = DynamicBatchingLoader(
+    args.eval_input_file, enc, args.normalize_data,
+    args.eval_batch_size, args.max_seq_length)
+eval_dataloader_gen = get_eval_list_same_length(
+    args.eval_input_file, enc, args.eval_batch_size, True)
+#########################################################################
+# Prepare Model and Optimizer
+##########################################################################
+model = load_model(GPT2LMHeadModel(config), args.init_checkpoint,
+                   args, verbose=True)
+if args.local_rank != -1:
+    # when from scratch make sure initial models are the same
+    params = [p.data for p in model.parameters()]
+    all_reduce_and_rescale_tensors(
+        params, float(torch.distributed.get_world_size()))
+model_parameters = filter(lambda p: p.requires_grad, model.parameters())
+total_params = sum([np.prod(p.size()) for p in model_parameters])
+logger.info('Number of parameter = {}'.format(total_params))
+param_optimizer = list(model.named_parameters())
+no_decay = ['bias', 'ln']   # no decay for bias and LayerNorm (ln)
+optimizer_grouped_parameters = [
+    {'params': [p for n, p in param_optimizer
+                if not any(nd in n for nd in no_decay)],
+     'weight_decay': 0.01},
+    {'params': [p for n, p in param_optimizer
+                if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
+]
+if args.fp16:
+    logger.info('in fp16, using FusedAdam')
+    try:
+        from apex.optimizers import FP16_Optimizer
+        from apex.optimizers import FusedAdam
+    except ImportError:
+        raise ImportError(
+            "Please install apex from https://www.github.com/nvidia/apex "
+            "to use distributed and fp16 training.")
+    optimizer = FusedAdam(optimizer_grouped_parameters,
+                          lr=args.learning_rate,
+                          bias_correction=False,
+                          max_grad_norm=1.0)
+    if args.loss_scale == 0:
+        optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True,
+                                   verbose=False)
+    else:
+        optimizer = FP16_Optimizer(optimizer,
+                                   static_loss_scale=args.loss_scale,
+                                   verbose=False)
+else:
+    optimizer = Adam(optimizer_grouped_parameters, args.learning_rate,
+                     max_grad_norm=1.0)
+#########################################################################
+# Training !
+##########################################################################
+if args.local_rank == -1 or get_rank() == 0:
+    train_logger = open(join(log_dir, 'train_log.txt'), 'a+', buffering=1)
+    eval_logger = open(join(log_dir, 'eval_log.txt'), 'a+', buffering=1)
+    print('epoch,global_step,step,mean_loss,mean_ppl,n_token_real,'
+          'n_token_total,epoch_time', file=train_logger)
+    print('epoch,global_step,step,eval_loss,eval_ppl', file=eval_logger)
+global_step = 0
+step = 0
+epoch = 0
+if args.continue_from:
+    global_step = args.continue_from
+    step = global_step*2 - 1
+if args.local_rank != -1:
+    n_gpu = 1
+if args.local_rank == -1 or get_rank() == 0:
+    if args.pbar:
+        pbar = tqdm.tqdm(total=args.num_optim_steps, desc=f"training")
+    else:
+        pbar = None
+while True:
+    model.train()
+    (tr_loss, tr_ppl, mean_ppl, nb_tr_examples, nb_tr_steps) = 0.0, 0.0, 0.0, 0, 0
+    n_token_real, n_token_total = 0, 0
+    train_start_time_epoch = time.time()
+    for batch in train_dataloader:
+        # activate new training mode
+        seq_len = batch[0].shape[1]
+        batch = tuple(t.to(device) for t in batch)
+        input_ids, position_ids, token_ids, label_ids, *_ = batch
+        if args.no_token_id:
+            token_ids = None
+        loss, ppl = model(input_ids, position_ids, token_ids, label_ids)
+        if n_gpu > 1:
+            loss = loss.mean()
+            ppl = ppl.mean()
+        loss = loss / (args.train_batch_size / input_ids.shape[0])
+        if args.fp16:
+            optimizer.backward(loss)
+        else:
+            loss.backward()
+        tr_loss += float(loss.item()) * (args.train_batch_size / input_ids.shape[0])
+        nb_tr_examples += input_ids.size(0)
+        nb_tr_steps += 1
+        mean_loss = tr_loss / nb_tr_steps
+        if ppl.item() < INF:
+            tr_ppl += ppl.item()
+        else:
+            tr_ppl += mean_ppl
+        mean_ppl = tr_ppl / nb_tr_steps
+        n_token_total += input_ids.shape[0] * input_ids.shape[1]
+        n_token_real += (input_ids != 0).sum().item()
+        # gradient update
+        step += 1
+        if step % args.gradient_accumulation_steps == 0:
+            set_lr(optimizer, global_step,
+                   args.lr_schedule, args.learning_rate,
+                   args.warmup_steps, args.warmup_proportion,
+                   config.n_embd, args.num_optim_steps)
+            if args.local_rank != -1:
+                grads = [p.grad.data for p in model.parameters()
+                         if p.requires_grad and p.grad is not None]
+                all_reduce_and_rescale_tensors(grads, float(1))
+            optimizer.step()
+            optimizer.zero_grad()
+            global_step += 1
+            # Print log info to file
+            if args.local_rank != -1:
+                mean_loss = sum(all_gather_list(mean_loss)) / get_world_size()
+                mean_ppl = sum(all_gather_list(mean_ppl)) / get_world_size()
+                n_token_real_all_proc = sum(all_gather_list(n_token_real))
+                n_token_total_all_proc = sum(all_gather_list(n_token_total))
+            else:
+                n_token_real_all_proc = n_token_real
+                n_token_total_all_proc = n_token_total
+            if args.local_rank == -1 or get_rank() == 0:
+                epoch_time = time.time() - train_start_time_epoch
+                if pbar is not None:
+                    pbar.set_postfix_str(
+                        f"tok/s: {n_token_real_all_proc//epoch_time//1000}k "
+                        f"ppl: {mean_ppl:.2f} epoch: {epoch}")
+                    pbar.update(1)
+                print('{},{},{},{},{},{},{},{}'.format(
+                    epoch+1, global_step+1, step+1, mean_loss, mean_ppl,
+                    n_token_real_all_proc, n_token_total_all_proc, epoch_time),
+                    file=train_logger)
+            if global_step % args.valid_step == 0:
+                if args.local_rank == -1 or get_rank() == 0:
+                    # only rank 0 process evaluate
+                    torch.save(
+                        {k: (v.cpu() if v is not None else None)  # save to cpu tensors
+                         for k, v in model.state_dict().items()},
+                        join(output_dir,
+                             f'GP2-pretrain-step-{global_step}.pkl'))
+                    eval_loss, eval_ppl = eval_model_loss(
+                        model, enc, eval_dataloader_loss, epoch, args)
+                    # enable generation step evaluation for now
+                    # gen_response = eval_model_generation(
+                    #     model, enc, eval_dataloader_gen, epoch, args)
+                    '''
+                    # probably use beam search only for test set
+                    if False:
+                        gen_response_beam = eval_model_generation(
+                            model, enc, eval_dataloader_gen, epoch, args,
+                            use_beam_search=True, beam_width=3)
+                    '''
+                    print('{},{},{},{},{}'.format(
+                        epoch+1, global_step+1, step+1, eval_loss, eval_ppl),
+                        file=eval_logger)
+                    logger.info('current learning rate: '
+                                + str(optimizer.param_groups[0]['lr']))
+                    model.train()
+            if global_step >= args.num_optim_steps:
+                break
+        if (step+1) % CACHE_EMPTY_STEP == 0:
+            torch.cuda.empty_cache()
+    if global_step >= args.num_optim_steps:
+        break
+    epoch += 1
+if args.local_rank == -1 or get_rank() == 0:
+    if pbar is not None:
+        pbar.close()
+    train_logger.close()
+    eval_logger.close()

MANIFEST.in ADDED Viewed

	@@ -0,0 +1 @@


1	+ include LICENSE

README.md CHANGED Viewed

@@ -1,16 +1,425 @@
----
-thumbnail: https://huggingface.co/front/thumbnails/dialogpt.png
-tags:
-- conversational
-license: mit
----
-## A State-of-the-Art Large-scale Pretrained Response generation model (DialoGPT)
-DialoGPT is a SOTA large-scale pretrained dialogue response generation model for multiturn conversations.
-The [human evaluation results](https://github.com/dreasysnail/Dialogpt_dev#human-evaluation) indicate that the response generated from DialoGPT is comparable to human response quality under a single-turn conversation Turing test.
-The model is trained on 147M multi-turn dialogue from Reddit discussion thread.
 * Multi-turn generation examples from an interactive environment:
 |Role | Response |
@@ -22,33 +431,186 @@ The model is trained on 147M multi-turn dialogue from Reddit discussion thread.
 |User |This is so difficult ! |
 | Bot | You have no idea how hard it is to be a millionaire and happy . There is a reason the rich have a lot of money |
-Please find the information about preprocessing, training and full details of the DialoGPT in the [original DialoGPT repository](https://github.com/microsoft/DialoGPT)
-ArXiv paper: [https://arxiv.org/abs/1911.00536](https://arxiv.org/abs/1911.00536)
-### How to use
-Now we are ready to try out how the model works as a chatting partner!
-```python
-from transformers import AutoModelForCausalLM, AutoTokenizer
-import torch
-tokenizer = AutoTokenizer.from_pretrained("microsoft/DialoGPT-medium")
-model = AutoModelForCausalLM.from_pretrained("microsoft/DialoGPT-medium")
-# Let's chat for 5 lines
-for step in range(5):
-	# encode the new user input, add the eos_token and return a tensor in Pytorch
-	new_user_input_ids = tokenizer.encode(input(">> User:") + tokenizer.eos_token, return_tensors='pt')
-	# append the new user input tokens to the chat history
-	bot_input_ids = torch.cat([chat_history_ids, new_user_input_ids], dim=-1) if step > 0 else new_user_input_ids
-	# generated a response while limiting the total chat history to 1000 tokens,
-	chat_history_ids = model.generate(bot_input_ids, max_length=1000, pad_token_id=tokenizer.eos_token_id)
-	# pretty print last ouput tokens from bot
-	print("DialoGPT: {}".format(tokenizer.decode(chat_history_ids[:, bot_input_ids.shape[-1]:][0], skip_special_tokens=True)))
 ```

+# A State-of-the-Art Large-scale Pretrained Response Generation Model (DialoGPT)
+## This project page is no longer maintained as DialoGPT is superseded by [GODEL](https://github.com/microsoft/GODEL), which outperforms DialoGPT according to the results of [this paper](https://arxiv.org/pdf/2206.11309.pdf). Unless you use DialoGPT for reproducibility reasons, we highly recommend you switch to [GODEL](https://github.com/microsoft/GODEL).
+This repository contains the source code and trained model for a large-scale pretrained dialogue response generation model. The [human evaluation results](#human_eval) indicate that the response generated from DialoGPT is comparable to human response quality under a single-turn conversation Turing test.
+<!--See more details on our [project page](https://www.microsoft.com/en-us/research/project/large-scale-pretraining-for-response-generation/)-->
+The repository is based on [huggingface pytorch-transformer](https://github.com/huggingface/transfer-learning-conv-ai) and [OpenAI GPT-2](https://github.com/openai/gpt-2), containing data extraction script, model training code and pretrained small (117M) medium (345M) and large (762M) model checkpoint.
+The model is trained on 147M multi-turn dialogue from Reddit discussion thread. The largest model can be trained in several hours on a 8 V100 machines (however this is not required), with distributed training and FP16 option.
+The include script can be used to reproduce the results of DSTC-7 grounded dialogue generation challenge and a 6k multi-reference dataset created from Reddit data.
+Project webpage: [https://www.microsoft.com/en-us/research/project/large-scale-pretraining-for-response-generation/](https://www.microsoft.com/en-us/research/project/large-scale-pretraining-for-response-generation/)
+ArXiv paper: [https://arxiv.org/abs/1911.00536](https://arxiv.org/abs/1911.00536)
+## News ##
+***(Update 07/09/2022) Changes on the files.pushshift.io/reddit server caused our data generation pipeline to break. These problems have now been fixed, and the steps explained in the Data Preparation subsection below should work again. Data is generated in about 10 hours with 8 processes (`-j 8`), and 800GB of temporary disk space is needed.***
+***(Update 06/23/2021) We have released a retrieval-augmented/grounded version of DialoGPT (RetGen), please check out the [RetGen repo](https://github.com/dreasysnail/RetGen)  and [RetGen paper](https://arxiv.org/abs/2105.06597)***
+***(Update 05/20/2021) An awesome [video walkthrough](https://www.youtube.com/watch?v=Zo679MYoJns) on YouTube for DialoGPT by [Prakhar Mishra](http://wsl.iiitb.ac.in/prakhar-mishra/)***
+***(Update 03/31/2021) A 3rd party demo by [AK391](https://github.com/AK391) using Gradio [web demo](https://gradio.app/g/AK391/DialoGPT) try it out***
+***(Update 09/15/2020) A set of large-scale [dialog ranking models](https://github.com/golsun/DialogRPT) has been released!***
+DialoGPT generation is improved by integrating with our latest dialog ranking models, [DialogRPT](https://github.com/golsun/DialogRPT)
+***(Update 07/08/2020) The 6K multi-ref test set has been released!***
+To generate the data, pleaser run `demo.py` and set the data option to 'full', the generated 6k multi-ref test set will be located at
+`./data/test.refs.txt`
+***(Update 03/10/2020) Model cards available in Huggingface Transformers!***
+Please check out our model cards in huggingface Transformers repository. With several lines of code it should be pretty straighforward to play with the DialoGPT interactively.
+[small model: https://huggingface.co/microsoft/DialoGPT-small](https://huggingface.co/microsoft/DialoGPT-small)
+[medium model: https://huggingface.co/microsoft/DialoGPT-medium](https://huggingface.co/microsoft/DialoGPT-medium)
+[large model: https://huggingface.co/microsoft/DialoGPT-large](https://huggingface.co/microsoft/DialoGPT-large)
+[**(New)** Ranking model: https://huggingface.co/microsoft/DialogRPT-updown](https://huggingface.co/microsoft/DialogRPT-updown?text=I+love+NLP%21+%3C%7Cendoftext%7C%3E+Me+too%21)
+***(Update 01/06/2020) Some third-party decoding script implementations:***
+- [https://github.com/polakowo/gpt2bot](https://github.com/polakowo/gpt2bot) GPT2Bot implementation based on telegram by polakowo, [ref](https://github.com/microsoft/DialoGPT/issues/3#issuecomment-573904419)
+- [https://colab.research.google.com/drive/1PslHE4Rl4RqSa20s7HEp0ZKITBir6ezE](https://colab.research.google.com/drive/1PslHE4Rl4RqSa20s7HEp0ZKITBir6ezE) A colab interactive notebook by qywu,[ref](https://github.com/microsoft/DialoGPT/issues/3#issuecomment-551410203)
+- [https://github.com/andreamad8/DialoGPT2-Interact](https://github.com/andreamad8/DialoGPT2-Interact) An interactive script featuring multiturn chatbot by andreamad8,[ref](https://github.com/microsoft/DialoGPT/issues/3#issuecomment-551450016)
+- [https://github.com/LHolten/DialoGTP-MMI-decoder](https://github.com/LHolten/DialoGTP-MMI-decoder) An MMI implementation by LHolten,[ref](https://github.com/microsoft/DialoGPT/issues/3#issuecomment-558318401)
+- [https://colab.research.google.com/drive/1-_KjlAV3J1IVDw_9KogjKDCzgFY7Jp7E](https://colab.research.google.com/drive/1-_KjlAV3J1IVDw_9KogjKDCzgFY7Jp7E) A colab interactive notebook by illuminascent@Reddit,[ref](https://www.reddit.com/r/MachineLearning/comments/dt5woy/p_dialogpt_state_of_the_art_conversational_model/?st=k530k3oo&sh=f6cd20fd)
+- [https://colab.research.google.com/drive/15wa925dj7jvdvrz8_z3vU7btqAFQLVlG](https://colab.research.google.com/drive/15wa925dj7jvdvrz8_z3vU7btqAFQLVlG) A great tutorial of how to finetune DialoGPT to build a customized bot built by [Rostyslav Neskorozhenyi](https://www.linkedin.com/in/slanj/). [ref](https://towardsdatascience.com/make-your-own-rick-sanchez-bot-with-transformers-and-dialogpt-fine-tuning-f85e6d1f4e30)
+- [https://gradio.app/g/AK391/DialoGPT](https://gradio.app/g/AK391/DialoGPT) A 3rd party demo by [AK391](https://github.com/AK391) using Gradio [web demo](https://gradio.app/g/AK391/DialoGPT)
+<!--**This github repository will be updated soon. Please stay tuned.**-->
+<!--## Minimal Computational Configurations-->
+## Recommended Configuration
+- Linux Ubuntu 16.04
+- GPU with at least 12G memory
+DialoGPT was developed entirely on **Ubuntu 16.04**, and -- depending on our availability -- we try to provide support if you experience difficulties running the code on the same configuration. However, we are **unable to provide support for other distributions or operating systems**. Portions of the code may run on other UNIX flavors (macOS, Windows subsystem for Linux, Cygwin, etc.), but it is recommended to use Ubuntu for the main training code.
+The training code can be run on CPU, but it can be slow. We would recommend to use GPU to train and finetune all models. There is no minimal limit of the number of GPUs. However, if using distributed train for multiple GPUs configuration, the speed-up vs the number of GPUs is roughly sub-linear. To simulate the same batchsize when using less GPUs, please use a larger `gradient_accumulation_steps` in model training.
+The 117M and 345M model can be loaded in a single GPU with 12G memory. The 762M model would require a single GPU that has greater than 16G memory for efficient training. The training speed on a benchmark data with 50M training instances and V100 GPUs:
+| n\_gpu           | epoch time (h) | token/sec  |
+|----------------------|--------|--------|
+| 1              | 118 | 10847 |
+| 2              | 62 | 20645 |
+| 4              | 34 | 37647 |
+| 8              | 18 | 71356 |
+Fine-tuning from our pretrained model on a new dataset typically requires 1-2 epochs.
+## Setup & Installation (TL;DR)
+We created a demo script `demo.py` to ease the difficulty of the deployment of this system. The `demo.py` contains a pipeline of **model downloading**, data extraction, data preprocessing and model training over a dummy dataset within one commandline.
+#### Train model with Conda Environment
+Please use the below commandlines to clone, install the requirements and load the Conda environment (Note that the Nvidia CUDA 10.0 developer toolkit is required):
+```bash
+sudo apt-get install -y make wget gzip bzip2 xz-utils zstd sed
+```
+```bash
+git clone https://github.com/microsoft/DialoGPT.git
+cd DialoGPT
+conda env create -f LSP-linux.yml -n LSP
+conda activate LSP
+```
+If you run this on an architecture other than Linux, please use `LSP-generic.yml` instead of `LSP-linux.yml` but please note that the generic one is not tested in all platform, so the stablity can not be gauranteed.
+To use fp16 training, please install apex by using commands below
+```bash
+conda activate LSP
+git clone https://github.com/NVIDIA/apex
+cd apex
+git reset --hard 3d01e4a0a188cc8df54bc6e44cf5eb40ff6b4cc5
+pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" .
+python3.6 demo.py
+```
+#### Train model with Docker environment
+To start, first install the docker and Nvidia-docker from their official repos.
+The image environment for running the code can be loaded as below:
+*Nvidia-docker v2.**
+```bash
+$ docker run --gpus all --ipc=host --rm -it -v $PWD:/workspace --network=host icaruszyz/large-scale-training:dialogpt bash
+```
+*Nvidia-docker v1.**
+```bash
+$ nvidia-docker --rm -it -v $PWD:/workspace --network=host icaruszyz/large-scale-training:dialogpt bash
+```
+Inside the docker container, run
+```bash
+python demo.py
+```
+## Pipeline details
+This section explains all components in the `demo.py`.
+#### Data loading
+Before running `demo.py`, you can set *DATA_FOLDER* (default value `./models`)  in `demo.py` as the place you want to download all the data and pretrained/fine-tuned models. Then simply run
+```bash
+python demo.py
+```
+to
+* automatically download models and data,
+* prepare raw data into db that is ready to use for the program,
+* generate a training scripts.
+Note that by default the `demo.py` will use a dummy data, please specify the Reddit training data by using option `--data`. Three options are  available:`dummy`,`small` and `full`.
+```bash
+python demo.py --data small
+python demo.py --data full
+```
+The small Reddit data is around 140MB and the full Reddit data is more than 27GB. You can prepare a cup of coffee when processing with the full Reddit data because **it takes a long time**!
+To generate the 6k multi-ref test set data, pleaser run `demo.py` and set the data option to 'full', the generation will be located at
+`./data/test.refs.txt`
+#### Pretrained model
+The pretrained and fine-tuned models are available on azure blobstorage.
+Please run/see `demo.py` for more details about how to download/use those models. Or you could download directly by using the links in `demo_utils.py`.
+#### Preparing data
+First, use the `prepare4db.sh` to convert a tsv data file into the correct format that the following script can recognize.
+The trainig data need to be then processed into a database file with below commandline:
+```bash
+python prepro.py --corpus $DATA_PATH
+```
+#### Using the training script
+The training script can be used in single GPU or multiple GPU settings (distributed training across multiple GPUs within a single node):
+```bash
+python ./LSP_train.py  # Single GPU training
+python -m torch.distributed.launch --nproc_per_node=8 ./LSP_train.py  # Training on 8 GPUs
+```
+The training script accept several arguments to tweak the training:
+Argument | Type | Default value | Description
+---------|------|---------------|------------
+max\_seq\_length | `int` | `128` | Maximum number of tokens for each training instance.
+train\_input\_file | `str` | `""` | Path of the training dataset in a .db format
+eval\_input\_file | `str` | `""` | Path of the validation set in a tsv format
+continue_from | `int` | `0` | Resuming the training after a specified number of steps
+fp16 | `boolean` | `True` | Whether to use 16-bits floating point for model training.
+train\_batch\_size | `int` | `4` | Batch size for training
+valid\_batch\_size | `int` | `4` | Batch size for validation
+gradient\_accumulation\_steps | `int` | `2` | Accumulate gradients on several steps
+learning\_rate | `float` | `1e-5` | Learning rate
+lr\_schedule | `str` | `noam` | Learning rate schedule can be chosen from [`noam`, `noamwd`, `BERT`, `None`]
+num\_optim\_steps | `int` | `1000000` | Number of training optimization steps
+no_token_id | `boolean` | `True` | If set True, using all-zeros token-type embedding.
+During the training, two log files will be updated. The `train_log.txt` and `eval_log.txt` contains the model loss, perplexity and training speed (tokens/sec) statistics for the training and dev set.
+The log file and saved model checkpoint can be found in `./models/output_model`
+#### Model decoding
+We note that even with properly filtered Reddit dataset, sometimes our model can still generate moderately toxic/inappropriate responses. Due to this reason, we are unable to provide the decoding script at this time (The live demo and decoding script access is upon invitation only now ).
+We are currently still working on a controlled decoding method to prevent this system from toxic generation. Please stay tuned.
+**See issues [#3](https://github.com/microsoft/DialoGPT/issues/3) and [Reddit discussions](https://www.reddit.com/r/MachineLearning/comments/dt5woy/p_dialogpt_state_of_the_art_conversational_model/) for some discussions on third-party decoding methods.**
+See below for some third-party decoding methods:
+- [https://github.com/polakowo/gpt2bot](https://github.com/polakowo/gpt2bot) GPT2Bot implementation based on telegram by polakowo, [ref](https://github.com/microsoft/DialoGPT/issues/3#issuecomment-573904419)
+- [https://colab.research.google.com/drive/1PslHE4Rl4RqSa20s7HEp0ZKITBir6ezE](https://colab.research.google.com/drive/1PslHE4Rl4RqSa20s7HEp0ZKITBir6ezE) A colab interactive notebook by qywu,[ref](https://github.com/microsoft/DialoGPT/issues/3#issuecomment-551410203)
+- [https://github.com/andreamad8/DialoGPT2-Interact](https://github.com/andreamad8/DialoGPT2-Interact) An interactive script featuring multiturn chatbot by andreamad8,[ref](https://github.com/microsoft/DialoGPT/issues/3#issuecomment-551450016)
+- [https://github.com/LHolten/DialoGTP-MMI-decoder](https://github.com/LHolten/DialoGTP-MMI-decoder) An MMI implementation by LHolten,[ref](https://github.com/microsoft/DialoGPT/issues/3#issuecomment-558318401)
+- [https://colab.research.google.com/drive/1-_KjlAV3J1IVDw_9KogjKDCzgFY7Jp7E](https://colab.research.google.com/drive/1-_KjlAV3J1IVDw_9KogjKDCzgFY7Jp7E) A colab interactive notebook by illuminascent@Reddit,[ref](https://www.reddit.com/r/MachineLearning/comments/dt5woy/p_dialogpt_state_of_the_art_conversational_model/?st=k530k3oo&sh=f6cd20fd)
+- [https://gradio.app/g/AK391/DialoGPT](https://gradio.app/g/AK391/DialoGPT) A 3rd party demo by [AK391](https://github.com/AK391) using Gradio [web demo](https://gradio.app/g/AK391/DialoGPT)
+## Models
+We release 6 fine-tuned models which can be further fine-tuned on low-resource  user-customized dataset. The total parameters in these models range from 117M to 762M, in accord with OpenAI GPT-2 model sizes.
+| Model           |  Fine-tuned from GPT-2| Trained from scratch
+|----------------------|--------|--------|
+| DialoGPT 762M model| [\[link\]](https://acvrpublicycchen.blob.core.windows.net/dialogpt/multiref/large_ft.pkl) [\[huggingface model card\]](https://huggingface.co/microsoft/DialoGPT-large)  | [\[link\]](https://acvrpublicycchen.blob.core.windows.net/dialogpt/multiref/large_fs.pkl) |
+| DialoGPT 345M model| [\[link\]](https://acvrpublicycchen.blob.core.windows.net/dialogpt/multiref/medium_ft.pkl) [\[huggingface model card\]](https://huggingface.co/microsoft/DialoGPT-medium) | [\[link\]](https://acvrpublicycchen.blob.core.windows.net/dialogpt/multiref/medium_fs.pkl) |
+| DialoGPT 117M model| [\[link\]](https://acvrpublicycchen.blob.core.windows.net/dialogpt/multiref/small_ft.pkl) [\[huggingface model card\]](https://huggingface.co/microsoft/DialoGPT-small)| [\[link\]](https://acvrpublicycchen.blob.core.windows.net/dialogpt/multiref/small_fs.pkl) |
+| DialoGPT 345M model (reverse, for MMI)| [link](https://acvrpublicycchen.blob.core.windows.net/dialogpt/multiref/small_reverse.pkl) | -|
+| [DialogRPT](https://github.com/golsun/DialogRPT) (**new** ranking models) | [link](https://github.com/golsun/DialogRPT) | -|
+The model files can be loaded exactly as the GPT-2 model checkpoints from Huggingface's [Transformers](https://github.com/huggingface/transformers). You can find the corresponding configuration files (`merges.txt`, `config.json`, `vocab.json`) in DialoGPT's repo in `./configs/*`.
+The reverse model is predicting the source from the target. This model is used  for MMI reranking.
+The [DialogRPT](https://github.com/golsun/DialogRPT) models our recently proposed ranking models used to predict the human feedback (upvotes, replies) of the responses. These models can be used to improve the DialoGPT generation quality (see our [EMNLP paper](https://arxiv.org/abs/2009.06978) for details).
+## Retraining full models
+### Data Preparation
+The first step to retrain the full models is to generate the aforementioned 27GB Reddit dataset. This involves downloading full Reddit submission and comments dumps from [https://files.pushshift.io/reddit](https://files.pushshift.io/reddit) and creating intermediate files, which overall require **700GB of local disk space**. Downloading and processing the full data requires about 1-2 days, depending on your (CPU) compute capabilties (e.g., ~24 hours with 8 cores on a recent computer). Assuming you ran the above setup and installation steps (conda activate LSP, etc.), you can create the full dataset by running either:
+```
+python demo.py --data full
+```
+or
+```
+cd reddit_extractor; SIZE=full make -j 8; cd ..
+```
+The former command calls the latter, so the two methods are equivalent. We recommend the former, as the latter is mostly useful if you run into any problem or want to customize any arguments (e.g., the `make` command lets you build only a subset of the data). Note that the downloading phase can be error prone, for example based on your geolocation (firewall, etc.). If the above commands fail to generate `data/train.tsv`, or if that file is not anywhere close to 27GB, it means something went wrong. In that case, you may want to inspect `reddit_extractor/wget-log` and `reddit_extractor/logs/*.log` for any obvious error (e.g., wget unable to download from pushshift.io). If error messages don't make sense to you, feel free to contact us. If so, please be sure to include any error messages gathered from these log files.
+Training data statistics: the generated training tsv file should be roughly 26.8 GB uncompressed, with 146.8M training instances, 3.87B source tokens, and 2.14B target tokens (including utterance-level 0/1 weights). The resulting train.tsv file should contain 146,846,215 lines.
+### Training
+We recommand generating the above data using the `demo.py --data full`, as it (1) generates the data, (2) converts it into DB format, and (3) trains a model using `python LSP_train.py`. Please directly edit `demo.py` if you want to customize any of the hyperparameters.
+## Evaluations
+#### DSTC-7 challenge
+Our model achieved the state-of-the-art results in [DSTC-7 Challenge response generation task](https://github.com/mgalley/DSTC7-End-to-End-Conversation-Modeling).
+| Experiment         | NIST2 | NIST4 | BLEU2  | BLEU4 | METEOR | ENT-4 | DIST-1 | DIST-2 | Avg. Len |
+|--------------------|-------|-------|--------|-------|--------|----------|------------|------------|---------|
+| Human response     | 2.62  | 2.65  | 12.35% | 3.13% | 8.31%  | 10.45    | 16.66%     | 67.01%     | 18.8    |
+| DSTC-7 Winner      | 2.51  | 2.52  | 14.35% | 1.83% | 8.07%  | 9.03     | 10.89%     | 32.49%     | 15.1    |
+| DialoGPT 345M      | 2.80  | 2.82  | 14.16% | 2.31% | 8.51%  | **10.08**    | 9.13%      | 39.73%     | 16.9    |
+| DialoGPT 345M (BS) | **2.92**  | **2.97**  | **19.18%** | **6.05%** | **9.29%**  | 9.57     | **15.73%**     | **51.03%**     | 14.2    |
+where ENT represents the [Entropy score](https://arxiv.org/abs/1809.05972), and DIST represents the [Distinct score](https://arxiv.org/pdf/1510.03055.pdf). For all metrics except the average length, larger are better.
+<!--| Experiment           | NIST1  | NIST2  | NIST3  | NIST4  | BLEU1  | BLEU2  | BLEU3  | BLEU4  | METEOR | ENT-1 | ENT-2 | ENT-3 | ENT-4 | DIST-1 | DIST-2 | Len |
+|----------------------|--------|--------|--------|--------|--------|--------|--------|--------|--------|----------|----------|----------|----------|------------|------------|---------|
+| Human                | 2.4237 | 2.6244 | 2.6472 | 2.65   | 0.3408 | 0.1235 | 0.0572 | 0.0313 | 0.0831 | 6.5893   | 9.7423   | 10.4101  | 10.4450  | 0.1666     | 0.6701     | 18.7568 |
+| DSTC-7 Winner | 2.3408 | 2.5102 | 2.522  | 2.523  | 0.4122 | 0.1435 | 0.0501 | 0.0183 | 0.0807 | 5.3832   | 7.6065   | 8.5304   | 9.0298   | 0.1089     | 0.3249     | 15.1327 |
+| DialoGPT           | 2.5863 | 2.804  | 2.823  | 2.8246 | 0.3927 | 0.1416 | 0.0555 | 0.0231 | 0.0851 | 5.5791   | 8.5109   | 9.6872   | 10.0765  | 0.0913     | 0.3973     | 16.9484 |
+| DialoGPT(beam search)       | **2.5943**| **2.9163** | **2.9624** | **2.9681**| **0.4238** | **0.1918** | **0.1027** | **0.0605** | **0.0929** | **6.0815**   | **8.7379**   | 9.4037   | 9.5697   | 0.1573     | 0.5103     | 14.1603 |-->
+Note that the superior automatic evaluation comparing to human responses does not necessary imply that our model achieves human parity. Please check out our paper for more detailed analysis.
+To fine-tune the `345M` DialoGPT model on the DSTC-7 challenge data on a server with 8 V100 GPUs, please run the following commandline (The DSTC data can be found at [DSTC-7 repo](https://github.com/mgalley/DSTC7-End-to-End-Conversation-Modeling)):
+```bash
+python3 -m torch.distributed.launch --nproc_per_node=8 train_LSP.py --init_checkpoint ./models/medium/medium_ft.pkl --train_input_file ./data/DSTC_train.db --eval_input_file ./data/DSTC_valid.tsv --model_name_or_path ./model/medium/ --learning_rate 1e-4  --train_batch_size 64 --eval_batch_size 64 --no_token_id
+```
+The trained model can be found at [DSTC medium model](https://acvrpublicycchen.blob.core.windows.net/dialogpt/DSTC/medium_ft.pkl)
+#### Evaluation
+1. Please **downloads** the following 3rd-party packages and save into the empty folder `3rdparty`:
+	* [**mteval-v14c.pl**](https://goo.gl/YUFajQ) to compute [NIST](http://www.mt-archive.info/HLT-2002-Doddington.pdf). You may need to install the following [perl](https://www.perl.org/get.html) modules (e.g. by `cpan install`): XML:Twig, Sort:Naturally and String:Util.
+	* [**meteor-1.5**](http://www.cs.cmu.edu/~alavie/METEOR/download/meteor-1.5.tar.gz) to compute [METEOR](http://www.cs.cmu.edu/~alavie/METEOR/index.html). It requires [Java](https://www.java.com/en/download/help/download_options.xml).
+2. Please follow the [DSTC-7 official repo](https://github.com/mgalley/DSTC7-End-to-End-Conversation-Modeling/tree/master/data_extraction) to extract the data, and put `data-official-test/test.refs.txt` into `./dstc/data/` folder.
+3. Run the extraction script below to produce the human response hypothesis file `human.resp.txt`:
+	```bash
+	python extract_human.py
+	```
+4. Finally, to reproduce the results of human hypothesis on DSTC dataset, please run following commands under the repo folder:
+	```bash
+	python batch_eval.py
+	```
+The evaluation results will be generated in the folder `./dstc/eval/`
+## 6K multi-ref dataset result
+### Automatic evaluation
+We test on 6K multi-ref dataset from Reddit. The results are summarized in below
+| Experiment         | NIST2 | NIST4 | BLEU2  | BLEU4 | METEOR | ENT-4 | DIST-1 | DIST-2 | Avg. Len |
+|--------------------|-------|-------|--------|-------|--------|----------|------------|------------|---------|
+| Human response     | 3.41  | 4.25  | 17.90% | 7.48% | 10.64% | 11       | 14.50%     | 63.00%     | 13.1    |
+| DialoGPT 117M      | 2.39  | 2.41  | 10.54% | 1.55% | 7.53%  | 10.78    | 8.60%      | 39.90%     | 12.8    |
+| DialoGPT 345M      | 3     | 3.06  | 16.96% | 4.56% | 9.81%  | 9.13     | 6.80%      | 26.30%     | 12.2    |
+| DialoGPT 762M      | 2.84  | 2.9   | 18.66% | 5.25% | 9.66%  | 9.72     | 7.76%      | 29.93%     | 11.2    |
+| DialoGPT 345M (BS) | **3.4**  | **3.5**   | **21.76%** | **7.92%** | 10.74%  | 10.48     | **12.38%**     | **48.74%**    | 11.3    |
+| DialoGPT 345M (w/MMI)| 3.28  | 3.33 | 15.68% | 3.94% | **11.23%**  | **11.25**     | 9.39%    | 45.55%   | 17.2    |
+### <a name="human_eval"></a>Human evaluation
+We further conduct human evaluations (6K examples for each methods, each example is evaluated by 3 human judges). The results show a strong evidence that our generation quality is towards approaching the quality of real human responses, under this non-interactive Turing test:
+*Relevance*: A and B, which one is more relevant to the source prompt.
+| System A | A Wins (%) | Ties (%) | B Wins (%) | System B|
+|--------------------|-------|-------|--------|-------|
+|DialoGPT 345M|2671      (45%)   | 513         (9%) |   2816       (47%)| Human responses|
+|DialoGPT 345M| 3281       (72%)|    394         (9%)  |  882         (19%)| [PersonalityChat](https://docs.microsoft.com/en-us/azure/cognitive-services/project-personality-chat/overview)|
+|DialoGPT 345M w/ MMI| **2871**     (48%)|    522         (9%)  |  2607      (43%)| Human responses|
+*Informativeness*: A and B, which one is more contentful and informative.
+| System A | A Wins (%) | Ties (%) | B Wins (%) | System B|
+|--------------------|-------|-------|--------|-------|
+|DialoGPT 345M| 2722       (45%) |  234         (4%) |  3044       (51%)| Human responses|
+|DialoGPT 345M|3490       (77%) |   206         (5%)  |  861         (19%)| [PersonalityChat](https://docs.microsoft.com/en-us/azure/cognitive-services/project-personality-chat/overview)|
+|DialoGPT 345M w/ MMI| **3011**       (50%)|    234        (4%)  |  2755       (46%)| Human responses|
+*Human-Like*: A and B, which one do you think is more likely to be generated by Human.
+| System A | A Wins (%) | Ties (%) | B Wins (%) | System B|
+|--------------------|-------|-------|--------|-------|
+|DialoGPT 345M|2716       (45%)  | 263         (4%)  | 3021       (50%)| Human responses|
+|DialoGPT 345M|3462       (76%) |  196         (4%)  | 899         (20%)| [PersonalityChat](https://docs.microsoft.com/en-us/azure/cognitive-services/project-personality-chat/overview)|
+|DialoGPT 345M w/ MMI| **2978**      (50%)|    241         (4%)  |  2781        (46%)| Human responses|
+Please see full details in our [arxiv paper](https://arxiv.org/abs/1911.00536).
+<!--Relevance
+System Wins      (%)         Ties        (%)         Losses   (%)
+2 vs 1     2671       (0.45)    513         (0.09)    2816       (0.47)
+2 vs 3     3281       (0.72)    394         (0.09)    882         (0.19)
+2 vs 4     2379       (0.40)    527         (0.09)    3094       (0.52)
+2 vs 5     3019       (0.50)    581         (0.10)    2400       (0.40)
+2 vs 6     2726       (0.45)    576         (0.10)    2698       (0.45)
+Informativeness
+System Wins      (%)         Ties        (%)         Losses   (%)
+2 vs 1     2722       (0.45)    234         (0.04)    3044       (0.51)
+2 vs 3     3490       (0.77)    206         (0.05)    861         (0.19)
+2 vs 4     2474       (0.41)    257         (0.04)    3269       (0.54)
+2 vs 5     3230       (0.54)    362         (0.06)    2408       (0.40)
+2 vs 6     2856       (0.48)    303         (0.05)    2841       (0.47)
+Human-Like
+System Wins      (%)         Ties        (%)         Losses   (%)
+2 vs 1     2716       (0.45)    263         (0.04)    3021       (0.50)
+2 vs 3     3462       (0.76)    196         (0.04)    899         (0.20)
+2 vs 4     2478       (0.41)    289         (0.05)    3233       (0.54)
+2 vs 5     3233       (0.54)    340         (0.06)    2427       (0.40)
+2 vs 6     2847       (0.47)    321         (0.05)    2832       (0.47)
+-->
+<!--| Experiment                   | NIST1 | NIST2 | NIST3 | NIST4 | BLEU1  | BLEU2  | BLEU3  | BLEU4 | METEOR | ENT-4 | DIST-1 | DIST-2 |
+|------------------------------|-------|-------|-------|-------|--------|--------|--------|-------|--------|----------|------------|------------|
+| Human response               | 2.99  | 3.41  | 3.83  | 4.25  | 39.61% | 17.90% | 10.71% | 7.48% | 10.64% | 11       | 14.50%     | 63.00%     |
+| DialoGPT 117M      | 2.25  | 2.39  | 2.41  | 2.41  | 35.43% | 10.54% | 3.85%  | 1.55% | 7.53%  | 10.78    | 8.60%      | 39.90%     |
+| DialoGPT 345M      |**2.67** | **3**     | **3.05**  | **3.06**  | 40.97% | 16.96% | 8.31%  | 4.56% | **9.81%**  | 9.13     | 6.80%      | 26.30%     |
+| DialoGPT 762M      | 2.51  | 2.84  | 2.89  | 2.9   | **44.45%** | **18.66%** | **9.32%**  | **5.25%** | 9.66%  | **9.72**     | **7.76%**      | **29.93%**    |-->
+<!--| DialoGPT 762M  (beam search)| 2.51  | 2.9   | 2.97  | 2.98  | 44.84% | 21.08% | 12.01% | 7.57% | 10.11% | 10.06    | 11.62%     | 44.07%     |-->
+<!--## ConvAI challenge -->
+<!--| DialoGPT 345M (beam search)  | 2.94  | 3.4   | 3.49  | 3.5   | 45.27% | 21.76% | 12.51% | 7.92% | 10.74% | 10.48    | 12.38%     | 48.74%     |-->
+## Generated Examples
 * Multi-turn generation examples from an interactive environment:
 |Role | Response |
 |User |This is so difficult ! |
 | Bot | You have no idea how hard it is to be a millionaire and happy . There is a reason the rich have a lot of money |
+* Multi-turn  self-playing  dialogue with user prompt
+|Role | Response |
+|---------|--------|
+|User | what is the meaning of life ?|
+| Bot | The  meaning  is  to  be  with  your  family  and friends .|
+|Bot | I’m going to guess :  It means that your parents and friends have loved you . That’s the meaning , I believe . |
+| Bot | I think  the  meaning  is  to  make  sure  you’re alive when you die . So you’ll be with your family , friends , and yourself .|
+|Bot |It’s not , but thanks for trying to help . |
+* Commonsense question answering examples
+|User | Bot |
+|---------|--------|
+|who is the first president of the United States | George Washington |
+|what is the boiling point of water? | I think it's about 212 F. |
+|which one is bigger, sun or moon? | The sun . |
+|which animal has black and white stripes? | A black and white striped cat . |
+* Handling context that does not appear in training examples (note that training examples are sampled from 2005 to 2017)
+|User | Bot |
+|---------|--------|
+|The trading war between China and US is still happening . | I'm sure the Chinese government and the U . S . will be fine with that as long as they keep their arms on the table and the money flowing . |
+|who won the world cup in 2018 ? | Germany , I believe . |
+|Nvidia's Titan RTX is really good . | I have this card and i love it . No complaints . |
+|Can Porsche beat Tesla with its new Taycan EV ? | I don't think so . The Tesla is more powerful than the porsche .
+* Our bot's response to philosophy questions
+|User | Bot |
+|---------|--------|
+|What is the meaning of a good life ? | I think it's that we're all connected to our past lives and the meaning of life is to live the way you want to and how you want to . |
+|Which is more important -- being successful or being happy ? | Being successful is just endless . Being happy is the only thing that matters . |
+|Is peace the only way to stop war ? |  Not if the world is not willing to work with you . Peace is the ultimate goal , but peace requires more than that . |
+| How much freedom should people have ? |  The right amount of freedom is the freedom to do as you please , as long as you don't hurt people or property .|
+* Finally
+|User | Bot |
+|---------|--------|
+|What do you think of the DialoGPT repo ? | It's pretty good , but it's very easy to find a bug . |
+Please start a issue if you spot any :)
+## Try our system
+The live demo and decoding script access is upon invitation only now. Please stayed tuned for the full release.
+<!--The live demo and decoding script access is upon approval request. Please apply [here](https://docs.google.com/spreadsheets/d/1epTNfaqva1isVO_o9pbyhVLsnzDn58dGkcLB0OUVcqs/edit?usp=sharing)-->
+<!--This model should give a Hits@1 over 79, perplexity of 20.5 and F1 of 16.5 using the convai2 evaluation script (see below).
+These numbers are slightly lower than the number we obtained in the ConvAI2 competition. Here is what you can tweak to reach the same results:
+- in the ConvAI2 competition we also used tweaked position emebddings so that the history of the dialog always start at with the same embeddings. This is easy to add with pytorch-pretrained-bert and should improve the hits@1 metric.
+- in the ConvAI2 competition we used a beam search decoder. While the results are better in term of f1 metric, our feeling is that the human experience is les compelling with beam search versus the nucleus sampling detector which is provided in the present repository.-->
+<!--## Using the interaction script
+The training script saves all the experiments and checkpoints in a sub-folder named with the timestamp of the experiment in the `./runs` folder of the repository base folder.
+You can then use the interactive script to interact with the model simply by pointing to this folder.
+Here is an example command line to run the interactive script:
+```bash
+python ./interact.py --model_checkpoint ./data/Apr17_13-31-38_thunder/  # run the interactive script with a training checkpoint
+python ./interact.py  # run the interactive script with the finetuned model on our S3
+```
+The fine-tuned model will gives FINAL Hits@1: 0.715
+The interactive script accept a few arguments to tweak the decoding algorithm:
+Argument | Type | Default value | Description
+---------|------|---------------|------------
+dataset_path | `str` | `""` | Path or url of the dataset. If empty download from S3.
+dataset_cache | `str` | `'./dataset_cache.bin'` | Path or url of the dataset cache
+model | `str` | `"openai-gpt"` | Path, url or short name of the model
+max_history | `int` | `2` | Number of previous utterances to keep in history
+device | `str` | `cuda` if `torch.cuda.is_available()` else `cpu` | Device (cuda or cpu)
+no_sample | action `store_true` | Set to use greedy decoding instead of sampling
+max_length | `int` | `20` | Maximum length of the output utterances
+min_length | `int` | `1` | Minimum length of the output utterances
+seed | `int` | `42` | Seed
+temperature | `int` | `0.7` | Sampling softmax temperature
+top_k | `int` | `0` | Filter top-k tokens before sampling (`<=0`: no filtering)
+top_p | `float` | `0.9` | Nucleus filtering (top-p) before sampling (`<=0.0`: no filtering)
+## Running ConvAI2 evaluation scripts
+To run the evaluation scripts of the ConvAI2 challenge, you first need to install `ParlAI` in the repo base folder like this:
+```bash
+git clone https://github.com/facebookresearch/ParlAI.git
+cd ParlAI
+python setup.py develop
+```
+You can then run the evaluation script from `ParlAI` base folder:
+```bash
+cd ParlAI
+python ../convai_evaluation.py --eval_type hits@1  # to download and evaluate our fine-tuned model on hits@1 metric
+python ../convai_evaluation.py --eval_type hits@1  --model_checkpoint ./data/Apr17_13-31-38_thunder/  # to evaluate a training checkpoint on hits@1 metric
 ```
+The evaluation script accept a few arguments to select the evaluation metric and tweak the decoding algorithm:
+Argument | Type | Default value | Description
+---------|------|---------------|------------
+eval_type | `str` | `"hits@1"` | Evaluate the model on `hits@1`, `ppl` or `f1` metric on the ConvAI2 validation dataset
+model | `str` | `"openai-gpt"` | Path, url or short name of the model
+max_history | `int` | `2` | Number of previous utterances to keep in history
+device | `str` | `cuda` if `torch.cuda.is_available()` else `cpu` | Device (cuda or cpu)
+no_sample | action `store_true` | Set to use greedy decoding instead of sampling
+max_length | `int` | `20` | Maximum length of the output utterances
+min_length | `int` | `1` | Minimum length of the output utterances
+seed | `int` | `42` | Seed
+temperature | `int` | `0.7` | Sampling softmax temperature
+top_k | `int` | `0` | Filter top-k tokens before sampling (`<=0`: no filtering)
+top_p | `float` | `0.9` | Nucleus filtering (top-p) before sampling (`<=0.0`: no filtering)
+-->
+## Related Project
+* RetGen: [https://github.com/dreasysnail/RetGen](https://github.com/dreasysnail/RetGen). Retrieval-augmented/grounded DialoGPT and beyond. RetGen is a joint training framework that simultaneously optimizes a dense passage retriever and a knowledge-grounded text generator in an end-to-end fashion.
+* Microsoft ICECAPS: [https://github.com/microsoft/icecaps](https://github.com/microsoft/icecaps).
+	As an orthogonal repository of this project,
+	Microsoft Icecaps is an open-source toolkit (in tensorflow) for building neural conversational systems. Icecaps provides an array of tools from recent conversation modeling and general NLP literature within a flexible paradigm that enables complex multi-task learning setups.
+* Pretrained UniLM: [https://github.com/microsoft/unilm](https://github.com/microsoft/unilm)
+* MT-DNN: [https://github.com/namisan/mt-dnn](https://github.com/namisan/mt-dnn)
+* A chinese counterpart of DialoGPT by yangjianxin1. [https://github.com/yangjianxin1/GPT2-chitchat](https://github.com/yangjianxin1/GPT2-chitchat). We are glad to see that the MMI strategy that we used in DialoGPT has also improved the performance for this project as well!
+## Contact
+Please contact [[email protected]](mailto:[email protected]) if you have any questions/suggestions. However, the response will be sporadic. Please expect delay.
+## Contributing
+This project welcomes contributions and suggestions.  Most contributions require you to agree to a
+Contributor License Agreement (CLA) declaring that you have the right to, and actually do, grant us
+the rights to use your contribution. For details, visit https://cla.opensource.microsoft.com.
+When you submit a pull request, a CLA bot will automatically determine whether you need to provide
+a CLA and decorate the PR appropriately (e.g., status check, comment). Simply follow the instructions
+provided by the bot. You will only need to do this once across all repos using our CLA.
+This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/).
+For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) or
+contact [[email protected]](mailto:[email protected]) with any additional questions or comments.
+## Disclaimer
+This repository aims to facilitate research in large-scale pretraining for conversational data. This toolkit contains only part of the modeling machinery needed to actually produce a model weight file in a running dialog. On its own, this model provides only information about the weights of various text spans; in order for a researcher to actually use it, they will need to bring conversational data of their own and decode the response generation from the pretrained system. Microsoft is not responsible for any generation from the 3rd party utilization of the pretrained system.
+## Citation
+If you use this code in your research, you can cite our [arxiv paper](https://arxiv.org/abs/1911.00536):
+```bash
+@inproceedings{zhang2019dialogpt,
+    title={DialoGPT: Large-Scale Generative Pre-training for Conversational Response Generation},
+    author={Yizhe Zhang and Siqi Sun and Michel Galley and Yen-Chun Chen and Chris Brockett and Xiang Gao and Jianfeng Gao and Jingjing Liu and Bill Dolan},
+    year={2020},
+    booktitle={ACL, system demonstration}
+}
+```

SECURITY.md ADDED Viewed

	@@ -0,0 +1,35 @@

+<!-- BEGIN MICROSOFT SECURITY.MD V0.0.1 BLOCK -->
+## Security
+Microsoft takes the security of our software products and services seriously, which includes all source code repositories managed through our GitHub organizations, which include [Microsoft](https://github.com/Microsoft), [Azure](https://github.com/Azure), [DotNet](https://github.com/dotnet), [AspNet](https://github.com/aspnet), [Xamarin](https://github.com/xamarin), and [many more](https://opensource.microsoft.com/).
+If you believe you have found a security vulnerability in any Microsoft-owned repository that meets Microsoft's [definition](https://docs.microsoft.com/en-us/previous-versions/tn-archive/cc751383(v=technet.10)) of a security vulnerability, please report it to us as described below.
+## Reporting Security Issues
+**Please do not report security vulnerabilities through public GitHub issues.** Instead, please report them to the Microsoft Security Response Center at [[email protected]](mailto:[email protected]).  If possible, encrypt your message with our PGP key; please download it from the [Microsoft Security Response Center PGP Key page](https://technet.microsoft.com/en-us/security/dn606155).
+You should receive a response within 24 hours. If for some reason you do not, please follow up via email to ensure we received your original message. Additional information can be found at [microsoft.com/msrc](https://www.microsoft.com/msrc).
+Please include the requested information listed below (as much as you can provide) to help us better understand the nature and scope of the possible issue:
+  * Type of issue (e.g. buffer overflow, SQL injection, cross-site scripting, etc.)
+  * Full paths of source file(s) related to the manifestation of the issue
+  * The location of the affected source code (tag/branch/commit or direct URL)
+  * Any special configuration required to reproduce the issue
+  * Step-by-step instructions to reproduce the issue
+  * Proof-of-concept or exploit code (if possible)
+  * Impact of the issue, including how an attacker might exploit the issue
+This information will help us triage your report more quickly.
+## Preferred Languages
+We prefer all communications to be in English.
+## Policy
+Microsoft follows the principle of [Coordinated Vulnerability Disclosure](https://www.microsoft.com/en-us/msrc/cvd).
+<!-- END MICROSOFT SECURITY.MD BLOCK -->

config.json ADDED Viewed

	@@ -0,0 +1,30 @@

+{
+  "activation_function": "gelu_new",
+  "architectures": [
+    "GPT2LMHeadModel"
+  ],
+  "attn_pdrop": 0.1,
+  "bos_token_id": 50256,
+  "embd_pdrop": 0.1,
+  "eos_token_id": 50256,
+  "initializer_range": 0.02,
+  "layer_norm_epsilon": 1e-05,
+  "model_type": "gpt2",
+  "n_ctx": 1024,
+  "n_embd": 1024,
+  "n_head": 16,
+  "n_layer": 24,
+  "n_positions": 1024,
+  "resid_pdrop": 0.1,
+  "summary_activation": null,
+  "summary_first_dropout": 0.1,
+  "summary_proj_to_labels": true,
+  "summary_type": "cls_index",
+  "summary_use_proj": true,
+  "task_specific_params": {
+    "conversational": {
+      "max_length": 1000
+    }
+  },
+  "vocab_size": 50257
+}

data_config.py ADDED Viewed

	@@ -0,0 +1,17 @@

+#  Copyright (c) Microsoft Corporation.
+#  Licensed under the MIT license.
+import os
+from . import proj_env
+RAW_DATA_DIR = os.path.join(proj_env.ROOT_DIR, "raw_data")
+PROCESSED_DATA_DIR = os.path.join(proj_env.ROOT_DIR, "processed")
+PIPELINE_DATA_DIR = os.path.join(proj_env.ROOT_DIR, "pipeline_data")
+TEST_KEY_FN = os.path.join(RAW_DATA_DIR, "keys.2k.txt")
+MAX_LEN = 128 #512
+MAX_CONTEXT_LEN = 64#250
+TAG_LIST = ["<p>", "<title>", "<anchor>"] + ["<h%s>" % i for i in range(1, 7)]

data_loader.py ADDED Viewed

	@@ -0,0 +1,291 @@

+#  Copyright (c) Microsoft Corporation.
+#  Licensed under the MIT license.
+import gzip
+import json
+import math
+import random
+import shelve
+import torch
+import subprocess as sp
+from math import ceil
+from torch.utils.data import DataLoader, Sampler, Dataset
+from torch.nn.utils.rnn import pad_sequence
+from env import END_OF_TEXT_TOKEN
+from gpt2_training.train_utils import (InputFeatures, InputFeatures_train,
+                                       RedditExample)
+class BucketSampler(Sampler):
+    """
+    this sampler will sort data by sequence length
+    """
+    def __init__(self, lens, bucket_size, batch_size,
+                 droplast=False, shuffle=True):
+        self._lens = lens
+        self._batch_size = batch_size
+        self._bucket_size = bucket_size
+        self._droplast = droplast
+        self._shuf = shuffle
+    def __iter__(self):
+        ids = list(range(len(self._lens)))
+        if self._shuf:
+            random.shuffle(ids)
+        buckets = [sorted(ids[i:i+self._bucket_size],
+                          key=lambda i: self._lens[i], reverse=True)
+                   for i in range(0, len(ids), self._bucket_size)]
+        batches = [bucket[i:i+self._batch_size]
+                   for bucket in buckets
+                   for i in range(0, len(bucket), self._batch_size)]
+        if self._droplast:
+            batches = [batch for batch in batches
+                       if len(batch) == self._batch_size]
+        if self._shuf:
+            random.shuffle(batches)
+        return iter(batches)
+    def __len__(self):
+        bucket_sizes = ([self._bucket_size]
+                        * (len(self._lens) // self._bucket_size)
+                        + [len(self._lens) % self._bucket_size])
+        if self._droplast:
+            return sum(s//self._batch_size for s in bucket_sizes)
+        else:
+            return sum(math.ceil(s/self._batch_size) for s in bucket_sizes)
+class GPT2FeatureDataset(Dataset):
+    """ pytorch dataset for GPT2 training """
+    def __init__(self, features, max_len=None):
+        self.features = features
+        self.max_len = max_len  # this max_len do truncate
+    def __getitem__(self, i):
+        feat_dict = self.features[i]
+        if self.max_len is not None and feat_dict['input_len'] > self.max_len:
+            # tuncate on the left side (context)
+            feat_dict['input_ids'] = feat_dict['input_ids'][-self.max_len:]
+            feat_dict['position_ids'] = feat_dict['position_ids'][
+                -self.max_len:]
+            feat_dict['token_type_ids'] = feat_dict['token_type_ids'][
+                -self.max_len:]
+            feat_dict['lm_labels'] = feat_dict['lm_labels'][-self.max_len:]
+        try:
+            for s in ['context_len', 'response_len']:
+                if s in feat_dict.keys():
+                    print("db file missing "+s)
+                    del feat_dict[s]
+        except Exception:
+            import pdb
+            pdb.set_trace()
+        feat = InputFeatures_train(**feat_dict)
+        return feat
+    def __len__(self):
+        return len(self.features)
+    @staticmethod
+    def collate(features):
+        input_ids = pad_sequence([torch.tensor(f.input_ids, dtype=torch.long)
+                                  for f in features],
+                                 batch_first=True, padding_value=0)
+        position_ids = pad_sequence([torch.tensor(f.position_ids,
+                                                  dtype=torch.long)
+                                     for f in features],
+                                    batch_first=True, padding_value=0)
+        token_type_ids = pad_sequence([torch.tensor(f.token_type_ids,
+                                                    dtype=torch.long)
+                                       for f in features],
+                                      batch_first=True, padding_value=0)
+        labels = pad_sequence([torch.tensor(f.lm_labels, dtype=torch.long)
+                               for f in features],
+                              batch_first=True, padding_value=-1)
+        return (input_ids, position_ids, token_type_ids, labels)
+class BucketingDataLoader(object):
+    """ this loads shelve db chunks and then convert to mini-batch loader"""
+    def __init__(self, db_name, batch_size, max_seq_len,
+                 bucket=100, shuffle=True):
+        self.db = shelve.open(f'{db_name}/db', 'r')
+        self.batch_size = batch_size
+        self.max_len = max_seq_len
+        self.bucket_size = bucket * batch_size
+        self.shuffle = shuffle
+    def _get_keys(self):
+        keys = list(self.db.keys())
+        return keys
+    def __iter__(self):
+        keys = self._get_keys()
+        if self.shuffle:
+            random.shuffle(keys)
+        for key in keys:
+            chunk = json.loads(gzip.decompress(self.db[key]).decode('utf-8'))
+            # discard long examples
+            trunc_chunk = []
+            lens = []
+            for feat in chunk:
+                if feat['input_len'] > self.max_len:
+                    continue
+                trunc_chunk.append(feat)
+                lens.append(feat['input_len'])
+            dataset = GPT2FeatureDataset(trunc_chunk, self.max_len)
+            sampler = BucketSampler(lens, self.bucket_size, self.batch_size,
+                                    droplast=True, shuffle=self.shuffle)
+            loader = DataLoader(dataset, batch_sampler=sampler,
+                                num_workers=0,  # can test multi-worker
+                                collate_fn=GPT2FeatureDataset.collate)
+            yield from loader
+    def __len__(self):
+        raise NotImplementedError()
+    def __del__(self):
+        self.db.close()
+class DistributedBucketingDataLoader(BucketingDataLoader):
+    """ distributed version """
+    def __init__(self, rank, num_replica, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.rank = rank
+        self.num_replica = num_replica
+    def _get_keys(self):
+        keys = list(self.db.keys())[self.rank::self.num_replica]
+        return keys
+def convert_examples_to_features_dynamic(examples, tokenizer,
+                                         max_seq_length=512):
+    """
+    do not pad
+    """
+    def featurize(example):
+        conv_id = example.conv_id
+        context_id = tokenizer.encode(example.context)
+        end_of_text_id = tokenizer.encoder[END_OF_TEXT_TOKEN]
+        # response is provided in example
+        response_id = tokenizer.encode(example.response)
+        input_ids_len = len(context_id) + len(response_id) + 2
+        if input_ids_len > max_seq_length:
+            if len(context_id) > input_ids_len - max_seq_length:
+                # cut context from beginning if length of context + response is too long
+                # and len of context is long enough to cut
+                context_id = context_id[input_ids_len - max_seq_length:]
+            else:
+                # cut response from end if length of context + response is too long
+                # and len of response is long enough to cut
+                # if no response is available, discard the data
+                if max_seq_length-len(context_id)-2 < 0:
+                    return None
+                response_id = response_id[:max_seq_length-len(context_id)-2]
+        input_ids = context_id + [end_of_text_id] + response_id + [end_of_text_id]
+        # label simplely is next token in sequences. MASK all context_id tokens except for the last one
+        lm_labels = [-1] * len(context_id) + response_id + [end_of_text_id] + [-1]
+        position_ids = list(range(len(input_ids)))
+        token_type_id = [0] * len(input_ids)
+        return InputFeatures(conv_id, input_ids, position_ids, token_type_id,
+                             lm_labels, len(context_id), len(response_id))
+    # discard None feature
+    features = [f for f in [featurize(ex) for ex in examples] if f is not None]
+    return features
+class DynamicBatchingLoader(object):
+    """ this loader takes raw text file, used for validate perplexity """
+    def __init__(self, corpus_file, tokenizer, normalize_data,
+                 batch_size, max_seq_length):
+        self.corpus = corpus_file
+        self.toker = tokenizer
+        self.norm = normalize_data
+        self.bs = batch_size
+        self.max_seq_length = max_seq_length
+        self.num_examples = self.get_len(corpus_file)
+    def __iter__(self, epoch=1):
+        if epoch > 0:
+            for epoch in range(epoch):
+                yield from self._iter_epoch()
+        else:
+            while True:
+                yield from self._iter_epoch()
+    def __len__(self):
+        return ceil(self.num_examples/self.bs)
+    def _iter_epoch(self):
+        try:
+            with open(self.corpus, 'r', encoding="utf-8") as corpus:
+                i = 0
+                while True:
+                    examples = []
+                    cur_bs = 0
+                    while True:
+                        line = next(corpus).encode('utf-8').decode('utf-8')
+                        contents = line.split('\t')
+                        src, tgt_all = contents[0], contents[1:]
+                        for tgt in tgt_all:
+                            if self.norm:
+                                src_line = ' '.join(src.strip().split())
+                                tgt_line = ' '.join(tgt.strip().split())
+                            else:
+                                src_line = src.strip()
+                                tgt_line = tgt.strip()
+                            examples.append(
+                                RedditExample(i, src_line, tgt_line),
+                            )
+                            i += 1
+                            cur_bs += 1
+                        if cur_bs >= self.bs:
+                            break
+                    features = convert_examples_to_features_dynamic(
+                        examples, self.toker, self.max_seq_length)
+                    batch = self._batch_feature(features)
+                    yield batch
+        except StopIteration:
+            pass
+    def _batch_feature(self, features):
+        input_ids = pad_sequence([torch.tensor(f.choices_features['input_ids'],
+                                               dtype=torch.long)
+                                  for f in features],
+                                 batch_first=True, padding_value=0)
+        position_ids = pad_sequence(
+            [torch.tensor(f.choices_features['position_ids'], dtype=torch.long)
+             for f in features],
+            batch_first=True, padding_value=0)
+        token_type_ids = pad_sequence(
+            [torch.tensor(f.choices_features['token_type_ids'],
+                          dtype=torch.long)
+             for f in features],
+            batch_first=True, padding_value=0)
+        labels = pad_sequence([torch.tensor(f.lm_labels, dtype=torch.long)
+                               for f in features],
+                              batch_first=True, padding_value=-1)
+        context_len = torch.tensor([f.context_len for f in features],
+                                   dtype=torch.long)
+        response_len = torch.tensor([f.response_len for f in features],
+                                    dtype=torch.long)
+        return (input_ids, position_ids, token_type_ids, labels,
+                context_len, response_len)
+    def get_len(self, corpus):
+        n_line = int(sp.check_output(f"wc -l {corpus}".split(),
+                                     universal_newlines=True).split()[0])
+        return n_line

demo.py ADDED Viewed

	@@ -0,0 +1,128 @@

+#  Copyright (c) Microsoft Corporation.
+#  Licensed under the MIT license.
+#
+# Please assign the DATA_FOLDER before running this scripts, the data, pre-trained model, fine-tuned model will be
+# downloaded automatically to DATA_FOLDER
+import os
+import sys
+import logging
+from functools import partial
+from demo_utils import download_model_folder
+import argparse
+import subprocess as sp
+PROJECT_FOLDER = os.path.dirname(os.path.realpath(__file__))
+PYTHON_EXE = 'python'
+MODEL_FOLDER = os.path.join(PROJECT_FOLDER, 'models')
+DATA_FOLDER = os.path.join(PROJECT_FOLDER, 'data')
+print(f'PROJECT_FOLDER = {PROJECT_FOLDER}')
+parser = argparse.ArgumentParser()
+parser.add_argument('--data', type=str, default='dummy',
+                    help='choose from dummy, small and full')
+dargs = parser.parse_args()
+assert dargs.data == 'dummy' or dargs.data == 'small' or dargs.data == 'full' , \
+    'The specified data option is not support!'
+logging.basicConfig(
+    format='%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
+    datefmt='%m/%d/%Y %H:%M:%S', level=logging.INFO
+)
+logger = logging.getLogger(__name__)
+if os.path.exists(MODEL_FOLDER):
+    print(f'Found existing models folder at {MODEL_FOLDER}, skip creating a new one!')
+    os.makedirs(MODEL_FOLDER, exist_ok=True)
+else:
+    os.makedirs(MODEL_FOLDER)
+#########################################################################
+# Download Model
+#########################################################################
+logger.info('Downloading models...')
+download_model = partial(download_model_folder, DATA_FOLDER=MODEL_FOLDER)
+# model size:  could be one of 'small' (GPT2 with 117M), 'medium'(345M) or 'large' (1542M)
+# dataset: one of 'multiref' or 'dstc'
+# from_scratch: True : load model trained from scratch or False: load model trained from fine-tuning the GPT-2
+target_folder = download_model(model_size='small', dataset='multiref', from_scratch=False)
+logger.info('Done!\n')
+#########################################################################
+# Prepare Data
+#########################################################################
+logger.info('Downloading and Extracting Data...')
+if dargs.data == 'dummy':
+    cmd = 'bash prepare4db.sh'
+    ret = sp.run(cmd.split(' '), stdout=sp.PIPE, stderr=sp.STDOUT, cwd=DATA_FOLDER)
+elif dargs.data == 'small':
+    myCmd = os.popen('cd reddit_extractor; SIZE=small make -j 8; cd ..').read()
+elif dargs.data == 'full':
+    myCmd = os.popen('cd reddit_extractor; SIZE=full make -j 8; cd ..').read()
+else:
+    raise ValueError('you need to implement your own data type, or use either dummy, small, or full')
+logger.info('Preparing Data...')
+data_path = os.path.join(DATA_FOLDER, 'train.tsv')
+MAX_LEN = 128
+data_db = f'{data_path[:-4]}.{MAX_LEN}len.db'
+if os.path.isdir(data_db):
+    print(f'{data_db} exists, skip prepro.py')
+else:
+    cmd = ['prepro.py', '--corpus', data_path, '--max_seq_len', f'{MAX_LEN}']
+    cmd = ' '.join(cmd) #% {'CODE_ROOT': CODE_ROOT}
+    print(cmd)
+    ret = sp.run([PYTHON_EXE] + cmd.split(' '), stdout=sp.PIPE, stderr=sp.STDOUT, cwd=PROJECT_FOLDER)
+    if ret.returncode != 0:
+        print(f'error occurred, {ret.stdout}')
+        sys.exit(ret.returncode)
+logger.info('Done!\n')
+#########################################################################
+# Train !
+#########################################################################
+logger.info('Generating training CMD!')
+logger.info('If there is any problem, please copy (modify) and run command below')
+logger.info('#########################################################################')
+train_cmd = 'LSP_train.py'
+args = [
+    '--model_name_or_path', target_folder,
+    '--init_checkpoint', os.path.join(target_folder, 'pytorch_model.bin'),
+    '--train_input_file', data_db ,  # file from last step
+    '--eval_input_file', './data/dummy_data.tsv',   # dummy test data
+    '--output_dir', os.path.join(MODEL_FOLDER, 'output_model'),
+    '--seed', '42',
+    '--max_seq_length', '128',
+    '--train_batch_size', '512',
+    '--gradient_accumulation_steps', '8',
+    '--eval_batch_size', '64',
+    '--learning_rate', '1e-5',
+    '--num_optim_steps', '10000',
+    '--valid_step', '5000',
+    '--warmup_steps', '4000',
+    '--normalize_data', 'true',
+    '--fp16', 'true',
+    '--lr_schedule', 'noam',
+    '--loss_scale', '0.0',
+    '--no_token_id', 'true',
+    '--pbar', 'true'
+]
+arg = ' '.join(args)
+train_cmd = train_cmd + ' ' + arg
+print(PYTHON_EXE + ' ' +train_cmd)
+logger.info('#########################################################################')
+with open('./output.log', 'wb') as f:
+    process = sp.Popen([PYTHON_EXE] + train_cmd.split(' '), stdout=sp.PIPE, stderr=sp.STDOUT, cwd=PROJECT_FOLDER)
+    for line in iter(process.stdout.readline, b''):
+        sys.stdout.write(line.decode(sys.stdout.encoding))
+        f.write(line)
+logger.info('Done!\n')

demo_utils.py ADDED Viewed

	@@ -0,0 +1,91 @@

+#  Copyright (c) Microsoft Corporation.
+#  Licensed under the MIT license.
+import os
+import logging
+from pytorch_pretrained_bert.file_utils import http_get
+logger = logging.getLogger(__name__)
+# Note that the model size is roughly half of the GPT model because our model is saved by fp16
+LSP_MODEL_URL = {
+    'multiref': {
+        'large_fs': 'https://acvrpublicycchen.blob.core.windows.net/dialogpt/multiref/large_fs.pkl',
+        'medium_fs': 'https://acvrpublicycchen.blob.core.windows.net/dialogpt/multiref/medium_fs.pkl',
+        'medium_ft': 'https://acvrpublicycchen.blob.core.windows.net/dialogpt/multiref/medium_ft.pkl',
+        'small_fs': 'https://acvrpublicycchen.blob.core.windows.net/dialogpt/multiref/small_fs.pkl',
+        'small_ft': 'https://acvrpublicycchen.blob.core.windows.net/dialogpt/multiref/small_ft.pkl'
+    },
+    'dstc': {
+        'medium_ft': 'https://acvrpublicycchen.blob.core.windows.net/dialogpt/DSTC/medium_ft.pkl'
+    }
+}
+# GPT model could be downloaded from huggingface repo
+GPT2_PRETRAINED_MODEL_ARCHIVE_MAP = {
+    "small": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-pytorch_model.bin",
+    "medium": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-medium-pytorch_model.bin",
+    "large": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-large-pytorch_model.bin"
+}
+CONFIG_FILE = {
+    'small': 'https://acvrpublicycchen.blob.core.windows.net/dialogpt/117M/config.json',
+    'medium': 'https://acvrpublicycchen.blob.core.windows.net/dialogpt/345M/config.json',
+    'large': 'https://acvrpublicycchen.blob.core.windows.net/dialogpt/1542M/config.json'
+}
+VOCAB_FILE = {
+    'small': 'https://acvrpublicycchen.blob.core.windows.net/dialogpt/117M/vocab.json',
+    'medium': 'https://acvrpublicycchen.blob.core.windows.net/dialogpt/345M/vocab.json',
+    'large': 'https://acvrpublicycchen.blob.core.windows.net/dialogpt/1542M/vocab.json'
+}
+MERGE_FILE = {
+    'small': 'https://acvrpublicycchen.blob.core.windows.net/dialogpt/117M/merges.txt',
+    'medium': 'https://acvrpublicycchen.blob.core.windows.net/dialogpt/345M/merges.txt',
+    'large': 'https://acvrpublicycchen.blob.core.windows.net/dialogpt/1542M/merges.txt'
+}
+def download_file(url, folder):
+    if not os.path.exists(folder):
+        os.makedirs(folder, exist_ok=True)
+    file_name = os.path.basename(url)
+    if 'pytorch_model.bin' in file_name:
+        file_name = 'pytorch_model.bin'
+    if os.path.isfile(os.path.join(folder, file_name)):
+        logger.info(f'{os.path.join(folder, file_name)} exists, return!')
+        return
+    with open(os.path.join(folder, file_name), 'wb') as f:
+        http_get(url, f)
+def download_model_folder(model_size, dataset=None, from_scratch=None, DATA_FOLDER=None):
+    assert DATA_FOLDER is not None, 'DATA_FOLDER cannot be None'
+    assert model_size in ['small', 'medium', 'large'], 'model size should be one of \'small\', \'medium\' or \'large\''
+    target_folder = os.path.join(DATA_FOLDER, model_size)
+    download_file(CONFIG_FILE[model_size], target_folder)
+    download_file(VOCAB_FILE[model_size], target_folder)
+    download_file(MERGE_FILE[model_size], target_folder)
+    download_file(GPT2_PRETRAINED_MODEL_ARCHIVE_MAP[model_size], target_folder)
+    if dataset is not None:
+        assert dataset in ['multiref', 'dstc'], \
+            'dataset has to be \'multiref\' or \'dstc\''
+        assert from_scratch in [True, False], 'from scratch has to be True or False'
+        if from_scratch:
+            model_train_type = model_size + '_fs'
+        else:
+            model_train_type = model_size + '_ft'
+        if model_train_type not in LSP_MODEL_URL[dataset]:
+            k = ','.join(list(LSP_MODEL_URL[dataset].keys()))
+            raise ValueError(f'\'{model_train_type}\' not exist for dataset \'{dataset}\', please choose from [{k}]')
+        download_file(LSP_MODEL_URL[dataset][model_train_type], target_folder)
+    return target_folder

env.py ADDED Viewed

	@@ -0,0 +1,8 @@

+#  Copyright (c) Microsoft Corporation.
+#  Licensed under the MIT license.
+import os
+END_OF_TURN_TOKEN = '<|endofturn|>'
+END_OF_TEXT_TOKEN = '<|endoftext|>'
+PROJECT_FOLDER = os.path.dirname(__file__)

gradiodemo.py ADDED Viewed

	@@ -0,0 +1,39 @@

+# A 3rd party demo contributed by Github user AK391 (https://github.com/AK391). This is not implemented by Microsoft and Microsoft do not own any IP with this implementation and associated demo.
+# Microsoft has not tested the generation of this demo and is not responsible for any offensive or biased generation from this demo.
+# Please contact the creator AK391 (https://github.com/AK391) for any potential issue.
+from transformers import AutoModelForCausalLM, AutoTokenizer
+import torch
+import gradio as gr
+tokenizer = AutoTokenizer.from_pretrained("microsoft/DialoGPT-large")
+model = AutoModelForCausalLM.from_pretrained("microsoft/DialoGPT-large")
+def dialogpt(text):
+    # encode the new user input, add the eos_token and return a tensor in Pytorch
+    for step in range(50000):
+        new_user_input_ids = tokenizer.encode(text + tokenizer.eos_token, return_tensors='pt')
+        # append the new user input tokens to the chat history
+        bot_input_ids = torch.cat([chat_history_ids, new_user_input_ids], dim=-1) if step > 0 else new_user_input_ids
+        # generated a response while limiting the total chat history to 1000 tokens,
+        chat_history_ids = model.generate(bot_input_ids, max_length=1000, pad_token_id=tokenizer.eos_token_id)
+        # pretty print last ouput tokens from bot
+        return tokenizer.decode(chat_history_ids[:, bot_input_ids.shape[-1]:][0], skip_special_tokens=True)
+inputs = gr.inputs.Textbox(lines=1, label="Input Text")
+outputs =  gr.outputs.Textbox(label="DialoGPT")
+title = "DialoGPT"
+description = "demo for Microsoft DialoGPT with Hugging Face transformers. To use it, simply input text or click one of the examples text to load them. Read more at the links below. *This is not a Microsoft product and is developed for Gradio*"
+article = "<p style='text-align: center'><a href='https://arxiv.org/abs/1911.00536'>DialoGPT: Large-Scale Generative Pre-training for Conversational Response Generation</a> | <a href='https://github.com/microsoft/DialoGPT'>Github Repo</a> | <a href='https://huggingface.co/microsoft/DialoGPT-large'>Hugging Face DialoGPT-large</a></p>"
+examples = [
+            ["Hi, how are you?"],
+            ["How far away is the moon?"],
+]
+gr.Interface(dialogpt, inputs, outputs, title=title, description=description, article=article, examples=examples).launch(debug=True)

prepro.py ADDED Viewed

	@@ -0,0 +1,221 @@

+#  Copyright (c) Microsoft Corporation.
+#  Licensed under the MIT license.
+"""
+preprocess input data into feature and stores binary as python shelve DB
+each chunk is gzipped JSON string
+"""
+import argparse
+import gzip
+import json
+import subprocess as sp
+import shelve
+import os
+from os.path import dirname, exists, join
+import torch
+from lsp_model import GPT2Tokenizer
+from tqdm import tqdm
+from env import END_OF_TEXT_TOKEN
+from gpt2_training.train_utils import InputFeatures_train as InputFeatures
+def _get_file_len(corpus):
+    n_line = int(sp.check_output(f"wc -l {corpus}".split(),
+                                 universal_newlines=True).split()[0])
+    return n_line
+def _norm_text(text):
+    w, *toks = text.strip().split()
+    try:
+        w = float(w)
+    except Exception:
+        toks = [w] + toks
+        w = 1.0
+    return w, ' '.join(toks)
+def _get_inputs_from_text(text, tokenizer):
+    srcs, tgt = text.strip().split('\t')
+    weights = []
+    inputs = []
+    for src in srcs.split(' EOS '):
+        src_weight, src = _norm_text(src)
+        context_id = tokenizer.encode(src)
+        weights.append(src_weight)
+        inputs.append(context_id)
+    tgt_weight, tgt = _norm_text(tgt)
+    if tgt_weight != 0:
+        response_id = tokenizer.encode(tgt)
+        weights.append(tgt_weight)
+        inputs.append(response_id)
+    return weights, inputs
+def _make_features(id_, weights, inputs, tokenizer, max_len):
+    end_of_text_id = tokenizer.encoder[END_OF_TEXT_TOKEN]
+    features = []
+    sents = []
+    ws = []
+    len_ = 0
+    i = 0
+    for ids, w in zip(inputs, weights):
+        if len(ids) > max_len:
+            if len(sents) >= 2:
+                feat = _make_feature(id_ + i, sents, ws, end_of_text_id)
+                if feat is not None:
+                    features.append(feat)
+                    i += 1
+            len_ = 0
+            sents = []
+            ws = []
+            continue
+        elif len_ > max_len:
+            feat = _make_feature(id_ + i, sents, ws, end_of_text_id)
+            if feat is not None:
+                features.append(feat)
+                i += 1
+            len_ = len(sents[-1]) + 1
+            sents = sents[-1:]
+            ws = ws[-1:]
+        len_ += (len(ids) + 1)
+        sents.append(ids)
+        ws.append(w)
+    if len(sents) >= 2:
+        feat = _make_feature(id_ + i, sents, ws, end_of_text_id)
+        if feat is not None:
+            features.append(feat)
+    return features
+def _make_feature(id_, sents, ws, eos):
+    if all(w == 0 for w in ws[1:]):
+        return None
+    input_ids = [i for s in sents for i in s+[eos]][:-1]
+    lm_labels = []
+    weights = []
+    token_type_ids = []  # this becomes round ids
+    for i, (s, w) in enumerate(zip(sents, ws)):
+        if i == 0:
+            lm_labels += [-1] * len(s)
+            weights += [0.0] * len(s)
+            token_type_ids += [0] * len(s)
+            continue
+        token_type_ids += [i] * (len(s) + 1)
+        if w == 0.0:
+            lm_labels += [-1] * (len(s) + 1)
+            weights += [0.0] * (len(s) + 1)
+        else:
+            lm_labels += (s + [eos])
+            weights += [w] * (len(s) + 1)
+    # handle trailing -1's
+    i = len(lm_labels) - 1
+    while i >= 0:
+        if lm_labels[i] != -1:
+            break
+        i -= 1
+    input_ids = input_ids[:i+1]
+    lm_labels = lm_labels[:i+1]
+    weights = weights[:i+1]
+    token_type_ids = token_type_ids[:i+1]
+    # pad to multiples of 8
+    while len(input_ids) % 8 != 0:
+        input_ids.append(0)
+        token_type_ids.append(0)
+        lm_labels.append(-1)
+        weights.append(0.0)
+    position_ids = list(range(len(input_ids)))
+    assert (len(input_ids) == len(position_ids) == len(token_type_ids)
+            == len(lm_labels) == len(weights))
+    assert len(input_ids) % 8 == 0
+    if len(input_ids) == 0:
+        import pdb
+        pdb.set_trace()
+    feature = InputFeatures(id_, input_ids, position_ids, token_type_ids,
+                            lm_labels, weights)
+    return feature
+def main(args):
+    toker = GPT2Tokenizer.from_pretrained('gpt2')
+    attrs = []
+    if args.reverse:
+        attrs.append('reverse')
+    if args.two_turn:
+        attrs.append('2turn')
+    if attrs:
+        db_path = (f'{args.corpus[:-4]}.{args.max_seq_len}len.'
+                   f'{".".join(attrs)}.db/db')
+    else:
+        db_path = f'{args.corpus[:-4]}.{args.max_seq_len}len.db/db'
+    if exists(dirname(db_path)):
+        raise ValueError('Found existing DB, please backup')
+    else:
+        os.makedirs(dirname(db_path))
+    with open(args.corpus, "r", encoding="utf-8") as reader, \
+            shelve.open(db_path, 'n') as db:
+        chunk = []
+        n_chunk = 0
+        n_example = 0
+        for line in tqdm(reader, total=_get_file_len(args.corpus)):
+            try:
+                if len(chunk) >= args.chunk_size:
+                    # save and renew chunk
+                    db[f'chunk_{n_chunk}'] = gzip.compress(
+                        json.dumps(chunk[:args.chunk_size]).encode('utf-8'))
+                    chunk = chunk[args.chunk_size:]
+                    n_chunk += 1
+                weights, inputs = _get_inputs_from_text(line, toker)
+                if args.reverse:
+                    weights = list(reversed(weights))
+                    inputs = list(reversed(inputs))
+                if args.two_turn:
+                    weights = weights[:2]
+                    inputs = inputs[:2]
+                if len(weights) < 2:
+                    continue
+                features = _make_features(n_example, weights, inputs,
+                                          toker, args.max_seq_len)
+                for feature in features:
+                    chunk.append(vars(feature))
+                    n_example += 1
+            except Exception as e:
+                print('!!! prepro exception !!!', e)
+                continue
+        # save last chunk
+        db[f'chunk_{n_chunk}'] = gzip.compress(
+            json.dumps(chunk).encode('utf-8'))
+    # save relevant information to reproduce
+    meta = {'n_example': n_example,
+            'chunk_size': args.chunk_size,
+            'max_seq_len': args.max_seq_len,
+            'reverse': args.reverse,
+            'two_turn': args.two_turn}
+    with open(join(dirname(db_path), 'meta.json'), 'w') as writer:
+        json.dump(meta, writer, indent=4)
+    torch.save(toker, join(dirname(db_path), 'tokenizer.pt'))
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--corpus', required=True,
+                        help='file name of training corpus (should be .tsv)')
+    parser.add_argument('--chunk_size', type=int, default=65536,
+                        help='num of data examples in a storing chunk')
+    parser.add_argument('--max_seq_len', type=int, default=128,
+                        help='discard data longer than this')
+    parser.add_argument('--reverse', action='store_true',
+                        help='reverse the src tgt')
+    parser.add_argument('--two_turn', action='store_true',
+                        help='take only the first 2 turns')
+    args = parser.parse_args()
+    main(args)

requirements.txt ADDED Viewed

	@@ -0,0 +1,10 @@

+transformers
+ftfy
+regex
+tqdm
+torch
+torchvision
+pycrypto
+flashtext
+zstandard
+nltk