Spaces:
Paused
Paused
| # Copyright (c) Facebook, Inc. and its affiliates. | |
| # All rights reserved. | |
| # | |
| # This source code is licensed under the BSD-style license found in the | |
| # LICENSE file in the root directory of this source tree. | |
| # | |
| # LASER Language-Agnostic SEntence Representations | |
| # is a toolkit to calculate multilingual sentence embeddings | |
| # and to use them for document classification, bitext filtering | |
| # and mining | |
| # | |
| # -------------------------------------------------------- | |
| # | |
| # bash script to calculate sentence embeddings for the MLDoc corpus, | |
| # train and evaluate the classifier | |
| if [ -z ${LASER+x} ] ; then | |
| echo "Please set the environment variable 'LASER'" | |
| exit | |
| fi | |
| # general config | |
| mldir="MLDoc" # raw texts of MLdoc | |
| edir="embed" # normalized texts and embeddings | |
| languages=('en' 'de' 'es' 'fr' 'it' 'ja' 'ru' 'zh') | |
| # encoder | |
| model_dir="${LASER}/models" | |
| encoder="${model_dir}/bilstm.93langs.2018-12-26.pt" | |
| bpe_codes="${model_dir}/93langs.fcodes" | |
| edir="embed" | |
| ################################################################### | |
| # | |
| # Extract files with labels and texts from the MLdoc corpus | |
| # | |
| ################################################################### | |
| ExtractMLdoc () { | |
| ifname=$1 | |
| ofname=$2 | |
| lang=$3 | |
| if [ ! -f ${ifname}.${lang} ] ; then | |
| echo "Please install the MLDoc corpus first" | |
| exit | |
| fi | |
| if [ ! -f ${ofname}.lbl.${lang} ] ; then | |
| echo " - extract labels from ${ifname}.${lang}" | |
| cut -d' ' -f1 ${ifname}.${lang} \ | |
| | sed -e 's/C/0/' -e 's/E/1/' -e 's/G/2/' -e 's/M/3/' \ | |
| > ${ofname}.lbl.${lang} | |
| fi | |
| if [ ! -f ${ofname}.txt.${lang} ] ; then | |
| echo " - extract texts from ${ifname}.${lang}" | |
| # remove text which is not useful for classification | |
| cut -d' ' -f2 ${ifname}.${lang} \ | |
| | sed -e 's/ Co \./ Co./g' -e s'/ Inc \. / Inc. /g' \ | |
| -e 's/([cC]) Reuters Limited 199[0-9]\.//g' \ | |
| > ${ofname}.txt.${lang} | |
| fi | |
| } | |
| ################################################################### | |
| # | |
| # Create all files | |
| # | |
| ################################################################### | |
| # create output directories | |
| for d in ${edir} ; do | |
| mkdir -p ${d} | |
| done | |
| # Embed all data | |
| echo -e "\nExtracting MLDoc data" | |
| #ExtractMLdoc ${mldir}/mldoc.train1000 ${edir}/mldoc.train1000 "en" | |
| for part in "mldoc.train1000" "mldoc.dev" "mldoc.test" ; do | |
| for l in ${languages[@]} ; do | |
| ExtractMLdoc ${mldir}/${part} ${edir}/${part} ${l} | |
| done | |
| done | |
| MECAB="${LASER}/tools-external/mecab" | |
| export LD_LIBRARY_PATH="${MECAB}/lib:${LD_LIBRARY_PATH}" | |
| python3 mldoc.py --data_dir ${edir} --lang ${languages[@]} --bpe_codes ${bpe_codes} --encoder ${encoder} | |
| # MLDoc classifier parameters | |
| nb_cl=4 | |
| N=500 | |
| lr=0.001 | |
| wd=0.0 | |
| nhid="10 8" | |
| drop=0.2 | |
| seed=1 | |
| bsize=12 | |
| echo -e "\nTraining MLDoc classifier (log files in ${edir})" | |
| #for ltrn in "en" ; do | |
| for ltrn in ${languages[@]} ; do | |
| ldev=${ltrn} | |
| lf="${edir}/mldoc.${ltrn}-${ldev}.log" | |
| echo " - train on ${ltrn}, dev on ${ldev}" | |
| if [ ! -f ${lf} ] ; then | |
| python3 ${LASER}/source/sent_classif.py \ | |
| --gpu 0 --base-dir ${edir} \ | |
| --train mldoc.train1000.enc.${ltrn} \ | |
| --train-labels mldoc.train1000.lbl.${ltrn} \ | |
| --dev mldoc.dev.enc.${ldev} \ | |
| --dev-labels mldoc.dev.lbl.${ldev} \ | |
| --test mldoc.test.enc \ | |
| --test-labels mldoc.test.lbl \ | |
| --nb-classes ${nb_cl} \ | |
| --nhid ${nhid[@]} --dropout ${drop} --bsize ${bsize} \ | |
| --seed ${seed} --lr ${lr} --wdecay ${wd} --nepoch ${N} \ | |
| --lang ${languages[@]} \ | |
| > ${lf} | |
| fi | |
| done | |
| # display results | |
| echo -e "\nAccuracy matrix:" | |
| echo -n "Train " | |
| for l1 in ${languages[@]} ; do | |
| printf " %2s " ${l1} | |
| done | |
| echo "" | |
| for l1 in ${languages[@]} ; do | |
| lf="${edir}/mldoc.${l1}-${l1}.log" | |
| echo -n " ${l1}: " | |
| for l2 in ${languages[@]} ; do | |
| grep "Test lang ${l2}" $lf | sed -e 's/%//' | awk '{printf(" %5.2f", $10)}' | |
| done | |
| echo "" | |
| done | |