/*
 * Decompiled with CFR 0.152.
 */
package edu.stanford.nlp.pipeline;

import edu.stanford.nlp.international.french.process.FrenchTokenizer;
import edu.stanford.nlp.international.spanish.process.SpanishTokenizer;
import edu.stanford.nlp.ling.CoreAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.pipeline.Annotation;
import edu.stanford.nlp.pipeline.Annotator;
import edu.stanford.nlp.pipeline.ArabicSegmenterAnnotator;
import edu.stanford.nlp.pipeline.ChineseSegmenterAnnotator;
import edu.stanford.nlp.pipeline.CleanXmlAnnotator;
import edu.stanford.nlp.pipeline.LanguageInfo;
import edu.stanford.nlp.pipeline.StatTokSentAnnotator;
import edu.stanford.nlp.pipeline.WordsToSentencesAnnotator;
import edu.stanford.nlp.process.CodepointCoreLabelProcessor;
import edu.stanford.nlp.process.CoreLabelProcessor;
import edu.stanford.nlp.process.CoreLabelTokenFactory;
import edu.stanford.nlp.process.PTBTokenizer;
import edu.stanford.nlp.process.Tokenizer;
import edu.stanford.nlp.process.TokenizerFactory;
import edu.stanford.nlp.process.WhitespaceTokenizer;
import edu.stanford.nlp.process.WordToSentenceProcessor;
import edu.stanford.nlp.util.Generics;
import edu.stanford.nlp.util.PropertiesUtils;
import edu.stanford.nlp.util.ReflectionLoading;
import edu.stanford.nlp.util.logging.Redwood;
import java.io.Reader;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import java.util.Set;

public class TokenizerAnnotator
implements Annotator {
    private static final Redwood.RedwoodChannels log = Redwood.channels(TokenizerAnnotator.class);
    public static final String EOL_PROPERTY = "tokenize.keepeol";
    public static final String KEEP_NL_OPTION = "tokenizeNLs,";
    private final boolean VERBOSE;
    private final TokenizerFactory<CoreLabel> factory;
    private final Annotator segmenterAnnotator;
    private final StatTokSentAnnotator cdcAnnotator;
    private final CleanXmlAnnotator cleanxmlAnnotator;
    private final WordsToSentencesAnnotator ssplitAnnotator;
    private final List<CoreLabelProcessor> postProcessors;

    public TokenizerAnnotator() {
        this(false);
    }

    private static String computeExtraOptions(Properties properties) {
        String nlsbString;
        WordToSentenceProcessor.NewlineIsSentenceBreak nlsb;
        String extraOptions = null;
        boolean keepNewline = Boolean.parseBoolean(properties.getProperty("ssplit.eolonly", "false"));
        if (!Boolean.parseBoolean(properties.getProperty("ssplit.isOneSentence")) && (nlsb = WordToSentenceProcessor.stringToNewlineIsSentenceBreak(nlsbString = properties.getProperty("ssplit.newlineIsSentenceBreak", "never"))) != WordToSentenceProcessor.NewlineIsSentenceBreak.NEVER) {
            keepNewline = true;
        }
        if (keepNewline) {
            extraOptions = KEEP_NL_OPTION;
        }
        return extraOptions;
    }

    public TokenizerAnnotator(Properties properties) {
        this(false, properties, TokenizerAnnotator.computeExtraOptions(properties));
    }

    public TokenizerAnnotator(boolean verbose) {
        this(verbose, TokenizerType.English);
    }

    public TokenizerAnnotator(String lang) {
        this(true, lang, null);
    }

    public TokenizerAnnotator(boolean verbose, TokenizerType lang) {
        this(verbose, lang.toString());
    }

    public TokenizerAnnotator(boolean verbose, String lang) {
        this(verbose, lang, null);
    }

    public TokenizerAnnotator(boolean verbose, String lang, String options) {
        this(verbose, lang == null ? null : PropertiesUtils.asProperties("tokenize.language", lang), options);
    }

    public TokenizerAnnotator(boolean verbose, Properties props) {
        this(verbose, props, TokenizerAnnotator.computeExtraOptions(props));
    }

    /*
     * Enabled aggressive block sorting
     * Enabled unnecessary exception pruning
     * Enabled aggressive exception aggregation
     */
    public TokenizerAnnotator(boolean verbose, Properties props, String options) {
        if (props == null) {
            props = new Properties();
        }
        boolean whitespace = Boolean.parseBoolean(props.getProperty("tokenize.whitespace", "false"));
        if (props.getProperty("tokenize.language") != null && LanguageInfo.isSegmenterLanguage(props.getProperty("tokenize.language")) && !whitespace) {
            this.cdcAnnotator = null;
            if (LanguageInfo.getLanguageFromString(props.getProperty("tokenize.language")) == LanguageInfo.HumanLanguage.ARABIC) {
                this.segmenterAnnotator = new ArabicSegmenterAnnotator("segment", props);
            } else {
                if (LanguageInfo.getLanguageFromString(props.getProperty("tokenize.language")) != LanguageInfo.HumanLanguage.CHINESE) {
                    this.segmenterAnnotator = null;
                    throw new RuntimeException("No segmenter implemented for: " + (Object)((Object)LanguageInfo.getLanguageFromString(props.getProperty("tokenize.language"))));
                }
                this.segmenterAnnotator = new ChineseSegmenterAnnotator("segment", props);
            }
        } else if (props.getProperty("cdc_tokenize.model", null) != null) {
            this.cdcAnnotator = new StatTokSentAnnotator(props);
            this.segmenterAnnotator = null;
        } else {
            this.segmenterAnnotator = null;
            this.cdcAnnotator = null;
        }
        String postProcessorClass = props.getProperty("tokenize.postProcessor", "");
        ArrayList<Object> processors = new ArrayList<Object>();
        try {
            if (!postProcessorClass.equals("")) {
                processors.add(ReflectionLoading.loadByReflection(postProcessorClass, new Object[0]));
            }
        }
        catch (RuntimeException e) {
            throw new RuntimeException("Loading: " + postProcessorClass + " failed with: " + e.getMessage());
        }
        if (PropertiesUtils.getBool(props, "tokenize.codepoint")) {
            processors.add(new CodepointCoreLabelProcessor());
        }
        this.postProcessors = Collections.unmodifiableList(processors);
        this.VERBOSE = PropertiesUtils.getBool(props, "tokenize.verbose", verbose);
        TokenizerType type = TokenizerType.getTokenizerType(props);
        this.factory = TokenizerAnnotator.initFactory(type, props, options);
        if (this.VERBOSE) {
            log.info("Initialized tokenizer factory: " + this.factory);
        }
        this.cleanxmlAnnotator = PropertiesUtils.getBool(props, "tokenize.cleanxml") ? new CleanXmlAnnotator(props) : null;
        if (PropertiesUtils.getBool(props, "tokenize.ssplit", true)) {
            this.ssplitAnnotator = new WordsToSentencesAnnotator(props);
            return;
        }
        this.ssplitAnnotator = null;
    }

    private static TokenizerFactory<CoreLabel> initFactory(TokenizerType type, Properties props, String extraOptions) throws IllegalArgumentException {
        TokenizerFactory<CoreLabel> factory;
        String options = props.getProperty("tokenize.options", null);
        if (options == null) {
            options = type.getDefaultOptions();
        }
        if (extraOptions != null) {
            options = extraOptions.endsWith(",") ? extraOptions + options : extraOptions + ',' + options;
        }
        switch (type) {
            case Arabic: 
            case Chinese: {
                factory = null;
                break;
            }
            case Spanish: {
                factory = SpanishTokenizer.factory(new CoreLabelTokenFactory(), options);
                break;
            }
            case French: {
                factory = FrenchTokenizer.factory(new CoreLabelTokenFactory(), options);
                break;
            }
            case Whitespace: {
                boolean eolIsSignificant = Boolean.parseBoolean(props.getProperty(EOL_PROPERTY, "false"));
                eolIsSignificant = eolIsSignificant || KEEP_NL_OPTION.equals(TokenizerAnnotator.computeExtraOptions(props));
                factory = new WhitespaceTokenizer.WhitespaceTokenizerFactory<CoreLabel>(new CoreLabelTokenFactory(), eolIsSignificant);
                break;
            }
            case English: 
            case German: {
                factory = PTBTokenizer.factory(new CoreLabelTokenFactory(), options);
                break;
            }
            case Unspecified: {
                log.info("No tokenizer type provided. Defaulting to PTBTokenizer.");
                factory = PTBTokenizer.factory(new CoreLabelTokenFactory(), options);
                break;
            }
            default: {
                throw new IllegalArgumentException("No valid tokenizer type provided.\nUse -tokenize.language, -tokenize.class, or -tokenize.whitespace \nto specify a tokenizer.");
            }
        }
        return factory;
    }

    public Tokenizer<CoreLabel> getTokenizer(Reader r) {
        return this.factory.getTokenizer(r);
    }

    private static void setTokenBeginTokenEnd(List<CoreLabel> tokensList) {
        int tokenIndex = 0;
        for (CoreLabel token : tokensList) {
            token.set(CoreAnnotations.TokenBeginAnnotation.class, tokenIndex);
            token.set(CoreAnnotations.TokenEndAnnotation.class, tokenIndex + 1);
            ++tokenIndex;
        }
    }

    private static void setNewlineStatus(List<CoreLabel> tokensList) {
        for (CoreLabel token : tokensList) {
            if (token.word().equals("*NL*")) {
                token.set(CoreAnnotations.IsNewlineAnnotation.class, true);
                continue;
            }
            token.set(CoreAnnotations.IsNewlineAnnotation.class, false);
        }
    }

    public static void adjustFinalToken(List<CoreLabel> tokens) {
        if (tokens == null || tokens.size() == 0) {
            return;
        }
        CoreLabel finalToken = tokens.get(tokens.size() - 1);
        String finalTokenAfter = (String)finalToken.get(CoreAnnotations.AfterAnnotation.class);
        if (finalTokenAfter != null && finalTokenAfter.length() > 0) {
            char last = finalTokenAfter.charAt(finalTokenAfter.length() - 1);
            if (last != ' ') {
                throw new IllegalArgumentException("adjustFinalToken: Unexpected final char: |" + last + "| (" + last + ')');
            }
            finalTokenAfter = finalTokenAfter.substring(0, finalTokenAfter.length() - 1);
            finalToken.set(CoreAnnotations.AfterAnnotation.class, finalTokenAfter);
        }
    }

    @Override
    public void annotate(Annotation annotation) {
        if (this.VERBOSE) {
            log.info("Beginning tokenization");
        }
        if (this.cdcAnnotator != null) {
            this.cdcAnnotator.annotate(annotation);
            return;
        }
        if (this.segmenterAnnotator != null) {
            this.segmenterAnnotator.annotate(annotation);
            TokenizerAnnotator.setTokenBeginTokenEnd((List)annotation.get(CoreAnnotations.TokensAnnotation.class));
            TokenizerAnnotator.setNewlineStatus((List)annotation.get(CoreAnnotations.TokensAnnotation.class));
        } else if (annotation.containsKey(CoreAnnotations.TextAnnotation.class)) {
            String text = (String)annotation.get(CoreAnnotations.TextAnnotation.class) + " ";
            StringReader r = new StringReader(text);
            List<CoreLabel> tokens = this.getTokenizer(r).tokenize();
            TokenizerAnnotator.adjustFinalToken(tokens);
            TokenizerAnnotator.setNewlineStatus(tokens);
            TokenizerAnnotator.setTokenBeginTokenEnd(tokens);
            for (CoreLabelProcessor postProcessor : this.postProcessors) {
                tokens = postProcessor.process(tokens);
            }
            annotation.set(CoreAnnotations.TokensAnnotation.class, tokens);
            if (this.VERBOSE) {
                log.info("Tokenized: " + annotation.get(CoreAnnotations.TokensAnnotation.class));
            }
        } else {
            throw new RuntimeException("Tokenizer unable to find text in annotation: " + annotation);
        }
        annotation.remove(CoreAnnotations.SentencesAnnotation.class);
        if (this.cleanxmlAnnotator != null) {
            this.cleanxmlAnnotator.annotate(annotation);
        }
        if (this.ssplitAnnotator != null) {
            this.ssplitAnnotator.annotate(annotation);
        }
    }

    @Override
    public Set<Class<? extends CoreAnnotation>> requires() {
        return Collections.emptySet();
    }

    @Override
    public Set<Class<? extends CoreAnnotation>> requirementsSatisfied() {
        return new HashSet<Class<? extends CoreAnnotation>>(Arrays.asList(CoreAnnotations.TextAnnotation.class, CoreAnnotations.TokensAnnotation.class, CoreAnnotations.CharacterOffsetBeginAnnotation.class, CoreAnnotations.CharacterOffsetEndAnnotation.class, CoreAnnotations.BeforeAnnotation.class, CoreAnnotations.AfterAnnotation.class, CoreAnnotations.TokenBeginAnnotation.class, CoreAnnotations.TokenEndAnnotation.class, CoreAnnotations.PositionAnnotation.class, CoreAnnotations.IndexAnnotation.class, CoreAnnotations.OriginalTextAnnotation.class, CoreAnnotations.ValueAnnotation.class, CoreAnnotations.IsNewlineAnnotation.class, CoreAnnotations.SentencesAnnotation.class, CoreAnnotations.SentenceIndexAnnotation.class));
    }

    public static enum TokenizerType {
        Unspecified(null, null, "invertible,ptb3Escaping=true"),
        Arabic("ar", null, ""),
        Chinese("zh", null, ""),
        Spanish("es", "SpanishTokenizer", "invertible,ellipses=ascii,splitAll=false"),
        English("en", "PTBTokenizer", "invertible"),
        German("de", null, "invertible,ptb3Escaping=false,splitHyphenated=true"),
        French("fr", "FrenchTokenizer", "invertible,splitCompounds=false,splitContractions=false,quotes=ORIGINAL"),
        Whitespace(null, "WhitespaceTokenizer", "");

        private final String abbreviation;
        private final String className;
        private final String defaultOptions;
        private static final Map<String, TokenizerType> nameToTokenizerMap;
        private static final Map<String, TokenizerType> classToTokenizerMap;

        private TokenizerType(String abbreviation, String className, String defaultOptions) {
            this.abbreviation = abbreviation;
            this.className = className;
            this.defaultOptions = defaultOptions;
        }

        public String getDefaultOptions() {
            return this.defaultOptions;
        }

        private static Map<String, TokenizerType> initializeNameMap() {
            Map<String, TokenizerType> map = Generics.newHashMap();
            for (TokenizerType type : TokenizerType.values()) {
                if (type.abbreviation != null) {
                    map.put(type.abbreviation.toUpperCase(), type);
                }
                map.put(type.toString().toUpperCase(), type);
            }
            return Collections.unmodifiableMap(map);
        }

        private static Map<String, TokenizerType> initializeClassMap() {
            Map<String, TokenizerType> map = Generics.newHashMap();
            for (TokenizerType type : TokenizerType.values()) {
                if (type.className == null) continue;
                map.put(type.className.toUpperCase(), type);
            }
            return Collections.unmodifiableMap(map);
        }

        public static TokenizerType getTokenizerType(Properties props) {
            String tokClass = props.getProperty("tokenize.class", null);
            boolean whitespace = Boolean.parseBoolean(props.getProperty("tokenize.whitespace", "false"));
            String language = props.getProperty("tokenize.language", "en");
            if (whitespace) {
                return Whitespace;
            }
            if (tokClass != null) {
                TokenizerType type = classToTokenizerMap.get(tokClass.toUpperCase());
                if (type == null) {
                    throw new IllegalArgumentException("TokenizerAnnotator: unknown tokenize.class property " + tokClass);
                }
                return type;
            }
            if (language != null) {
                TokenizerType type = nameToTokenizerMap.get(language.toUpperCase());
                if (type == null) {
                    throw new IllegalArgumentException("TokenizerAnnotator: unknown tokenize.language property " + language);
                }
                return type;
            }
            return Unspecified;
        }

        static {
            nameToTokenizerMap = TokenizerType.initializeNameMap();
            classToTokenizerMap = TokenizerType.initializeClassMap();
        }
    }
}

