/*
 * Decompiled with CFR 0.152.
 */
package edu.berkeley.nlp.lm.io;

import edu.berkeley.nlp.lm.ConfigOptions;
import edu.berkeley.nlp.lm.WordIndexer;
import edu.berkeley.nlp.lm.collections.Iterators;
import edu.berkeley.nlp.lm.io.IOUtils;
import edu.berkeley.nlp.lm.io.LmReader;
import edu.berkeley.nlp.lm.io.NgramOrderedLmReaderCallback;
import edu.berkeley.nlp.lm.util.Logger;
import edu.berkeley.nlp.lm.util.LongRef;
import java.io.File;
import java.io.IOException;

public class GoogleLmReader<W>
implements LmReader<LongRef, NgramOrderedLmReaderCallback<LongRef>> {
    private static final String START_SYMBOL = "<S>";
    private static final String END_SYMBOL = "</S>";
    private static final String UNK_SYMBOL = "<UNK>";
    private static final String sortedVocabFile = "vocab_cs.gz";
    private final String rootDir;
    private final WordIndexer<W> wordIndexer;

    public GoogleLmReader(String rootDir, WordIndexer<W> wordIndexer, ConfigOptions opts) {
        this.rootDir = rootDir;
        this.wordIndexer = wordIndexer;
    }

    /*
     * Exception decompiling
     */
    @Override
    public void parse(NgramOrderedLmReaderCallback<LongRef> callback) {
        /*
         * This method has failed to decompile.  When submitting a bug report, please provide this stack trace, and (if you hold appropriate legal rights) the relevant class file.
         * 
         * org.benf.cfr.reader.util.ConfusedCFRException: Started 2 blocks at once
         *     at org.benf.cfr.reader.bytecode.analysis.opgraph.Op04StructuredStatement.getStartingBlocks(Op04StructuredStatement.java:412)
         *     at org.benf.cfr.reader.bytecode.analysis.opgraph.Op04StructuredStatement.buildNestedBlocks(Op04StructuredStatement.java:487)
         *     at org.benf.cfr.reader.bytecode.analysis.opgraph.Op03SimpleStatement.createInitialStructuredBlock(Op03SimpleStatement.java:736)
         *     at org.benf.cfr.reader.bytecode.CodeAnalyser.getAnalysisInner(CodeAnalyser.java:850)
         *     at org.benf.cfr.reader.bytecode.CodeAnalyser.getAnalysisOrWrapFail(CodeAnalyser.java:278)
         *     at org.benf.cfr.reader.bytecode.CodeAnalyser.getAnalysis(CodeAnalyser.java:201)
         *     at org.benf.cfr.reader.entities.attributes.AttributeCode.analyse(AttributeCode.java:94)
         *     at org.benf.cfr.reader.entities.Method.analyse(Method.java:531)
         *     at org.benf.cfr.reader.entities.ClassFile.analyseMid(ClassFile.java:1055)
         *     at org.benf.cfr.reader.entities.ClassFile.analyseTop(ClassFile.java:942)
         *     at org.benf.cfr.reader.Driver.doJarVersionTypes(Driver.java:257)
         *     at org.benf.cfr.reader.Driver.doJar(Driver.java:139)
         *     at org.benf.cfr.reader.CfrDriverImpl.analyse(CfrDriverImpl.java:76)
         *     at org.benf.cfr.reader.Main.main(Main.java:54)
         */
        throw new IllegalStateException("Decompilation failed");
    }

    private void parseLine(String line, int ngramOrder, NgramOrderedLmReaderCallback<LongRef> callback) {
        int tabIndex = line.indexOf(9);
        int spaceIndex = 0;
        int[] ngram = new int[ngramOrder + 1];
        String words = line.substring(0, tabIndex);
        int i = 0;
        while (true) {
            int nextIndex;
            if ((nextIndex = line.indexOf(32, spaceIndex)) < 0) {
                nextIndex = words.length();
            }
            String word = words.substring(spaceIndex, nextIndex);
            ngram[i] = this.wordIndexer.getOrAddIndexFromString(word);
            if (nextIndex == words.length()) break;
            spaceIndex = nextIndex + 1;
            ++i;
        }
        long count = Long.parseLong(line.substring(tabIndex + 1));
        callback.call(ngram, 0, ngram.length, new LongRef(count), words);
    }

    public static <W> void addToIndexer(WordIndexer<W> wordIndexer, String sortedVocabPath) {
        if (!new File(sortedVocabPath).getName().equals(sortedVocabFile)) {
            Logger.warn("You have specified that " + sortedVocabPath + " is the count-sorted vocab file for Google n-grams, but it is usually named " + sortedVocabFile);
        }
        try {
            for (String line : Iterators.able(IOUtils.lineIterator(sortedVocabPath))) {
                String[] parts = line.split("\t");
                String word = parts[0];
                wordIndexer.getOrAddIndexFromString(word);
            }
        }
        catch (NumberFormatException e) {
            throw new RuntimeException(e);
        }
        catch (IOException e) {
            throw new RuntimeException(e);
        }
        GoogleLmReader.addSpecialSymbols(wordIndexer);
    }

    private static <W> void addSpecialSymbols(WordIndexer<W> wordIndexer) {
        wordIndexer.setStartSymbol(wordIndexer.getWord(wordIndexer.getOrAddIndexFromString(START_SYMBOL)));
        wordIndexer.setEndSymbol(wordIndexer.getWord(wordIndexer.getOrAddIndexFromString(END_SYMBOL)));
        wordIndexer.setUnkSymbol(wordIndexer.getWord(wordIndexer.getOrAddIndexFromString(UNK_SYMBOL)));
    }
}

