/*
 * Decompiled with CFR 0.152.
 */
package org.languagetool.tokenizers;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;
import java.util.StringTokenizer;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.commons.lang3.StringUtils;
import org.languagetool.tokenizers.Tokenizer;
import org.languagetool.tools.StringTools;

public class WordTokenizer
implements Tokenizer {
    private static final List<String> PROTOCOLS = Collections.unmodifiableList(Arrays.asList("http", "https", "ftp"));
    private static final Pattern URL_CHARS = Pattern.compile("[a-zA-Z0-9/%$-_.+!*'(),\\?]+");
    private static final Pattern E_MAIL = Pattern.compile("(?<!:)\\b[a-zA-Z0-9.!#$%&'*+/=?^_`{|}~-]+@((\\[[0-9]{1,3}\\.[0-9]{1,3}\\.[0-9]{1,3}\\.[0-9]{1,3}\\])|(([a-zA-Z\\-0-9]+\\.)+[a-zA-Z]{2,}))\\b");
    private static final String TOKENIZING_CHARACTERS = " \u00a0\u115f\u1160\u1680\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200a\u200b\u200c\u200d\u200e\u200f\u2028\u2029\u202a\u202b\u202c\u202d\u202e\u202f\u205f\u2060\u2061\u2062\u2063\u206a\u206b\u206c\u206d\u206e\u206f\u3000\u3164\ufeff\uffa0\ufff9\ufffa\ufffb,.;()[]{}=*#\u2217\u00d7\u00b7+\u00f7<>!?:/|\\\"'\u00ab\u00bb\u201e\u201d\u201c`\u00b4\u2018\u2019\u201b\u2032\u2026\u00bf\u00a1\u2192\u203c\u2047\u2048\u2049\u2014\t\n\r";

    public static List<String> getProtocols() {
        return PROTOCOLS;
    }

    public static boolean isUrl(String token) {
        for (String protocol : WordTokenizer.getProtocols()) {
            if (!token.startsWith(protocol + "://") && !token.startsWith("www.")) continue;
            return true;
        }
        return false;
    }

    public static boolean isEMail(String token) {
        return E_MAIL.matcher(token).matches();
    }

    @Override
    public List<String> tokenize(String text) {
        ArrayList<String> l = new ArrayList<String>();
        StringTokenizer st = new StringTokenizer(text, this.getTokenizingCharacters(), true);
        while (st.hasMoreElements()) {
            l.add(st.nextToken());
        }
        return this.joinEMailsAndUrls(l);
    }

    public String getTokenizingCharacters() {
        return TOKENIZING_CHARACTERS;
    }

    protected List<String> joinEMailsAndUrls(List<String> list) {
        return this.joinUrls(this.joinEMails(list));
    }

    protected List<String> joinEMails(List<String> list) {
        StringBuilder sb = new StringBuilder();
        for (String item : list) {
            sb.append(item);
        }
        String text = sb.toString();
        if (E_MAIL.matcher(text).find()) {
            Matcher matcher = E_MAIL.matcher(text);
            ArrayList<String> l = new ArrayList<String>();
            int currentPosition = 0;
            int idx = 0;
            while (matcher.find()) {
                int start = matcher.start();
                int end = matcher.end();
                while (currentPosition < end) {
                    if (currentPosition < start) {
                        l.add(list.get(idx));
                    } else if (currentPosition == start) {
                        l.add(matcher.group());
                    }
                    currentPosition += list.get(idx).length();
                    ++idx;
                }
            }
            if (currentPosition < text.length()) {
                l.addAll(list.subList(idx, list.size()));
            }
            return l;
        }
        return list;
    }

    protected List<String> joinUrls(List<String> l) {
        ArrayList<String> newList = new ArrayList<String>();
        boolean inUrl = false;
        StringBuilder url = new StringBuilder();
        String urlQuote = null;
        for (int i = 0; i < l.size(); ++i) {
            if (this.urlStartsAt(i, l)) {
                inUrl = true;
                if (i - 1 >= 0) {
                    urlQuote = l.get(i - 1);
                }
                url.append(l.get(i));
                continue;
            }
            if (inUrl && this.urlEndsAt(i, l, urlQuote)) {
                inUrl = false;
                urlQuote = null;
                newList.add(url.toString());
                url.setLength(0);
                newList.add(l.get(i));
                continue;
            }
            if (inUrl) {
                url.append(l.get(i));
                continue;
            }
            newList.add(l.get(i));
        }
        if (url.length() > 0) {
            newList.add(url.toString());
        }
        return newList;
    }

    private boolean urlStartsAt(int i, List<String> l) {
        String nnToken;
        String nToken;
        String token = l.get(i);
        if (this.isProtocol(token) && l.size() > i + 3) {
            nToken = l.get(i + 1);
            nnToken = l.get(i + 2);
            String nnnToken = l.get(i + 3);
            if (nToken.equals(":") && nnToken.equals("/") && nnnToken.equals("/")) {
                return true;
            }
        }
        if (l.size() > i + 1) {
            nToken = l.get(i);
            nnToken = l.get(i + 1);
            if (nToken.equals("www") && nnToken.equals(".")) {
                return true;
            }
        }
        return false;
    }

    private boolean isProtocol(String token) {
        return PROTOCOLS.contains(token);
    }

    private boolean urlEndsAt(int i, List<String> l, String urlQuote) {
        Matcher matcher;
        String nToken;
        String token = l.get(i);
        if (StringTools.isWhitespace(token)) {
            return true;
        }
        if (token.equals(")") || token.equals("]")) {
            return true;
        }
        return l.size() > i + 1 ? StringTools.isWhitespace(nToken = l.get(i + 1)) && (StringUtils.equalsAny((CharSequence)token, (CharSequence[])new CharSequence[]{".", ",", ";", ":", "!", "?"}) || token.equals(urlQuote)) : !(matcher = URL_CHARS.matcher(token)).matches();
    }
}

