package org.languagetool.language;

import com.optimaize.langdetect.LanguageDetector;
import com.optimaize.langdetect.LanguageDetectorBuilder;
import com.optimaize.langdetect.ngram.NgramExtractors;
import com.optimaize.langdetect.profiles.LanguageProfile;
import com.optimaize.langdetect.profiles.LanguageProfileReader;
import com.optimaize.langdetect.text.RemoveMinorityScriptsTextFilter;
import com.optimaize.langdetect.text.TextFilter;
import com.optimaize.langdetect.text.TextObjectFactory;
import com.optimaize.langdetect.text.TextObjectFactoryBuilder;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.util.AbstractMap;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
import org.jetbrains.annotations.Nullable;
import org.languagetool.DetectedLanguage;
import org.languagetool.Experimental;
import org.languagetool.JLanguageTool;
import org.languagetool.Language;
import org.languagetool.Languages;
import org.languagetool.language.FastText;
import org.languagetool.noop.NoopLanguage;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/* loaded from: input_file:org/languagetool/language/LanguageIdentifier.class */
public class LanguageIdentifier {
    private static final double MINIMAL_CONFIDENCE = 0.9d;
    private static final int SHORT_ALGO_THRESHOLD = 50;
    private static final int CONSIDER_ONLY_PREFERRED_THRESHOLD = 50;
    private static final float THRESHOLD = 0.85f;
    private final LanguageDetector languageDetector;
    private final TextObjectFactory textObjectFactory;
    private final int maxLength;
    private final UnicodeBasedLangIdentifier unicodeIdentifier;
    private FastText fastText;
    private NGramLangIdentifier ngram;
    private static final Logger logger = LoggerFactory.getLogger(LanguageIdentifier.class);
    private static final List<String> RARE_LANGUAGES = Arrays.asList("eo", "ast", "be", "br", "da", "gl", "ga", "km", "fa", "ro", "sk", "sl", "sv", "tl", "ta", "no", "nb");
    private static final List<String> ignoreLangCodes = Arrays.asList("ast", "gl");
    private static final List<String> externalLangCodes = Arrays.asList("eo");

    /* loaded from: input_file:org/languagetool/language/LanguageIdentifier$ImprovedUrlTextFilter.class */
    static class ImprovedUrlTextFilter implements TextFilter {
        private static final Pattern URL_REGEX = Pattern.compile("https?://[-_.?&~;+=/#%0-9A-Za-z]+");
        private static final Pattern MAIL_REGEX = Pattern.compile("[-_.0-9A-Za-z]+@[-_0-9A-Za-z]+[-_.0-9A-Za-z]+");
        private static final ImprovedUrlTextFilter INSTANCE = new ImprovedUrlTextFilter();

        ImprovedUrlTextFilter() {
        }

        static ImprovedUrlTextFilter getInstance() {
            return INSTANCE;
        }

        public String filter(CharSequence charSequence) {
            return MAIL_REGEX.matcher(URL_REGEX.matcher(charSequence).replaceAll(" ")).replaceAll(" ");
        }
    }

    /* loaded from: input_file:org/languagetool/language/LanguageIdentifier$RemoveEMailSignatureFilter.class */
    static class RemoveEMailSignatureFilter implements TextFilter {
        private static final Pattern SIGNATURE = Pattern.compile("\n-- \n.*", 32);

        RemoveEMailSignatureFilter() {
        }

        public String filter(CharSequence charSequence) {
            return SIGNATURE.matcher(charSequence.toString()).replaceFirst("");
        }
    }

    /* loaded from: input_file:org/languagetool/language/LanguageIdentifier$RemoveMentionFilter.class */
    static class RemoveMentionFilter implements TextFilter {
        private static final Pattern MENTION = Pattern.compile("@[A-Za-z0-9_]+");

        RemoveMentionFilter() {
        }

        public String filter(CharSequence charSequence) {
            return MENTION.matcher(charSequence.toString()).replaceFirst("");
        }
    }

    /* loaded from: input_file:org/languagetool/language/LanguageIdentifier$RemoveNonBreakingSpaces.class */
    static class RemoveNonBreakingSpaces implements TextFilter {
        RemoveNonBreakingSpaces() {
        }

        public String filter(CharSequence charSequence) {
            return charSequence.toString().replace((char) 160, ' ');
        }
    }

    public LanguageIdentifier() {
        this(1000);
    }

    public LanguageIdentifier(int i) {
        this.unicodeIdentifier = new UnicodeBasedLangIdentifier();
        if (i < 10) {
            throw new IllegalArgumentException("maxLength must be >= 10 (but values > 100 are recommended): " + i);
        }
        this.maxLength = i;
        try {
            this.languageDetector = LanguageDetectorBuilder.create(NgramExtractors.standard()).minimalConfidence(MINIMAL_CONFIDENCE).shortTextAlgorithm(50).withProfiles(loadProfiles(getLanguageCodes())).build();
            this.textObjectFactory = new TextObjectFactoryBuilder().maxTextLength(10000).withTextFilter(ImprovedUrlTextFilter.getInstance()).withTextFilter(RemoveMinorityScriptsTextFilter.forThreshold(0.3d)).withTextFilter(new RemoveEMailSignatureFilter()).withTextFilter(new RemoveMentionFilter()).withTextFilter(new RemoveNonBreakingSpaces()).build();
        } catch (IOException e) {
            throw new RuntimeException("Could not set up language identifier", e);
        }
    }

    public void enableFasttext(File file, File file2) {
        if (file == null || file2 == null) {
            return;
        }
        try {
            this.fastText = new FastText(file2, file);
            logger.info("Started fasttext process for language identification: Binary " + file + " with model @ " + file2);
        } catch (IOException e) {
            throw new RuntimeException("Could not start fasttext process for language identification @ " + file + " with model @ " + file2, e);
        }
    }

    public boolean isFastTextEnabled() {
        return this.fastText != null;
    }

    public void enableNgrams(File file) {
        try {
            logger.info("Loading ngram data for language identification from " + file + "...");
            this.ngram = new NGramLangIdentifier(file, 50);
            logger.info("Loaded ngram data for language identification from " + file);
        } catch (IOException e) {
            throw new RuntimeException("Could not load ngram data language identification from " + file, e);
        }
    }

    private static List<String> getLanguageCodes() {
        ArrayList arrayList = new ArrayList();
        for (Language language : Languages.get()) {
            String shortCode = language.getShortCode();
            if (!(language.isVariant() || ignoreLangCodes.contains(shortCode) || externalLangCodes.contains(shortCode))) {
                if ("zh".equals(shortCode)) {
                    arrayList.add("zh-CN");
                    arrayList.add("zh-TW");
                } else if (!arrayList.contains(shortCode)) {
                    arrayList.add(shortCode);
                }
            }
        }
        return arrayList;
    }

    private List<LanguageProfile> loadProfiles(List<String> list) throws IOException {
        List<LanguageProfile> read = new LanguageProfileReader().read(list);
        for (String str : externalLangCodes) {
            String str2 = "/" + str + "/" + str + ".profile";
            if (JLanguageTool.getDataBroker().resourceExists(str2)) {
                InputStream fromResourceDirAsStream = JLanguageTool.getDataBroker().getFromResourceDirAsStream(str2);
                Throwable th = null;
                try {
                    try {
                        read.add(new LanguageProfileReader().read(fromResourceDirAsStream));
                        if (fromResourceDirAsStream != null) {
                            if (0 != 0) {
                                try {
                                    fromResourceDirAsStream.close();
                                } catch (Throwable th2) {
                                    th.addSuppressed(th2);
                                }
                            } else {
                                fromResourceDirAsStream.close();
                            }
                        }
                    } catch (Throwable th3) {
                        if (fromResourceDirAsStream != null) {
                            if (th != null) {
                                try {
                                    fromResourceDirAsStream.close();
                                } catch (Throwable th4) {
                                    th.addSuppressed(th4);
                                }
                            } else {
                                fromResourceDirAsStream.close();
                            }
                        }
                        throw th3;
                    }
                } finally {
                }
            }
        }
        return read;
    }

    public String cleanAndShortenText(String str) {
        String replaceAll = (str.length() > this.maxLength ? str.substring(0, this.maxLength) : str).replaceAll("\ufeff+", " ");
        if (this.fastText != null || this.ngram != null) {
            replaceAll = new RemoveNonBreakingSpaces().filter(new RemoveMentionFilter().filter(new RemoveEMailSignatureFilter().filter(ImprovedUrlTextFilter.getInstance().filter(replaceAll))));
        }
        return replaceAll;
    }

    @Nullable
    public Language detectLanguage(String str) {
        DetectedLanguage detectLanguage = detectLanguage(str, Collections.emptyList(), Collections.emptyList());
        if (detectLanguage == null) {
            return null;
        }
        return detectLanguage.getDetectedLanguage();
    }

    @Nullable
    @Experimental
    DetectedLanguage detectLanguageWithDetails(String str) {
        return detectLanguage(str, Collections.emptyList(), Collections.emptyList());
    }

    @Nullable
    public DetectedLanguage detectLanguage(String str, List<String> list, List<String> list2) {
        Map<String, Double> detectLanguages;
        Objects.requireNonNull(list);
        Objects.requireNonNull(list2);
        List<String> list3 = (List) list.stream().map(str2 -> {
            return str2.equals("nb") ? "no" : str2;
        }).collect(Collectors.toList());
        List list4 = (List) list2.stream().map(str3 -> {
            return str3.equals("nb") ? "no" : str3;
        }).collect(Collectors.toCollection(ArrayList::new));
        if (list4.stream().anyMatch(str4 -> {
            return str4.contains("-");
        })) {
            throw new IllegalArgumentException("preferredLanguages may only contain language codes without variants (e.g. 'en', but not 'en-US'): " + list4 + ". Use 'preferredVariants' to specify variants.");
        }
        List<String> dominantLangCodes = this.unicodeIdentifier.getDominantLangCodes(str);
        String join = String.join(",", dominantLangCodes);
        if (join.equals("th") || join.equals("he") || join.equals("ko") || join.equals("hi,mr")) {
            return new DetectedLanguage(null, new NoopLanguage());
        }
        if (!list4.contains("ru") && !list4.contains("uk") && !list4.contains("be") && !list4.contains("zh") && !list4.contains("hi") && !list4.contains("mr")) {
            list4.addAll(dominantLangCodes);
            list3.addAll(dominantLangCodes);
        }
        Map.Entry<String, Double> entry = null;
        boolean z = false;
        if (this.fastText != null || this.ngram != null) {
            try {
                boolean z2 = false;
                if ((str.length() <= 50 || this.fastText == null) && this.ngram != null) {
                    detectLanguages = this.ngram.detectLanguages(str.trim(), list3);
                } else {
                    z2 = true;
                    detectLanguages = this.fastText.runFasttext(str, list3);
                }
                Map.Entry<String, Double> highestScoringResult = getHighestScoringResult(detectLanguages);
                if ((z2 && highestScoringResult.getValue().floatValue() < THRESHOLD) || highestScoringResult.getKey().equals(NoopLanguage.SHORT_CODE)) {
                    Map<Language, Integer> knownWordsPerLanguage = new CommonWords().getKnownWordsPerLanguage(str);
                    HashSet hashSet = new HashSet();
                    Iterator<Map.Entry<Language, Integer>> it = knownWordsPerLanguage.entrySet().iterator();
                    while (it.hasNext()) {
                        String shortCode = it.next().getKey().getShortCode();
                        if (!hashSet.contains(shortCode)) {
                            hashSet.add(shortCode);
                            if (detectLanguages.containsKey(shortCode)) {
                                detectLanguages.put(shortCode, Double.valueOf(detectLanguages.get(shortCode).doubleValue() + Double.valueOf(r0.getValue().intValue()).doubleValue()));
                            } else {
                                detectLanguages.put(shortCode, Double.valueOf(r0.getValue().intValue()));
                            }
                        }
                    }
                    highestScoringResult = getHighestScoringResult(detectLanguages);
                }
                if (list4.contains("no") && !list4.contains("da")) {
                    detectLanguages.keySet().removeIf(str5 -> {
                        return str5.equals("da");
                    });
                    highestScoringResult = getHighestScoringResult(detectLanguages);
                }
                if (str.length() < 50 && list4.size() > 0) {
                    detectLanguages.keySet().removeIf(str6 -> {
                        return !list4.contains(str6);
                    });
                    highestScoringResult = getHighestScoringResult(detectLanguages);
                }
                entry = new AbstractMap.SimpleImmutableEntry(highestScoringResult.getKey(), Double.valueOf(0.99d / (30.0d / Math.min(str.length(), 30))));
            } catch (FastText.FastTextException e) {
                if (e.isDisabled()) {
                    this.fastText = null;
                    logger.error("Fasttext disabled", e);
                } else {
                    logger.error("Fasttext failed, fallback used", e);
                    z = true;
                }
            } catch (Exception e2) {
                this.fastText = null;
                logger.error("Fasttext disabled", e2);
            }
        }
        if ((this.fastText == null && this.ngram == null) || z) {
            entry = detectLanguageCode(this.textObjectFactory.forText(str).toString());
            if (list3.size() > 0) {
                logger.warn("Cannot consider noopLanguages because not in fastText mode: " + list3);
            }
        }
        if (entry == null || entry.getKey() == null || !canLanguageBeDetected(entry.getKey(), list3)) {
            return null;
        }
        return new DetectedLanguage(null, Languages.getLanguageForShortCode(entry.getKey(), list3), entry.getValue().floatValue());
    }

    /* JADX INFO: Access modifiers changed from: package-private */
    public static boolean canLanguageBeDetected(String str, List<String> list) {
        return Languages.isLanguageSupported(str) || list.contains(str);
    }

    private Map.Entry<String, Double> getHighestScoringResult(Map<String, Double> map) {
        String str = null;
        double d = -1.0d;
        for (Map.Entry<String, Double> entry : map.entrySet()) {
            if (entry.getValue().doubleValue() > d) {
                d = entry.getValue().doubleValue();
                str = entry.getKey();
            }
        }
        return new AbstractMap.SimpleImmutableEntry(str, Double.valueOf(d));
    }

    @Nullable
    private Map.Entry<String, Double> detectLanguageCode(String str) {
        List probabilities = this.languageDetector.getProbabilities(str);
        if (probabilities.size() > 0) {
            return new AbstractMap.SimpleImmutableEntry(((com.optimaize.langdetect.DetectedLanguage) probabilities.get(0)).getLocale().getLanguage(), Double.valueOf(((com.optimaize.langdetect.DetectedLanguage) probabilities.get(0)).getProbability()));
        }
        return null;
    }
}
