/*
 * Decompiled with CFR 0.152.
 */
package com.teamscale.index.tests.information_retrieval;

import com.google.common.annotations.VisibleForTesting;
import com.google.common.collect.Lists;
import com.teamscale.index.comment_analysis.identifier.EStemmer;
import com.teamscale.index.resource.TokenElementInfo;
import com.teamscale.index.tests.information_retrieval.data.StopWordSet;
import eu.cqse.check.framework.scanner.ELanguage;
import eu.cqse.check.framework.scanner.ETokenType;
import eu.cqse.check.framework.scanner.IToken;
import eu.cqse.check.framework.shallowparser.framework.ShallowEntity;
import eu.cqse.check.framework.util.tokens.TokenUtils;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.LinkedList;
import java.util.List;
import java.util.Set;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
import org.apache.commons.lang3.StringUtils;
import org.conqat.lib.commons.collections.CollectionUtils;
import org.conqat.lib.commons.collections.CounterSet;
import org.conqat.lib.commons.region.OffsetBasedRegion;
import org.jspecify.annotations.Nullable;

public class TokenTermExtractor {
    private static final Set<Character> CHARS_TO_REMOVE = Set.of(Character.valueOf('/'), Character.valueOf('*'), Character.valueOf('.'), Character.valueOf(','), Character.valueOf(';'), Character.valueOf(':'), Character.valueOf('!'), Character.valueOf('?'), Character.valueOf('='), Character.valueOf('('), Character.valueOf(')'), Character.valueOf('{'), Character.valueOf('}'), Character.valueOf('['), Character.valueOf(']'), Character.valueOf('\"'), Character.valueOf('-'), Character.valueOf('+'), Character.valueOf('~'), Character.valueOf('^'), Character.valueOf('&'), Character.valueOf('|'), Character.valueOf('\''), Character.valueOf('\n'), Character.valueOf('%'), Character.valueOf('@'), Character.valueOf('>'), Character.valueOf('<'), Character.valueOf('`'));
    private static final Pattern CAMEL_CASE_PATTERN = Pattern.compile("([a-z]+)([A-Z])");
    private static final Pattern PASCAL_CASE_PATTERN = Pattern.compile("([A-Z]{1,100})([A-Z])([a-z]+)");
    private static final Set<ETokenType> RELEVANT_LITERAL_TYPES = Set.of(ETokenType.STRING_LITERAL, ETokenType.BACKTICK_STRING_LITERAL, ETokenType.REGEX_LITERAL, ETokenType.TEMPLATE_LITERAL, ETokenType.CLASS_LITERAL);
    private static final Set<String> RELEVANT_CONTEXT_SUBTYPES = Set.of("package", "import", "static import", "include", "require");

    public static List<String> extractContentAsList(List<IToken> tokens) {
        ArrayList result = new ArrayList();
        tokens.stream().filter(token -> !TokenUtils.isMacroGenerated((IToken)token)).forEach(token -> TokenTermExtractor.extractContentToList(token, result));
        return result.stream().map(String::toLowerCase).collect(Collectors.toList());
    }

    private static void extractContentToList(IToken token, List<String> targetList) {
        switch (token.getType().getTokenClass()) {
            case COMMENT: {
                List<String> commentTokens = TokenTermExtractor.removeBadAndSplit(token.getText());
                for (String commentToken : commentTokens) {
                    Collections.addAll(targetList, TokenTermExtractor.preprocessAndFilter(commentToken, token.getLanguage(), ELanguage.TEXT));
                }
                break;
            }
            case LITERAL: {
                if (!RELEVANT_LITERAL_TYPES.contains(token.getType())) break;
                List<String> tokenCleaned = TokenTermExtractor.removeBadAndSplit(token.getText());
                for (String cleanedToken : tokenCleaned) {
                    Collections.addAll(targetList, TokenTermExtractor.preprocessToken(org.conqat.lib.commons.string.StringUtils.removeDoubleQuotes((String)cleanedToken)));
                }
                break;
            }
            case IDENTIFIER: {
                Collections.addAll(targetList, TokenTermExtractor.preprocessAndFilter(token.getText(), token.getLanguage()));
                break;
            }
        }
    }

    private static String[] preprocessAndFilter(@Nullable String token, ELanguage ... languages) {
        String[] result = TokenTermExtractor.preprocessToken(token);
        String[] filtered = new String[result.length];
        int filteredLength = 0;
        for (String word : result) {
            if (TokenTermExtractor.isStopWord(word, languages)) continue;
            filtered[filteredLength++] = word;
        }
        return Arrays.copyOf(filtered, filteredLength);
    }

    private static boolean isStopWord(String word, ELanguage ... languages) {
        for (ELanguage language : languages) {
            if (!StopWordSet.shouldIgnore(word, language)) continue;
            return true;
        }
        return false;
    }

    @VisibleForTesting
    public static String[] preprocessToken(@Nullable String inputTokenNormalized) {
        if (inputTokenNormalized == null) {
            return new String[0];
        }
        String token = inputTokenNormalized.replace('_', ' ').replace('.', ' ');
        token = CAMEL_CASE_PATTERN.matcher(token).replaceAll("$1 $2");
        token = PASCAL_CASE_PATTERN.matcher(token).replaceAll("$1 $2$3");
        token = token.toLowerCase();
        String[] splitToken = StringUtils.split((String)token);
        int withDifferentStemming = 0;
        String[] splitTokenWithStemmed = new String[splitToken.length * 2];
        for (int i = 0; i < splitToken.length; ++i) {
            String element2 = splitToken[i];
            String elementStemmed = EStemmer.ENGLISH.stem(element2);
            splitTokenWithStemmed[i] = element2;
            if (elementStemmed.equals(element2)) continue;
            splitTokenWithStemmed[splitToken.length + withDifferentStemming] = elementStemmed;
            ++withDifferentStemming;
        }
        int addOriginals = inputTokenNormalized.toLowerCase().equals(token) ? 0 : 1;
        Object[] result = Arrays.copyOf(splitTokenWithStemmed, splitToken.length + withDifferentStemming + addOriginals);
        if (addOriginals > 0) {
            result[result.length - 1] = token;
        }
        return (String[])CollectionUtils.filterArray((Object[])result, element -> element.length() > 2);
    }

    private static List<String> removeBadAndSplit(String input) {
        return org.conqat.lib.commons.string.StringUtils.splitToList((String)TokenTermExtractor.removeBadChars(input), (String)"\\s+");
    }

    private static String removeBadChars(String input) {
        StringBuilder result = new StringBuilder(input.length());
        for (char c : input.toCharArray()) {
            if (CHARS_TO_REMOVE.contains(Character.valueOf(c))) {
                result.append("");
                continue;
            }
            result.append(c);
        }
        return result.toString();
    }

    public static CounterSet<String> getContextBagOfWords(TokenElementInfo fileTokenElementInfo, OffsetBasedRegion locationWithComment) {
        ArrayList contextEntities = Lists.newArrayList();
        LinkedList worklist = Lists.newLinkedList();
        for (ShallowEntity shallowEntity : fileTokenElementInfo.getShallowEntitiesWithoutPreprocessorTokens()) {
            if (RELEVANT_CONTEXT_SUBTYPES.contains(shallowEntity.getSubtype())) {
                for (List tokens : shallowEntity.ownTokens()) {
                    contextEntities.addAll(TokenTermExtractor.extractContentAsList(tokens));
                }
                continue;
            }
            worklist.add(shallowEntity);
        }
        while (!worklist.isEmpty()) {
            ShallowEntity work = (ShallowEntity)worklist.pop();
            if (work.getStartOffset() > locationWithComment.getStart() || work.getEndOffset() < locationWithComment.getEnd()) continue;
            Collections.addAll(contextEntities, TokenTermExtractor.preprocessAndFilter(work.getName(), fileTokenElementInfo.getLanguage()));
            worklist.addAll(work.getChildren());
        }
        return new CounterSet((Collection)contextEntities);
    }
}

