JavadocMetadataScraper.java

////////////////////////////////////////////////////////////////////////////////
// checkstyle: Checks Java source code for adherence to a set of rules.
// Copyright (C) 2001-2021 the original author or authors.
//
// This library is free software; you can redistribute it and/or
// modify it under the terms of the GNU Lesser General Public
// License as published by the Free Software Foundation; either
// version 2.1 of the License, or (at your option) any later version.
//
// This library is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
// Lesser General Public License for more details.
//
// You should have received a copy of the GNU Lesser General Public
// License along with this library; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
////////////////////////////////////////////////////////////////////////////////

package com.puppycrawl.tools.checkstyle.meta;

import java.util.ArrayDeque;
import java.util.Arrays;
import java.util.Collections;
import java.util.Deque;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedHashSet;
import java.util.Locale;
import java.util.Map;
import java.util.Optional;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.stream.Collectors;

import javax.xml.parsers.ParserConfigurationException;
import javax.xml.transform.TransformerException;

import com.puppycrawl.tools.checkstyle.FileStatefulCheck;
import com.puppycrawl.tools.checkstyle.api.DetailAST;
import com.puppycrawl.tools.checkstyle.api.DetailNode;
import com.puppycrawl.tools.checkstyle.api.JavadocTokenTypes;
import com.puppycrawl.tools.checkstyle.api.TokenTypes;
import com.puppycrawl.tools.checkstyle.checks.javadoc.AbstractJavadocCheck;
import com.puppycrawl.tools.checkstyle.utils.TokenUtil;

/**
 * Class for scraping module metadata from the corresponding class' class-level javadoc.
 */
@FileStatefulCheck
public class JavadocMetadataScraper extends AbstractJavadocCheck {

    /** Module details store used for testing. */
    private static final Map<String, ModuleDetails> MODULE_DETAILS_STORE = new HashMap<>();

    /** Regular expression for property location in class-level javadocs. */
    private static final Pattern PROPERTY_TAG = Pattern.compile("\\s*Property\\s*");

    /** Regular expression for property type location in class-level javadocs. */
    private static final Pattern TYPE_TAG = Pattern.compile("^ Type is\\s.*");

    /** Regular expression for property validation type location in class-level javadocs. */
    private static final Pattern VALIDATION_TYPE_TAG =
            Pattern.compile("\\s.*Validation type is\\s.*");

    /** Regular expression for property default value location in class-level javadocs. */
    private static final Pattern DEFAULT_VALUE_TAG = Pattern.compile("^ Default value is:*.*");

    /** Regular expression for check example location in class-level javadocs. */
    private static final Pattern EXAMPLES_TAG =
            Pattern.compile("\\s*To configure the (default )?check.*");

    /** Regular expression for module parent location in class-level javadocs. */
    private static final Pattern PARENT_TAG = Pattern.compile("\\s*Parent is\\s*");

    /** Regular expression for module violation messages location in class-level javadocs. */
    private static final Pattern VIOLATION_MESSAGES_TAG =
            Pattern.compile("\\s*Violation Message Keys:\\s*");

    /** Regular expression for detecting ANTLR tokens(for e.g. CLASS_DEF). */
    private static final Pattern TOKEN_TEXT_PATTERN = Pattern.compile("([A-Z_]{2,})+");

    /** Regular expression for removal of @code{-} present at the beginning of texts. */
    private static final Pattern DESC_CLEAN = Pattern.compile("-\\s");

    /** Regular expression for file separator corresponding to the host OS. */
    private static final Pattern FILE_SEPARATOR_PATTERN =
            Pattern.compile(Pattern.quote(System.getProperty("file.separator")));

    /** Regular expression for quotes. */
    private static final Pattern QUOTE_PATTERN = Pattern.compile("\"");

    /** Java file extension. */
    private static final String JAVA_FILE_EXTENSION = ".java";

    /**
     * This set contains faulty property default value which should not be written to the XML
     * metadata files.
     */
    private static final Set<String> PROPERTIES_TO_NOT_WRITE = Collections.unmodifiableSet(
            new HashSet<>(Arrays.asList(
                    "null",
                    "the charset property of the parent <a href=https://checkstyle.org/"
                        + "config.html#Checker>Checker</a> module"
    )));

    /**
     * Format for exception message for missing type for check property.
     */
    private static final String PROP_TYPE_MISSING = "Type for property '%s' is missing";

    /**
     * Format for exception message for missing default value for check property.
     */
    private static final String PROP_DEFAULT_VALUE_MISSING =
        "Default value for property '%s' is missing";

    /** ModuleDetails instance for each module AST traversal. */
    private ModuleDetails moduleDetails;

    /**
     * Boolean variable which lets us know whether violation message section is being scraped
     * currently.
     */
    private boolean scrapingViolationMessageList;

    /**
     * Boolean variable which lets us know whether we should scan and scrape the current javadoc
     * or not. Since we need only class level javadoc, it becomes true at its root and false after
     * encountering {@code JavadocTokenTypes.SINCE_LITERAL}.
     */
    private boolean toScan;

    /** DetailNode pointing to the root node of the class level javadoc of the class. */
    private DetailNode rootNode;

    /**
     * Child number of the property section node, where parent is the class level javadoc root
     * node.
     */
    private int propertySectionStartIdx;

    /**
     * Child number of the example section node, where parent is the class level javadoc root
     * node.
     */
    private int exampleSectionStartIdx;

    /**
     * Child number of the parent section node, where parent is the class level javadoc root
     * node.
     */
    private int parentSectionStartIdx;

    @Override
    public int[] getDefaultJavadocTokens() {
        return new int[] {
            JavadocTokenTypes.JAVADOC,
            JavadocTokenTypes.PARAGRAPH,
            JavadocTokenTypes.LI,
            JavadocTokenTypes.SINCE_LITERAL,
        };
    }

    @Override
    public int[] getRequiredJavadocTokens() {
        return getAcceptableJavadocTokens();
    }

    @Override
    public void beginJavadocTree(DetailNode rootAst) {
        if (isTopLevelClassJavadoc()) {
            moduleDetails = new ModuleDetails();
            toScan = false;
            scrapingViolationMessageList = false;
            propertySectionStartIdx = -1;
            exampleSectionStartIdx = -1;
            parentSectionStartIdx = -1;

            final String filePath = getFileContents().getFileName();
            String moduleName = getModuleSimpleName();
            final String checkModuleExtension = "Check";
            if (moduleName.contains(checkModuleExtension)) {
                moduleName = moduleName.substring(0, moduleName.indexOf(checkModuleExtension));
            }
            moduleDetails.setName(moduleName);
            moduleDetails.setFullQualifiedName(getPackageName(filePath));
            moduleDetails.setModuleType(getModuleType());
        }
    }

    @Override
    public void visitJavadocToken(DetailNode ast) {
        if (toScan) {
            scrapeContent(ast);
        }

        if (ast.getType() == JavadocTokenTypes.JAVADOC) {
            final DetailAST parent = getParent(getBlockCommentAst());
            if (parent.getType() == TokenTypes.CLASS_DEF) {
                rootNode = ast;
                toScan = true;
            }
        }
        else if (ast.getType() == JavadocTokenTypes.SINCE_LITERAL) {
            toScan = false;
        }
    }

    @Override
    public void finishJavadocTree(DetailNode rootAst) {
        moduleDetails.setDescription(getDescriptionText());
        if (isTopLevelClassJavadoc()) {
            if (getFileContents().getFileName().contains("test")) {
                MODULE_DETAILS_STORE.put(moduleDetails.getFullQualifiedName(), moduleDetails);
            }
            else {
                try {
                    XmlMetaWriter.write(moduleDetails);
                }
                catch (TransformerException | ParserConfigurationException ex) {
                    throw new IllegalStateException("Failed to write metadata into XML file for "
                            + "module: " + getModuleSimpleName(), ex);
                }
            }
        }
    }

    /**
     * Method containing the core logic of scraping. This keeps track and decides which phase of
     * scraping we are in, and accordingly call other subroutines.
     *
     * @param ast javadoc ast
     */
    public void scrapeContent(DetailNode ast) {
        if (ast.getType() == JavadocTokenTypes.PARAGRAPH) {
            if (isParentText(ast)) {
                parentSectionStartIdx = getParentIndexOf(ast);
                moduleDetails.setParent(getParentText(ast));
            }
            else if (isViolationMessagesText(ast)) {
                scrapingViolationMessageList = true;
            }
            else if (exampleSectionStartIdx == -1
                    && isExamplesText(ast)) {
                exampleSectionStartIdx = getParentIndexOf(ast);
            }
        }
        else if (ast.getType() == JavadocTokenTypes.LI) {
            if (isPropertyList(ast)) {
                if (propertySectionStartIdx == -1) {
                    propertySectionStartIdx = getParentIndexOf(ast);
                }
                moduleDetails.addToProperties(createProperties(ast));
            }
            else if (scrapingViolationMessageList) {
                moduleDetails.addToViolationMessages(getViolationMessages(ast));
            }
        }
    }

    /**
     * Create the modulePropertyDetails content.
     *
     * @param nodeLi list item javadoc node
     * @return modulePropertyDetail object for the corresponding property
     */
    private static ModulePropertyDetails createProperties(DetailNode nodeLi) {
        final ModulePropertyDetails modulePropertyDetails = new ModulePropertyDetails();

        final Optional<DetailNode> propertyNameNode = getFirstChildOfType(nodeLi,
                JavadocTokenTypes.JAVADOC_INLINE_TAG, 0);
        if (propertyNameNode.isPresent()) {
            final DetailNode propertyNameTag = propertyNameNode.get();
            final String propertyName = getTextFromTag(propertyNameTag);

            final DetailNode propertyType = getFirstChildOfMatchingText(nodeLi, TYPE_TAG)
                .orElseThrow(() -> {
                    return new MetadataGenerationException(String.format(
                        Locale.ROOT, PROP_TYPE_MISSING, propertyName)
                    );
                });
            final String propertyDesc = DESC_CLEAN.matcher(
                    constructSubTreeText(nodeLi, propertyNameTag.getIndex() + 1,
                            propertyType.getIndex() - 1))
                    .replaceAll(Matcher.quoteReplacement(""));

            modulePropertyDetails.setDescription(propertyDesc.trim());
            modulePropertyDetails.setName(propertyName);
            modulePropertyDetails.setType(getTagTextFromProperty(nodeLi, propertyType));

            final Optional<DetailNode> validationTypeNodeOpt = getFirstChildOfMatchingText(nodeLi,
                VALIDATION_TYPE_TAG);
            if (validationTypeNodeOpt.isPresent()) {
                final DetailNode validationTypeNode = validationTypeNodeOpt.get();
                modulePropertyDetails.setValidationType(getTagTextFromProperty(nodeLi,
                    validationTypeNode));
            }

            final String defaultValue = getFirstChildOfMatchingText(nodeLi, DEFAULT_VALUE_TAG)
                .map(defaultValueNode -> getPropertyDefaultText(nodeLi, defaultValueNode))
                .orElseThrow(() -> {
                    return new MetadataGenerationException(String.format(
                        Locale.ROOT, PROP_DEFAULT_VALUE_MISSING, propertyName)
                    );
                });
            if (!PROPERTIES_TO_NOT_WRITE.contains(defaultValue)) {
                modulePropertyDetails.setDefaultValue(defaultValue);
            }
        }
        return modulePropertyDetails;
    }

    /**
     * Get tag text from property data.
     *
     * @param nodeLi javadoc li item node
     * @param propertyMeta property javadoc node
     * @return property metadata text
     */
    private static String getTagTextFromProperty(DetailNode nodeLi, DetailNode propertyMeta) {
        final Optional<DetailNode> tagNodeOpt = getFirstChildOfType(nodeLi,
                JavadocTokenTypes.JAVADOC_INLINE_TAG, propertyMeta.getIndex() + 1);
        DetailNode tagNode = null;
        if (tagNodeOpt.isPresent()) {
            tagNode = tagNodeOpt.get();
        }
        return getTextFromTag(tagNode);
    }

    /**
     * Clean up the default token text by removing hyperlinks, and only keeping token type text.
     *
     * @param initialText unclean text
     * @return clean text
     */
    private static String cleanDefaultTokensText(String initialText) {
        final Set<String> tokens = new LinkedHashSet<>();
        final Matcher matcher = TOKEN_TEXT_PATTERN.matcher(initialText);
        while (matcher.find()) {
            tokens.add(matcher.group(0));
        }
        return String.join(",", tokens);
    }

    /**
     * Performs a DFS of the subtree with a node as the root and constructs the text of that
     * tree, ignoring JavadocToken texts.
     *
     * @param node root node of subtree
     * @param childLeftLimit the left index of root children from where to scan
     * @param childRightLimit the right index of root children till where to scan
     * @return constructed text of subtree
     */
    private static String constructSubTreeText(DetailNode node, int childLeftLimit,
                                               int childRightLimit) {
        final StringBuilder result = new StringBuilder(1024);
        DetailNode detailNode = node;

        final Deque<DetailNode> stack = new ArrayDeque<>();
        stack.addFirst(detailNode);
        final Set<DetailNode> visited = new HashSet<>();
        while (!stack.isEmpty()) {
            detailNode = stack.getFirst();
            stack.removeFirst();

            if (!visited.contains(detailNode)) {
                final String childText = detailNode.getText();
                if (detailNode.getType() != JavadocTokenTypes.LEADING_ASTERISK
                        && !TOKEN_TEXT_PATTERN.matcher(childText).matches()) {
                    result.insert(0, detailNode.getText());
                }
                visited.add(detailNode);
            }

            for (DetailNode child : detailNode.getChildren()) {
                if (child.getParent().equals(node)
                        && (child.getIndex() < childLeftLimit
                        || child.getIndex() > childRightLimit)) {
                    continue;
                }
                if (!visited.contains(child)) {
                    stack.addFirst(child);
                }
            }
        }
        return result.toString().trim();
    }

    /**
     * Create the description text with starting index as 0 and ending index would be the first
     * valid non zero index amongst in the order of {@code propertySectionStartIdx},
     * {@code exampleSectionStartIdx} and {@code parentSectionStartIdx}.
     *
     * @return description text
     */
    private String getDescriptionText() {
        final int descriptionEndIdx;
        if (propertySectionStartIdx > -1) {
            descriptionEndIdx = propertySectionStartIdx;
        }
        else if (exampleSectionStartIdx > -1) {
            descriptionEndIdx = exampleSectionStartIdx;
        }
        else {
            descriptionEndIdx = parentSectionStartIdx;
        }
        return constructSubTreeText(rootNode, 0, descriptionEndIdx - 1);
    }

    /**
     * Create property default text, which is either normal property value or list of tokens.
     *
     * @param nodeLi list item javadoc node
     * @param defaultValueNode default value node
     * @return default property text
     */
    private static String getPropertyDefaultText(DetailNode nodeLi, DetailNode defaultValueNode) {
        final Optional<DetailNode> propertyDefaultValueTag = getFirstChildOfType(nodeLi,
                JavadocTokenTypes.JAVADOC_INLINE_TAG, defaultValueNode.getIndex() + 1);
        final String result;
        if (propertyDefaultValueTag.isPresent()) {
            result = getTextFromTag(propertyDefaultValueTag.get());
        }
        else {
            final String tokenText = constructSubTreeText(nodeLi,
                    defaultValueNode.getIndex(), nodeLi.getChildren().length);
            result = cleanDefaultTokensText(tokenText);
        }
        return result;
    }

    /**
     * Get the violation message text for a specific key from the list item.
     *
     * @param nodeLi list item javadoc node
     * @return violation message key text
     */
    private static String getViolationMessages(DetailNode nodeLi) {
        final Optional<DetailNode> resultNode = getFirstChildOfType(nodeLi,
                JavadocTokenTypes.JAVADOC_INLINE_TAG, 0);
        return resultNode.map(JavadocMetadataScraper::getTextFromTag).orElse("");
    }

    /**
     * Get text from {@code JavadocTokenTypes.JAVADOC_INLINE_TAG}.
     *
     * @param nodeTag target javadoc tag
     * @return text contained by the tag
     */
    private static String getTextFromTag(DetailNode nodeTag) {
        return Optional.ofNullable(nodeTag).map(JavadocMetadataScraper::getText).orElse("");
    }

    /**
     * Returns the first child node which matches the provided {@code TokenType} and has the
     * children index after the offset value.
     *
     * @param node parent node
     * @param tokenType token type to match
     * @param offset children array index offset
     * @return the first child satisfying the conditions
     */
    private static Optional<DetailNode> getFirstChildOfType(DetailNode node, int tokenType,
                                                            int offset) {
        return Arrays.stream(node.getChildren())
                .filter(child -> child.getIndex() >= offset && child.getType() == tokenType)
                .findFirst();
    }

    /**
     * Get joined text from all text children nodes.
     *
     * @param parentNode parent node
     * @return the joined text of node
     */
    private static String getText(DetailNode parentNode) {
        return Arrays.stream(parentNode.getChildren())
                .filter(child -> child.getType() == JavadocTokenTypes.TEXT)
                .map(node -> QUOTE_PATTERN.matcher(node.getText().trim()).replaceAll(""))
                .collect(Collectors.joining(" "));
    }

    /**
     * Get first child of parent node matching the provided pattern.
     *
     * @param node parent node
     * @param pattern pattern to match against
     * @return the first child node matching the condition
     */
    private static Optional<DetailNode> getFirstChildOfMatchingText(DetailNode node,
                                                                    Pattern pattern) {
        return Arrays.stream(node.getChildren())
                .filter(child -> pattern.matcher(child.getText()).matches())
                .findFirst();
    }

    /**
     * Returns parent node, removing modifier/annotation nodes.
     *
     * @param commentBlock child node.
     * @return parent node.
     */
    private static DetailAST getParent(DetailAST commentBlock) {
        final DetailAST parentNode = commentBlock.getParent();
        DetailAST result = parentNode;
        if (result.getType() == TokenTypes.ANNOTATION) {
            result = parentNode.getParent().getParent();
        }
        else if (result.getType() == TokenTypes.MODIFIERS) {
            result = parentNode.getParent();
        }
        return result;
    }

    /**
     * Traverse parents until we reach the root node (@code{JavadocTokenTypes.JAVADOC})
     * child and return its index.
     *
     * @param node subtree child node
     * @return root node child index
     */
    private static int getParentIndexOf(DetailNode node) {
        DetailNode currNode = node;
        while (currNode.getParent().getIndex() != -1) {
            currNode = currNode.getParent();
        }
        return currNode.getIndex();
    }

    /**
     * Get module parent text from paragraph javadoc node.
     *
     * @param nodeParagraph paragraph javadoc node
     * @return parent text
     */
    private static String getParentText(DetailNode nodeParagraph) {
        return getFirstChildOfType(nodeParagraph, JavadocTokenTypes.JAVADOC_INLINE_TAG, 0)
                .map(JavadocMetadataScraper::getTextFromTag)
                .orElse(null);
    }

    /**
     * Get module type(check/filter/filefilter) based on file name.
     *
     * @return module type
     */
    private ModuleType getModuleType() {
        final String simpleModuleName = getModuleSimpleName();
        final ModuleType result;
        if (simpleModuleName.endsWith("FileFilter")) {
            result = ModuleType.FILEFILTER;
        }
        else if (simpleModuleName.endsWith("Filter")) {
            result = ModuleType.FILTER;
        }
        else {
            result = ModuleType.CHECK;
        }
        return result;
    }

    /**
     * Extract simple file name from the whole file path name.
     *
     * @return simple module name
     */
    private String getModuleSimpleName() {
        final String fullFileName = getFileContents().getFileName();
        final String[] pathTokens = FILE_SEPARATOR_PATTERN.split(fullFileName);
        final String fileName = pathTokens[pathTokens.length - 1];
        return fileName.substring(0, fileName.length() - JAVA_FILE_EXTENSION.length());
    }

    /**
     * Retrieve package name of module from the absolute file path.
     *
     * @param filePath absolute file path
     * @return package name
     */
    private static String getPackageName(String filePath) {
        final Deque<String> result = new ArrayDeque<>();
        final String[] filePathTokens = FILE_SEPARATOR_PATTERN.split(filePath);
        for (int i = filePathTokens.length - 1; i >= 0; i--) {
            if ("java".equals(filePathTokens[i]) || "resources".equals(filePathTokens[i])) {
                break;
            }
            result.addFirst(filePathTokens[i]);
        }
        final String fileName = result.removeLast();
        result.addLast(fileName.substring(0, fileName.length() - JAVA_FILE_EXTENSION.length()));
        return String.join(".", result);
    }

    /**
     * Getter method for {@code moduleDetailsStore}.
     *
     * @return map containing module details of supplied checks.
     */
    public static Map<String, ModuleDetails> getModuleDetailsStore() {
        return Collections.unmodifiableMap(MODULE_DETAILS_STORE);
    }

    /**
     * Check if the current javadoc block comment AST corresponds to the top-level class as we
     * only want to scrape top-level class javadoc.
     *
     * @return true if the current AST corresponds to top level class
     */
    public boolean isTopLevelClassJavadoc() {
        final DetailAST parent = getParent(getBlockCommentAst());
        final Optional<DetailAST> className = TokenUtil
                .findFirstTokenByPredicate(parent, child -> {
                    return parent.getType() == TokenTypes.CLASS_DEF
                            && child.getType() == TokenTypes.IDENT;
                });
        return className.isPresent()
                && getModuleSimpleName().equals(className.get().getText());
    }

    /**
     * Checks whether the paragraph node corresponds to the example section.
     *
     * @param ast javadoc paragraph node
     * @return true if the section matches the example section marker
     */
    private static boolean isExamplesText(DetailNode ast) {
        return isChildNodeTextMatches(ast, EXAMPLES_TAG);
    }

    /**
     * Checks whether the list item node is part of a property list.
     *
     * @param nodeLi {@code JavadocTokenType.LI} node
     * @return true if the node is part of a property list
     */
    private static boolean isPropertyList(DetailNode nodeLi) {
        return isChildNodeTextMatches(nodeLi, PROPERTY_TAG);
    }

    /**
     * Checks whether the {@code JavadocTokenType.PARAGRAPH} node is referring to the violation
     * message keys javadoc segment.
     *
     * @param nodeParagraph paragraph javadoc node
     * @return true if paragraph node contains the violation message keys text
     */
    private static boolean isViolationMessagesText(DetailNode nodeParagraph) {
        return isChildNodeTextMatches(nodeParagraph, VIOLATION_MESSAGES_TAG);
    }

    /**
     * Checks whether the {@code JavadocTokenType.PARAGRAPH} node is referring to the parent
     * javadoc segment.
     *
     * @param nodeParagraph paragraph javadoc node
     * @return true if paragraph node contains the parent text
     */
    private static boolean isParentText(DetailNode nodeParagraph) {
        return isChildNodeTextMatches(nodeParagraph, PARENT_TAG);
    }

    /**
     * Checks whether the first child {@code JavadocTokenType.TEXT} node matches given pattern.
     *
     * @param ast parent javadoc node
     * @param pattern pattern to match
     * @return true if one of child text nodes matches pattern
     */
    private static boolean isChildNodeTextMatches(DetailNode ast, Pattern pattern) {
        return getFirstChildOfType(ast, JavadocTokenTypes.TEXT, 0)
                .map(DetailNode::getText)
                .map(pattern::matcher)
                .map(Matcher::matches)
                .orElse(false);
    }
}