# !/usr/bin/python

"""
Downloads and updates documentation for clang-tidy checks from llvm-project GitHub repository.

If rst documentation file for a check points to another rst file  for detailed check description,
it copies content from that file and updates the original rst file.
If the rst documentation file for a check points to a separate html page for detailed check description
then this script parses the html page and updates the rst file with the content from html page.

Repo url: https://github.com/llvm/llvm-project
Doc path: https://github.com/llvm/llvm-project/tree/main/clang-tools-extra/docs/clang-tidy/checks/

usage: clang_tidy_doc_gen [-h] [--tag TAG]
example: python3 clang_tidy_doc_gen.py --tag llvmorg-15.0.7

If tag is not given it downloads from main branch

Limitation: Sometimes the script fails if GitHub requests rate limit is reached.
"""


import argparse
import os
import re
import requests
from bs4 import BeautifulSoup
from github import ContentFile, Github
from typing import Any

# Timout for requests
TIMEOUT_SEC = 5

# LLVM project github path
LLVM_REPO_URL = "llvm/llvm-project"

# clang-tidy checks documentation path in GitHub repo
CLANG_TIDY_DOCS_REPO_PATH = "clang-tools-extra/docs/clang-tidy/checks"

# Path for clang-tidy html documentation containing detailed documentation for checks
CLANG_TIDY_DOCS_HTML_PATH = "https://clang.llvm.org/docs/analyzer/checkers.html"

# Path where the downloaded rst files will be stored
DOWNLOAD_PATH = f"{os.getcwd()}{os.sep}clang-analyzer{os.sep}"

# The pattern after which the html content is replaced in rst doc for clang analyzer
# example:
#
#     clang-analyzer-osx.SecKeychainAPI
#     =================================
DOC_REPLACE_AFTER_PATTERN = r"\n.*-.*-.*\n==*\n"

# Code section to insert in rst document
RST_CPP_CODE_SECTION = ".. code:: c++\n\n"

# URL pattern in rst files
RST_URL_PATTERN = r'http-equiv=refresh:\s*\d+;URL=([^\s]+)'

# Patterns to remove lines after we copy the content from a referencing rst file.
# Eg. the line '`cert-err09-cpp` redirects here as an alias for this check.' should be
# removed as it is no longer needed after we copy the content from a referred rst file
# to the original rst file
REDIRECT_LINE_PATTERN1 = r"\n.* redirects here as an alias for this check."
REDIRECT_LINE_PATTERN2 = r"For the CERT alias,\nthe `.*` option is set to `.*`\."


def download_file(content_file: ContentFile, output_dir: str) -> None:
    """Downloads a file from GitHub and stores it in output directory

    Args:
        content_file (ContentFile): file to download
        output_dir (str): directory to store file in
    """

    output_path = f'{output_dir}{os.sep}{content_file.name}'
    if os.path.exists(output_path):
        return
    response = requests.get(content_file.download_url, timeout=TIMEOUT_SEC)

    with open(output_path, 'wb') as file:
        print(f'downloading file {content_file.path} to {output_path}')
        file.write(response.content)


def get_content_from_section(section: Any) -> str:
    """Retrieve clang-tidy check documentation from section tag

    Args:
        section (Any): section tag containing the documentation for a clang-tidy check

    Returns:
        str: content from section tag
    """

    content = ""

    for tag in section.find_all(recursive=True):
        if tag.name == "p":
            if tag.parent.name == "li":
                content += "* "

            content += tag.text.replace("\n", " ") + "\n\n"
        elif tag.name == "pre":
            # pre tag contains code examples
            content += RST_CPP_CODE_SECTION
            content +=  "\n".join(["    " + line for line in tag.text.splitlines()])
            content += "\n\n"

    return content


def update_rst_docs() -> None:
    """
    Update rst documentation for checks.
    Replace html links with actual content from html page.
    """

    for root, _, files in os.walk(os.getcwd()):
        for file_path in files:
            if file_path.endswith(".rst"):
                with open(os.path.join(root, file_path), "r", encoding="utf-8") as file:
                    print(f"Processing file: {os.path.join(root, file_path)}")
                    curr_file_content = file.read()
                    # Get the URL to which document points to
                    match = re.search(RST_URL_PATTERN, curr_file_content)
                    url = match.group(1) if match else None

                    if url:
                        # Get content in the rst file before where the html link is specified.
                        # Append it to the html content retrieved from html file and write to rst doc.
                        match = re.search(DOC_REPLACE_AFTER_PATTERN, curr_file_content)

                        if match:
                            content_to_add = curr_file_content[:match.end()]
                        else:
                            content_to_add = ""

                        # If the URL points to a rst file in another directory
                        # copy content from that file in another directory.
                        # eg. URL=../readability/implicit-bool-conversion.html
                        if url.startswith(".."):
                            rst_content_location = url.replace(".html", ".rst")[3:]

                            with open(rst_content_location, "r", encoding="utf-8") as rst_file:
                                rst_content = rst_file.read()
                                match = re.search(DOC_REPLACE_AFTER_PATTERN , rst_content)
                                content_to_copy = rst_content[match.end():]
                                content_to_copy = re.sub(REDIRECT_LINE_PATTERN1, '', content_to_copy).strip()
                                content_to_copy = re.sub(REDIRECT_LINE_PATTERN2, '', content_to_copy).strip()
                                content_to_add += "\n"
                                content_to_add += content_to_copy
                                content_to_add += "\n"

                        # Else URL points to an html page
                        else:
                            response = requests.get(url, timeout=TIMEOUT_SEC)
                            response.encoding = 'utf-8'
                            soup = BeautifulSoup(response.text, 'html.parser')

                            # Find the part after the `#` in html url
                            # The url has the following structure:
                            # https://clang.llvm.org/docs/analyzer/checkers.html#core-nulldereference
                            doc_identifier = url.split("#")[1]

                            if doc_identifier:
                                # The check description is either in section tag or
                                # in the span tag having id as doc_identifier
                                doc_content = soup.find_all('section', id=doc_identifier)
                                doc_html_content = None

                                if doc_content:
                                    doc_html_content = doc_content[0]
                                else:
                                    spans = soup.find_all('span', id=doc_identifier)
                                    if spans:
                                        doc_html_content = spans[0].parent

                                if doc_html_content:
                                    content_to_add += "\n"
                                    content_to_add += get_content_from_section(doc_html_content)
                                else:
                                    continue

                        if content_to_add:
                            # Get content in the rst file before where the html link is specified.
                            # Append it to the html content retrieved from html file and write to rst doc.
                            with open(os.path.join(root, file_path), "w", encoding="utf-8") as file:
                                file.write(content_to_add)


def add_args(parser: argparse.ArgumentParser) -> None:
    """
    Add various command line arguments

    Args:
        parser (argparse.ArgumentParser): argparse parser
    """

    parser.add_argument(
        "--tag",
        help="release tag to fetch documentation from"
    )


def main() -> None:
    """Main function"""

    # Init parser
    parser = argparse.ArgumentParser(
        prog="clang_tidy_doc_gen",
        description="Fetch documentation for clang tidy checks from llvm-project repository"
    )

    add_args(parser)
    args = parser.parse_args()

    github = Github()
    repo = github.get_repo(LLVM_REPO_URL)

    # if tag is provided download from release tag
    # else download from main branch
    if args.tag:
        tag_ref = repo.get_git_ref(f"tags/{args.tag}")
        tag_info = requests.get(tag_ref.object.url, timeout=TIMEOUT_SEC).json()
        tag_commit = tag_info["object"]["sha"]
        contents = repo.get_contents(path=CLANG_TIDY_DOCS_REPO_PATH, ref=tag_commit)
    else:
        tag_commit = None
        contents = repo.get_contents(path=CLANG_TIDY_DOCS_REPO_PATH)

    for content in contents:
        if content.type == 'dir':
            print(f"Creating directory: {content.name}")
            os.makedirs(content.name, exist_ok=True)

            if tag_commit:
                file_contents = repo.get_contents(path=f"{CLANG_TIDY_DOCS_REPO_PATH}/{content.name}", ref=tag_commit)
            else:
                file_contents = repo.get_contents(path=f"{CLANG_TIDY_DOCS_REPO_PATH}/{content.name}")

            for file in file_contents:
                download_file(file, content.name)


    update_rst_docs()


if __name__ == "__main__":
    main()
