import json
import pickle
from base64 import b64decode
from csv import reader
from datetime import UTC, datetime
from email.header import decode_header
from io import BytesIO
from logging import getLogger
from pathlib import PosixPath
from pprint import pformat
from re import match
from time import time
from typing import Any, BinaryIO
from urllib.parse import quote

import requests
from pdfminer.high_level import extract_text
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfparser import PDFParser, PDFSyntaxError
from pdfminer.psparser import PSEOF
from pypandoc import convert_text, get_pandoc_formats, normalize_format
from rt.rest2 import Rt, UnexpectedResponseError  # type: ignore[attr-defined]

from eego import Document

log = getLogger("eego")

pandoc_input_formats = get_pandoc_formats()[0]

pdfminer_ignored_errors = (ValueError, PSEOF, PDFSyntaxError, AssertionError)


class EegoBadResponseError(Exception):
    pass


class Injector:
    TIMEOUT = 900

    def __init__(
        self,
        config: dict[str, Any],
        source: str = "unknown",
        base_path: PosixPath = PosixPath("/dev/null/"),
        base_uri: str = "https://invalid/",
    ) -> None:
        self.config = config
        self.source = source
        self.base_path = base_path
        self.base_uri = base_uri
        self.headers = {"Authorization": self.config["injectors"]["eego-api-token"]}
        self.eego_api_base_uri = self.config["injectors"].get("eego-api-base-uri")
        self.last_update = float(
            0.0
            if self.config["test"]
            else requests.get(
                self.eego_api_base_uri
                + f"/doc?sort=modtime-cache-descending&source={self.source}",
                headers=self.headers,
                timeout=self.TIMEOUT,
            ).text
        )
        log.info(
            "Source %s was last updated on %s",
            self.source,
            datetime.fromtimestamp(self.last_update, tz=UTC).strftime(
                "%Y-%m-%d %H:%M:%S"
            ),
        )

    def pantextualize(
        self,
        content: str | bytes,
        extension: str,  # https://pandoc.org/MANUAL.html#option--from
    ) -> str:
        fmt = normalize_format(extension)
        if fmt not in pandoc_input_formats:
            raise TypeError
        return str(
            convert_text(
                content,
                "plain",
                format=fmt,
                sandbox=True,
            )
        )

    def get_pdf_title(self, stream: BinaryIO) -> str | None:
        doc = PDFDocument(PDFParser(stream))

        if (
            len(doc.info) > 0
            and "Title" in doc.info[0]
            and isinstance(doc.info[0]["Title"], bytes)
        ):
            for encoding in ["utf_8", "utf_16", "utf_32", "latin_1"]:
                try:
                    return str(doc.info[0]["Title"].decode(encoding))
                except UnicodeDecodeError:
                    continue

        return None

    def send_to_api(self, doc: Document) -> None:
        time_start = time()
        response = requests.post(
            self.eego_api_base_uri + "/doc",
            headers=self.headers | {"Content-Type": "application/json"},
            data=json.dumps(doc.__dict__),
            timeout=self.TIMEOUT,
        )
        duration = time() - time_start
        log.debug("%s (%d)", response.status_code, duration)
        if response.status_code != 200:
            raise EegoBadResponseError

    def commit(self) -> None:
        requests.patch(
            self.eego_api_base_uri + "/doc",
            headers=self.headers,
            timeout=self.TIMEOUT,
        )


class DirectoryInjector(Injector):
    def __init__(
        self,
        config: dict[str, Any],
        source: str = "unknown",
        base_path: PosixPath = PosixPath("/dev/null/"),
        base_uri: str = "https://invalid/",
    ) -> None:
        super().__init__(
            config=config,
            source=source,
            base_path=base_path,
            base_uri=base_uri,
        )

    def get_uri(self, filepath: PosixPath) -> str:
        return self.base_uri + str(filepath).replace(str(self.base_path), "").replace(
            ".txt", ""
        )

    def file_reader(self, filepath: PosixPath) -> None | Document:
        if filepath.suffix == ".pdf":
            try:
                content = extract_text(filepath)
            except pdfminer_ignored_errors:
                log.exception("Error extracting PDF text.")
                return None
            with filepath.open("rb") as file:
                title = self.get_pdf_title(file) or "PDF : " + filepath.name
        else:
            extension = filepath.suffix[1:]
            with filepath.open("rb") as file:
                try:
                    content = self.pantextualize(file.read(), extension)
                except TypeError:
                    log.info("Skipping {filepath}: format unsupported by pandoc.")
                    return None
            title = "Document " + extension.upper() + " : " + filepath.name

        return Document(
            url=self.get_uri(filepath),
            source=self.source,
            content=content,
            title=title,
            modtime=filepath.stat().st_mtime,
            size=filepath.stat().st_size,
        )

    def dir_reader(self) -> None:
        for filepath in sorted(
            self.base_path.rglob("*"),
            key=lambda file: file.stat().st_mtime,
        ):
            if not filepath.is_file():
                continue

            if filepath.stat().st_mtime_ns / (10**9) <= self.last_update - 1:
                log.info("Skipping %s: already indexed", filepath)
                continue

            log.info("Reading %s", filepath)

            doc = self.file_reader(filepath)
            if doc is not None:
                self.send_to_api(doc)
        self.commit()


class DokuwikiPagesInjector(DirectoryInjector):
    def __init__(self, config: dict[str, Any]) -> None:
        super().__init__(
            config=config,
            source="dokuwiki_pages",
            base_path=PosixPath(config["injectors"]["dokuwiki-pages"]["base-path"]),
            base_uri=config["injectors"]["dokuwiki-pages"]["base-uri"],
        )

        self.delete()
        self.dir_reader()

    def delete(self) -> None:
        for uri in self.get_deleted_uris():
            response = requests.delete(
                self.eego_api_base_uri + "/doc/" + quote(uri, safe=""),
                headers=self.headers,
                timeout=self.TIMEOUT,
            )
            log.info("Deleting %s: %d", uri, response.status_code)
            if response.status_code not in [200, 404]:
                raise EegoBadResponseError
        self.commit()

    def get_deleted_uris(self) -> list[str]:
        with (self.base_path / "../meta/_dokuwiki.changes").open(
            encoding="utf-8",
            newline="",
        ) as file:
            deleted = []
            rows = reader(file.readlines(), delimiter="\t")
            for row in rows:
                if len(row) > 2 and row[2] == "D":  # If change is a deletion
                    timestamp = int(row[0])
                    if timestamp <= self.last_update - 1:
                        continue
                    deleted.append(
                        "https://www.easter-eggs.fr/" + row[3].replace(":", "/"),
                    )
        return deleted

    def file_reader(self, filepath: PosixPath) -> None | Document:
        with filepath.open(encoding="utf-8") as file:
            try:
                raw = str(file.read())
                content = self.pantextualize(raw, "dokuwiki")
            except RuntimeError:
                log.warning("Skipping %s: pandoc conversion failed", filepath)
                return None
            except UnicodeDecodeError:
                log.error(
                    "Failed to read file '%s'. Looks like it is not utf-8 encoded",
                    filepath,
                )
                log.debug(
                    "Failed to read file '%s'. Full trace below",
                    filepath,
                    exc_info=True,
                )
                return None
            results = match("====== (?P<title>.+) ======", raw)
            if results is None:
                log.warning("Skipping %s: no title", filepath)
                return None
            title = results["title"]

        return Document(
            url=self.get_uri(filepath),
            source=self.source,
            content=content,
            title=title,
            modtime=filepath.stat().st_mtime,
            size=filepath.stat().st_size,
        )


class UnexpectedRtResponsesInARowError(Exception):
    pass


class RtInjector(Injector):
    RT_API = "https://rt.easter-eggs.com/REST/2.0/"
    RT_API_INTERNAL = "https://rt.easter-eggs.fr/REST/2.0/"

    def __init__(self, config: dict[str, Any]) -> None:
        super().__init__(
            config=config,
            source="rt",
            base_uri="https://rt.easter-eggs.fr/",
        )
        self.conn = Rt(
            url=self.RT_API,
            token=self.config["injectors"]["rt"]["token"],
            http_timeout=300,
        )
        self.rt_errors_in_a_row = 0

        self.inject()

    def tickets_to_inject(self) -> list[dict[Any, Any]]:
        return list(
            self.conn.search(
                order="LastUpdated",
                raw_query=f"""Updated > '{
                    datetime.fromtimestamp(self.last_update-5, tz=UTC).strftime(
                        "%Y-%m-%d %H:%M:%S GMT"
                    )
                }'""",
            )
        )

    def inject(self) -> None:
        if time() - self.last_update > 86400 * 5:
            cache_path = PosixPath("global_search.pickle")
            if not cache_path.is_file():
                with cache_path.open("wb") as file:
                    pickle.dump(self.tickets_to_inject(), file)
                log.debug(f"Created cache file '{cache_path}'")
            with cache_path.open("rb") as file:
                tickets = pickle.load(file)
            log.debug(f"Loaded cache file '{cache_path}'")
        else:
            tickets = self.tickets_to_inject()

        log.info(f"Found {len(tickets)} to index...")
        for ticket in tickets:
            self.inject_ticket(ticket)
        self.commit()

    def inject_ticket(  # pylint: disable=too-many-branches
        self, ticket: dict[str, Any]
    ) -> None:
        ticket_modtime: float = datetime.fromisoformat(
            ticket["LastUpdated"]
        ).timestamp()
        if ticket_modtime <= self.last_update - 1:
            log.info("Skipping ticket already indexed: %s", ticket["id"])
            return
        log.debug("Indexing ticket %s, last updated on %s...", ticket["id"], ticket["LastUpdated"])
        for event in self.conn.get_ticket_history(ticket["id"]) or []:
            log.debug("Event %s", event["id"])
            if event["Type"] not in ("Create", "Correspond", "Comment"):
                continue
            for event_link in event["_hyperlinks"]:
                if event_link.get("id") is None:
                    continue
                log.debug("Link %s", event_link["id"])
                try:
                    transaction = self.conn.get_transaction(event_link["id"])
                except UnexpectedResponseError as err:
                    log.exception("Failed to handle transaction %s", event_link["id"])
                    self.rt_errors_in_a_row += 1
                    if self.rt_errors_in_a_row > 1:
                        raise UnexpectedRtResponsesInARowError from err
                    continue
                self.rt_errors_in_a_row = 0
                for transaction_link in transaction["_hyperlinks"]:
                    log.debug(pformat(transaction_link))
                    if transaction_link["ref"] != "attachment":
                        continue
                    attachment_id = transaction_link["_url"].replace(
                        self.RT_API_INTERNAL + "attachment/", ""
                    )
                    log.debug("Attachment %s", attachment_id)
                    message = self.conn.get_attachment(attachment_id)
                    if message is None:
                        continue
                    if message["Content"] is None:
                        continue
                    content_type = message["ContentType"]
                    if content_type not in ("text/plain", "application/pdf"):
                        if content_type.startswith("image/") or \
                            content_type.startswith("audio/") or \
                            content_type == "text/html":
                            continue
                        log.warning(
                            "Ignoring attachment %s from ticket %s, unknown type: '%s'",
                            attachment_id,
                            ticket["id"],
                            message["ContentType"],
                        )
                        continue
                    doc = self.message_reader(message, ticket_modtime)
                    if doc is None:
                        continue
                    if doc.title:
                        log.debug('Sending doc with title "%s"', doc.title)
                    else:
                        log.debug("Sending doc with type '%s'", event["Type"])
                    self.send_to_api(doc)
        log.info(f"Finished indexing ticket {ticket['id']}")

    def message_reader(
        self, message: dict[str, Any], ticket_modtime: float
    ) -> Document | None:
        raw_content = b64decode(message["Content"])
        if message["ContentType"] == "application/pdf":
            pdf = BytesIO(raw_content)
            try:
                content = extract_text(pdf)
            except pdfminer_ignored_errors:
                log.exception("Error extracting PDF text.")
                return None
            url = (
                self.base_uri
                + "Ticket/Attachment/"
                + str(message["TransactionId"]["id"])
                + "/"
                + str(message["id"])
                + "/"
                + str(quote(message["Filename"], safe=""))
            )
            title = self.get_pdf_title(pdf) or "PDF : " + message["Filename"]
        else:
            content = raw_content.decode("utf-8")
            url = (
                self.base_uri
                + "Transaction/Display.html?id="
                + str(message["TransactionId"]["id"])
            )
            title, _ = decode_header(message["Subject"])[0]
            if not isinstance(title, str):
                title = title.decode("utf-8")

        return Document(
            url=url,
            source=self.source,
            content=content,
            title=title,
            modtime=datetime.fromisoformat(message["Created"]).timestamp(),
            size=len(raw_content),
            modtime_cache=ticket_modtime,
        )
