parser.py

import json
import re
import sys
import urllib
from collections import defaultdict

import bs4
from django.utils.text import slugify

from shared.utils import strip_all_html_tags

from .constants import BENEFITS

KNOWN_KEYS = [
    "nadpis",
    "anotace",
    "problem",
    "kontext-problemu",
    "ideal",
    "navrhovana-opatreni",
    "casovy-horizont",
    "co-jsme-uz-udelali",
    "faq",
    "souvisejici-body",
    "zdroje",
]

BENEFIT_FOR_ALL = "společnost jako celek"

MAIN_BENEFITS = {slugify(old_name): num for num, _, old_name in BENEFITS}


def parse_program_html(fp):
    # Načteme celý dokument.
    html = bs4.BeautifulSoup(fp, "html5lib")

    # Vyházíme odkazy na komentáře.
    for cmnt in html.select('*[id^="cmnt"]'):
        cmnt.parent.extract()

    # Bod má svůj pracovní název
    nazev_bodu = html.select_one("h1, h2, h3").text.strip()

    # Bod má své pojmenované sekce.
    bod = {}

    SEKCE = [
        "Nadpis",
        "Anotace",
        "Problém",
        "Kontext problému",
        "Ideál",
        "Navrhovaná opatření",
        "Časový horizont",
        "FAQ",
        "Související body",
        "Zdroje",
        "Co jsme už udělali",
    ]

    # Tabulka benefitů má své pojmenované cílové skupiny.
    benefity = {}

    # Chceme očesat HTML na výstupu a nechat jenom tyto atributy.
    ATRIBUTY = set(["id", "href"])

    # Dokument má právě dvě tabulky. První je s bodem a druhá s jeho benefity.
    bod_html, bene_html = html.select("table")

    # Z CSS je potřeba vyvodit, které třídy odpovídají tučnému textu a
    # které superskriptu. Protože Google.

    strong = []
    sup = []

    for style in html.select("style"):
        strong = re.findall(r"(\.c[0-9]+)\{[^{]*font-weight:700", style.text)
        sup = re.findall(r"(\.c[0-9]+)\{[^{]*vertical-align:super", style.text)

    assert strong, "Nenašel jsem styl pro tučný text"
    assert sup, "Nenašel jsem styl pro superskript"

    def vycisti(sekce):
        sekce.name = "div"
        sekce.attrs.clear()

        # Nahradíme třídy nativními HTML prvky.
        for tag in sekce.select(", ".join(sup)):
            tag.name = "sup"

        for tag in sekce.select(", ".join(strong)):
            tag.name = "strong"

        # Zbavíme se <span>ů.
        for tag in sekce.select("span"):
            tag.unwrap()

        # Ořízneme nežádoucí atributy.
        for tag in sekce.find_all():
            for attr in list(tag.attrs):
                if attr not in ATRIBUTY:
                    del tag.attrs[attr]

                if tag.name != "a" and attr == "id":
                    del tag.attrs[attr]

        # Ořízneme prázdné tagy.
        for tag in sekce.find_all():
            if tag.text == "" and tag.name not in ["br", "hr"]:
                tag.extract()

        # Opravíme odkazy, které se nakazily Googlem.
        for tag in sekce.select("*[href]"):
            _proto, _loc, _path, query, _frag = urllib.parse.urlsplit(tag.attrs["href"])
            qs = urllib.parse.parse_qs(query)

            if "q" in qs:
                tag.attrs["href"] = qs["q"][0]

        # Opravíme odkazy, které se nakazily Facebookem.
        for tag in sekce.select("*[href]"):
            proto, loc, path, query, frag = urllib.parse.urlsplit(tag.attrs["href"])
            qs = urllib.parse.parse_qs(query)

            if "fbclid" in qs:
                del qs["fbclid"]
                query = urllib.parse.urlencode(qs, doseq=True)
                tag.attrs["href"] = urllib.parse.urlunsplit(
                    (proto, loc, path, query, frag)
                )

        # Spojíme po sobě následující prvky některých typů.
        for fst in sekce.select("ul, sup, strong"):
            if fst.parent is None:
                continue

            snd = fst.next_sibling

            while snd is not None:
                if snd.name == fst.name:
                    snd.extract()
                    for child in snd:
                        fst.append(child)
                else:
                    break

                snd = fst.next_sibling

    # Nejprve zpracujeme bod.
    radky = list(bod_html.select("tr"))

    for radek in radky[1:]:
        nazev, sekce = radek.select("td")

        nazev = nazev.text
        nazev = nazev.strip(" \u00a0\t\r\n:")
        nazev = re.sub("[ \u00a0]+", " ", nazev)

        if nazev not in SEKCE:
            print("Přebývá neznámá sekce: {!r}".format(nazev), file=sys.stderr)

        vycisti(sekce)
        bod[nazev] = sekce

    for nazev in SEKCE:
        if nazev not in bod:
            print("Chybí povinná sekce {!r}".format(nazev), file=sys.stderr)

    # Benefity

    for radek in bene_html.select("tr")[1:]:
        cilovka, benefit, _info = radek.select("td")

        cilovka = cilovka.text
        cilovka = re.sub(r"\(.*\)", "", cilovka)
        cilovka = cilovka.strip(" \u00a0\t\r\n:")
        cilovka = re.sub("[ \u00a0]+", " ", cilovka)

        vycisti(benefit)

        if benefit.text.strip() != "":
            benefity[cilovka] = benefit

    # Pro případnou zběžnou kontrolu:

    # print("<h1>" + nazev_bodu + "</h1>")

    # for nazev, sekce in bod.items():
    #     print("<hr/>")
    #     print("<h2>" + nazev + "</h2>")
    #     print(sekce)

    # print("<hr/>")
    # print("<h2>Benefity</h2>")
    # for cilovka, benefit in benefity.items():
    #     print("<h3>" + cilovka + "</h3>")
    #     print(benefit)

    return {
        "nazev": nazev_bodu,
        "sekce": {nazev: str(sekce) for nazev, sekce in bod.items()},
        "benefity": {cilovka: str(benefit) for cilovka, benefit in benefity.items()},
    }


def strip_div(value):
    return value.replace("<div>", "").replace("</div>", "")


def replace_tags(value):
    value = strip_div(value)
    if not value.startswith("<p>"):
        value = f"<p>{value}</p>"
    return value


def set_fancy_lists(value):
    value = value.replace("<ul>", '<ul class="unordered-list unordered-list-checks">')
    value = value.replace("<li>", '<li class="mb-4">')
    return value


def clean_point(point):
    out = {}

    for old_key, val in point.items():
        key = slugify(old_key)

        if key not in KNOWN_KEYS:
            raise ValueError(f"Unknown key: {old_key}")

        if key in ["nadpis"]:
            out[key] = strip_all_html_tags(val)
        else:
            out[key] = replace_tags(val)

    return out


def prepare_faq(value):
    soup = bs4.BeautifulSoup(value, "html.parser")
    questions = defaultdict(list)
    for tag in soup.children:
        if tag.strong:
            key = tag.strong.string
        else:
            questions[key].append(str(tag))

    data = []
    for key, val in questions.items():
        data.append(
            {"type": "question", "value": {"question": key, "answer": "".join(val)}}
        )

    return json.dumps(data)


def prepare_horizon(value):
    raw = strip_all_html_tags(value)
    m = re.match(r"^(\d+)\s(\w+)$", raw)
    if m:
        return None, m.group(1), m.group(2)
    return value, None, None


def print_preview(point):
    print("")
    for key, val in point.items():
        print(key, ":", val[:120])


def print_full(point):
    for key, val in point.items():
        print("")
        print(key)
        print("-" * 100)
        print(val)
        print("-" * 100)


def prepare_point(source):
    point = clean_point(source)
    # print_full(point)
    return point


def prepare_benefit_for_all(benefits):
    if BENEFIT_FOR_ALL in benefits:
        text = benefits[BENEFIT_FOR_ALL]
        return strip_div(text)
    return None


def prepare_main_benefits(benefits):
    data = []
    for name, text in benefits.items():
        if name == BENEFIT_FOR_ALL:
            continue
        name_slug = slugify(name)
        if name_slug in MAIN_BENEFITS:
            data.append(
                {
                    "type": "benefit",
                    "value": {
                        "variant": MAIN_BENEFITS[name_slug],
                        "text": strip_div(text),
                    },
                }
            )
    return json.dumps(data) if data else None


def prepare_benefits(benefits):
    data = []
    for name, text in benefits.items():
        if name == BENEFIT_FOR_ALL:
            continue
        name_slug = slugify(name)
        if name_slug not in MAIN_BENEFITS:
            data.append(
                {
                    "type": "benefit",
                    "value": {"title": name, "text": strip_div(text)},
                }
            )
    return json.dumps(data) if data else None