district import jekyll

ea202873 · jan.hosek · c55468c4 · ea202873 · ea202873 · ea202873
Commit ea202873 authored 3 years ago by jan.hosek
--- a/.gitignore
+++ b/.gitignore
@@ -151,3 +151,5 @@ static_files/
 update_election_statics.sh
 download_static.sh
 matice.csv
+.vscode/
--- a/district/management/__init__.py
+++ b/district/management/__init__.py
--- a/district/management/commands/__init__.py
+++ b/district/management/commands/__init__.py
--- a/district/management/commands/district_import_jekyll.py
+++ b/district/management/commands/district_import_jekyll.py
+import os
+import yaml, markdown, re
+from django.core.management.base import BaseCommand
+from django.db.models.expressions import Col
+from django.utils.text import slugify
+from django.core.files.images import ImageFile
+from wagtail.core.models.collections import Collection
+from wagtail.images.models import Image
+from wagtail.core.models import Site
+from district.models import DistrictArticlePage, DistrictArticlesPage
+from markdown import Markdown
+from markdown.inlinepatterns import InlineProcessor
+from markdown.extensions import Extension
+import xml.etree.ElementTree as ET
+from io import StringIO
+from django.utils.dateparse import (
+    parse_date,
+    parse_datetime,
+    parse_duration,
+    parse_time,
+)
+import markdown.serializers
+# Wagtail to portrebuje https://docs.wagtail.io/en/stable/extending/rich_text_internals.html#data-format
+markdown.serializers.HTML_EMPTY.add("embed")
+# Plain format pro perex
+def unmark_element(element, stream=None):
+    if stream is None:
+        stream = StringIO()
+    if element.text:
+        stream.write(element.text)
+    for sub in element:
+        unmark_element(sub, stream)
+    if element.tail:
+        stream.write(element.tail)
+    return stream.getvalue()
+Markdown.output_formats["plain"] = unmark_element
+plain_md = Markdown(output_format="plain")
+plain_md.stripTopLevelTags = False
+global path
+PATH = os.path.abspath("../cb.pirati.cz/")
+class ImgProcessor(InlineProcessor):
+    def handleMatch(self, m, data):
+        el = ET.Element("embed")
+        el.attrib["embedtype"] = "image"
+        el.attrib["alt"] = m.group(1)
+        el.attrib["format"] = "left"
+        collection = get_collection()
+        image_obj = get_or_create_image(
+            path, m.group(2), collection=collection
+        )  # TODO path
+        el.attrib["id"] = str(image_obj.pk)
+        return el, m.start(0), m.end(0)
+class ImgExtension(Extension):
+    def extendMarkdown(self, md):
+        IMG_PATTERN = r"!\[(.*?)\]\((.*?)\)"
+        md.inlinePatterns.register(ImgProcessor(IMG_PATTERN, md), "img", 175)
+html_md = Markdown(extensions=[ImgExtension()])
+def get_perex(text):
+    text = re.split(r"^\s*$", text.strip(), flags=re.MULTILINE)[0]
+    return plain_md.convert(text)
+POSTS_DIR = "_posts"
+TITLE_SUFFIX = " - Piráti České Budějovice"
+def get_site_config(path):
+    with open(os.path.join(path, "_config.yml")) as f:
+        config = yaml.safe_load(f.read())
+    return config
+def import_post(path, file_path, parrent, title_suffix):
+    with open(os.path.join(path, file_path), "rt") as f:
+        r = re.split(r"^---\s*$", f.read(), maxsplit=3, flags=re.MULTILINE)
+    meta = yaml.safe_load(r[1])
+    md = r[2]
+    html = html_md.convert(md)
+    if DistrictArticlePage.objects.filter(title=meta["title"]).exists():
+        for article in DistrictArticlePage.objects.filter(title=meta["title"]):
+            if article.date == parse_date(meta["date"].split()[0]):
+                return article
+    article = DistrictArticlePage()
+    article.text = html
+    article.perex = get_perex(md)
+    article.date = meta["date"].split()[0]
+    article.title = meta["title"]
+    article.author = meta["author"]
+    article.seo_title = article.title + title_suffix
+    article.search_description = meta.get("description", "")
+    # for tag in meta['tags']:
+    #     article.tags.add(tag)
+    collection = get_collection()
+    article.image = get_or_create_image(path, meta["image"], collection=collection)
+    parrent.add_child(instance=article)
+    rev = article.save_revision()
+    if meta["published"]:
+        rev.publish()
+    return article
+def get_collection():
+    if Collection.objects.filter(name="import").exists():
+        collection = Collection.objects.filter(name="import").first()
+    else:
+        collection = Collection.add_root(name="import")
+    return collection
+def get_or_create_image(path, file_path, collection):
+    file_path = file_path.lstrip("/")
+    if Image.objects.filter(title=file_path).exists():
+        return Image.objects.filter(title=file_path).first()
+    else:
+        file = ImageFile(open(os.path.join(path, file_path), "rb"), name=file_path)
+        image = Image(title=file_path, file=file, collection=collection)
+        image.save()
+        return image
+class Command(BaseCommand):
+    help = """Importuje články z pirátského jekyll webu.
+    """
+    def add_arguments(self, parser):
+        parser.add_argument("path", help="Cesta k jekyll repu")
+        parser.add_argument(
+            "--hostname",
+            default=None,
+            help="Hostname webu, defaultně vezme první site (/admin/sites/)",
+        )
+        parser.add_argument(
+            "--clanky-id",
+            default=None,
+            type=int,
+            help="Id nadstránky článků, defaultně vezme první",
+        )
+    def handle(self, *args, **options):
+        if options["clanky_id"] is None:
+            articles = DistrictArticlesPage.objects.first()
+        else:
+            articles = DistrictArticlesPage.objects.get(pk=options["clanky_id"])
+        global path
+        path = options["path"]
+        site_config = get_site_config(path)
+        if "title" in site_config:
+            title_suffix = " - " + site_config.get("title", "")
+        else:
+            title_suffix = ""
+        if options["hostname"] is None:
+            site = Site.objects.first()
+        else:
+            site = Site.objects.get(hostname=options["hostname"])
+        articlepath = site_config["articlepath"]
+        for fn in os.listdir(os.path.join(path, POSTS_DIR)):
+            fname = os.path.join(POSTS_DIR, fn)
+            match = re.match(r"(\d*)-(\d*)-(\d*)-(.*)\.(.*)", fn)
+            if match:
+                y = match.group(1)
+                m = match.group(2)
+                d = match.group(3)
+                slug = match.group(4)
+                ext = match.group(5)
+                if ext == "md":
+                    article = import_post(path, fname, articles, title_suffix)
+                    from wagtail.contrib.redirects.models import Redirect
+                    r, created = Redirect.objects.get_or_create(
+                        site=site,
+                        old_path="/%s/%s/%s/%s/%s/"
+                        % (articlepath, y, m.zfill(2), d.zfill(2), slug),
+                        defaults={"is_permanent": True, "redirect_page": article},
+                    )
+                else:
+                    self.stdout.write(self.style.ERROR("Not Implemented: %s" % ext))
+            else:
+                self.stdout.write(self.style.WARNING("Skipping: %s" % fn))