diff --git a/.gitignore b/.gitignore index 046a75cb033c2562e55cd9e3317c014e209186cd..7b91e10c6ffd24baba85f0da40f4bc8cde4ef1ab 100644 --- a/.gitignore +++ b/.gitignore @@ -151,3 +151,5 @@ static_files/ update_election_statics.sh download_static.sh matice.csv + +.vscode/ diff --git a/district/management/__init__.py b/district/management/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/district/management/commands/__init__.py b/district/management/commands/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/district/management/commands/district_import_jekyll.py b/district/management/commands/district_import_jekyll.py new file mode 100644 index 0000000000000000000000000000000000000000..15b6e75094c2f16c246e506b63bc6ec703b3ae8b --- /dev/null +++ b/district/management/commands/district_import_jekyll.py @@ -0,0 +1,209 @@ +import os +import yaml, markdown, re +from django.core.management.base import BaseCommand +from django.db.models.expressions import Col +from django.utils.text import slugify +from django.core.files.images import ImageFile + +from wagtail.core.models.collections import Collection +from wagtail.images.models import Image +from wagtail.core.models import Site +from district.models import DistrictArticlePage, DistrictArticlesPage + +from markdown import Markdown +from markdown.inlinepatterns import InlineProcessor +from markdown.extensions import Extension +import xml.etree.ElementTree as ET +from io import StringIO +from django.utils.dateparse import ( + parse_date, + parse_datetime, + parse_duration, + parse_time, +) + +import markdown.serializers + +# Wagtail to portrebuje https://docs.wagtail.io/en/stable/extending/rich_text_internals.html#data-format +markdown.serializers.HTML_EMPTY.add("embed") + +# Plain format pro perex +def unmark_element(element, stream=None): + if stream is None: + stream = StringIO() + if element.text: + stream.write(element.text) + for sub in element: + unmark_element(sub, stream) + if element.tail: + stream.write(element.tail) + return stream.getvalue() + + +Markdown.output_formats["plain"] = unmark_element +plain_md = Markdown(output_format="plain") +plain_md.stripTopLevelTags = False + +global path +PATH = os.path.abspath("../cb.pirati.cz/") + + +class ImgProcessor(InlineProcessor): + def handleMatch(self, m, data): + el = ET.Element("embed") + el.attrib["embedtype"] = "image" + el.attrib["alt"] = m.group(1) + el.attrib["format"] = "left" + collection = get_collection() + image_obj = get_or_create_image( + path, m.group(2), collection=collection + ) # TODO path + el.attrib["id"] = str(image_obj.pk) + return el, m.start(0), m.end(0) + + +class ImgExtension(Extension): + def extendMarkdown(self, md): + IMG_PATTERN = r"!\[(.*?)\]\((.*?)\)" + md.inlinePatterns.register(ImgProcessor(IMG_PATTERN, md), "img", 175) + + +html_md = Markdown(extensions=[ImgExtension()]) + + +def get_perex(text): + text = re.split(r"^\s*$", text.strip(), flags=re.MULTILINE)[0] + return plain_md.convert(text) + + +POSTS_DIR = "_posts" +TITLE_SUFFIX = " - Piráti České Budějovice" + + +def get_site_config(path): + with open(os.path.join(path, "_config.yml")) as f: + config = yaml.safe_load(f.read()) + return config + + +def import_post(path, file_path, parrent, title_suffix): + with open(os.path.join(path, file_path), "rt") as f: + r = re.split(r"^---\s*$", f.read(), maxsplit=3, flags=re.MULTILINE) + meta = yaml.safe_load(r[1]) + md = r[2] + html = html_md.convert(md) + + if DistrictArticlePage.objects.filter(title=meta["title"]).exists(): + for article in DistrictArticlePage.objects.filter(title=meta["title"]): + if article.date == parse_date(meta["date"].split()[0]): + return article + + article = DistrictArticlePage() + + article.text = html + article.perex = get_perex(md) + article.date = meta["date"].split()[0] + article.title = meta["title"] + article.author = meta["author"] + + article.seo_title = article.title + title_suffix + article.search_description = meta.get("description", "") + + # for tag in meta['tags']: + # article.tags.add(tag) + + collection = get_collection() + article.image = get_or_create_image(path, meta["image"], collection=collection) + + parrent.add_child(instance=article) + + rev = article.save_revision() + if meta["published"]: + rev.publish() + return article + + +def get_collection(): + if Collection.objects.filter(name="import").exists(): + collection = Collection.objects.filter(name="import").first() + else: + collection = Collection.add_root(name="import") + return collection + + +def get_or_create_image(path, file_path, collection): + file_path = file_path.lstrip("/") + if Image.objects.filter(title=file_path).exists(): + return Image.objects.filter(title=file_path).first() + else: + file = ImageFile(open(os.path.join(path, file_path), "rb"), name=file_path) + image = Image(title=file_path, file=file, collection=collection) + image.save() + return image + + +class Command(BaseCommand): + help = """Importuje články z pirátského jekyll webu. + + """ + + def add_arguments(self, parser): + parser.add_argument("path", help="Cesta k jekyll repu") + parser.add_argument( + "--hostname", + default=None, + help="Hostname webu, defaultně vezme první site (/admin/sites/)", + ) + parser.add_argument( + "--clanky-id", + default=None, + type=int, + help="Id nadstránky článků, defaultně vezme první", + ) + + def handle(self, *args, **options): + + if options["clanky_id"] is None: + articles = DistrictArticlesPage.objects.first() + else: + articles = DistrictArticlesPage.objects.get(pk=options["clanky_id"]) + + global path + path = options["path"] + site_config = get_site_config(path) + + if "title" in site_config: + title_suffix = " - " + site_config.get("title", "") + else: + title_suffix = "" + + if options["hostname"] is None: + site = Site.objects.first() + else: + site = Site.objects.get(hostname=options["hostname"]) + articlepath = site_config["articlepath"] + + for fn in os.listdir(os.path.join(path, POSTS_DIR)): + fname = os.path.join(POSTS_DIR, fn) + match = re.match(r"(\d*)-(\d*)-(\d*)-(.*)\.(.*)", fn) + if match: + y = match.group(1) + m = match.group(2) + d = match.group(3) + slug = match.group(4) + ext = match.group(5) + + if ext == "md": + article = import_post(path, fname, articles, title_suffix) + from wagtail.contrib.redirects.models import Redirect + + r, created = Redirect.objects.get_or_create( + site=site, + old_path="/%s/%s/%s/%s/%s/" + % (articlepath, y, m.zfill(2), d.zfill(2), slug), + defaults={"is_permanent": True, "redirect_page": article}, + ) + else: + self.stdout.write(self.style.ERROR("Not Implemented: %s" % ext)) + else: + self.stdout.write(self.style.WARNING("Skipping: %s" % fn))