Skip to content
Snippets Groups Projects
Commit e8b18cf3 authored by Tomáš Valenta's avatar Tomáš Valenta Committed by jan.bednarik
Browse files

switch instagram API to scraping

parent 63cc1875
Branches
No related tags found
2 merge requests!804Release,!790Fix personal calendar, homepage search & scrape Instagram instead of using API
Pipeline #13671 passed
......@@ -93,9 +93,7 @@ class CalendarMixin(models.Model):
calendar_format_events = []
for event in (
self.calendar.past_events
if self.calendar.past_events is not None
else []
self.calendar.past_events if self.calendar.past_events is not None else []
) + (
self.calendar.future_events
if self.calendar.future_events is not None
......
......@@ -4,11 +4,9 @@ from django.db import migrations
class Migration(migrations.Migration):
dependencies = [
('district', '0110_remove_districtpersonpage_ical_calendar_url_and_more'),
('district', '0113_merge_20230502_1854'),
("district", "0110_remove_districtpersonpage_ical_calendar_url_and_more"),
("district", "0113_merge_20230502_1854"),
]
operations = [
]
operations = []
import datetime
import mimetypes
import uuid
from django.db import models
......@@ -7,6 +9,27 @@ def get_current_datetime() -> datetime.datetime:
return datetime.datetime.now(tz=datetime.timezone.utc)
def get_instagram_image_path(instance, filename) -> str:
mimetypes_instance = mimetypes.MimeTypes()
guessed_type = mimetypes_instance.guess_type(filename, strict=False)[0]
extension = ""
if guessed_type is not None:
for mapper in mimetypes_instance.types_map_inv:
if guessed_type not in mapper:
continue
extension = mapper[guessed_type]
if isinstance(extension, list):
extension = extension[0]
break
return f"instagram/{uuid.uuid4()}{extension}"
class InstagramPost(models.Model):
"""
Model representing an Instgram post obtained from its API through the
......@@ -38,7 +61,7 @@ class InstagramPost(models.Model):
)
image = models.ImageField(
verbose_name="Obrázek",
upload_to="instagram",
upload_to=get_instagram_image_path,
)
url = models.URLField(
verbose_name="Odkaz",
......
......@@ -3,6 +3,7 @@ import io
import logging
import os
import instaloader
import requests
from django.core.files import File
......@@ -22,24 +23,21 @@ class InstagramDownloadService:
self.app_id = app_id
self.app_secret = app_secret
def get_user_info_list(self) -> list[str]:
def get_usernames(self) -> list[str]:
access_block = MainHomePage.objects.first().instagram_access
homepage_access_list = [
(block["value"]["name"], block["value"]["access_token"])
for block in access_block.raw_data
]
username_list = [block["value"]["username"] for block in access_block.raw_data]
people_access_list = []
for person_page in MainPersonPage.objects.all():
if (
person_page.instagram_username is None
or person_page.instagram_username in username_list
):
continue
for people_page in MainPersonPage.objects.all():
people_access_list += [
(block["value"]["name"], block["value"]["access_token"])
for block in people_page.instagram_access.raw_data
]
username_list.append(person_page.instagram_username)
# Remove duplicates
return list({*people_access_list, *homepage_access_list})
return username_list
def download_remote_image(self, image_url) -> (str, File):
try:
......@@ -51,81 +49,43 @@ class InstagramDownloadService:
return os.path.basename(image_url), File(io.BytesIO(response.content))
def get_user_data(self, access_token: str) -> dict:
user_data = requests.get(
f"https://graph.instagram.com/v16.0/me?access_token={access_token}"
"&fields=id,username"
)
user_data.raise_for_status()
return user_data.json()
def get_recent_media(self, user_data: dict, access_token: str) -> list[dict]:
recent_media = requests.get(
f"https://graph.instagram.com/v16.0/{user_data['id']}/media?access_token="
f"{access_token}&fields=id,timestamp,caption,media_type,permalink,"
"media_url,thumbnail_url"
)
if not recent_media.ok:
logger.warning(
"Error getting media for user %s: %s",
user_data["id"],
recent_media.status_code,
)
def parse_media_for_user(self, username: str) -> None:
loader = instaloader.Instaloader()
return []
profile = instaloader.Profile.from_username(loader.context, username)
logger.debug("Parsing Instagram feed: %s", recent_media)
for remote_post in profile.get_posts():
# Don't recreate existing posts
return recent_media.json()["data"]
def parse_media_for_user(self, name: str, access_token: str) -> None:
user_data = self.get_user_data(access_token)
recent_media_json = self.get_recent_media(user_data, access_token)
if len(recent_media_json) == 0:
return
posts = []
for media_data in recent_media_json:
# Don't recreate existing posts'
if InstagramPost.objects.filter(remote_id=media_data["id"]).exists():
if InstagramPost.objects.filter(remote_id=remote_post.shortcode).exists():
logging.info(
"Skipping Instagram post ID %s, already exists", media_data["id"]
"Skipping Instagram post ID %s, already exists",
remote_post.shortcode,
)
continue
post = InstagramPost(
remote_id=media_data["id"],
author_name=name,
author_username=user_data["username"],
timestamp=datetime.datetime.strptime(
media_data["timestamp"],
"%Y-%m-%dT%H:%M:%S%z",
),
caption=media_data["caption"],
url=media_data["permalink"],
local_post_instance = InstagramPost(
remote_id=remote_post.shortcode,
author_name=profile.full_name,
author_username=profile.username,
timestamp=remote_post.date_local,
caption=remote_post.caption,
url=f"https://instagram.com/p/{remote_post.shortcode}",
)
post.image.save(
*self.download_remote_image(media_data["media_url"]),
local_post_instance.image.save(
*self.download_remote_image(remote_post.url),
False, # Don't save yet
)
post.save()
local_post_instance.save()
logger.info(
"Saved Instagram post ID %s",
post.remote_id,
remote_post.mediaid,
)
def perform_update(self) -> None:
user_info_list = self.get_user_info_list()
media_list = []
for user_info in user_info_list:
self.parse_media_for_user(*user_info)
for username in self.get_usernames():
self.parse_media_for_user(username)
......@@ -361,19 +361,12 @@ class CardLinkWithHeadlineBlock(CardLinkWithHeadlineBlockMixin):
class InstagramAccessBlock(StructBlock):
name = CharBlock(label="Zobrazované jméno")
username = CharBlock(
label="Username", help_text="Např. pirati.cz, bez @ na začátku!"
label="Uživatelské jméno", help_text="Např. pirati.cz, bez @ na začátku!"
)
access_token = CharBlock(label="Přístupový token")
class Meta:
label = "Synchronizace s Instagramem"
help_text = (
"Informace lze získat přihlášením požadovaným Instagramovým "
"účtem na tools.pirati.cz/instagram . Token je třeba kvůli "
"podmínkám Instagramu každých 60 dní obnovit."
)
class InstagramPostsBlock(StructBlock):
......
# Generated by Django 4.1.10 on 2023-07-08 06:23
import wagtail.blocks
import wagtail.fields
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
("main", "0056_remove_mainpersonpage_ical_calendar_url_and_more"),
]
operations = [
migrations.RemoveField(
model_name="mainpersonpage",
name="instagram_access",
),
migrations.AddField(
model_name="mainpersonpage",
name="instagram_username",
field=models.CharField(
blank=True,
max_length=64,
null=True,
verbose_name="Uživatelské jméno na Instagramu",
),
),
migrations.AlterField(
model_name="mainhomepage",
name="instagram_access",
field=wagtail.fields.StreamField(
[
(
"instagram_access",
wagtail.blocks.StructBlock(
[
(
"username",
wagtail.blocks.CharBlock(
help_text="Např. pirati.cz, bez @ na začátku!",
label="Uživatelské jméno",
),
)
]
),
)
],
blank=True,
use_json_field=True,
verbose_name="Uživatelská jména synchronizovaných Instagram účtů",
),
),
]
......@@ -141,7 +141,7 @@ class MainHomePage(
instagram_access = StreamField(
[("instagram_access", blocks.InstagramAccessBlock())],
verbose_name="Uživatelská jména a přístupové tokeny pro synchronizované Instagram účty",
verbose_name="Uživatelská jména synchronizovaných Instagram účtů",
blank=True,
max_num=64,
use_json_field=True,
......@@ -764,13 +764,8 @@ class MainPersonPage(
perex = models.TextField()
text = RichTextField()
instagram_access = StreamField(
[
("instagram_access", blocks.InstagramAccessBlock()),
],
verbose_name="Synchronizace s Instagramem",
blank=True,
use_json_field=True,
instagram_username = models.CharField(
"Uživatelské jméno na Instagramu", max_length=64, blank=True, null=True
)
social_links = StreamField(
......@@ -807,11 +802,11 @@ class MainPersonPage(
FieldPanel("after_name"),
FieldPanel("position"),
FieldPanel("perex"),
FieldPanel("instagram_access"),
FieldPanel("text"),
FieldPanel("email"),
FieldPanel("phone"),
FieldPanel("calendar_url"),
FieldPanel("instagram_username"),
FieldPanel("social_links"),
FieldPanel("people"),
]
......@@ -819,12 +814,10 @@ class MainPersonPage(
def get_context(self, request) -> dict:
context = super().get_context(request)
if len(self.instagram_access.raw_data) != 0:
if self.instagram_username:
context["instagram_post_list"] = (
InstagramPost.objects.filter(
author_username=self.instagram_access.raw_data[0]["value"][
"username"
]
author_username=self.instagram_username
).order_by("-timestamp")
)[:20]
......
wagtail
wagtail<5.0 # For now
wagtail-metadata
wagtail-trash
django<4.2 # Wagtail compatibility
django-environ<0.10.0
django-extensions
django-redis
......@@ -14,6 +15,7 @@ opencv-python
requests
icalevents
ics
instaloader
arrow
sentry-sdk
Markdown
......
......@@ -12,13 +12,13 @@ arrow==1.2.3
# via
# -r base.in
# ics
asgiref==3.6.0
asgiref==3.7.2
# via django
asttokens==2.2.1
# via stack-data
async-timeout==4.0.2
# via redis
attrs==22.2.0
attrs==23.1.0
# via
# cattrs
# ics
......@@ -29,17 +29,17 @@ beautifulsoup4==4.11.2
# via
# -r base.in
# wagtail
billiard==3.6.4.0
billiard==4.1.0
# via celery
bleach==6.0.0
# via -r base.in
brotli==1.0.9
# via fonttools
cattrs==22.2.0
cattrs==23.1.2
# via requests-cache
celery==5.2.7
celery==5.3.1
# via -r base.in
certifi==2022.12.7
certifi==2023.5.7
# via
# requests
# sentry-sdk
......@@ -47,9 +47,9 @@ cffi==1.15.1
# via
# cryptography
# weasyprint
charset-normalizer==3.1.0
charset-normalizer==3.2.0
# via requests
click==8.1.3
click==8.1.4
# via
# celery
# click-didyoumean
......@@ -59,9 +59,9 @@ click-didyoumean==0.3.0
# via celery
click-plugins==1.1.1
# via celery
click-repl==0.2.0
click-repl==0.3.0
# via celery
cryptography==40.0.1
cryptography==41.0.1
# via
# josepy
# mozilla-django-oidc
......@@ -72,8 +72,9 @@ datetime==4.9
# via icalevents
decorator==5.1.1
# via ipython
django==4.1.8
django==4.1.10
# via
# -r base.in
# django-extensions
# django-filter
# django-modelcluster
......@@ -89,7 +90,7 @@ django==4.1.8
# wagtail
django-environ==0.9.0
# via -r base.in
django-extensions==3.2.1
django-extensions==3.2.3
# via -r base.in
django-filter==22.1
# via wagtail
......@@ -99,15 +100,15 @@ django-permissionedforms==0.1
# via wagtail
django-ranged-response==0.2.0
# via django-simple-captcha
django-redis==5.2.0
django-redis==5.3.0
# via -r base.in
django-settings-export==1.2.1
# via -r base.in
django-simple-captcha==0.5.17
django-simple-captcha==0.5.18
# via -r base.in
django-taggit==3.1.0
# via wagtail
django-treebeard==4.6.1
django-treebeard==4.7
# via wagtail
django-widget-tweaks==1.4.12
# via -r base.in
......@@ -119,9 +120,9 @@ et-xmlfile==1.1.0
# via openpyxl
executing==1.2.0
# via stack-data
fastjsonschema==2.16.3
fastjsonschema==2.17.1
# via -r base.in
fonttools[woff]==4.39.3
fonttools[woff]==4.40.0
# via weasyprint
html5lib==1.1
# via
......@@ -137,13 +138,15 @@ ics==0.7.2
# via -r base.in
idna==3.4
# via requests
ipython==8.12.0
instaloader==4.9.6
# via -r base.in
ipython==8.14.0
# via -r base.in
jedi==0.18.2
# via ipython
josepy==1.13.0
# via mozilla-django-oidc
kombu==5.2.4
kombu==5.3.1
# via celery
l18n==2021.3
# via wagtail
......@@ -153,13 +156,13 @@ matplotlib-inline==0.1.6
# via ipython
mozilla-django-oidc==2.0.0
# via pirates
numpy==1.24.2
numpy==1.25.0
# via opencv-python
oauthlib==3.2.2
# via
# requests-oauthlib
# tweepy
opencv-python==4.7.0.72
opencv-python==4.8.0.74
# via -r base.in
openpyxl==3.1.2
# via wagtail
......@@ -176,9 +179,9 @@ pillow==9.5.0
# weasyprint
pirates==0.6.0
# via -r base.in
platformdirs==3.2.0
platformdirs==3.8.1
# via requests-cache
prompt-toolkit==3.0.38
prompt-toolkit==3.0.39
# via
# click-repl
# ipython
......@@ -190,13 +193,13 @@ pure-eval==0.2.2
# via stack-data
pycparser==2.21
# via cffi
pydyf==0.6.0
pydyf==0.7.0
# via weasyprint
pygments==2.15.0
pygments==2.15.1
# via ipython
pyopenssl==23.1.1
pyopenssl==23.2.0
# via josepy
pyparsing==3.0.9
pyparsing==3.1.0
# via httplib2
pypdf2==3.0.1
# via -r base.in
......@@ -205,12 +208,12 @@ pyphen==0.14.0
python-dateutil==2.8.2
# via
# arrow
# celery
# icalendar
# icalevents
# ics
pytz==2021.3
# via
# celery
# datetime
# django-modelcluster
# djangorestframework
......@@ -219,41 +222,41 @@ pytz==2021.3
# l18n
pyyaml==6.0
# via -r base.in
redis==4.5.4
redis==4.6.0
# via django-redis
requests==2.28.2
requests==2.31.0
# via
# -r base.in
# instaloader
# mozilla-django-oidc
# requests-cache
# requests-oauthlib
# tweepy
# wagtail
requests-cache==1.0.1
requests-cache==1.1.0
# via -r base.in
requests-oauthlib==1.3.1
# via tweepy
sentry-sdk==1.19.1
sentry-sdk==1.27.1
# via -r base.in
six==1.16.0
# via
# asttokens
# bleach
# click-repl
# html5lib
# ics
# l18n
# python-dateutil
# url-normalize
soupsieve==2.4
soupsieve==2.4.1
# via beautifulsoup4
sqlparse==0.4.3
sqlparse==0.4.4
# via django
stack-data==0.6.2
# via ipython
tatsu==5.8.3
# via ics
telepath==0.3
telepath==0.3.1
# via wagtail
tinycss2==1.2.1
# via
......@@ -263,11 +266,13 @@ traitlets==5.9.0
# via
# ipython
# matplotlib-inline
tweepy==4.13.0
tweepy==4.14.0
# via -r base.in
tzdata==2023.3
# via celery
url-normalize==1.4.3
# via requests-cache
urllib3==1.26.15
urllib3==2.0.3
# via
# requests
# requests-cache
......@@ -277,18 +282,18 @@ vine==5.0.0
# amqp
# celery
# kombu
wagtail==4.2.2
wagtail==4.2.4
# via
# -r base.in
# wagtail-metadata
# wagtail-trash
wagtail-metadata==4.0.3
# via -r base.in
wagtail-trash==1.0.0
wagtail-trash==1.0.1
# via -r base.in
wcwidth==0.2.6
# via prompt-toolkit
weasyprint==58.1
weasyprint==59.0
# via -r base.in
webencodings==0.5.1
# via
......@@ -296,7 +301,7 @@ webencodings==0.5.1
# cssselect2
# html5lib
# tinycss2
whitenoise==6.4.0
whitenoise==6.5.0
# via -r base.in
willow==1.4.1
# via wagtail
......
#
# This file is autogenerated by pip-compile with Python 3.10
# This file is autogenerated by pip-compile with Python 3.11
# by the following command:
#
# pip-compile dev.in
#
asgiref==3.6.0
asgiref==3.7.2
# via django
coverage[toml]==7.2.3
coverage[toml]==7.2.7
# via pytest-cov
django==4.1.8
django==4.1.10
# via
# -r dev.in
# django-debug-toolbar
django-debug-toolbar==4.0.0
django-debug-toolbar==4.1.0
# via -r dev.in
exceptiongroup==1.1.1
# via pytest
factory-boy==3.2.1
# via pytest-factoryboy
faker==18.4.0
faker==18.13.0
# via factory-boy
fastdiff==0.3.0
# via snapshottest
......@@ -32,9 +30,9 @@ packaging==23.1
# via
# pytest
# pytest-sugar
pluggy==1.0.0
pluggy==1.2.0
# via pytest
pytest==7.3.0
pytest==7.4.0
# via
# -r dev.in
# pytest-cov
......@@ -43,7 +41,7 @@ pytest==7.3.0
# pytest-freezegun
# pytest-mock
# pytest-sugar
pytest-cov==4.0.0
pytest-cov==4.1.0
# via -r dev.in
pytest-django==4.5.2
# via -r dev.in
......@@ -51,7 +49,7 @@ pytest-factoryboy==2.5.1
# via -r dev.in
pytest-freezegun==0.4.2
# via -r dev.in
pytest-mock==3.10.0
pytest-mock==3.11.1
# via -r dev.in
pytest-sugar==0.9.7
# via -r dev.in
......@@ -65,19 +63,15 @@ six==1.16.0
# snapshottest
snapshottest==0.6.0
# via -r dev.in
sqlparse==0.4.3
sqlparse==0.4.4
# via
# django
# django-debug-toolbar
termcolor==2.2.0
termcolor==2.3.0
# via
# pytest-sugar
# snapshottest
tomli==2.0.1
# via
# coverage
# pytest
typing-extensions==4.5.0
typing-extensions==4.7.1
# via pytest-factoryboy
wasmer==1.1.0
# via fastdiff
......
#
# This file is autogenerated by pip-compile with Python 3.10
# This file is autogenerated by pip-compile with Python 3.11
# by the following command:
#
# pip-compile production.in
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment