hedgedoc-migrator/hedgedoc-image.py

import requests
import json
import re
import argparse
import os

from urllib.parse import urlparse
from typing import List
from os.path import exists

PATH = "images/"
NEW_NETLOC = ""

seen_pads = set()

pads_json = []

os.makedirs(PATH, exist_ok=True)
os.makedirs(PATH + "uploads", exist_ok=True)


class Image:
    def __init__(self, _old_url: str) -> None:
        self.old_url = _old_url
        old = urlparse(self.old_url)
        self.new_url = "." + old.path;

    def download(self):
        r = requests.get(self.old_url)
        p = PATH + urlparse(self.old_url).path
        with open(p, 'wb') as f:
            f.write(r.content)
            print(f"Downloaded image {self.old_url} to {p}")


class PadPage:
    def __init__(self, _old_url: str):
        self.old_url: str = _old_url
        #self.name: str = _name
        self.images: List = []
        self.linked_pads = []
        self.content: str = ""

    def gather_linked_pads(self):
        regex = r'https://[\w\d.]+/(?!upload)[\w\-_]+'
        matches = re.findall(regex, self.content)

        full_url = urlparse(self.old_url)
        for match in matches:
            print(f"match: {match}")
            url = urlparse(match)
            if url.netloc == full_url.netloc:
                self.linked_pads.append(PadPage(match))
            else:
                print("Dropped pad, wrong netloc")

    def to_dict(self) -> dict:
        old = urlparse(self.old_url)
        new = old._replace(netloc="pad.hacknang.de")
        return {"old_url": self.old_url, "new_url": new.geturl()}

    def to_json(self) -> str:
        old = urlparse(self.old_url)
        new = old._replace(netloc=NEW_NETLOC)
        return json.dumps({"old_url": self.old_url, "new_url": new.geturl()})

    def download_and_process_content(self):
        print(f"Downloading pad at {self.old_url}")
        seen_pads.add(self.old_url)
        r = requests.get(self.old_url + "/download")
        if r.status_code == 200:
            self.content = r.text
            num = self._find_images()
            print(f"Found {num} images")
        else:
            print(f"Error downloading Pad {self.old_url}, got HTTP status code {r.status_code}")

    # returns number of images found
    def _find_images(self) -> int:
        regex = r'https://[\w\d.]+/uploads/[\w\d]+\.(?:png|jpg|jpeg|webp)'
        matches = re.findall(regex, self.content)

        full_url = urlparse(self.old_url)
        for match in matches:
            print(f"match: {match}")
            url = urlparse(match)
            if url.netloc == full_url.netloc:
                i = Image(match)
                self.images.append(i)
            else:
                print("Dropped pad, wrong netloc")
        return len(matches)

    def download_images(self):
        for i in self.images:
            i.download()

    def recursive(self):
        for pad in self.linked_pads:
            if pad.old_url not in seen_pads:
                print(f"New pad found: {pad.old_url}")
                pad.download_and_process_content()
                pad.download_images()
                pad.gather_linked_pads()
                pads_json.append(pad.to_dict())
                pad.recursive()


if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Recursivly downloads all images from a hedgedoc pad")
    parser.add_argument("pad_url", metavar="N", type=str, nargs="+",
                        help="urls of the pad to start searching")
    parser.add_argument("--replace", dest="new_pad_url", metavar="B", type=str, nargs=1,
                        help="url of the new pad for generating the json")

    args = parser.parse_args()

    old_file = []

    if exists("pads.json"):
        with open("pads.json", "r") as f:
            old_file = json.load(f)

        for entry in old_file:
            seen_pads.add(entry["old_url"])

        pads_json.extend(old_file)
        print(f"Seen Pads: {seen_pads}")


    NEW_NETLOC = args.new_pad_url[0]
    for pad_url in args.pad_url:
        pad = PadPage(pad_url)
        pad.download_and_process_content()
        pad.download_images()
        pad.gather_linked_pads()
        pads_json.append(pad.to_dict())
        pad.recursive()

    print(f"We have seen {len(seen_pads)} which were {seen_pads}")

    with open("pads.json", "w") as f:
        f.write(json.dumps(pads_json))