import requests import json import re import argparse import os from urllib.parse import urlparse from typing import List from os.path import exists IMG_PATH = "images/" PAD_PATH = "pads/" NEW_NETLOC = "" seen_pads = set() pads_json = [] os.makedirs(PAD_PATH, exist_ok=True) os.makedirs(IMG_PATH, exist_ok=True) os.makedirs(IMG_PATH + "uploads", exist_ok=True) class Image: def __init__(self, _old_url: str) -> None: self.old_url = _old_url old = urlparse(self.old_url) self.new_url = "." + old.path; def download(self): r = requests.get(self.old_url) p = IMG_PATH + urlparse(self.old_url).path with open(p, 'wb') as f: f.write(r.content) print(f"Downloaded image {self.old_url} to {p}") class PadPage: def __init__(self, _old_url: str): self.old_url: str = _old_url #self.name: str = _name self.images: List = [] self.linked_pads = [] self.content: str = "" def gather_linked_pads(self): regex = r'https://[\w\d.]+/(?!upload)[\w\-_]+' matches = re.findall(regex, self.content) full_url = urlparse(self.old_url) for match in matches: print(f"match: {match}") url = urlparse(match) if url.netloc == full_url.netloc: self.linked_pads.append(PadPage(match)) else: print("Dropped pad, wrong netloc") def to_dict(self) -> dict: old = urlparse(self.old_url) new = old._replace(netloc="pad.hacknang.de") return {"oldUrl": self.old_url, "newUrl": new.geturl()} def to_json(self) -> str: old = urlparse(self.old_url) new = old._replace(netloc=NEW_NETLOC) return json.dumps({"oldUrl": self.old_url, "newUrl": new.geturl()}) def download_and_process_content(self): print(f"Downloading pad at {self.old_url}") seen_pads.add(self.old_url) r = requests.get(self.old_url + "/download") if r.status_code == 200: self.content = r.text with open(PAD_PATH + urlparse(self.old_url).path + ".md", "w") as f: f.write(self.content) num = self._find_images() print(f"Found {num} images") else: print(f"Error downloading Pad {self.old_url}, got HTTP status code {r.status_code}") # returns number of images found def _find_images(self) -> int: regex = r'https://[\w\d.]+/uploads/[\w\d]+\.(?:png|jpg|jpeg|webp)' matches = re.findall(regex, self.content) full_url = urlparse(self.old_url) for match in matches: print(f"match: {match}") url = urlparse(match) if url.netloc == full_url.netloc: i = Image(match) self.images.append(i) else: print("Dropped pad, wrong netloc") return len(matches) def download_images(self): for i in self.images: i.download() def recursive(self): for pad in self.linked_pads: if pad.old_url not in seen_pads: print(f"New pad found: {pad.old_url}") pad.download_and_process_content() pad.download_images() pad.gather_linked_pads() pads_json.append(pad.to_dict()) pad.recursive() if __name__ == "__main__": parser = argparse.ArgumentParser(description="Recursivly downloads all images from a hedgedoc pad") parser.add_argument("pad_url", metavar="N", type=str, nargs="+", help="urls of the pad to start searching") parser.add_argument("--replace", dest="new_pad_url", metavar="B", type=str, nargs=1, help="url of the new pad for generating the json") args = parser.parse_args() old_file = [] if exists("pads.json"): with open("pads.json", "r") as f: old_file = json.load(f) for entry in old_file: seen_pads.add(entry["old_url"]) pads_json.extend(old_file) print(f"Seen Pads: {seen_pads}") NEW_NETLOC = args.new_pad_url[0] for pad_url in args.pad_url: pad = PadPage(pad_url) pad.download_and_process_content() pad.download_images() pad.gather_linked_pads() pads_json.append(pad.to_dict()) pad.recursive() print(f"We have seen {len(seen_pads)} which were {seen_pads}") with open("pads.json", "w") as f: f.write(json.dumps(pads_json))