diff --git a/.gitignore b/.gitignore index ceaea36..fde6bf5 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,8 @@ +# Python +__pycache__ +venv/ +images/ +pads.json # ---> Node # Logs logs diff --git a/README.md b/README.md index ef10214..319aef6 100644 --- a/README.md +++ b/README.md @@ -8,6 +8,37 @@ Follow the guide over [here](https://docs.cypress.io/guides/getting-started/inst You also require NodeJS. Run `npm ci` to install the required packages. +Create new python venv: +``` +python -m venv venv +``` + +Activate the new environment (linux) + +``` +source venv/bin/activate +``` + +Install the requirements +``` +pip -r requirements.txt +``` + ## Execution To start cypress, simply execute `npx cypress open`. Then click `E2E Testing` and run using electron. This step could be automated using the `cypress` [API](https://docs.cypress.io/guides/guides/module-api). + +Start the extractor +``` +python hedgedoc-image.py meta_pad new_netloc +``` + +For example: +``` +python hedgedoc-image.py https://md.margau.net/dbk-meta pad.hacknang.de +``` + +## Produced files + +The python scripts produces a `pads.json` which contains the mapping from `old_url` to `new_url`. +All images land in `images/uploads`. Only images hosted on the `old_pads` URL are saved diff --git a/hedgedoc-image.py b/hedgedoc-image.py new file mode 100644 index 0000000..184b275 --- /dev/null +++ b/hedgedoc-image.py @@ -0,0 +1,130 @@ +import requests +import json +import re +import argparse + +import os + +from urllib.parse import urlparse +from typing import List + +PATH = "images/" +NEW_NETLOC = "" + +seen_pads = [] + +pads_json = [] + +os.makedirs(PATH, exist_ok=True) +os.makedirs(PATH + "uploads", exist_ok=True) +# TODO: Loop detection +# TODO: Recursion +class Image: + def __init__(self, _old_url: str) -> None: + self.old_url = _old_url + old = urlparse(self.old_url) + self.new_url = "." + old.path; + + def download(self): + print("HERE") + r = requests.get(self.old_url) + p = PATH + urlparse(self.old_url).path + with open(p, 'wb') as f: + f.write(r.content) + print(f"Downloaded image {self.old_url} to {p}") + + +class PadPage: + def __init__(self, _old_url: str): + self.old_url: str = _old_url + #self.name: str = _name + self.images: List = [] + self.linked_pads = [] + self.content: str = "" + + def gather_linked_pads(self): + regex = r'https://[\w\d.]+/(?!upload)[\w\-_]+' + matches = re.findall(regex, self.content) + + full_url = urlparse(self.old_url) + for match in matches: + print(f"match: {match}") + url = urlparse(match) + if url.netloc == full_url.netloc: + self.linked_pads.append(PadPage(match)) + else: + print("Dropped pad, wrong netloc") + + def to_dict(self) -> dict: + old = urlparse(self.old_url) + new = old._replace(netloc="pad.hacknang.de") + return {"old_url": self.old_url, "new_url": new.geturl()} + + def to_json(self) -> str: + old = urlparse(self.old_url) + new = old._replace(netloc=NEW_NETLOC) + return json.dumps({"old_url": self.old_url, "new_url": new.geturl()}) + + def download_and_process_content(self): + print(f"Downloading pad at {self.old_url}") + seen_pads.append(self.old_url) + r = requests.get(self.old_url + "/download") + if r.status_code == 200: + self.content = r.text + num = self._find_images() + print(f"Found {num} images") + else: + print(f"Error downloading Pad {self.old_url}, got HTTP status code {r.status_code}") + + # returns number of images found + def _find_images(self) -> int: + regex = r'https://[\w\d.]+/uploads/[\w\d]+\.(?:png|jpg|jpeg|webp)' + matches = re.findall(regex, self.content) + + full_url = urlparse(self.old_url) + for match in matches: + print(f"match: {match}") + url = urlparse(match) + if url.netloc == full_url.netloc: + i = Image(match) + self.images.append(i) + else: + print("Dropped pad, wrong netloc") + return len(matches) + + def download_images(self): + for i in self.images: + i.download() + + def recursive(self): + for pad in self.linked_pads: + if pad.old_url not in seen_pads: + print(f"New pad found: {pad.old_url}") + pad.download_and_process_content() + pad.download_images() + pad.gather_linked_pads() + pads_json.append(pad.to_dict()) + pad.recursive() + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Recursivly downloads all images from a hedgedoc pad") + parser.add_argument("pad_url", metavar="N", type=str, nargs=1, + help="url of the pad to start searching") + parser.add_argument("new_pad_url", metavar="N", type=str, nargs=1, + help="url of the new pad for generating the json") + + args = parser.parse_args() + + NEW_NETLOC = args.new_pad_url[0] + pad = PadPage(args.pad_url[0]) + pad.download_and_process_content() + pad.download_images() + pad.gather_linked_pads() + pads_json.append(pad.to_dict()) + pad.recursive() + + print(f"We have seen {len(seen_pads)} which were {seen_pads}") + + with open("pads.json", "w") as f: + f.write(json.dumps(pads_json)) diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..a20d73b --- /dev/null +++ b/requirements.txt @@ -0,0 +1,8 @@ +beautifulsoup4==4.11.1 +bs4==0.0.1 +certifi==2022.6.15.1 +charset-normalizer==2.1.1 +idna==3.3 +requests==2.28.1 +soupsieve==2.3.2.post1 +urllib3==1.26.12