Add python extractor

2022-09-14 14:04:35 +02:00 · 2022-09-14 14:04:35 +02:00 · 7a422ef89b
commit 7a422ef89b
parent 52cd87650d
4 changed files with 174 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,3 +1,8 @@
+# Python
+__pycache__
+venv/
+images/
+pads.json
 # ---> Node
 # Logs
 logs
--- a/README.md
+++ b/README.md
@ -8,6 +8,37 @@ Follow the guide over [here](https://docs.cypress.io/guides/getting-started/inst

 You also require NodeJS. Run `npm ci` to install the required packages.

+Create new python venv:
+```
+python -m venv venv
+```
+
+Activate the new environment (linux)
+
+```
+source venv/bin/activate
+```
+
+Install the requirements
+```
+pip -r requirements.txt
+```
+
 ## Execution

 To start cypress, simply execute `npx cypress open`. Then click `E2E Testing` and run using electron. This step could be automated using the `cypress` [API](https://docs.cypress.io/guides/guides/module-api).
+
+Start the extractor
+```
+python hedgedoc-image.py meta_pad new_netloc
+```
+
+For example:
+```
+python hedgedoc-image.py https://md.margau.net/dbk-meta pad.hacknang.de
+```
+
+## Produced files
+
+The python scripts produces a `pads.json` which contains the mapping from `old_url` to `new_url`.
+All images land in `images/uploads`. Only images hosted on the `old_pads` URL are saved
--- a/hedgedoc-image.py
+++ b/hedgedoc-image.py
@ -0,0 +1,130 @@
+import requests
+import json
+import re
+import argparse
+
+import os
+
+from urllib.parse import urlparse
+from typing import List
+
+PATH = "images/"
+NEW_NETLOC = ""
+
+seen_pads = []
+
+pads_json = []
+
+os.makedirs(PATH, exist_ok=True)
+os.makedirs(PATH + "uploads", exist_ok=True)
+# TODO: Loop detection
+# TODO: Recursion
+class Image:
+    def __init__(self, _old_url: str) -> None:
+        self.old_url = _old_url
+        old = urlparse(self.old_url)
+        self.new_url = "." + old.path;
+
+    def download(self):
+        print("HERE")
+        r = requests.get(self.old_url)
+        p = PATH + urlparse(self.old_url).path
+        with open(p, 'wb') as f:
+            f.write(r.content)
+            print(f"Downloaded image {self.old_url} to {p}")
+
+
+class PadPage:
+    def __init__(self, _old_url: str):
+        self.old_url: str = _old_url
+        #self.name: str = _name
+        self.images: List = []
+        self.linked_pads = []
+        self.content: str = ""
+
+    def gather_linked_pads(self):
+        regex = r'https://[\w\d.]+/(?!upload)[\w\-_]+'
+        matches = re.findall(regex, self.content)
+
+        full_url = urlparse(self.old_url)
+        for match in matches:
+            print(f"match: {match}")
+            url = urlparse(match)
+            if url.netloc == full_url.netloc:
+                self.linked_pads.append(PadPage(match))
+            else:
+                print("Dropped pad, wrong netloc")
+
+    def to_dict(self) -> dict:
+        old = urlparse(self.old_url)
+        new = old._replace(netloc="pad.hacknang.de")
+        return {"old_url": self.old_url, "new_url": new.geturl()}
+
+    def to_json(self) -> str:
+        old = urlparse(self.old_url)
+        new = old._replace(netloc=NEW_NETLOC)
+        return json.dumps({"old_url": self.old_url, "new_url": new.geturl()})
+
+    def download_and_process_content(self):
+        print(f"Downloading pad at {self.old_url}")
+        seen_pads.append(self.old_url)
+        r = requests.get(self.old_url + "/download")
+        if r.status_code == 200:
+            self.content = r.text
+            num = self._find_images()
+            print(f"Found {num} images")
+        else:
+            print(f"Error downloading Pad {self.old_url}, got HTTP status code {r.status_code}")
+
+    # returns number of images found
+    def _find_images(self) -> int:
+        regex = r'https://[\w\d.]+/uploads/[\w\d]+\.(?:png|jpg|jpeg|webp)'
+        matches = re.findall(regex, self.content)
+
+        full_url = urlparse(self.old_url)
+        for match in matches:
+            print(f"match: {match}")
+            url = urlparse(match)
+            if url.netloc == full_url.netloc:
+                i = Image(match)
+                self.images.append(i)
+            else:
+                print("Dropped pad, wrong netloc")
+        return len(matches)
+
+    def download_images(self):
+        for i in self.images:
+            i.download()
+
+    def recursive(self):
+        for pad in self.linked_pads:
+            if pad.old_url not in seen_pads:
+                print(f"New pad found: {pad.old_url}")
+                pad.download_and_process_content()
+                pad.download_images()
+                pad.gather_linked_pads()
+                pads_json.append(pad.to_dict())
+                pad.recursive()
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Recursivly downloads all images from a hedgedoc pad")
+    parser.add_argument("pad_url", metavar="N", type=str, nargs=1,
+                        help="url of the pad to start searching")
+    parser.add_argument("new_pad_url", metavar="N", type=str, nargs=1,
+                        help="url of the new pad for generating the json")
+
+    args = parser.parse_args()
+
+    NEW_NETLOC = args.new_pad_url[0]
+    pad = PadPage(args.pad_url[0])
+    pad.download_and_process_content()
+    pad.download_images()
+    pad.gather_linked_pads()
+    pads_json.append(pad.to_dict())
+    pad.recursive()
+
+    print(f"We have seen {len(seen_pads)} which were {seen_pads}")
+
+    with open("pads.json", "w") as f:
+        f.write(json.dumps(pads_json))
--- a/requirements.txt
+++ b/requirements.txt
@ -0,0 +1,8 @@
+beautifulsoup4==4.11.1
+bs4==0.0.1
+certifi==2022.6.15.1
+charset-normalizer==2.1.1
+idna==3.3
+requests==2.28.1
+soupsieve==2.3.2.post1
+urllib3==1.26.12