From 7c770166a75b2930292e0c80ac466c21299ac7d3 Mon Sep 17 00:00:00 2001 From: Nick Hahn Date: Thu, 15 Sep 2022 19:31:00 +0200 Subject: [PATCH] Add ability to specify multiple metapads --- README.md | 4 ++-- hedgedoc-image.py | 26 +++++++++++++------------- 2 files changed, 15 insertions(+), 15 deletions(-) diff --git a/README.md b/README.md index 7215df9..64ccbb3 100644 --- a/README.md +++ b/README.md @@ -36,12 +36,12 @@ pip install -r requirements.txt Start the extractor ``` -python hedgedoc-image.py meta_pad new_netloc +python hedgedoc-image.py --replace new_netloc meta_pad1 meta_pad2 ... ``` For example: ``` -python hedgedoc-image.py https://md.margau.net/dbk-meta pad.hacknang.de +python hedgedoc-image.py --replace pad.hacknang.de https://md.margau.net/dbk-meta ``` ### Produced files diff --git a/hedgedoc-image.py b/hedgedoc-image.py index dcb5d90..753fd44 100644 --- a/hedgedoc-image.py +++ b/hedgedoc-image.py @@ -2,7 +2,6 @@ import requests import json import re import argparse - import os from urllib.parse import urlparse @@ -12,7 +11,7 @@ from os.path import exists PATH = "images/" NEW_NETLOC = "" -seen_pads = [] +seen_pads = set() pads_json = [] @@ -67,7 +66,7 @@ class PadPage: def download_and_process_content(self): print(f"Downloading pad at {self.old_url}") - seen_pads.append(self.old_url) + seen_pads.add(self.old_url) r = requests.get(self.old_url + "/download") if r.status_code == 200: self.content = r.text @@ -109,9 +108,9 @@ class PadPage: if __name__ == "__main__": parser = argparse.ArgumentParser(description="Recursivly downloads all images from a hedgedoc pad") - parser.add_argument("pad_url", metavar="N", type=str, nargs=1, - help="url of the pad to start searching") - parser.add_argument("new_pad_url", metavar="N", type=str, nargs=1, + parser.add_argument("pad_url", metavar="N", type=str, nargs="+", + help="urls of the pad to start searching") + parser.add_argument("--replace", dest="new_pad_url", metavar="B", type=str, nargs=1, help="url of the new pad for generating the json") args = parser.parse_args() @@ -123,19 +122,20 @@ if __name__ == "__main__": old_file = json.load(f) for entry in old_file: - seen_pads.append(entry["old_url"]) + seen_pads.add(entry["old_url"]) pads_json.extend(old_file) print(f"Seen Pads: {seen_pads}") NEW_NETLOC = args.new_pad_url[0] - pad = PadPage(args.pad_url[0]) - pad.download_and_process_content() - pad.download_images() - pad.gather_linked_pads() - pads_json.append(pad.to_dict()) - pad.recursive() + for pad_url in args.pad_url: + pad = PadPage(pad_url) + pad.download_and_process_content() + pad.download_images() + pad.gather_linked_pads() + pads_json.append(pad.to_dict()) + pad.recursive() print(f"We have seen {len(seen_pads)} which were {seen_pads}")