Add ability to specify multiple metapads

This commit is contained in:
Nick Hahn 2022-09-15 19:31:00 +02:00
parent 226a9c4a38
commit 7c770166a7
2 changed files with 15 additions and 15 deletions

View file

@ -36,12 +36,12 @@ pip install -r requirements.txt
Start the extractor
```
python hedgedoc-image.py meta_pad new_netloc
python hedgedoc-image.py --replace new_netloc meta_pad1 meta_pad2 ...
```
For example:
```
python hedgedoc-image.py https://md.margau.net/dbk-meta pad.hacknang.de
python hedgedoc-image.py --replace pad.hacknang.de https://md.margau.net/dbk-meta
```
### Produced files

View file

@ -2,7 +2,6 @@ import requests
import json
import re
import argparse
import os
from urllib.parse import urlparse
@ -12,7 +11,7 @@ from os.path import exists
PATH = "images/"
NEW_NETLOC = ""
seen_pads = []
seen_pads = set()
pads_json = []
@ -67,7 +66,7 @@ class PadPage:
def download_and_process_content(self):
print(f"Downloading pad at {self.old_url}")
seen_pads.append(self.old_url)
seen_pads.add(self.old_url)
r = requests.get(self.old_url + "/download")
if r.status_code == 200:
self.content = r.text
@ -109,9 +108,9 @@ class PadPage:
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Recursivly downloads all images from a hedgedoc pad")
parser.add_argument("pad_url", metavar="N", type=str, nargs=1,
help="url of the pad to start searching")
parser.add_argument("new_pad_url", metavar="N", type=str, nargs=1,
parser.add_argument("pad_url", metavar="N", type=str, nargs="+",
help="urls of the pad to start searching")
parser.add_argument("--replace", dest="new_pad_url", metavar="B", type=str, nargs=1,
help="url of the new pad for generating the json")
args = parser.parse_args()
@ -123,19 +122,20 @@ if __name__ == "__main__":
old_file = json.load(f)
for entry in old_file:
seen_pads.append(entry["old_url"])
seen_pads.add(entry["old_url"])
pads_json.extend(old_file)
print(f"Seen Pads: {seen_pads}")
NEW_NETLOC = args.new_pad_url[0]
pad = PadPage(args.pad_url[0])
pad.download_and_process_content()
pad.download_images()
pad.gather_linked_pads()
pads_json.append(pad.to_dict())
pad.recursive()
for pad_url in args.pad_url:
pad = PadPage(pad_url)
pad.download_and_process_content()
pad.download_images()
pad.gather_linked_pads()
pads_json.append(pad.to_dict())
pad.recursive()
print(f"We have seen {len(seen_pads)} which were {seen_pads}")