Add ability to specify multiple metapads
This commit is contained in:
parent
226a9c4a38
commit
7c770166a7
2 changed files with 15 additions and 15 deletions
|
@ -36,12 +36,12 @@ pip install -r requirements.txt
|
||||||
|
|
||||||
Start the extractor
|
Start the extractor
|
||||||
```
|
```
|
||||||
python hedgedoc-image.py meta_pad new_netloc
|
python hedgedoc-image.py --replace new_netloc meta_pad1 meta_pad2 ...
|
||||||
```
|
```
|
||||||
|
|
||||||
For example:
|
For example:
|
||||||
```
|
```
|
||||||
python hedgedoc-image.py https://md.margau.net/dbk-meta pad.hacknang.de
|
python hedgedoc-image.py --replace pad.hacknang.de https://md.margau.net/dbk-meta
|
||||||
```
|
```
|
||||||
|
|
||||||
### Produced files
|
### Produced files
|
||||||
|
|
|
@ -2,7 +2,6 @@ import requests
|
||||||
import json
|
import json
|
||||||
import re
|
import re
|
||||||
import argparse
|
import argparse
|
||||||
|
|
||||||
import os
|
import os
|
||||||
|
|
||||||
from urllib.parse import urlparse
|
from urllib.parse import urlparse
|
||||||
|
@ -12,7 +11,7 @@ from os.path import exists
|
||||||
PATH = "images/"
|
PATH = "images/"
|
||||||
NEW_NETLOC = ""
|
NEW_NETLOC = ""
|
||||||
|
|
||||||
seen_pads = []
|
seen_pads = set()
|
||||||
|
|
||||||
pads_json = []
|
pads_json = []
|
||||||
|
|
||||||
|
@ -67,7 +66,7 @@ class PadPage:
|
||||||
|
|
||||||
def download_and_process_content(self):
|
def download_and_process_content(self):
|
||||||
print(f"Downloading pad at {self.old_url}")
|
print(f"Downloading pad at {self.old_url}")
|
||||||
seen_pads.append(self.old_url)
|
seen_pads.add(self.old_url)
|
||||||
r = requests.get(self.old_url + "/download")
|
r = requests.get(self.old_url + "/download")
|
||||||
if r.status_code == 200:
|
if r.status_code == 200:
|
||||||
self.content = r.text
|
self.content = r.text
|
||||||
|
@ -109,9 +108,9 @@ class PadPage:
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
parser = argparse.ArgumentParser(description="Recursivly downloads all images from a hedgedoc pad")
|
parser = argparse.ArgumentParser(description="Recursivly downloads all images from a hedgedoc pad")
|
||||||
parser.add_argument("pad_url", metavar="N", type=str, nargs=1,
|
parser.add_argument("pad_url", metavar="N", type=str, nargs="+",
|
||||||
help="url of the pad to start searching")
|
help="urls of the pad to start searching")
|
||||||
parser.add_argument("new_pad_url", metavar="N", type=str, nargs=1,
|
parser.add_argument("--replace", dest="new_pad_url", metavar="B", type=str, nargs=1,
|
||||||
help="url of the new pad for generating the json")
|
help="url of the new pad for generating the json")
|
||||||
|
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
@ -123,19 +122,20 @@ if __name__ == "__main__":
|
||||||
old_file = json.load(f)
|
old_file = json.load(f)
|
||||||
|
|
||||||
for entry in old_file:
|
for entry in old_file:
|
||||||
seen_pads.append(entry["old_url"])
|
seen_pads.add(entry["old_url"])
|
||||||
|
|
||||||
pads_json.extend(old_file)
|
pads_json.extend(old_file)
|
||||||
print(f"Seen Pads: {seen_pads}")
|
print(f"Seen Pads: {seen_pads}")
|
||||||
|
|
||||||
|
|
||||||
NEW_NETLOC = args.new_pad_url[0]
|
NEW_NETLOC = args.new_pad_url[0]
|
||||||
pad = PadPage(args.pad_url[0])
|
for pad_url in args.pad_url:
|
||||||
pad.download_and_process_content()
|
pad = PadPage(pad_url)
|
||||||
pad.download_images()
|
pad.download_and_process_content()
|
||||||
pad.gather_linked_pads()
|
pad.download_images()
|
||||||
pads_json.append(pad.to_dict())
|
pad.gather_linked_pads()
|
||||||
pad.recursive()
|
pads_json.append(pad.to_dict())
|
||||||
|
pad.recursive()
|
||||||
|
|
||||||
print(f"We have seen {len(seen_pads)} which were {seen_pads}")
|
print(f"We have seen {len(seen_pads)} which were {seen_pads}")
|
||||||
|
|
||||||
|
|
Loading…
Add table
Reference in a new issue