143 lines
4.3 KiB
Python
143 lines
4.3 KiB
Python
import requests
|
|
import json
|
|
import re
|
|
import argparse
|
|
import os
|
|
|
|
from urllib.parse import urlparse
|
|
from typing import List
|
|
from os.path import exists
|
|
|
|
PATH = "images/"
|
|
NEW_NETLOC = ""
|
|
|
|
seen_pads = set()
|
|
|
|
pads_json = []
|
|
|
|
os.makedirs(PATH, exist_ok=True)
|
|
os.makedirs(PATH + "uploads", exist_ok=True)
|
|
|
|
|
|
class Image:
|
|
def __init__(self, _old_url: str) -> None:
|
|
self.old_url = _old_url
|
|
old = urlparse(self.old_url)
|
|
self.new_url = "." + old.path;
|
|
|
|
def download(self):
|
|
r = requests.get(self.old_url)
|
|
p = PATH + urlparse(self.old_url).path
|
|
with open(p, 'wb') as f:
|
|
f.write(r.content)
|
|
print(f"Downloaded image {self.old_url} to {p}")
|
|
|
|
|
|
class PadPage:
|
|
def __init__(self, _old_url: str):
|
|
self.old_url: str = _old_url
|
|
#self.name: str = _name
|
|
self.images: List = []
|
|
self.linked_pads = []
|
|
self.content: str = ""
|
|
|
|
def gather_linked_pads(self):
|
|
regex = r'https://[\w\d.]+/(?!upload)[\w\-_]+'
|
|
matches = re.findall(regex, self.content)
|
|
|
|
full_url = urlparse(self.old_url)
|
|
for match in matches:
|
|
print(f"match: {match}")
|
|
url = urlparse(match)
|
|
if url.netloc == full_url.netloc:
|
|
self.linked_pads.append(PadPage(match))
|
|
else:
|
|
print("Dropped pad, wrong netloc")
|
|
|
|
def to_dict(self) -> dict:
|
|
old = urlparse(self.old_url)
|
|
new = old._replace(netloc="pad.hacknang.de")
|
|
return {"old_url": self.old_url, "new_url": new.geturl()}
|
|
|
|
def to_json(self) -> str:
|
|
old = urlparse(self.old_url)
|
|
new = old._replace(netloc=NEW_NETLOC)
|
|
return json.dumps({"old_url": self.old_url, "new_url": new.geturl()})
|
|
|
|
def download_and_process_content(self):
|
|
print(f"Downloading pad at {self.old_url}")
|
|
seen_pads.add(self.old_url)
|
|
r = requests.get(self.old_url + "/download")
|
|
if r.status_code == 200:
|
|
self.content = r.text
|
|
num = self._find_images()
|
|
print(f"Found {num} images")
|
|
else:
|
|
print(f"Error downloading Pad {self.old_url}, got HTTP status code {r.status_code}")
|
|
|
|
# returns number of images found
|
|
def _find_images(self) -> int:
|
|
regex = r'https://[\w\d.]+/uploads/[\w\d]+\.(?:png|jpg|jpeg|webp)'
|
|
matches = re.findall(regex, self.content)
|
|
|
|
full_url = urlparse(self.old_url)
|
|
for match in matches:
|
|
print(f"match: {match}")
|
|
url = urlparse(match)
|
|
if url.netloc == full_url.netloc:
|
|
i = Image(match)
|
|
self.images.append(i)
|
|
else:
|
|
print("Dropped pad, wrong netloc")
|
|
return len(matches)
|
|
|
|
def download_images(self):
|
|
for i in self.images:
|
|
i.download()
|
|
|
|
def recursive(self):
|
|
for pad in self.linked_pads:
|
|
if pad.old_url not in seen_pads:
|
|
print(f"New pad found: {pad.old_url}")
|
|
pad.download_and_process_content()
|
|
pad.download_images()
|
|
pad.gather_linked_pads()
|
|
pads_json.append(pad.to_dict())
|
|
pad.recursive()
|
|
|
|
|
|
if __name__ == "__main__":
|
|
parser = argparse.ArgumentParser(description="Recursivly downloads all images from a hedgedoc pad")
|
|
parser.add_argument("pad_url", metavar="N", type=str, nargs="+",
|
|
help="urls of the pad to start searching")
|
|
parser.add_argument("--replace", dest="new_pad_url", metavar="B", type=str, nargs=1,
|
|
help="url of the new pad for generating the json")
|
|
|
|
args = parser.parse_args()
|
|
|
|
old_file = []
|
|
|
|
if exists("pads.json"):
|
|
with open("pads.json", "r") as f:
|
|
old_file = json.load(f)
|
|
|
|
for entry in old_file:
|
|
seen_pads.add(entry["old_url"])
|
|
|
|
pads_json.extend(old_file)
|
|
print(f"Seen Pads: {seen_pads}")
|
|
|
|
|
|
NEW_NETLOC = args.new_pad_url[0]
|
|
for pad_url in args.pad_url:
|
|
pad = PadPage(pad_url)
|
|
pad.download_and_process_content()
|
|
pad.download_images()
|
|
pad.gather_linked_pads()
|
|
pads_json.append(pad.to_dict())
|
|
pad.recursive()
|
|
|
|
print(f"We have seen {len(seen_pads)} which were {seen_pads}")
|
|
|
|
with open("pads.json", "w") as f:
|
|
f.write(json.dumps(pads_json))
|