Add python extractor
This commit is contained in:
parent
52cd87650d
commit
7a422ef89b
4 changed files with 174 additions and 0 deletions
5
.gitignore
vendored
5
.gitignore
vendored
|
@ -1,3 +1,8 @@
|
|||
# Python
|
||||
__pycache__
|
||||
venv/
|
||||
images/
|
||||
pads.json
|
||||
# ---> Node
|
||||
# Logs
|
||||
logs
|
||||
|
|
31
README.md
31
README.md
|
@ -8,6 +8,37 @@ Follow the guide over [here](https://docs.cypress.io/guides/getting-started/inst
|
|||
|
||||
You also require NodeJS. Run `npm ci` to install the required packages.
|
||||
|
||||
Create new python venv:
|
||||
```
|
||||
python -m venv venv
|
||||
```
|
||||
|
||||
Activate the new environment (linux)
|
||||
|
||||
```
|
||||
source venv/bin/activate
|
||||
```
|
||||
|
||||
Install the requirements
|
||||
```
|
||||
pip -r requirements.txt
|
||||
```
|
||||
|
||||
## Execution
|
||||
|
||||
To start cypress, simply execute `npx cypress open`. Then click `E2E Testing` and run using electron. This step could be automated using the `cypress` [API](https://docs.cypress.io/guides/guides/module-api).
|
||||
|
||||
Start the extractor
|
||||
```
|
||||
python hedgedoc-image.py meta_pad new_netloc
|
||||
```
|
||||
|
||||
For example:
|
||||
```
|
||||
python hedgedoc-image.py https://md.margau.net/dbk-meta pad.hacknang.de
|
||||
```
|
||||
|
||||
## Produced files
|
||||
|
||||
The python scripts produces a `pads.json` which contains the mapping from `old_url` to `new_url`.
|
||||
All images land in `images/uploads`. Only images hosted on the `old_pads` URL are saved
|
||||
|
|
130
hedgedoc-image.py
Normal file
130
hedgedoc-image.py
Normal file
|
@ -0,0 +1,130 @@
|
|||
import requests
|
||||
import json
|
||||
import re
|
||||
import argparse
|
||||
|
||||
import os
|
||||
|
||||
from urllib.parse import urlparse
|
||||
from typing import List
|
||||
|
||||
PATH = "images/"
|
||||
NEW_NETLOC = ""
|
||||
|
||||
seen_pads = []
|
||||
|
||||
pads_json = []
|
||||
|
||||
os.makedirs(PATH, exist_ok=True)
|
||||
os.makedirs(PATH + "uploads", exist_ok=True)
|
||||
# TODO: Loop detection
|
||||
# TODO: Recursion
|
||||
class Image:
|
||||
def __init__(self, _old_url: str) -> None:
|
||||
self.old_url = _old_url
|
||||
old = urlparse(self.old_url)
|
||||
self.new_url = "." + old.path;
|
||||
|
||||
def download(self):
|
||||
print("HERE")
|
||||
r = requests.get(self.old_url)
|
||||
p = PATH + urlparse(self.old_url).path
|
||||
with open(p, 'wb') as f:
|
||||
f.write(r.content)
|
||||
print(f"Downloaded image {self.old_url} to {p}")
|
||||
|
||||
|
||||
class PadPage:
|
||||
def __init__(self, _old_url: str):
|
||||
self.old_url: str = _old_url
|
||||
#self.name: str = _name
|
||||
self.images: List = []
|
||||
self.linked_pads = []
|
||||
self.content: str = ""
|
||||
|
||||
def gather_linked_pads(self):
|
||||
regex = r'https://[\w\d.]+/(?!upload)[\w\-_]+'
|
||||
matches = re.findall(regex, self.content)
|
||||
|
||||
full_url = urlparse(self.old_url)
|
||||
for match in matches:
|
||||
print(f"match: {match}")
|
||||
url = urlparse(match)
|
||||
if url.netloc == full_url.netloc:
|
||||
self.linked_pads.append(PadPage(match))
|
||||
else:
|
||||
print("Dropped pad, wrong netloc")
|
||||
|
||||
def to_dict(self) -> dict:
|
||||
old = urlparse(self.old_url)
|
||||
new = old._replace(netloc="pad.hacknang.de")
|
||||
return {"old_url": self.old_url, "new_url": new.geturl()}
|
||||
|
||||
def to_json(self) -> str:
|
||||
old = urlparse(self.old_url)
|
||||
new = old._replace(netloc=NEW_NETLOC)
|
||||
return json.dumps({"old_url": self.old_url, "new_url": new.geturl()})
|
||||
|
||||
def download_and_process_content(self):
|
||||
print(f"Downloading pad at {self.old_url}")
|
||||
seen_pads.append(self.old_url)
|
||||
r = requests.get(self.old_url + "/download")
|
||||
if r.status_code == 200:
|
||||
self.content = r.text
|
||||
num = self._find_images()
|
||||
print(f"Found {num} images")
|
||||
else:
|
||||
print(f"Error downloading Pad {self.old_url}, got HTTP status code {r.status_code}")
|
||||
|
||||
# returns number of images found
|
||||
def _find_images(self) -> int:
|
||||
regex = r'https://[\w\d.]+/uploads/[\w\d]+\.(?:png|jpg|jpeg|webp)'
|
||||
matches = re.findall(regex, self.content)
|
||||
|
||||
full_url = urlparse(self.old_url)
|
||||
for match in matches:
|
||||
print(f"match: {match}")
|
||||
url = urlparse(match)
|
||||
if url.netloc == full_url.netloc:
|
||||
i = Image(match)
|
||||
self.images.append(i)
|
||||
else:
|
||||
print("Dropped pad, wrong netloc")
|
||||
return len(matches)
|
||||
|
||||
def download_images(self):
|
||||
for i in self.images:
|
||||
i.download()
|
||||
|
||||
def recursive(self):
|
||||
for pad in self.linked_pads:
|
||||
if pad.old_url not in seen_pads:
|
||||
print(f"New pad found: {pad.old_url}")
|
||||
pad.download_and_process_content()
|
||||
pad.download_images()
|
||||
pad.gather_linked_pads()
|
||||
pads_json.append(pad.to_dict())
|
||||
pad.recursive()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(description="Recursivly downloads all images from a hedgedoc pad")
|
||||
parser.add_argument("pad_url", metavar="N", type=str, nargs=1,
|
||||
help="url of the pad to start searching")
|
||||
parser.add_argument("new_pad_url", metavar="N", type=str, nargs=1,
|
||||
help="url of the new pad for generating the json")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
NEW_NETLOC = args.new_pad_url[0]
|
||||
pad = PadPage(args.pad_url[0])
|
||||
pad.download_and_process_content()
|
||||
pad.download_images()
|
||||
pad.gather_linked_pads()
|
||||
pads_json.append(pad.to_dict())
|
||||
pad.recursive()
|
||||
|
||||
print(f"We have seen {len(seen_pads)} which were {seen_pads}")
|
||||
|
||||
with open("pads.json", "w") as f:
|
||||
f.write(json.dumps(pads_json))
|
8
requirements.txt
Normal file
8
requirements.txt
Normal file
|
@ -0,0 +1,8 @@
|
|||
beautifulsoup4==4.11.1
|
||||
bs4==0.0.1
|
||||
certifi==2022.6.15.1
|
||||
charset-normalizer==2.1.1
|
||||
idna==3.3
|
||||
requests==2.28.1
|
||||
soupsieve==2.3.2.post1
|
||||
urllib3==1.26.12
|
Loading…
Add table
Reference in a new issue