Also save pad content on crawl
This commit is contained in:
parent
7c770166a7
commit
11b5541c0f
1 changed files with 8 additions and 4 deletions
|
@ -8,15 +8,17 @@ from urllib.parse import urlparse
|
||||||
from typing import List
|
from typing import List
|
||||||
from os.path import exists
|
from os.path import exists
|
||||||
|
|
||||||
PATH = "images/"
|
IMG_PATH = "images/"
|
||||||
|
PAD_PATH = "pads/"
|
||||||
NEW_NETLOC = ""
|
NEW_NETLOC = ""
|
||||||
|
|
||||||
seen_pads = set()
|
seen_pads = set()
|
||||||
|
|
||||||
pads_json = []
|
pads_json = []
|
||||||
|
|
||||||
os.makedirs(PATH, exist_ok=True)
|
os.makedirs(PAD_PATH, exist_ok=True)
|
||||||
os.makedirs(PATH + "uploads", exist_ok=True)
|
os.makedirs(IMG_PATH, exist_ok=True)
|
||||||
|
os.makedirs(IMG_PATH + "uploads", exist_ok=True)
|
||||||
|
|
||||||
|
|
||||||
class Image:
|
class Image:
|
||||||
|
@ -27,7 +29,7 @@ class Image:
|
||||||
|
|
||||||
def download(self):
|
def download(self):
|
||||||
r = requests.get(self.old_url)
|
r = requests.get(self.old_url)
|
||||||
p = PATH + urlparse(self.old_url).path
|
p = IMG_PATH + urlparse(self.old_url).path
|
||||||
with open(p, 'wb') as f:
|
with open(p, 'wb') as f:
|
||||||
f.write(r.content)
|
f.write(r.content)
|
||||||
print(f"Downloaded image {self.old_url} to {p}")
|
print(f"Downloaded image {self.old_url} to {p}")
|
||||||
|
@ -70,6 +72,8 @@ class PadPage:
|
||||||
r = requests.get(self.old_url + "/download")
|
r = requests.get(self.old_url + "/download")
|
||||||
if r.status_code == 200:
|
if r.status_code == 200:
|
||||||
self.content = r.text
|
self.content = r.text
|
||||||
|
with open(PAD_PATH + urlparse(self.old_url).path + ".md", "w") as f:
|
||||||
|
f.write(self.content)
|
||||||
num = self._find_images()
|
num = self._find_images()
|
||||||
print(f"Found {num} images")
|
print(f"Found {num} images")
|
||||||
else:
|
else:
|
||||||
|
|
Loading…
Add table
Reference in a new issue