Compare commits
2 commits
Author | SHA1 | Date | |
---|---|---|---|
![]() |
d10cf37bfd | ||
![]() |
2b30b277e7 |
8 changed files with 60 additions and 252 deletions
7
.gitignore
vendored
7
.gitignore
vendored
|
@ -1,10 +1,3 @@
|
||||||
# Python
|
|
||||||
__pycache__
|
|
||||||
venv/
|
|
||||||
images/
|
|
||||||
pads/
|
|
||||||
pads.json
|
|
||||||
new_pads.json
|
|
||||||
# ---> Node
|
# ---> Node
|
||||||
# Logs
|
# Logs
|
||||||
logs
|
logs
|
||||||
|
|
41
README.md
41
README.md
|
@ -2,49 +2,12 @@
|
||||||
|
|
||||||
Simple script to migrate content from one Hedgedoc instance to another.
|
Simple script to migrate content from one Hedgedoc instance to another.
|
||||||
|
|
||||||
## Cypress
|
## Setup
|
||||||
|
|
||||||
|
|
||||||
Follow the guide over [here](https://docs.cypress.io/guides/getting-started/installing-cypress#Linux-Prerequisites).
|
Follow the guide over [here](https://docs.cypress.io/guides/getting-started/installing-cypress#Linux-Prerequisites).
|
||||||
|
|
||||||
You also require NodeJS. Run `npm ci` to install the required packages.
|
You also require NodeJS. Run `npm ci` to install the required packages.
|
||||||
|
|
||||||
### Execution
|
## Execution
|
||||||
|
|
||||||
To start cypress, simply execute `npx cypress open`. Then click `E2E Testing` and run using electron. This step could be automated using the `cypress` [API](https://docs.cypress.io/guides/guides/module-api).
|
To start cypress, simply execute `npx cypress open`. Then click `E2E Testing` and run using electron. This step could be automated using the `cypress` [API](https://docs.cypress.io/guides/guides/module-api).
|
||||||
|
|
||||||
|
|
||||||
## Python
|
|
||||||
|
|
||||||
Create new python venv:
|
|
||||||
```
|
|
||||||
python -m venv venv
|
|
||||||
```
|
|
||||||
|
|
||||||
Activate the new environment (linux)
|
|
||||||
|
|
||||||
```
|
|
||||||
source venv/bin/activate
|
|
||||||
```
|
|
||||||
|
|
||||||
Install the requirements
|
|
||||||
```
|
|
||||||
pip install -r requirements.txt
|
|
||||||
```
|
|
||||||
|
|
||||||
### Execution
|
|
||||||
|
|
||||||
Start the extractor
|
|
||||||
```
|
|
||||||
python hedgedoc-image.py --replace new_netloc meta_pad1 meta_pad2 ...
|
|
||||||
```
|
|
||||||
|
|
||||||
For example:
|
|
||||||
```
|
|
||||||
python hedgedoc-image.py --replace pad.hacknang.de https://md.margau.net/dbk-meta
|
|
||||||
```
|
|
||||||
|
|
||||||
### Produced files
|
|
||||||
|
|
||||||
The python scripts produces a `pads.json` which contains the mapping from `old_url` to `new_url`.
|
|
||||||
All images land in `images/uploads`. Only images hosted on the `old_pads` URL are saved
|
|
||||||
|
|
|
@ -1,31 +1,56 @@
|
||||||
|
String.prototype.replaceLast = function (what, replacement) {
|
||||||
|
var pcs = this.split(what);
|
||||||
|
var lastPc = pcs.pop();
|
||||||
|
return pcs.join(what) + replacement + lastPc;
|
||||||
|
}
|
||||||
|
|
||||||
async function getContent(url) {
|
async function getContent(url) {
|
||||||
const res = await fetch(url.concat('/download'));
|
const res = await fetch(url.concat('/download'));
|
||||||
return res.text();
|
return res.text();
|
||||||
}
|
}
|
||||||
|
|
||||||
async function migrateDocument(url, baseUrl) {
|
async function migrateDocument(oldUrl, newUrl) {
|
||||||
const content = await getContent(url);
|
// get content of old pad
|
||||||
cy.request({
|
var content = await getContent(oldUrl);
|
||||||
url: baseUrl.concat('/new'),
|
|
||||||
method: 'POST',
|
// replace URLs
|
||||||
headers: {
|
content = content.replaceAll(new URL(oldUrl).hostname, new URL(newUrl).hostname);
|
||||||
'Content-Type': 'text/markdown',
|
|
||||||
'Access-Control-Allow-Origin': new URL(baseUrl).hostname,
|
// visit new pad url (not possible via post api request because pad may already exists)
|
||||||
},
|
// Caution: Content of new pad url will be overwritten!
|
||||||
body: content,
|
// Caution: History will not be moved to new pad url
|
||||||
}).then((res) => {
|
cy.visit(newUrl.concat('?edit'));
|
||||||
const redirect = res.redirects[0].split(' ')[1];
|
cy.window().then((win) => {
|
||||||
cy.visit(url);
|
|
||||||
cy.get('#view-mode-toggle-edit').click({force: true});
|
// Write Content
|
||||||
cy.get('.CodeMirror-scroll').type('{ctrl}a{backspace}');
|
cy.get('.CodeMirror-scroll').type('{ctrl}a{backspace}');
|
||||||
cy.get('.CodeMirror-scroll').type(`Moved to [${redirect}](${redirect})`);
|
cy.get('.CodeMirror-scroll').type(content);
|
||||||
});
|
|
||||||
|
// Visit old pad and replace content
|
||||||
|
cy.visit(oldUrl);
|
||||||
|
cy.get('#view-mode-toggle-edit').click({force: true});
|
||||||
|
cy.get('.CodeMirror-scroll').type('{ctrl}a{backspace}');
|
||||||
|
cy.get('.CodeMirror-scroll').type(`# 301 Pad moved{enter}=> [${newUrl}](${newUrl})`);
|
||||||
|
|
||||||
|
})
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
describe('Migrate document', () => {
|
describe('Migrate document', () => {
|
||||||
it('passes', async () => {
|
it('passes', async () => {
|
||||||
const baseUrl = 'https://md.margau.net';
|
|
||||||
const url = 'https://md.margau.net/H0JO3L5DS-6Yhv4RrdS-tw';
|
// Read list of pads to migrate
|
||||||
migrateDocument(url, baseUrl);
|
cy.fixture('pads').then(async(pads) => {
|
||||||
});
|
|
||||||
});
|
if(pads.length === 0) {
|
||||||
|
console.log("Didn't find any pad urls to migrate");
|
||||||
|
}
|
||||||
|
|
||||||
|
for (const pad of pads) {
|
||||||
|
await migrateDocument(pad.oldUrl, pad.newUrl);
|
||||||
|
}
|
||||||
|
|
||||||
|
})
|
||||||
|
|
||||||
|
})
|
||||||
|
})
|
||||||
|
|
10
cypress/fixtures/pads.json
Normal file
10
cypress/fixtures/pads.json
Normal file
|
@ -0,0 +1,10 @@
|
||||||
|
[
|
||||||
|
{
|
||||||
|
"oldUrl":"https://md.margau.net/test",
|
||||||
|
"newUrl":"https://pad.hacknang.de/test"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"oldUrl":"https://md.margau.net/test3",
|
||||||
|
"newUrl":"https://pad.hacknang.de/test2"
|
||||||
|
}
|
||||||
|
]
|
|
@ -1,147 +0,0 @@
|
||||||
import requests
|
|
||||||
import json
|
|
||||||
import re
|
|
||||||
import argparse
|
|
||||||
import os
|
|
||||||
|
|
||||||
from urllib.parse import urlparse
|
|
||||||
from typing import List
|
|
||||||
from os.path import exists
|
|
||||||
|
|
||||||
IMG_PATH = "images/"
|
|
||||||
PAD_PATH = "pads/"
|
|
||||||
NEW_NETLOC = ""
|
|
||||||
|
|
||||||
seen_pads = set()
|
|
||||||
|
|
||||||
pads_json = []
|
|
||||||
|
|
||||||
os.makedirs(PAD_PATH, exist_ok=True)
|
|
||||||
os.makedirs(IMG_PATH, exist_ok=True)
|
|
||||||
os.makedirs(IMG_PATH + "uploads", exist_ok=True)
|
|
||||||
|
|
||||||
|
|
||||||
class Image:
|
|
||||||
def __init__(self, _old_url: str) -> None:
|
|
||||||
self.old_url = _old_url
|
|
||||||
old = urlparse(self.old_url)
|
|
||||||
self.new_url = "." + old.path;
|
|
||||||
|
|
||||||
def download(self):
|
|
||||||
r = requests.get(self.old_url)
|
|
||||||
p = IMG_PATH + urlparse(self.old_url).path
|
|
||||||
with open(p, 'wb') as f:
|
|
||||||
f.write(r.content)
|
|
||||||
print(f"Downloaded image {self.old_url} to {p}")
|
|
||||||
|
|
||||||
|
|
||||||
class PadPage:
|
|
||||||
def __init__(self, _old_url: str):
|
|
||||||
self.old_url: str = _old_url
|
|
||||||
#self.name: str = _name
|
|
||||||
self.images: List = []
|
|
||||||
self.linked_pads = []
|
|
||||||
self.content: str = ""
|
|
||||||
|
|
||||||
def gather_linked_pads(self):
|
|
||||||
regex = r'https://[\w\d.]+/(?!upload)[\w\-_]+'
|
|
||||||
matches = re.findall(regex, self.content)
|
|
||||||
|
|
||||||
full_url = urlparse(self.old_url)
|
|
||||||
for match in matches:
|
|
||||||
print(f"match: {match}")
|
|
||||||
url = urlparse(match)
|
|
||||||
if url.netloc == full_url.netloc:
|
|
||||||
self.linked_pads.append(PadPage(match))
|
|
||||||
else:
|
|
||||||
print("Dropped pad, wrong netloc")
|
|
||||||
|
|
||||||
def to_dict(self) -> dict:
|
|
||||||
old = urlparse(self.old_url)
|
|
||||||
new = old._replace(netloc="pad.hacknang.de")
|
|
||||||
return {"oldUrl": self.old_url, "newUrl": new.geturl()}
|
|
||||||
|
|
||||||
def to_json(self) -> str:
|
|
||||||
old = urlparse(self.old_url)
|
|
||||||
new = old._replace(netloc=NEW_NETLOC)
|
|
||||||
return json.dumps({"oldUrl": self.old_url, "newUrl": new.geturl()})
|
|
||||||
|
|
||||||
def download_and_process_content(self):
|
|
||||||
print(f"Downloading pad at {self.old_url}")
|
|
||||||
seen_pads.add(self.old_url)
|
|
||||||
r = requests.get(self.old_url + "/download")
|
|
||||||
if r.status_code == 200:
|
|
||||||
self.content = r.text
|
|
||||||
with open(PAD_PATH + urlparse(self.old_url).path + ".md", "w") as f:
|
|
||||||
f.write(self.content)
|
|
||||||
num = self._find_images()
|
|
||||||
print(f"Found {num} images")
|
|
||||||
else:
|
|
||||||
print(f"Error downloading Pad {self.old_url}, got HTTP status code {r.status_code}")
|
|
||||||
|
|
||||||
# returns number of images found
|
|
||||||
def _find_images(self) -> int:
|
|
||||||
regex = r'https://[\w\d.]+/uploads/[\w\d]+\.(?:png|jpg|jpeg|webp)'
|
|
||||||
matches = re.findall(regex, self.content)
|
|
||||||
|
|
||||||
full_url = urlparse(self.old_url)
|
|
||||||
for match in matches:
|
|
||||||
print(f"match: {match}")
|
|
||||||
url = urlparse(match)
|
|
||||||
if url.netloc == full_url.netloc:
|
|
||||||
i = Image(match)
|
|
||||||
self.images.append(i)
|
|
||||||
else:
|
|
||||||
print("Dropped pad, wrong netloc")
|
|
||||||
return len(matches)
|
|
||||||
|
|
||||||
def download_images(self):
|
|
||||||
for i in self.images:
|
|
||||||
i.download()
|
|
||||||
|
|
||||||
def recursive(self):
|
|
||||||
for pad in self.linked_pads:
|
|
||||||
if pad.old_url not in seen_pads:
|
|
||||||
print(f"New pad found: {pad.old_url}")
|
|
||||||
pad.download_and_process_content()
|
|
||||||
pad.download_images()
|
|
||||||
pad.gather_linked_pads()
|
|
||||||
pads_json.append(pad.to_dict())
|
|
||||||
pad.recursive()
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
parser = argparse.ArgumentParser(description="Recursivly downloads all images from a hedgedoc pad")
|
|
||||||
parser.add_argument("pad_url", metavar="N", type=str, nargs="+",
|
|
||||||
help="urls of the pad to start searching")
|
|
||||||
parser.add_argument("--replace", dest="new_pad_url", metavar="B", type=str, nargs=1,
|
|
||||||
help="url of the new pad for generating the json")
|
|
||||||
|
|
||||||
args = parser.parse_args()
|
|
||||||
|
|
||||||
old_file = []
|
|
||||||
|
|
||||||
if exists("pads.json"):
|
|
||||||
with open("pads.json", "r") as f:
|
|
||||||
old_file = json.load(f)
|
|
||||||
|
|
||||||
for entry in old_file:
|
|
||||||
seen_pads.add(entry["old_url"])
|
|
||||||
|
|
||||||
pads_json.extend(old_file)
|
|
||||||
print(f"Seen Pads: {seen_pads}")
|
|
||||||
|
|
||||||
|
|
||||||
NEW_NETLOC = args.new_pad_url[0]
|
|
||||||
for pad_url in args.pad_url:
|
|
||||||
pad = PadPage(pad_url)
|
|
||||||
pad.download_and_process_content()
|
|
||||||
pad.download_images()
|
|
||||||
pad.gather_linked_pads()
|
|
||||||
pads_json.append(pad.to_dict())
|
|
||||||
pad.recursive()
|
|
||||||
|
|
||||||
print(f"We have seen {len(seen_pads)} which were {seen_pads}")
|
|
||||||
|
|
||||||
with open("pads.json", "w") as f:
|
|
||||||
f.write(json.dumps(pads_json))
|
|
|
@ -1,27 +0,0 @@
|
||||||
import requests
|
|
||||||
import json
|
|
||||||
import os
|
|
||||||
|
|
||||||
from os.path import exists
|
|
||||||
|
|
||||||
pads_json = []
|
|
||||||
new_json= []
|
|
||||||
|
|
||||||
if exists("pads.json"):
|
|
||||||
with open("pads.json", "r") as f:
|
|
||||||
old_file = json.load(f)
|
|
||||||
|
|
||||||
pads_json.extend(old_file)
|
|
||||||
|
|
||||||
for pad in pads_json:
|
|
||||||
print("Downloading: " + pad['oldUrl'] + "/download")
|
|
||||||
r = requests.head(pad['oldUrl'] + "/download")
|
|
||||||
if int(r.headers['content-length']) > 150:
|
|
||||||
new_json.append(pad)
|
|
||||||
|
|
||||||
|
|
||||||
with open("new_pads.json", "w") as f:
|
|
||||||
f.write(json.dumps(new_json))
|
|
||||||
|
|
||||||
else:
|
|
||||||
print("Give me pads.json")
|
|
|
@ -1 +0,0 @@
|
||||||
# Java
|
|
|
@ -1,8 +0,0 @@
|
||||||
beautifulsoup4==4.11.1
|
|
||||||
bs4==0.0.1
|
|
||||||
certifi==2022.6.15.1
|
|
||||||
charset-normalizer==2.1.1
|
|
||||||
idna==3.3
|
|
||||||
requests==2.28.1
|
|
||||||
soupsieve==2.3.2.post1
|
|
||||||
urllib3==1.26.12
|
|
Loading…
Add table
Add a link
Reference in a new issue