diff --git a/.gitignore b/.gitignore index 61a7128..ceaea36 100644 --- a/.gitignore +++ b/.gitignore @@ -1,10 +1,3 @@ -# Python -__pycache__ -venv/ -images/ -pads/ -pads.json -new_pads.json # ---> Node # Logs logs diff --git a/README.md b/README.md index 64ccbb3..ef10214 100644 --- a/README.md +++ b/README.md @@ -2,49 +2,12 @@ Simple script to migrate content from one Hedgedoc instance to another. -## Cypress - +## Setup Follow the guide over [here](https://docs.cypress.io/guides/getting-started/installing-cypress#Linux-Prerequisites). You also require NodeJS. Run `npm ci` to install the required packages. -### Execution +## Execution To start cypress, simply execute `npx cypress open`. Then click `E2E Testing` and run using electron. This step could be automated using the `cypress` [API](https://docs.cypress.io/guides/guides/module-api). - - -## Python - -Create new python venv: -``` -python -m venv venv -``` - -Activate the new environment (linux) - -``` -source venv/bin/activate -``` - -Install the requirements -``` -pip install -r requirements.txt -``` - -### Execution - -Start the extractor -``` -python hedgedoc-image.py --replace new_netloc meta_pad1 meta_pad2 ... -``` - -For example: -``` -python hedgedoc-image.py --replace pad.hacknang.de https://md.margau.net/dbk-meta -``` - -### Produced files - -The python scripts produces a `pads.json` which contains the mapping from `old_url` to `new_url`. -All images land in `images/uploads`. Only images hosted on the `old_pads` URL are saved diff --git a/cypress/e2e/hedgedoc-migrator.cy.js b/cypress/e2e/hedgedoc-migrator.cy.js index 82c5430..6c671ff 100644 --- a/cypress/e2e/hedgedoc-migrator.cy.js +++ b/cypress/e2e/hedgedoc-migrator.cy.js @@ -1,31 +1,56 @@ +String.prototype.replaceLast = function (what, replacement) { + var pcs = this.split(what); + var lastPc = pcs.pop(); + return pcs.join(what) + replacement + lastPc; +} + async function getContent(url) { const res = await fetch(url.concat('/download')); return res.text(); } -async function migrateDocument(url, baseUrl) { - const content = await getContent(url); - cy.request({ - url: baseUrl.concat('/new'), - method: 'POST', - headers: { - 'Content-Type': 'text/markdown', - 'Access-Control-Allow-Origin': new URL(baseUrl).hostname, - }, - body: content, - }).then((res) => { - const redirect = res.redirects[0].split(' ')[1]; - cy.visit(url); - cy.get('#view-mode-toggle-edit').click({force: true}); - cy.get('.CodeMirror-scroll').type('{ctrl}a{backspace}'); - cy.get('.CodeMirror-scroll').type(`Moved to [${redirect}](${redirect})`); - }); +async function migrateDocument(oldUrl, newUrl) { + // get content of old pad + var content = await getContent(oldUrl); + + // replace URLs + content = content.replaceAll(new URL(oldUrl).hostname, new URL(newUrl).hostname); + + // visit new pad url (not possible via post api request because pad may already exists) + // Caution: Content of new pad url will be overwritten! + // Caution: History will not be moved to new pad url + cy.visit(newUrl.concat('?edit')); + cy.window().then((win) => { + + // Write Content + cy.get('.CodeMirror-scroll').type('{ctrl}a{backspace}'); + cy.get('.CodeMirror-scroll').type(content); + + // Visit old pad and replace content + cy.visit(oldUrl); + cy.get('#view-mode-toggle-edit').click({force: true}); + cy.get('.CodeMirror-scroll').type('{ctrl}a{backspace}'); + cy.get('.CodeMirror-scroll').type(`# 301 Pad moved{enter}=> [${newUrl}](${newUrl})`); + + }) + } describe('Migrate document', () => { - it('passes', async () => { - const baseUrl = 'https://md.margau.net'; - const url = 'https://md.margau.net/H0JO3L5DS-6Yhv4RrdS-tw'; - migrateDocument(url, baseUrl); - }); -}); + it('passes', async () => { + + // Read list of pads to migrate + cy.fixture('pads').then(async(pads) => { + + if(pads.length === 0) { + console.log("Didn't find any pad urls to migrate"); + } + + for (const pad of pads) { + await migrateDocument(pad.oldUrl, pad.newUrl); + } + + }) + + }) +}) diff --git a/cypress/fixtures/pads.json b/cypress/fixtures/pads.json new file mode 100644 index 0000000..98ca924 --- /dev/null +++ b/cypress/fixtures/pads.json @@ -0,0 +1,10 @@ +[ + { + "oldUrl":"https://md.margau.net/test", + "newUrl":"https://pad.hacknang.de/test" + }, + { + "oldUrl":"https://md.margau.net/test3", + "newUrl":"https://pad.hacknang.de/test2" + } +] diff --git a/hedgedoc-image.py b/hedgedoc-image.py deleted file mode 100644 index d1ab53c..0000000 --- a/hedgedoc-image.py +++ /dev/null @@ -1,147 +0,0 @@ -import requests -import json -import re -import argparse -import os - -from urllib.parse import urlparse -from typing import List -from os.path import exists - -IMG_PATH = "images/" -PAD_PATH = "pads/" -NEW_NETLOC = "" - -seen_pads = set() - -pads_json = [] - -os.makedirs(PAD_PATH, exist_ok=True) -os.makedirs(IMG_PATH, exist_ok=True) -os.makedirs(IMG_PATH + "uploads", exist_ok=True) - - -class Image: - def __init__(self, _old_url: str) -> None: - self.old_url = _old_url - old = urlparse(self.old_url) - self.new_url = "." + old.path; - - def download(self): - r = requests.get(self.old_url) - p = IMG_PATH + urlparse(self.old_url).path - with open(p, 'wb') as f: - f.write(r.content) - print(f"Downloaded image {self.old_url} to {p}") - - -class PadPage: - def __init__(self, _old_url: str): - self.old_url: str = _old_url - #self.name: str = _name - self.images: List = [] - self.linked_pads = [] - self.content: str = "" - - def gather_linked_pads(self): - regex = r'https://[\w\d.]+/(?!upload)[\w\-_]+' - matches = re.findall(regex, self.content) - - full_url = urlparse(self.old_url) - for match in matches: - print(f"match: {match}") - url = urlparse(match) - if url.netloc == full_url.netloc: - self.linked_pads.append(PadPage(match)) - else: - print("Dropped pad, wrong netloc") - - def to_dict(self) -> dict: - old = urlparse(self.old_url) - new = old._replace(netloc="pad.hacknang.de") - return {"oldUrl": self.old_url, "newUrl": new.geturl()} - - def to_json(self) -> str: - old = urlparse(self.old_url) - new = old._replace(netloc=NEW_NETLOC) - return json.dumps({"oldUrl": self.old_url, "newUrl": new.geturl()}) - - def download_and_process_content(self): - print(f"Downloading pad at {self.old_url}") - seen_pads.add(self.old_url) - r = requests.get(self.old_url + "/download") - if r.status_code == 200: - self.content = r.text - with open(PAD_PATH + urlparse(self.old_url).path + ".md", "w") as f: - f.write(self.content) - num = self._find_images() - print(f"Found {num} images") - else: - print(f"Error downloading Pad {self.old_url}, got HTTP status code {r.status_code}") - - # returns number of images found - def _find_images(self) -> int: - regex = r'https://[\w\d.]+/uploads/[\w\d]+\.(?:png|jpg|jpeg|webp)' - matches = re.findall(regex, self.content) - - full_url = urlparse(self.old_url) - for match in matches: - print(f"match: {match}") - url = urlparse(match) - if url.netloc == full_url.netloc: - i = Image(match) - self.images.append(i) - else: - print("Dropped pad, wrong netloc") - return len(matches) - - def download_images(self): - for i in self.images: - i.download() - - def recursive(self): - for pad in self.linked_pads: - if pad.old_url not in seen_pads: - print(f"New pad found: {pad.old_url}") - pad.download_and_process_content() - pad.download_images() - pad.gather_linked_pads() - pads_json.append(pad.to_dict()) - pad.recursive() - - -if __name__ == "__main__": - parser = argparse.ArgumentParser(description="Recursivly downloads all images from a hedgedoc pad") - parser.add_argument("pad_url", metavar="N", type=str, nargs="+", - help="urls of the pad to start searching") - parser.add_argument("--replace", dest="new_pad_url", metavar="B", type=str, nargs=1, - help="url of the new pad for generating the json") - - args = parser.parse_args() - - old_file = [] - - if exists("pads.json"): - with open("pads.json", "r") as f: - old_file = json.load(f) - - for entry in old_file: - seen_pads.add(entry["old_url"]) - - pads_json.extend(old_file) - print(f"Seen Pads: {seen_pads}") - - - NEW_NETLOC = args.new_pad_url[0] - for pad_url in args.pad_url: - pad = PadPage(pad_url) - pad.download_and_process_content() - pad.download_images() - pad.gather_linked_pads() - pads_json.append(pad.to_dict()) - pad.recursive() - - print(f"We have seen {len(seen_pads)} which were {seen_pads}") - - with open("pads.json", "w") as f: - f.write(json.dumps(pads_json)) diff --git a/hedgedoc-is-moved.py b/hedgedoc-is-moved.py deleted file mode 100644 index eb2057e..0000000 --- a/hedgedoc-is-moved.py +++ /dev/null @@ -1,27 +0,0 @@ -import requests -import json -import os - -from os.path import exists - -pads_json = [] -new_json= [] - -if exists("pads.json"): - with open("pads.json", "r") as f: - old_file = json.load(f) - - pads_json.extend(old_file) - - for pad in pads_json: - print("Downloading: " + pad['oldUrl'] + "/download") - r = requests.head(pad['oldUrl'] + "/download") - if int(r.headers['content-length']) > 150: - new_json.append(pad) - - - with open("new_pads.json", "w") as f: - f.write(json.dumps(new_json)) - -else: - print("Give me pads.json") diff --git a/pads/19IN-2Sem-Java b/pads/19IN-2Sem-Java deleted file mode 100644 index 526e9b1..0000000 --- a/pads/19IN-2Sem-Java +++ /dev/null @@ -1 +0,0 @@ -# Java \ No newline at end of file diff --git a/requirements.txt b/requirements.txt deleted file mode 100644 index a20d73b..0000000 --- a/requirements.txt +++ /dev/null @@ -1,8 +0,0 @@ -beautifulsoup4==4.11.1 -bs4==0.0.1 -certifi==2022.6.15.1 -charset-normalizer==2.1.1 -idna==3.3 -requests==2.28.1 -soupsieve==2.3.2.post1 -urllib3==1.26.12