From 2b30b277e70d471890a9015867b3df8ec0b41a55 Mon Sep 17 00:00:00 2001 From: Patrick Schwarz Date: Tue, 13 Sep 2022 21:25:02 +0200 Subject: [PATCH 1/9] Unsuccessfully try to change cypress test to migrate a bunch of pads using json file --- cypress/e2e/hedgedoc-migrator.cy.js | 71 +++++++++++++++++++---------- cypress/fixtures/pads.json | 10 ++++ 2 files changed, 58 insertions(+), 23 deletions(-) create mode 100644 cypress/fixtures/pads.json diff --git a/cypress/e2e/hedgedoc-migrator.cy.js b/cypress/e2e/hedgedoc-migrator.cy.js index 82c5430..faa1003 100644 --- a/cypress/e2e/hedgedoc-migrator.cy.js +++ b/cypress/e2e/hedgedoc-migrator.cy.js @@ -1,31 +1,56 @@ +String.prototype.replaceLast = function (what, replacement) { + var pcs = this.split(what); + var lastPc = pcs.pop(); + return pcs.join(what) + replacement + lastPc; +} + async function getContent(url) { const res = await fetch(url.concat('/download')); return res.text(); } -async function migrateDocument(url, baseUrl) { - const content = await getContent(url); - cy.request({ - url: baseUrl.concat('/new'), - method: 'POST', - headers: { - 'Content-Type': 'text/markdown', - 'Access-Control-Allow-Origin': new URL(baseUrl).hostname, - }, - body: content, - }).then((res) => { - const redirect = res.redirects[0].split(' ')[1]; - cy.visit(url); - cy.get('#view-mode-toggle-edit').click({force: true}); - cy.get('.CodeMirror-scroll').type('{ctrl}a{backspace}'); - cy.get('.CodeMirror-scroll').type(`Moved to [${redirect}](${redirect})`); - }); +async function migrateDocument(oldUrl, newUrl) { + // get content of old pad + var content = await getContent(oldUrl); + + // replace URLs + content = content.replaceAll(new URL(oldUrl).hostname, new URL(newUrl).hostname); + + // visit new pad url (not possible via post api request because pad may already exists) + // Caution: Content of new pad url will be overwritten! + // Caution: History will not be moved to new pad url + cy.visit(newUrl.concat('?edit')); + cy.window().then((win) => { + + // Write Content + cy.get('.CodeMirror-scroll').type('{ctrl}a{backspace}'); + cy.get('.CodeMirror-scroll').type(content); + + // Visit old pad and replace content + cy.visit(oldUrl); + cy.get('#view-mode-toggle-edit').click({force: true}); + cy.get('.CodeMirror-scroll').type('{ctrl}a{backspace}'); + cy.get('.CodeMirror-scroll').type(`# 301 Pad moved{enter}=> [${newUrl}](${newUrl})`); + + }) + } describe('Migrate document', () => { - it('passes', async () => { - const baseUrl = 'https://md.margau.net'; - const url = 'https://md.margau.net/H0JO3L5DS-6Yhv4RrdS-tw'; - migrateDocument(url, baseUrl); - }); -}); + it('passes', async () => { + + // Read list of pads to migrate + cy.fixture('pads').then( pads => { + + if(pads.length === 0) { + console.log("Didn't find any pad urls to migrate"); + } + + for (const pad of pads) { + migrateDocument(pad.oldUrl, pad.newUrl); + } + + }) + + }) +}) diff --git a/cypress/fixtures/pads.json b/cypress/fixtures/pads.json new file mode 100644 index 0000000..98ca924 --- /dev/null +++ b/cypress/fixtures/pads.json @@ -0,0 +1,10 @@ +[ + { + "oldUrl":"https://md.margau.net/test", + "newUrl":"https://pad.hacknang.de/test" + }, + { + "oldUrl":"https://md.margau.net/test3", + "newUrl":"https://pad.hacknang.de/test2" + } +] From d10cf37bfd61b7b84013ab8e54ca52b74498339e Mon Sep 17 00:00:00 2001 From: Patrick Schwarz Date: Wed, 14 Sep 2022 00:14:31 +0200 Subject: [PATCH 2/9] Fix non working non working migration script with cypress async magic --- cypress/e2e/hedgedoc-migrator.cy.js | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cypress/e2e/hedgedoc-migrator.cy.js b/cypress/e2e/hedgedoc-migrator.cy.js index faa1003..6c671ff 100644 --- a/cypress/e2e/hedgedoc-migrator.cy.js +++ b/cypress/e2e/hedgedoc-migrator.cy.js @@ -40,14 +40,14 @@ describe('Migrate document', () => { it('passes', async () => { // Read list of pads to migrate - cy.fixture('pads').then( pads => { + cy.fixture('pads').then(async(pads) => { if(pads.length === 0) { console.log("Didn't find any pad urls to migrate"); } for (const pad of pads) { - migrateDocument(pad.oldUrl, pad.newUrl); + await migrateDocument(pad.oldUrl, pad.newUrl); } }) From 7a422ef89bb09df9b00466ce8586376632e17bce Mon Sep 17 00:00:00 2001 From: Nick Hahn Date: Wed, 14 Sep 2022 14:04:35 +0200 Subject: [PATCH 3/9] Add python extractor --- .gitignore | 5 ++ README.md | 31 +++++++++++ hedgedoc-image.py | 130 ++++++++++++++++++++++++++++++++++++++++++++++ requirements.txt | 8 +++ 4 files changed, 174 insertions(+) create mode 100644 hedgedoc-image.py create mode 100644 requirements.txt diff --git a/.gitignore b/.gitignore index ceaea36..fde6bf5 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,8 @@ +# Python +__pycache__ +venv/ +images/ +pads.json # ---> Node # Logs logs diff --git a/README.md b/README.md index ef10214..319aef6 100644 --- a/README.md +++ b/README.md @@ -8,6 +8,37 @@ Follow the guide over [here](https://docs.cypress.io/guides/getting-started/inst You also require NodeJS. Run `npm ci` to install the required packages. +Create new python venv: +``` +python -m venv venv +``` + +Activate the new environment (linux) + +``` +source venv/bin/activate +``` + +Install the requirements +``` +pip -r requirements.txt +``` + ## Execution To start cypress, simply execute `npx cypress open`. Then click `E2E Testing` and run using electron. This step could be automated using the `cypress` [API](https://docs.cypress.io/guides/guides/module-api). + +Start the extractor +``` +python hedgedoc-image.py meta_pad new_netloc +``` + +For example: +``` +python hedgedoc-image.py https://md.margau.net/dbk-meta pad.hacknang.de +``` + +## Produced files + +The python scripts produces a `pads.json` which contains the mapping from `old_url` to `new_url`. +All images land in `images/uploads`. Only images hosted on the `old_pads` URL are saved diff --git a/hedgedoc-image.py b/hedgedoc-image.py new file mode 100644 index 0000000..184b275 --- /dev/null +++ b/hedgedoc-image.py @@ -0,0 +1,130 @@ +import requests +import json +import re +import argparse + +import os + +from urllib.parse import urlparse +from typing import List + +PATH = "images/" +NEW_NETLOC = "" + +seen_pads = [] + +pads_json = [] + +os.makedirs(PATH, exist_ok=True) +os.makedirs(PATH + "uploads", exist_ok=True) +# TODO: Loop detection +# TODO: Recursion +class Image: + def __init__(self, _old_url: str) -> None: + self.old_url = _old_url + old = urlparse(self.old_url) + self.new_url = "." + old.path; + + def download(self): + print("HERE") + r = requests.get(self.old_url) + p = PATH + urlparse(self.old_url).path + with open(p, 'wb') as f: + f.write(r.content) + print(f"Downloaded image {self.old_url} to {p}") + + +class PadPage: + def __init__(self, _old_url: str): + self.old_url: str = _old_url + #self.name: str = _name + self.images: List = [] + self.linked_pads = [] + self.content: str = "" + + def gather_linked_pads(self): + regex = r'https://[\w\d.]+/(?!upload)[\w\-_]+' + matches = re.findall(regex, self.content) + + full_url = urlparse(self.old_url) + for match in matches: + print(f"match: {match}") + url = urlparse(match) + if url.netloc == full_url.netloc: + self.linked_pads.append(PadPage(match)) + else: + print("Dropped pad, wrong netloc") + + def to_dict(self) -> dict: + old = urlparse(self.old_url) + new = old._replace(netloc="pad.hacknang.de") + return {"old_url": self.old_url, "new_url": new.geturl()} + + def to_json(self) -> str: + old = urlparse(self.old_url) + new = old._replace(netloc=NEW_NETLOC) + return json.dumps({"old_url": self.old_url, "new_url": new.geturl()}) + + def download_and_process_content(self): + print(f"Downloading pad at {self.old_url}") + seen_pads.append(self.old_url) + r = requests.get(self.old_url + "/download") + if r.status_code == 200: + self.content = r.text + num = self._find_images() + print(f"Found {num} images") + else: + print(f"Error downloading Pad {self.old_url}, got HTTP status code {r.status_code}") + + # returns number of images found + def _find_images(self) -> int: + regex = r'https://[\w\d.]+/uploads/[\w\d]+\.(?:png|jpg|jpeg|webp)' + matches = re.findall(regex, self.content) + + full_url = urlparse(self.old_url) + for match in matches: + print(f"match: {match}") + url = urlparse(match) + if url.netloc == full_url.netloc: + i = Image(match) + self.images.append(i) + else: + print("Dropped pad, wrong netloc") + return len(matches) + + def download_images(self): + for i in self.images: + i.download() + + def recursive(self): + for pad in self.linked_pads: + if pad.old_url not in seen_pads: + print(f"New pad found: {pad.old_url}") + pad.download_and_process_content() + pad.download_images() + pad.gather_linked_pads() + pads_json.append(pad.to_dict()) + pad.recursive() + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Recursivly downloads all images from a hedgedoc pad") + parser.add_argument("pad_url", metavar="N", type=str, nargs=1, + help="url of the pad to start searching") + parser.add_argument("new_pad_url", metavar="N", type=str, nargs=1, + help="url of the new pad for generating the json") + + args = parser.parse_args() + + NEW_NETLOC = args.new_pad_url[0] + pad = PadPage(args.pad_url[0]) + pad.download_and_process_content() + pad.download_images() + pad.gather_linked_pads() + pads_json.append(pad.to_dict()) + pad.recursive() + + print(f"We have seen {len(seen_pads)} which were {seen_pads}") + + with open("pads.json", "w") as f: + f.write(json.dumps(pads_json)) diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..a20d73b --- /dev/null +++ b/requirements.txt @@ -0,0 +1,8 @@ +beautifulsoup4==4.11.1 +bs4==0.0.1 +certifi==2022.6.15.1 +charset-normalizer==2.1.1 +idna==3.3 +requests==2.28.1 +soupsieve==2.3.2.post1 +urllib3==1.26.12 From 425b13d7a4df033d00469c77fd21b91cc955bf45 Mon Sep 17 00:00:00 2001 From: Nick Hahn Date: Thu, 15 Sep 2022 18:34:42 +0200 Subject: [PATCH 4/9] Preserve old runs in pads.json --- hedgedoc-image.py | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/hedgedoc-image.py b/hedgedoc-image.py index 184b275..dcb5d90 100644 --- a/hedgedoc-image.py +++ b/hedgedoc-image.py @@ -7,6 +7,7 @@ import os from urllib.parse import urlparse from typing import List +from os.path import exists PATH = "images/" NEW_NETLOC = "" @@ -17,8 +18,8 @@ pads_json = [] os.makedirs(PATH, exist_ok=True) os.makedirs(PATH + "uploads", exist_ok=True) -# TODO: Loop detection -# TODO: Recursion + + class Image: def __init__(self, _old_url: str) -> None: self.old_url = _old_url @@ -26,7 +27,6 @@ class Image: self.new_url = "." + old.path; def download(self): - print("HERE") r = requests.get(self.old_url) p = PATH + urlparse(self.old_url).path with open(p, 'wb') as f: @@ -116,6 +116,19 @@ if __name__ == "__main__": args = parser.parse_args() + old_file = [] + + if exists("pads.json"): + with open("pads.json", "r") as f: + old_file = json.load(f) + + for entry in old_file: + seen_pads.append(entry["old_url"]) + + pads_json.extend(old_file) + print(f"Seen Pads: {seen_pads}") + + NEW_NETLOC = args.new_pad_url[0] pad = PadPage(args.pad_url[0]) pad.download_and_process_content() From 226a9c4a38b0a1b1b07e47067429ef08fd26316e Mon Sep 17 00:00:00 2001 From: Nick Hahn Date: Thu, 15 Sep 2022 18:36:16 +0200 Subject: [PATCH 5/9] Cleaned up readme --- README.md | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index 319aef6..7215df9 100644 --- a/README.md +++ b/README.md @@ -2,12 +2,20 @@ Simple script to migrate content from one Hedgedoc instance to another. -## Setup +## Cypress + Follow the guide over [here](https://docs.cypress.io/guides/getting-started/installing-cypress#Linux-Prerequisites). You also require NodeJS. Run `npm ci` to install the required packages. +### Execution + +To start cypress, simply execute `npx cypress open`. Then click `E2E Testing` and run using electron. This step could be automated using the `cypress` [API](https://docs.cypress.io/guides/guides/module-api). + + +## Python + Create new python venv: ``` python -m venv venv @@ -21,12 +29,10 @@ source venv/bin/activate Install the requirements ``` -pip -r requirements.txt +pip install -r requirements.txt ``` -## Execution - -To start cypress, simply execute `npx cypress open`. Then click `E2E Testing` and run using electron. This step could be automated using the `cypress` [API](https://docs.cypress.io/guides/guides/module-api). +### Execution Start the extractor ``` @@ -38,7 +44,7 @@ For example: python hedgedoc-image.py https://md.margau.net/dbk-meta pad.hacknang.de ``` -## Produced files +### Produced files The python scripts produces a `pads.json` which contains the mapping from `old_url` to `new_url`. All images land in `images/uploads`. Only images hosted on the `old_pads` URL are saved From 7c770166a75b2930292e0c80ac466c21299ac7d3 Mon Sep 17 00:00:00 2001 From: Nick Hahn Date: Thu, 15 Sep 2022 19:31:00 +0200 Subject: [PATCH 6/9] Add ability to specify multiple metapads --- README.md | 4 ++-- hedgedoc-image.py | 26 +++++++++++++------------- 2 files changed, 15 insertions(+), 15 deletions(-) diff --git a/README.md b/README.md index 7215df9..64ccbb3 100644 --- a/README.md +++ b/README.md @@ -36,12 +36,12 @@ pip install -r requirements.txt Start the extractor ``` -python hedgedoc-image.py meta_pad new_netloc +python hedgedoc-image.py --replace new_netloc meta_pad1 meta_pad2 ... ``` For example: ``` -python hedgedoc-image.py https://md.margau.net/dbk-meta pad.hacknang.de +python hedgedoc-image.py --replace pad.hacknang.de https://md.margau.net/dbk-meta ``` ### Produced files diff --git a/hedgedoc-image.py b/hedgedoc-image.py index dcb5d90..753fd44 100644 --- a/hedgedoc-image.py +++ b/hedgedoc-image.py @@ -2,7 +2,6 @@ import requests import json import re import argparse - import os from urllib.parse import urlparse @@ -12,7 +11,7 @@ from os.path import exists PATH = "images/" NEW_NETLOC = "" -seen_pads = [] +seen_pads = set() pads_json = [] @@ -67,7 +66,7 @@ class PadPage: def download_and_process_content(self): print(f"Downloading pad at {self.old_url}") - seen_pads.append(self.old_url) + seen_pads.add(self.old_url) r = requests.get(self.old_url + "/download") if r.status_code == 200: self.content = r.text @@ -109,9 +108,9 @@ class PadPage: if __name__ == "__main__": parser = argparse.ArgumentParser(description="Recursivly downloads all images from a hedgedoc pad") - parser.add_argument("pad_url", metavar="N", type=str, nargs=1, - help="url of the pad to start searching") - parser.add_argument("new_pad_url", metavar="N", type=str, nargs=1, + parser.add_argument("pad_url", metavar="N", type=str, nargs="+", + help="urls of the pad to start searching") + parser.add_argument("--replace", dest="new_pad_url", metavar="B", type=str, nargs=1, help="url of the new pad for generating the json") args = parser.parse_args() @@ -123,19 +122,20 @@ if __name__ == "__main__": old_file = json.load(f) for entry in old_file: - seen_pads.append(entry["old_url"]) + seen_pads.add(entry["old_url"]) pads_json.extend(old_file) print(f"Seen Pads: {seen_pads}") NEW_NETLOC = args.new_pad_url[0] - pad = PadPage(args.pad_url[0]) - pad.download_and_process_content() - pad.download_images() - pad.gather_linked_pads() - pads_json.append(pad.to_dict()) - pad.recursive() + for pad_url in args.pad_url: + pad = PadPage(pad_url) + pad.download_and_process_content() + pad.download_images() + pad.gather_linked_pads() + pads_json.append(pad.to_dict()) + pad.recursive() print(f"We have seen {len(seen_pads)} which were {seen_pads}") From 11b5541c0f9feac4a3f7e808a4afdeee3b10c7e5 Mon Sep 17 00:00:00 2001 From: Nick Hahn Date: Fri, 16 Sep 2022 23:06:07 +0200 Subject: [PATCH 7/9] Also save pad content on crawl --- hedgedoc-image.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/hedgedoc-image.py b/hedgedoc-image.py index 753fd44..05dd570 100644 --- a/hedgedoc-image.py +++ b/hedgedoc-image.py @@ -8,15 +8,17 @@ from urllib.parse import urlparse from typing import List from os.path import exists -PATH = "images/" +IMG_PATH = "images/" +PAD_PATH = "pads/" NEW_NETLOC = "" seen_pads = set() pads_json = [] -os.makedirs(PATH, exist_ok=True) -os.makedirs(PATH + "uploads", exist_ok=True) +os.makedirs(PAD_PATH, exist_ok=True) +os.makedirs(IMG_PATH, exist_ok=True) +os.makedirs(IMG_PATH + "uploads", exist_ok=True) class Image: @@ -27,7 +29,7 @@ class Image: def download(self): r = requests.get(self.old_url) - p = PATH + urlparse(self.old_url).path + p = IMG_PATH + urlparse(self.old_url).path with open(p, 'wb') as f: f.write(r.content) print(f"Downloaded image {self.old_url} to {p}") @@ -70,6 +72,8 @@ class PadPage: r = requests.get(self.old_url + "/download") if r.status_code == 200: self.content = r.text + with open(PAD_PATH + urlparse(self.old_url).path + ".md", "w") as f: + f.write(self.content) num = self._find_images() print(f"Found {num} images") else: From dea3fa9fe65f64870399af0cb0708d5b7902e2a8 Mon Sep 17 00:00:00 2001 From: Nick Hahn Date: Fri, 16 Sep 2022 23:24:28 +0200 Subject: [PATCH 8/9] New json format (snake_case to camelCase) --- hedgedoc-image.py | 4 ++-- pads/19IN-2Sem-Java | 1 + 2 files changed, 3 insertions(+), 2 deletions(-) create mode 100644 pads/19IN-2Sem-Java diff --git a/hedgedoc-image.py b/hedgedoc-image.py index 05dd570..d1ab53c 100644 --- a/hedgedoc-image.py +++ b/hedgedoc-image.py @@ -59,12 +59,12 @@ class PadPage: def to_dict(self) -> dict: old = urlparse(self.old_url) new = old._replace(netloc="pad.hacknang.de") - return {"old_url": self.old_url, "new_url": new.geturl()} + return {"oldUrl": self.old_url, "newUrl": new.geturl()} def to_json(self) -> str: old = urlparse(self.old_url) new = old._replace(netloc=NEW_NETLOC) - return json.dumps({"old_url": self.old_url, "new_url": new.geturl()}) + return json.dumps({"oldUrl": self.old_url, "newUrl": new.geturl()}) def download_and_process_content(self): print(f"Downloading pad at {self.old_url}") diff --git a/pads/19IN-2Sem-Java b/pads/19IN-2Sem-Java new file mode 100644 index 0000000..526e9b1 --- /dev/null +++ b/pads/19IN-2Sem-Java @@ -0,0 +1 @@ +# Java \ No newline at end of file From 6f7c991948526ece8ff474da1092563bc4092558 Mon Sep 17 00:00:00 2001 From: Nick Hahn Date: Sat, 17 Sep 2022 11:40:16 +0200 Subject: [PATCH 9/9] Add script that checks if pad is already moved --- .gitignore | 2 ++ hedgedoc-is-moved.py | 27 +++++++++++++++++++++++++++ 2 files changed, 29 insertions(+) create mode 100644 hedgedoc-is-moved.py diff --git a/.gitignore b/.gitignore index fde6bf5..61a7128 100644 --- a/.gitignore +++ b/.gitignore @@ -2,7 +2,9 @@ __pycache__ venv/ images/ +pads/ pads.json +new_pads.json # ---> Node # Logs logs diff --git a/hedgedoc-is-moved.py b/hedgedoc-is-moved.py new file mode 100644 index 0000000..eb2057e --- /dev/null +++ b/hedgedoc-is-moved.py @@ -0,0 +1,27 @@ +import requests +import json +import os + +from os.path import exists + +pads_json = [] +new_json= [] + +if exists("pads.json"): + with open("pads.json", "r") as f: + old_file = json.load(f) + + pads_json.extend(old_file) + + for pad in pads_json: + print("Downloading: " + pad['oldUrl'] + "/download") + r = requests.head(pad['oldUrl'] + "/download") + if int(r.headers['content-length']) > 150: + new_json.append(pad) + + + with open("new_pads.json", "w") as f: + f.write(json.dumps(new_json)) + +else: + print("Give me pads.json")