Fix non working non working migration script with cypress async magic

Unsuccessfully try to change cypress test to migrate a bunch of pads using json file
2022-09-14 00:14:31 +02:00 · 2022-09-13 22:30:47 +02:00
8 changed files with 60 additions and 252 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,10 +1,3 @@
-# Python
-__pycache__
-venv/
-images/
-pads/
-pads.json
-new_pads.json
 # ---> Node
 # Logs
 logs
--- a/README.md
+++ b/README.md
@ -2,49 +2,12 @@

 Simple script to migrate content from one Hedgedoc instance to another.

-## Cypress
-
+## Setup

 Follow the guide over [here](https://docs.cypress.io/guides/getting-started/installing-cypress#Linux-Prerequisites).

 You also require NodeJS. Run `npm ci` to install the required packages.

-### Execution
+## Execution

 To start cypress, simply execute `npx cypress open`. Then click `E2E Testing` and run using electron. This step could be automated using the `cypress` [API](https://docs.cypress.io/guides/guides/module-api).
-
-
-## Python
-
-Create new python venv:
-```
-python -m venv venv
-```
-
-Activate the new environment (linux)
-
-```
-source venv/bin/activate
-```
-
-Install the requirements
-```
-pip install -r requirements.txt
-```
-
-### Execution
-
-Start the extractor
-```
-python hedgedoc-image.py --replace new_netloc meta_pad1 meta_pad2 ...
-```
-
-For example:
-```
-python hedgedoc-image.py --replace pad.hacknang.de https://md.margau.net/dbk-meta
-```
-
-### Produced files
-
-The python scripts produces a `pads.json` which contains the mapping from `old_url` to `new_url`.
-All images land in `images/uploads`. Only images hosted on the `old_pads` URL are saved
--- a/cypress/e2e/hedgedoc-migrator.cy.js
+++ b/cypress/e2e/hedgedoc-migrator.cy.js
@ -1,31 +1,56 @@
+String.prototype.replaceLast = function (what, replacement) {
+    var pcs = this.split(what);
+    var lastPc = pcs.pop();
+    return pcs.join(what) + replacement + lastPc;
+}
+
 async function getContent(url) {
  const res = await fetch(url.concat('/download'));
  return res.text();
 }

-async function migrateDocument(url, baseUrl) {
-  const content = await getContent(url);
-  cy.request({
-    url: baseUrl.concat('/new'),
-    method: 'POST',
-    headers: {
-      'Content-Type': 'text/markdown',
-      'Access-Control-Allow-Origin': new URL(baseUrl).hostname,
-    },
-    body: content,
-  }).then((res) => {
-    const redirect = res.redirects[0].split(' ')[1];
-    cy.visit(url);
-    cy.get('#view-mode-toggle-edit').click({force: true});
-    cy.get('.CodeMirror-scroll').type('{ctrl}a{backspace}');
-    cy.get('.CodeMirror-scroll').type(`Moved to [${redirect}](${redirect})`);
-  });
+async function migrateDocument(oldUrl, newUrl) {
+  // get content of old pad
+  var content = await getContent(oldUrl);
+  
+  // replace URLs
+  content = content.replaceAll(new URL(oldUrl).hostname, new URL(newUrl).hostname);
+  
+  // visit new pad url (not possible via post api request because pad may already exists)
+  // Caution: Content of new pad url will be overwritten!
+  // Caution: History will not be moved to new pad url
+  cy.visit(newUrl.concat('?edit'));
+  cy.window().then((win) => {
+  
+  	// Write Content
+  	cy.get('.CodeMirror-scroll').type('{ctrl}a{backspace}');
+  	cy.get('.CodeMirror-scroll').type(content);
+  	
+  	// Visit old pad and replace content
+  	cy.visit(oldUrl);
+   	cy.get('#view-mode-toggle-edit').click({force: true});
+  	cy.get('.CodeMirror-scroll').type('{ctrl}a{backspace}');
+  	cy.get('.CodeMirror-scroll').type(`# 301 Pad moved{enter}=> [${newUrl}](${newUrl})`);
+  	
+  })
+  
 }

 describe('Migrate document', () => {
-  it('passes', async () => {
-    const baseUrl = 'https://md.margau.net';
-    const url = 'https://md.margau.net/H0JO3L5DS-6Yhv4RrdS-tw';
-    migrateDocument(url, baseUrl);
-  });
-});
+	it('passes', async () => {
+  
+    	// Read list of pads to migrate
+    	cy.fixture('pads').then(async(pads) => {
+
+	    	if(pads.length === 0) {
+    			console.log("Didn't find any pad urls to migrate");
+    		}
+    	
+    		for (const pad of pads) {
+   				await migrateDocument(pad.oldUrl, pad.newUrl);
+	   		}  	
+
+    	})
+	
+	})
+})
--- a/cypress/fixtures/pads.json
+++ b/cypress/fixtures/pads.json
@ -0,0 +1,10 @@
+[
+   {
+      "oldUrl":"https://md.margau.net/test",
+      "newUrl":"https://pad.hacknang.de/test"
+   },
+   {
+      "oldUrl":"https://md.margau.net/test3",
+      "newUrl":"https://pad.hacknang.de/test2"
+   }
+]
--- a/hedgedoc-image.py
+++ b/hedgedoc-image.py
@ -1,147 +0,0 @@
-import requests
-import json
-import re
-import argparse
-import os
-
-from urllib.parse import urlparse
-from typing import List
-from os.path import exists
-
-IMG_PATH = "images/"
-PAD_PATH = "pads/"
-NEW_NETLOC = ""
-
-seen_pads = set()
-
-pads_json = []
-
-os.makedirs(PAD_PATH, exist_ok=True)
-os.makedirs(IMG_PATH, exist_ok=True)
-os.makedirs(IMG_PATH + "uploads", exist_ok=True)
-
-
-class Image:
-    def __init__(self, _old_url: str) -> None:
-        self.old_url = _old_url
-        old = urlparse(self.old_url)
-        self.new_url = "." + old.path;
-
-    def download(self):
-        r = requests.get(self.old_url)
-        p = IMG_PATH + urlparse(self.old_url).path
-        with open(p, 'wb') as f:
-            f.write(r.content)
-            print(f"Downloaded image {self.old_url} to {p}")
-
-
-class PadPage:
-    def __init__(self, _old_url: str):
-        self.old_url: str = _old_url
-        #self.name: str = _name
-        self.images: List = []
-        self.linked_pads = []
-        self.content: str = ""
-
-    def gather_linked_pads(self):
-        regex = r'https://[\w\d.]+/(?!upload)[\w\-_]+'
-        matches = re.findall(regex, self.content)
-
-        full_url = urlparse(self.old_url)
-        for match in matches:
-            print(f"match: {match}")
-            url = urlparse(match)
-            if url.netloc == full_url.netloc:
-                self.linked_pads.append(PadPage(match))
-            else:
-                print("Dropped pad, wrong netloc")
-
-    def to_dict(self) -> dict:
-        old = urlparse(self.old_url)
-        new = old._replace(netloc="pad.hacknang.de")
-        return {"oldUrl": self.old_url, "newUrl": new.geturl()}
-
-    def to_json(self) -> str:
-        old = urlparse(self.old_url)
-        new = old._replace(netloc=NEW_NETLOC)
-        return json.dumps({"oldUrl": self.old_url, "newUrl": new.geturl()})
-
-    def download_and_process_content(self):
-        print(f"Downloading pad at {self.old_url}")
-        seen_pads.add(self.old_url)
-        r = requests.get(self.old_url + "/download")
-        if r.status_code == 200:
-            self.content = r.text
-            with open(PAD_PATH + urlparse(self.old_url).path + ".md", "w") as f:
-                f.write(self.content)
-            num = self._find_images()
-            print(f"Found {num} images")
-        else:
-            print(f"Error downloading Pad {self.old_url}, got HTTP status code {r.status_code}")
-
-    # returns number of images found
-    def _find_images(self) -> int:
-        regex = r'https://[\w\d.]+/uploads/[\w\d]+\.(?:png|jpg|jpeg|webp)'
-        matches = re.findall(regex, self.content)
-
-        full_url = urlparse(self.old_url)
-        for match in matches:
-            print(f"match: {match}")
-            url = urlparse(match)
-            if url.netloc == full_url.netloc:
-                i = Image(match)
-                self.images.append(i)
-            else:
-                print("Dropped pad, wrong netloc")
-        return len(matches)
-
-    def download_images(self):
-        for i in self.images:
-            i.download()
-
-    def recursive(self):
-        for pad in self.linked_pads:
-            if pad.old_url not in seen_pads:
-                print(f"New pad found: {pad.old_url}")
-                pad.download_and_process_content()
-                pad.download_images()
-                pad.gather_linked_pads()
-                pads_json.append(pad.to_dict())
-                pad.recursive()
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description="Recursivly downloads all images from a hedgedoc pad")
-    parser.add_argument("pad_url", metavar="N", type=str, nargs="+",
-                        help="urls of the pad to start searching")
-    parser.add_argument("--replace", dest="new_pad_url", metavar="B", type=str, nargs=1,
-                        help="url of the new pad for generating the json")
-
-    args = parser.parse_args()
-
-    old_file = []
-
-    if exists("pads.json"):
-        with open("pads.json", "r") as f:
-            old_file = json.load(f)
-
-        for entry in old_file:
-            seen_pads.add(entry["old_url"])
-
-        pads_json.extend(old_file)
-        print(f"Seen Pads: {seen_pads}")
-
-
-    NEW_NETLOC = args.new_pad_url[0]
-    for pad_url in args.pad_url:
-        pad = PadPage(pad_url)
-        pad.download_and_process_content()
-        pad.download_images()
-        pad.gather_linked_pads()
-        pads_json.append(pad.to_dict())
-        pad.recursive()
-
-    print(f"We have seen {len(seen_pads)} which were {seen_pads}")
-
-    with open("pads.json", "w") as f:
-        f.write(json.dumps(pads_json))
--- a/hedgedoc-is-moved.py
+++ b/hedgedoc-is-moved.py
@ -1,27 +0,0 @@
-import requests
-import json
-import os
-
-from os.path import exists
-
-pads_json = []
-new_json= []
-
-if exists("pads.json"):
-    with open("pads.json", "r") as f:
-        old_file = json.load(f)
-
-    pads_json.extend(old_file)
-
-    for pad in pads_json:
-        print("Downloading: " + pad['oldUrl'] + "/download")
-        r = requests.head(pad['oldUrl'] + "/download")
-        if int(r.headers['content-length']) > 150:
-            new_json.append(pad)
-
-
-        with open("new_pads.json", "w") as f:
-            f.write(json.dumps(new_json))
-
-else:
-    print("Give me pads.json")
--- a/pads/19IN-2Sem-Java
+++ b/pads/19IN-2Sem-Java
@ -1 +0,0 @@
-# Java
--- a/requirements.txt
+++ b/requirements.txt
@ -1,8 +0,0 @@
-beautifulsoup4==4.11.1
-bs4==0.0.1
-certifi==2022.6.15.1
-charset-normalizer==2.1.1
-idna==3.3
-requests==2.28.1
-soupsieve==2.3.2.post1
-urllib3==1.26.12
Author	SHA1	Message	Date
Patrick Schwarz	d10cf37bfd	Fix non working non working migration script with cypress async magic	2022-09-14 00:14:31 +02:00
Patrick Schwarz	2b30b277e7	Unsuccessfully try to change cypress test to migrate a bunch of pads using json file	2022-09-13 22:30:47 +02:00