diff --git a/auswertung.code-workspace b/auswertung.code-workspace index 57d88ef..207029b 100644 --- a/auswertung.code-workspace +++ b/auswertung.code-workspace @@ -1,11 +1,17 @@ { "folders": [ { - "path": "." + "path": ".", + "name": "code" }, { "path": "../../../../../nextcloud/Documents/Projekte/SLT/Auswertungsskript Solo" } ], - "settings": {} + "settings": { + "python.testing.pytestEnabled": false, + "python.autoComplete.extraPaths": [ + "${workspaceFolder:code}/venv/lib" + ] + } } diff --git a/requiremnts.txt b/requiremnts.txt index a00a1bb..af69cc1 100644 --- a/requiremnts.txt +++ b/requiremnts.txt @@ -1 +1,3 @@ +beautifulsoup4==4.11.1 +soupsieve==2.3.2.post1 tabulate==0.9.0 diff --git a/src/.gitignore b/src/.gitignore new file mode 100644 index 0000000..1bc5aa1 --- /dev/null +++ b/src/.gitignore @@ -0,0 +1 @@ +/.coverage diff --git a/src/solo_turnier/html_parser.py b/src/solo_turnier/html_parser.py new file mode 100644 index 0000000..ce25f2e --- /dev/null +++ b/src/solo_turnier/html_parser.py @@ -0,0 +1,57 @@ +from bs4 import BeautifulSoup +import bs4 + +import logging +import re + +class HtmlParser: + + def __init__(self): + self.l = logging.getLogger('solo_turnier.html_parser') + + def parseString(self, text: str): + soup = BeautifulSoup(text, 'html.parser') + + ret = {} + + def __parseRows(rows, finalist: bool): + def __parseRow(row): + tds = row.find_all('td') + regex = re.compile('(.*) \\(([0-9]+)\\)') + + place = tds[0].contents[0] + + match = regex.fullmatch(tds[1].contents[0]) + if match is None: + raise Exception(f'Could not match {tds} to regex search pattern') + name = match.group(1) + number = match.group(2) + + participant = { + 'name': name, + 'place': place, + 'finalist': finalist + } + ret[number] = participant + + for row in rows: + __parseRow(row) + + def __parseFirstTable(table): + roundName = table.tr.td.contents[0] + if roundName != 'Endrunde': + raise Exception('Could not parse HTML file') + + __parseRows(table.find_all('tr')[2:], True) + + def __parseRemainingTables(tables): + for table in tables: + __parseRows(table.find_all('tr')[2:], False) + + tables = soup.find('div', class_='extract').find_all('table') + if len(tables) > 0: + __parseFirstTable(tables[0]) + + __parseRemainingTables(tables[1:]) + + return ret diff --git a/src/solo_turnier/tests/__init__.py b/src/solo_turnier/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/solo_turnier/tests/html_parser/1/erg.htm b/src/solo_turnier/tests/html_parser/1/erg.htm new file mode 100644 index 0000000..e9d4a3c --- /dev/null +++ b/src/solo_turnier/tests/html_parser/1/erg.htm @@ -0,0 +1,85 @@ + + + + + + + + + + 09.07.2022 Jun. Newc./Beg. Rumba + + + +
+
+ + +
09.07.2022 - ETW, Solos Jun. Newc./Beg. Rumba + +
+
+
+
Ergebnis
+
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Endrunde
PlatzTeilnehmer
1.Max Mustermann 1 (14)
2.Max Mustermann 2 (9)
3.Max Mustermann 3 (13)
4.Max Mustermann 4 (17)
5.Max Mustermann 5 (6)
6.Max Mustermann 6 (27)
7.Max Mustermann 7 (22)
+ + + + + + + + + + + + + + + +
Vorrunde
8.Max Mustermann 8 (26)
9.Max Mustermann 9 (25)
+
+
+
+
+

Diese Liste wurde mit TopTurnier für Windows V9.3 erstellt.
+ + diff --git a/src/solo_turnier/tests/html_parser/1/expected.json b/src/solo_turnier/tests/html_parser/1/expected.json new file mode 100644 index 0000000..7438fa7 --- /dev/null +++ b/src/solo_turnier/tests/html_parser/1/expected.json @@ -0,0 +1,47 @@ +{ + "14": { + "name": "Max Mustermann 1", + "place": "1.", + "finalist": true + }, + "9": { + "name": "Max Mustermann 2", + "place": "2.", + "finalist": true + }, + "13": { + "name": "Max Mustermann 3", + "place": "3.", + "finalist": true + }, + "17": { + "name": "Max Mustermann 4", + "place": "4.", + "finalist": true + }, + "6": { + "name": "Max Mustermann 5", + "place": "5.", + "finalist": true + }, + "27": { + "name": "Max Mustermann 6", + "place": "6.", + "finalist": true + }, + "22": { + "name": "Max Mustermann 7", + "place": "7.", + "finalist": true + }, + "26": { + "name": "Max Mustermann 8", + "place": "8.", + "finalist": false + }, + "25": { + "name": "Max Mustermann 9", + "place": "9.", + "finalist": false + } +} diff --git a/src/solo_turnier/tests/html_parser/2/erg.htm b/src/solo_turnier/tests/html_parser/2/erg.htm new file mode 100644 index 0000000..97402ec --- /dev/null +++ b/src/solo_turnier/tests/html_parser/2/erg.htm @@ -0,0 +1,63 @@ + + + + + + + + + + 09.07.2022 Jun. Beginner Jive + + + +

+
+ + +
09.07.2022 - ETW, Solos Jun. Beginner Jive + +
+
+
+
Ergebnis
+
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Endrunde
PlatzTeilnehmer
1.Maxime Musterfrau 1 (14)
2.Maxime Musterfrau 2 (13)
3.Maxime Musterfrau 3 (17)
4.Maxime Musterfrau 4 (6)
5.Maxime Musterfrau 5 (22)
+ +
+
+
+
+
+

Diese Liste wurde mit TopTurnier für Windows V9.3 erstellt.
+ + diff --git a/src/solo_turnier/tests/html_parser/2/expected.json b/src/solo_turnier/tests/html_parser/2/expected.json new file mode 100644 index 0000000..4f7a89c --- /dev/null +++ b/src/solo_turnier/tests/html_parser/2/expected.json @@ -0,0 +1,27 @@ +{ + "14": { + "name": "Maxime Musterfrau 1", + "place": "1.", + "finalist": true + }, + "13": { + "name": "Maxime Musterfrau 2", + "place": "2.", + "finalist": true + }, + "17": { + "name": "Maxime Musterfrau 3", + "place": "3.", + "finalist": true + }, + "6": { + "name": "Maxime Musterfrau 4", + "place": "4.", + "finalist": true + }, + "22": { + "name": "Maxime Musterfrau 5", + "place": "5.", + "finalist": true + } +} diff --git a/src/solo_turnier/tests/test_html_parser.py b/src/solo_turnier/tests/test_html_parser.py new file mode 100644 index 0000000..3680e86 --- /dev/null +++ b/src/solo_turnier/tests/test_html_parser.py @@ -0,0 +1,28 @@ +import pytest +import os +import json + +import solo_turnier.html_parser + +@pytest.fixture(scope='module', params=["1", '2']) +def dataProviderHtmlParser(request): + variant = request.param + dir = os.path.join(os.path.dirname(__file__), 'html_parser', variant) + htmlFile = os.path.join(dir, 'erg.htm') + jsonFile = os.path.join(dir, 'expected.json') + + with open(htmlFile, 'r') as fp: + html = fp.read() + with open(jsonFile, 'r') as fp: + jsonContent = json.load(fp) + + return (html, jsonContent) + +def test_extractDataFromHtml(dataProviderHtmlParser): + htmlString = dataProviderHtmlParser[0] + expected = dataProviderHtmlParser[1] + + parser = solo_turnier.html_parser.HtmlParser() + actualResult = parser.parseString(htmlString) + + assert actualResult == expected