From adc7158862911d1e757be9bc16d8371d7c6aefef Mon Sep 17 00:00:00 2001 From: Christian Wolf Date: Sat, 26 Nov 2022 08:43:15 +0100 Subject: [PATCH] Cache parsed object in parser object --- src/solo_turnier/html_parser.py | 68 ++++++++++++---------- src/solo_turnier/tests/test_html_parser.py | 10 ++-- src/solo_turnier/worker.py | 2 +- 3 files changed, 43 insertions(+), 37 deletions(-) diff --git a/src/solo_turnier/html_parser.py b/src/solo_turnier/html_parser.py index 5bd35a4..9224d31 100644 --- a/src/solo_turnier/html_parser.py +++ b/src/solo_turnier/html_parser.py @@ -19,9 +19,42 @@ class HtmlImport: class HtmlParser: - def __init__(self): + def __init__(self, text: str): self.l = logging.getLogger('solo_turnier.html_parser') + self.soup = BeautifulSoup(text, 'html.parser') + def getEventTitle(self): + return self.soup.find('div', class_='eventhead').table.tr.td.contents[0] + + def guessDataFromHtmlTitle(self, title = None): + if title is None: + title = self.getEventTitle() + + match = re.compile('.*?ETW, Solos (.*)').match(title) + if match is None: + raise Exception(f'Cannot parse title "{title}"') + + rest = match.group(1) + rawGroup, rawClass, dance = rest.split(' ', 2) + + classMap = { + 'Newcomer': 'Newc.', + 'Beginner': 'Beg.', + 'Advanced': 'Adv.' + } + + groupMap = { + 'Kinder': 'Kin.', + 'Junioren': 'Jun.', + 'Jugend': 'Jug.', + } + + return { + 'dance': dance.strip(), + 'class_': classMap.get(rawClass, rawClass), + 'group': groupMap.get(rawGroup, rawGroup) + } + def parseString(self, text: str): soup = BeautifulSoup(text, 'html.parser') @@ -68,10 +101,8 @@ class HtmlParser: ret = HtmlImport(title, participants) return ret - def parsePreparationRoundString(self, text: str): - soup = BeautifulSoup(text, 'html.parser') - - title = soup.find('div', class_='eventhead').table.tr.td.contents[0] + def parsePreparationRound(self): + title = self.soup.find('div', class_='eventhead').table.tr.td.contents[0] tableData = [] rowTitles = [] @@ -120,7 +151,7 @@ class HtmlParser: def __mergeColumns(columns1, columns2): return list(map(lambda x, y: x + y, columns1, columns2)) - extract = soup.find('div', class_='extract') + extract = self.soup.find('div', class_='extract') tables = extract.find_all('table', class_='tab1') __extractTitles(tables[0]) @@ -155,28 +186,3 @@ class HtmlParser: data['title'] = data['title'].strip() __cleanTable(data['data']['table']) - def guessDataFromHtmlTitle(self, title): - match = re.compile('.*?ETW, Solos (.*)').match(title) - if match is None: - raise Exception(f'Cannot parse title "{title}"') - - rest = match.group(1) - rawGroup, rawClass, dance = rest.split(' ', 2) - - classMap = { - 'Newcomer': 'Newc.', - 'Beginner': 'Beg.', - 'Advanced': 'Adv.' - } - - groupMap = { - 'Kinder': 'Kin.', - 'Junioren': 'Jun.', - 'Jugend': 'Jug.', - } - - return { - 'dance': dance.strip(), - 'class_': classMap.get(rawClass, rawClass), - 'group': groupMap.get(rawGroup, rawGroup) - } diff --git a/src/solo_turnier/tests/test_html_parser.py b/src/solo_turnier/tests/test_html_parser.py index 0bc1bc2..2f79882 100644 --- a/src/solo_turnier/tests/test_html_parser.py +++ b/src/solo_turnier/tests/test_html_parser.py @@ -22,7 +22,7 @@ def test_extractDataFromHtml(dataProviderHtmlParser): htmlString = dataProviderHtmlParser[0] expected = dataProviderHtmlParser[1] - parser = solo_turnier.html_parser.HtmlParser() + parser = solo_turnier.html_parser.HtmlParser(htmlString) actualResult = parser.parseString(htmlString) participants = {} @@ -66,7 +66,7 @@ def fixture_guessDataFromTitle(request): return (key, cases[key]) def test_guessDataFromTitle(fixture_guessDataFromTitle): - parser = solo_turnier.html_parser.HtmlParser() + parser = solo_turnier.html_parser.HtmlParser('') ret = parser.guessDataFromHtmlTitle(fixture_guessDataFromTitle[0]) assert ret == fixture_guessDataFromTitle[1] @@ -89,8 +89,8 @@ def test_parsePreparationResult(fixture_parsePreparationResult): html = fixture_parsePreparationResult[0] jsonContent = fixture_parsePreparationResult[1] - parser = solo_turnier.html_parser.HtmlParser() - ret = parser.parsePreparationRoundString(html) + parser = solo_turnier.html_parser.HtmlParser(html) + ret = parser.parsePreparationRound() assert ret == jsonContent @@ -112,7 +112,7 @@ def test_cleanPreparationImport(fixture_cleanPreparationImport): src = fixture_cleanPreparationImport[0] expected = fixture_cleanPreparationImport[1] - parser = solo_turnier.html_parser.HtmlParser() + parser = solo_turnier.html_parser.HtmlParser('') parser.cleanPreparationRoundImport(src) assert src == expected diff --git a/src/solo_turnier/worker.py b/src/solo_turnier/worker.py index 5cfc0d9..ab7e9e0 100644 --- a/src/solo_turnier/worker.py +++ b/src/solo_turnier/worker.py @@ -221,7 +221,7 @@ class DataWorker: def _createHtmlLUT(self, htmlImports: list[html_parser.HtmlImport]): ret = {} - parser = html_parser.HtmlParser() + parser = html_parser.HtmlParser('') for imp in htmlImports: parsed = parser.guessDataFromHtmlTitle(imp.title) key = (parsed['group'], parsed['class_'], parsed['dance'])