Cache parsed object in parser object

This commit is contained in:
Christian Wolf 2022-11-26 08:43:15 +01:00
parent 724ac95886
commit adc7158862
3 changed files with 43 additions and 37 deletions

View File

@ -19,8 +19,41 @@ class HtmlImport:
class HtmlParser: class HtmlParser:
def __init__(self): def __init__(self, text: str):
self.l = logging.getLogger('solo_turnier.html_parser') self.l = logging.getLogger('solo_turnier.html_parser')
self.soup = BeautifulSoup(text, 'html.parser')
def getEventTitle(self):
return self.soup.find('div', class_='eventhead').table.tr.td.contents[0]
def guessDataFromHtmlTitle(self, title = None):
if title is None:
title = self.getEventTitle()
match = re.compile('.*?ETW, Solos (.*)').match(title)
if match is None:
raise Exception(f'Cannot parse title "{title}"')
rest = match.group(1)
rawGroup, rawClass, dance = rest.split(' ', 2)
classMap = {
'Newcomer': 'Newc.',
'Beginner': 'Beg.',
'Advanced': 'Adv.'
}
groupMap = {
'Kinder': 'Kin.',
'Junioren': 'Jun.',
'Jugend': 'Jug.',
}
return {
'dance': dance.strip(),
'class_': classMap.get(rawClass, rawClass),
'group': groupMap.get(rawGroup, rawGroup)
}
def parseString(self, text: str): def parseString(self, text: str):
soup = BeautifulSoup(text, 'html.parser') soup = BeautifulSoup(text, 'html.parser')
@ -68,10 +101,8 @@ class HtmlParser:
ret = HtmlImport(title, participants) ret = HtmlImport(title, participants)
return ret return ret
def parsePreparationRoundString(self, text: str): def parsePreparationRound(self):
soup = BeautifulSoup(text, 'html.parser') title = self.soup.find('div', class_='eventhead').table.tr.td.contents[0]
title = soup.find('div', class_='eventhead').table.tr.td.contents[0]
tableData = [] tableData = []
rowTitles = [] rowTitles = []
@ -120,7 +151,7 @@ class HtmlParser:
def __mergeColumns(columns1, columns2): def __mergeColumns(columns1, columns2):
return list(map(lambda x, y: x + y, columns1, columns2)) return list(map(lambda x, y: x + y, columns1, columns2))
extract = soup.find('div', class_='extract') extract = self.soup.find('div', class_='extract')
tables = extract.find_all('table', class_='tab1') tables = extract.find_all('table', class_='tab1')
__extractTitles(tables[0]) __extractTitles(tables[0])
@ -155,28 +186,3 @@ class HtmlParser:
data['title'] = data['title'].strip() data['title'] = data['title'].strip()
__cleanTable(data['data']['table']) __cleanTable(data['data']['table'])
def guessDataFromHtmlTitle(self, title):
match = re.compile('.*?ETW, Solos (.*)').match(title)
if match is None:
raise Exception(f'Cannot parse title "{title}"')
rest = match.group(1)
rawGroup, rawClass, dance = rest.split(' ', 2)
classMap = {
'Newcomer': 'Newc.',
'Beginner': 'Beg.',
'Advanced': 'Adv.'
}
groupMap = {
'Kinder': 'Kin.',
'Junioren': 'Jun.',
'Jugend': 'Jug.',
}
return {
'dance': dance.strip(),
'class_': classMap.get(rawClass, rawClass),
'group': groupMap.get(rawGroup, rawGroup)
}

View File

@ -22,7 +22,7 @@ def test_extractDataFromHtml(dataProviderHtmlParser):
htmlString = dataProviderHtmlParser[0] htmlString = dataProviderHtmlParser[0]
expected = dataProviderHtmlParser[1] expected = dataProviderHtmlParser[1]
parser = solo_turnier.html_parser.HtmlParser() parser = solo_turnier.html_parser.HtmlParser(htmlString)
actualResult = parser.parseString(htmlString) actualResult = parser.parseString(htmlString)
participants = {} participants = {}
@ -66,7 +66,7 @@ def fixture_guessDataFromTitle(request):
return (key, cases[key]) return (key, cases[key])
def test_guessDataFromTitle(fixture_guessDataFromTitle): def test_guessDataFromTitle(fixture_guessDataFromTitle):
parser = solo_turnier.html_parser.HtmlParser() parser = solo_turnier.html_parser.HtmlParser('')
ret = parser.guessDataFromHtmlTitle(fixture_guessDataFromTitle[0]) ret = parser.guessDataFromHtmlTitle(fixture_guessDataFromTitle[0])
assert ret == fixture_guessDataFromTitle[1] assert ret == fixture_guessDataFromTitle[1]
@ -89,8 +89,8 @@ def test_parsePreparationResult(fixture_parsePreparationResult):
html = fixture_parsePreparationResult[0] html = fixture_parsePreparationResult[0]
jsonContent = fixture_parsePreparationResult[1] jsonContent = fixture_parsePreparationResult[1]
parser = solo_turnier.html_parser.HtmlParser() parser = solo_turnier.html_parser.HtmlParser(html)
ret = parser.parsePreparationRoundString(html) ret = parser.parsePreparationRound()
assert ret == jsonContent assert ret == jsonContent
@ -112,7 +112,7 @@ def test_cleanPreparationImport(fixture_cleanPreparationImport):
src = fixture_cleanPreparationImport[0] src = fixture_cleanPreparationImport[0]
expected = fixture_cleanPreparationImport[1] expected = fixture_cleanPreparationImport[1]
parser = solo_turnier.html_parser.HtmlParser() parser = solo_turnier.html_parser.HtmlParser('')
parser.cleanPreparationRoundImport(src) parser.cleanPreparationRoundImport(src)
assert src == expected assert src == expected

View File

@ -221,7 +221,7 @@ class DataWorker:
def _createHtmlLUT(self, htmlImports: list[html_parser.HtmlImport]): def _createHtmlLUT(self, htmlImports: list[html_parser.HtmlImport]):
ret = {} ret = {}
parser = html_parser.HtmlParser() parser = html_parser.HtmlParser('')
for imp in htmlImports: for imp in htmlImports:
parsed = parser.guessDataFromHtmlTitle(imp.title) parsed = parser.guessDataFromHtmlTitle(imp.title)
key = (parsed['group'], parsed['class_'], parsed['dance']) key = (parsed['group'], parsed['class_'], parsed['dance'])