Cache parsed object in parser object

This commit is contained in:
Christian Wolf 2022-11-26 08:43:15 +01:00
parent 724ac95886
commit adc7158862
3 changed files with 43 additions and 37 deletions

View File

@ -19,8 +19,41 @@ class HtmlImport:
class HtmlParser:
def __init__(self):
def __init__(self, text: str):
self.l = logging.getLogger('solo_turnier.html_parser')
self.soup = BeautifulSoup(text, 'html.parser')
def getEventTitle(self):
return self.soup.find('div', class_='eventhead').table.tr.td.contents[0]
def guessDataFromHtmlTitle(self, title = None):
if title is None:
title = self.getEventTitle()
match = re.compile('.*?ETW, Solos (.*)').match(title)
if match is None:
raise Exception(f'Cannot parse title "{title}"')
rest = match.group(1)
rawGroup, rawClass, dance = rest.split(' ', 2)
classMap = {
'Newcomer': 'Newc.',
'Beginner': 'Beg.',
'Advanced': 'Adv.'
}
groupMap = {
'Kinder': 'Kin.',
'Junioren': 'Jun.',
'Jugend': 'Jug.',
}
return {
'dance': dance.strip(),
'class_': classMap.get(rawClass, rawClass),
'group': groupMap.get(rawGroup, rawGroup)
}
def parseString(self, text: str):
soup = BeautifulSoup(text, 'html.parser')
@ -68,10 +101,8 @@ class HtmlParser:
ret = HtmlImport(title, participants)
return ret
def parsePreparationRoundString(self, text: str):
soup = BeautifulSoup(text, 'html.parser')
title = soup.find('div', class_='eventhead').table.tr.td.contents[0]
def parsePreparationRound(self):
title = self.soup.find('div', class_='eventhead').table.tr.td.contents[0]
tableData = []
rowTitles = []
@ -120,7 +151,7 @@ class HtmlParser:
def __mergeColumns(columns1, columns2):
return list(map(lambda x, y: x + y, columns1, columns2))
extract = soup.find('div', class_='extract')
extract = self.soup.find('div', class_='extract')
tables = extract.find_all('table', class_='tab1')
__extractTitles(tables[0])
@ -155,28 +186,3 @@ class HtmlParser:
data['title'] = data['title'].strip()
__cleanTable(data['data']['table'])
def guessDataFromHtmlTitle(self, title):
match = re.compile('.*?ETW, Solos (.*)').match(title)
if match is None:
raise Exception(f'Cannot parse title "{title}"')
rest = match.group(1)
rawGroup, rawClass, dance = rest.split(' ', 2)
classMap = {
'Newcomer': 'Newc.',
'Beginner': 'Beg.',
'Advanced': 'Adv.'
}
groupMap = {
'Kinder': 'Kin.',
'Junioren': 'Jun.',
'Jugend': 'Jug.',
}
return {
'dance': dance.strip(),
'class_': classMap.get(rawClass, rawClass),
'group': groupMap.get(rawGroup, rawGroup)
}

View File

@ -22,7 +22,7 @@ def test_extractDataFromHtml(dataProviderHtmlParser):
htmlString = dataProviderHtmlParser[0]
expected = dataProviderHtmlParser[1]
parser = solo_turnier.html_parser.HtmlParser()
parser = solo_turnier.html_parser.HtmlParser(htmlString)
actualResult = parser.parseString(htmlString)
participants = {}
@ -66,7 +66,7 @@ def fixture_guessDataFromTitle(request):
return (key, cases[key])
def test_guessDataFromTitle(fixture_guessDataFromTitle):
parser = solo_turnier.html_parser.HtmlParser()
parser = solo_turnier.html_parser.HtmlParser('')
ret = parser.guessDataFromHtmlTitle(fixture_guessDataFromTitle[0])
assert ret == fixture_guessDataFromTitle[1]
@ -89,8 +89,8 @@ def test_parsePreparationResult(fixture_parsePreparationResult):
html = fixture_parsePreparationResult[0]
jsonContent = fixture_parsePreparationResult[1]
parser = solo_turnier.html_parser.HtmlParser()
ret = parser.parsePreparationRoundString(html)
parser = solo_turnier.html_parser.HtmlParser(html)
ret = parser.parsePreparationRound()
assert ret == jsonContent
@ -112,7 +112,7 @@ def test_cleanPreparationImport(fixture_cleanPreparationImport):
src = fixture_cleanPreparationImport[0]
expected = fixture_cleanPreparationImport[1]
parser = solo_turnier.html_parser.HtmlParser()
parser = solo_turnier.html_parser.HtmlParser('')
parser.cleanPreparationRoundImport(src)
assert src == expected

View File

@ -221,7 +221,7 @@ class DataWorker:
def _createHtmlLUT(self, htmlImports: list[html_parser.HtmlImport]):
ret = {}
parser = html_parser.HtmlParser()
parser = html_parser.HtmlParser('')
for imp in htmlImports:
parsed = parser.guessDataFromHtmlTitle(imp.title)
key = (parsed['group'], parsed['class_'], parsed['dance'])