Cache parsed object in parser object
This commit is contained in:
parent
724ac95886
commit
adc7158862
@ -19,9 +19,42 @@ class HtmlImport:
|
||||
|
||||
class HtmlParser:
|
||||
|
||||
def __init__(self):
|
||||
def __init__(self, text: str):
|
||||
self.l = logging.getLogger('solo_turnier.html_parser')
|
||||
self.soup = BeautifulSoup(text, 'html.parser')
|
||||
|
||||
def getEventTitle(self):
|
||||
return self.soup.find('div', class_='eventhead').table.tr.td.contents[0]
|
||||
|
||||
def guessDataFromHtmlTitle(self, title = None):
|
||||
if title is None:
|
||||
title = self.getEventTitle()
|
||||
|
||||
match = re.compile('.*?ETW, Solos (.*)').match(title)
|
||||
if match is None:
|
||||
raise Exception(f'Cannot parse title "{title}"')
|
||||
|
||||
rest = match.group(1)
|
||||
rawGroup, rawClass, dance = rest.split(' ', 2)
|
||||
|
||||
classMap = {
|
||||
'Newcomer': 'Newc.',
|
||||
'Beginner': 'Beg.',
|
||||
'Advanced': 'Adv.'
|
||||
}
|
||||
|
||||
groupMap = {
|
||||
'Kinder': 'Kin.',
|
||||
'Junioren': 'Jun.',
|
||||
'Jugend': 'Jug.',
|
||||
}
|
||||
|
||||
return {
|
||||
'dance': dance.strip(),
|
||||
'class_': classMap.get(rawClass, rawClass),
|
||||
'group': groupMap.get(rawGroup, rawGroup)
|
||||
}
|
||||
|
||||
def parseString(self, text: str):
|
||||
soup = BeautifulSoup(text, 'html.parser')
|
||||
|
||||
@ -68,10 +101,8 @@ class HtmlParser:
|
||||
ret = HtmlImport(title, participants)
|
||||
return ret
|
||||
|
||||
def parsePreparationRoundString(self, text: str):
|
||||
soup = BeautifulSoup(text, 'html.parser')
|
||||
|
||||
title = soup.find('div', class_='eventhead').table.tr.td.contents[0]
|
||||
def parsePreparationRound(self):
|
||||
title = self.soup.find('div', class_='eventhead').table.tr.td.contents[0]
|
||||
tableData = []
|
||||
rowTitles = []
|
||||
|
||||
@ -120,7 +151,7 @@ class HtmlParser:
|
||||
def __mergeColumns(columns1, columns2):
|
||||
return list(map(lambda x, y: x + y, columns1, columns2))
|
||||
|
||||
extract = soup.find('div', class_='extract')
|
||||
extract = self.soup.find('div', class_='extract')
|
||||
tables = extract.find_all('table', class_='tab1')
|
||||
|
||||
__extractTitles(tables[0])
|
||||
@ -155,28 +186,3 @@ class HtmlParser:
|
||||
data['title'] = data['title'].strip()
|
||||
__cleanTable(data['data']['table'])
|
||||
|
||||
def guessDataFromHtmlTitle(self, title):
|
||||
match = re.compile('.*?ETW, Solos (.*)').match(title)
|
||||
if match is None:
|
||||
raise Exception(f'Cannot parse title "{title}"')
|
||||
|
||||
rest = match.group(1)
|
||||
rawGroup, rawClass, dance = rest.split(' ', 2)
|
||||
|
||||
classMap = {
|
||||
'Newcomer': 'Newc.',
|
||||
'Beginner': 'Beg.',
|
||||
'Advanced': 'Adv.'
|
||||
}
|
||||
|
||||
groupMap = {
|
||||
'Kinder': 'Kin.',
|
||||
'Junioren': 'Jun.',
|
||||
'Jugend': 'Jug.',
|
||||
}
|
||||
|
||||
return {
|
||||
'dance': dance.strip(),
|
||||
'class_': classMap.get(rawClass, rawClass),
|
||||
'group': groupMap.get(rawGroup, rawGroup)
|
||||
}
|
||||
|
@ -22,7 +22,7 @@ def test_extractDataFromHtml(dataProviderHtmlParser):
|
||||
htmlString = dataProviderHtmlParser[0]
|
||||
expected = dataProviderHtmlParser[1]
|
||||
|
||||
parser = solo_turnier.html_parser.HtmlParser()
|
||||
parser = solo_turnier.html_parser.HtmlParser(htmlString)
|
||||
actualResult = parser.parseString(htmlString)
|
||||
|
||||
participants = {}
|
||||
@ -66,7 +66,7 @@ def fixture_guessDataFromTitle(request):
|
||||
return (key, cases[key])
|
||||
|
||||
def test_guessDataFromTitle(fixture_guessDataFromTitle):
|
||||
parser = solo_turnier.html_parser.HtmlParser()
|
||||
parser = solo_turnier.html_parser.HtmlParser('')
|
||||
ret = parser.guessDataFromHtmlTitle(fixture_guessDataFromTitle[0])
|
||||
|
||||
assert ret == fixture_guessDataFromTitle[1]
|
||||
@ -89,8 +89,8 @@ def test_parsePreparationResult(fixture_parsePreparationResult):
|
||||
html = fixture_parsePreparationResult[0]
|
||||
jsonContent = fixture_parsePreparationResult[1]
|
||||
|
||||
parser = solo_turnier.html_parser.HtmlParser()
|
||||
ret = parser.parsePreparationRoundString(html)
|
||||
parser = solo_turnier.html_parser.HtmlParser(html)
|
||||
ret = parser.parsePreparationRound()
|
||||
|
||||
assert ret == jsonContent
|
||||
|
||||
@ -112,7 +112,7 @@ def test_cleanPreparationImport(fixture_cleanPreparationImport):
|
||||
src = fixture_cleanPreparationImport[0]
|
||||
expected = fixture_cleanPreparationImport[1]
|
||||
|
||||
parser = solo_turnier.html_parser.HtmlParser()
|
||||
parser = solo_turnier.html_parser.HtmlParser('')
|
||||
parser.cleanPreparationRoundImport(src)
|
||||
|
||||
assert src == expected
|
||||
|
@ -221,7 +221,7 @@ class DataWorker:
|
||||
|
||||
def _createHtmlLUT(self, htmlImports: list[html_parser.HtmlImport]):
|
||||
ret = {}
|
||||
parser = html_parser.HtmlParser()
|
||||
parser = html_parser.HtmlParser('')
|
||||
for imp in htmlImports:
|
||||
parsed = parser.guessDataFromHtmlTitle(imp.title)
|
||||
key = (parsed['group'], parsed['class_'], parsed['dance'])
|
||||
|
Loading…
Reference in New Issue
Block a user