Cache parsed object in parser object
This commit is contained in:
parent
724ac95886
commit
adc7158862
@ -19,8 +19,41 @@ class HtmlImport:
|
|||||||
|
|
||||||
class HtmlParser:
|
class HtmlParser:
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self, text: str):
|
||||||
self.l = logging.getLogger('solo_turnier.html_parser')
|
self.l = logging.getLogger('solo_turnier.html_parser')
|
||||||
|
self.soup = BeautifulSoup(text, 'html.parser')
|
||||||
|
|
||||||
|
def getEventTitle(self):
|
||||||
|
return self.soup.find('div', class_='eventhead').table.tr.td.contents[0]
|
||||||
|
|
||||||
|
def guessDataFromHtmlTitle(self, title = None):
|
||||||
|
if title is None:
|
||||||
|
title = self.getEventTitle()
|
||||||
|
|
||||||
|
match = re.compile('.*?ETW, Solos (.*)').match(title)
|
||||||
|
if match is None:
|
||||||
|
raise Exception(f'Cannot parse title "{title}"')
|
||||||
|
|
||||||
|
rest = match.group(1)
|
||||||
|
rawGroup, rawClass, dance = rest.split(' ', 2)
|
||||||
|
|
||||||
|
classMap = {
|
||||||
|
'Newcomer': 'Newc.',
|
||||||
|
'Beginner': 'Beg.',
|
||||||
|
'Advanced': 'Adv.'
|
||||||
|
}
|
||||||
|
|
||||||
|
groupMap = {
|
||||||
|
'Kinder': 'Kin.',
|
||||||
|
'Junioren': 'Jun.',
|
||||||
|
'Jugend': 'Jug.',
|
||||||
|
}
|
||||||
|
|
||||||
|
return {
|
||||||
|
'dance': dance.strip(),
|
||||||
|
'class_': classMap.get(rawClass, rawClass),
|
||||||
|
'group': groupMap.get(rawGroup, rawGroup)
|
||||||
|
}
|
||||||
|
|
||||||
def parseString(self, text: str):
|
def parseString(self, text: str):
|
||||||
soup = BeautifulSoup(text, 'html.parser')
|
soup = BeautifulSoup(text, 'html.parser')
|
||||||
@ -68,10 +101,8 @@ class HtmlParser:
|
|||||||
ret = HtmlImport(title, participants)
|
ret = HtmlImport(title, participants)
|
||||||
return ret
|
return ret
|
||||||
|
|
||||||
def parsePreparationRoundString(self, text: str):
|
def parsePreparationRound(self):
|
||||||
soup = BeautifulSoup(text, 'html.parser')
|
title = self.soup.find('div', class_='eventhead').table.tr.td.contents[0]
|
||||||
|
|
||||||
title = soup.find('div', class_='eventhead').table.tr.td.contents[0]
|
|
||||||
tableData = []
|
tableData = []
|
||||||
rowTitles = []
|
rowTitles = []
|
||||||
|
|
||||||
@ -120,7 +151,7 @@ class HtmlParser:
|
|||||||
def __mergeColumns(columns1, columns2):
|
def __mergeColumns(columns1, columns2):
|
||||||
return list(map(lambda x, y: x + y, columns1, columns2))
|
return list(map(lambda x, y: x + y, columns1, columns2))
|
||||||
|
|
||||||
extract = soup.find('div', class_='extract')
|
extract = self.soup.find('div', class_='extract')
|
||||||
tables = extract.find_all('table', class_='tab1')
|
tables = extract.find_all('table', class_='tab1')
|
||||||
|
|
||||||
__extractTitles(tables[0])
|
__extractTitles(tables[0])
|
||||||
@ -155,28 +186,3 @@ class HtmlParser:
|
|||||||
data['title'] = data['title'].strip()
|
data['title'] = data['title'].strip()
|
||||||
__cleanTable(data['data']['table'])
|
__cleanTable(data['data']['table'])
|
||||||
|
|
||||||
def guessDataFromHtmlTitle(self, title):
|
|
||||||
match = re.compile('.*?ETW, Solos (.*)').match(title)
|
|
||||||
if match is None:
|
|
||||||
raise Exception(f'Cannot parse title "{title}"')
|
|
||||||
|
|
||||||
rest = match.group(1)
|
|
||||||
rawGroup, rawClass, dance = rest.split(' ', 2)
|
|
||||||
|
|
||||||
classMap = {
|
|
||||||
'Newcomer': 'Newc.',
|
|
||||||
'Beginner': 'Beg.',
|
|
||||||
'Advanced': 'Adv.'
|
|
||||||
}
|
|
||||||
|
|
||||||
groupMap = {
|
|
||||||
'Kinder': 'Kin.',
|
|
||||||
'Junioren': 'Jun.',
|
|
||||||
'Jugend': 'Jug.',
|
|
||||||
}
|
|
||||||
|
|
||||||
return {
|
|
||||||
'dance': dance.strip(),
|
|
||||||
'class_': classMap.get(rawClass, rawClass),
|
|
||||||
'group': groupMap.get(rawGroup, rawGroup)
|
|
||||||
}
|
|
||||||
|
@ -22,7 +22,7 @@ def test_extractDataFromHtml(dataProviderHtmlParser):
|
|||||||
htmlString = dataProviderHtmlParser[0]
|
htmlString = dataProviderHtmlParser[0]
|
||||||
expected = dataProviderHtmlParser[1]
|
expected = dataProviderHtmlParser[1]
|
||||||
|
|
||||||
parser = solo_turnier.html_parser.HtmlParser()
|
parser = solo_turnier.html_parser.HtmlParser(htmlString)
|
||||||
actualResult = parser.parseString(htmlString)
|
actualResult = parser.parseString(htmlString)
|
||||||
|
|
||||||
participants = {}
|
participants = {}
|
||||||
@ -66,7 +66,7 @@ def fixture_guessDataFromTitle(request):
|
|||||||
return (key, cases[key])
|
return (key, cases[key])
|
||||||
|
|
||||||
def test_guessDataFromTitle(fixture_guessDataFromTitle):
|
def test_guessDataFromTitle(fixture_guessDataFromTitle):
|
||||||
parser = solo_turnier.html_parser.HtmlParser()
|
parser = solo_turnier.html_parser.HtmlParser('')
|
||||||
ret = parser.guessDataFromHtmlTitle(fixture_guessDataFromTitle[0])
|
ret = parser.guessDataFromHtmlTitle(fixture_guessDataFromTitle[0])
|
||||||
|
|
||||||
assert ret == fixture_guessDataFromTitle[1]
|
assert ret == fixture_guessDataFromTitle[1]
|
||||||
@ -89,8 +89,8 @@ def test_parsePreparationResult(fixture_parsePreparationResult):
|
|||||||
html = fixture_parsePreparationResult[0]
|
html = fixture_parsePreparationResult[0]
|
||||||
jsonContent = fixture_parsePreparationResult[1]
|
jsonContent = fixture_parsePreparationResult[1]
|
||||||
|
|
||||||
parser = solo_turnier.html_parser.HtmlParser()
|
parser = solo_turnier.html_parser.HtmlParser(html)
|
||||||
ret = parser.parsePreparationRoundString(html)
|
ret = parser.parsePreparationRound()
|
||||||
|
|
||||||
assert ret == jsonContent
|
assert ret == jsonContent
|
||||||
|
|
||||||
@ -112,7 +112,7 @@ def test_cleanPreparationImport(fixture_cleanPreparationImport):
|
|||||||
src = fixture_cleanPreparationImport[0]
|
src = fixture_cleanPreparationImport[0]
|
||||||
expected = fixture_cleanPreparationImport[1]
|
expected = fixture_cleanPreparationImport[1]
|
||||||
|
|
||||||
parser = solo_turnier.html_parser.HtmlParser()
|
parser = solo_turnier.html_parser.HtmlParser('')
|
||||||
parser.cleanPreparationRoundImport(src)
|
parser.cleanPreparationRoundImport(src)
|
||||||
|
|
||||||
assert src == expected
|
assert src == expected
|
||||||
|
@ -221,7 +221,7 @@ class DataWorker:
|
|||||||
|
|
||||||
def _createHtmlLUT(self, htmlImports: list[html_parser.HtmlImport]):
|
def _createHtmlLUT(self, htmlImports: list[html_parser.HtmlImport]):
|
||||||
ret = {}
|
ret = {}
|
||||||
parser = html_parser.HtmlParser()
|
parser = html_parser.HtmlParser('')
|
||||||
for imp in htmlImports:
|
for imp in htmlImports:
|
||||||
parsed = parser.guessDataFromHtmlTitle(imp.title)
|
parsed = parser.guessDataFromHtmlTitle(imp.title)
|
||||||
key = (parsed['group'], parsed['class_'], parsed['dance'])
|
key = (parsed['group'], parsed['class_'], parsed['dance'])
|
||||||
|
Loading…
Reference in New Issue
Block a user