solo-auswertung/src/solo_turnier/html_parser.py

89 lines
2.5 KiB
Python
Raw Normal View History

from bs4 import BeautifulSoup
import logging
import re
class HtmlParser:
def __init__(self):
self.l = logging.getLogger('solo_turnier.html_parser')
def parseString(self, text: str):
soup = BeautifulSoup(text, 'html.parser')
participants = {}
def __parseRows(rows, finalist: bool):
def __parseRow(row):
tds = row.find_all('td')
regex = re.compile('(.*) \\(([0-9]+)\\)')
place = tds[0].contents[0]
match = regex.fullmatch(tds[1].contents[0])
if match is None:
raise Exception(f'Could not match {tds} to regex search pattern')
name = match.group(1)
number = match.group(2)
participant = {
'name': name,
'place': place,
'finalist': finalist
}
participants[number] = participant
for row in rows:
__parseRow(row)
def __parseFirstTable(table):
roundName = table.tr.td.contents[0]
if roundName != 'Endrunde':
raise Exception('Could not parse HTML file')
__parseRows(table.find_all('tr')[2:], True)
def __parseRemainingTables(tables):
for table in tables:
__parseRows(table.find_all('tr')[2:], False)
tables = soup.find('div', class_='extract').find_all('table')
if len(tables) > 0:
__parseFirstTable(tables[0])
__parseRemainingTables(tables[1:])
title = soup.find('div', class_='eventhead').table.tr.td.contents[0]
ret = {
'participants': participants,
'title': title
}
return ret
2022-11-15 13:19:10 +00:00
def guessDataFromHtmlTitle(self, title):
match = re.compile('.*?ETW, Solos (.*)').match(title)
if match is None:
raise Exception(f'Cannot parse title "{title}"')
rest = match.group(1)
rawGroup, rawClass, dance = rest.split(' ', 2)
classMap = {
'Newcomer': 'Newc.',
'Beginner': 'Beg.',
'Advanced': 'Adv.'
}
groupMap = {
'Kinder': 'Kin.',
'Junioren': 'Jun.',
'Jugend': 'Jug.',
}
return {
'dance': dance,
'class_': classMap.get(rawClass, rawClass),
'group': groupMap.get(rawGroup, rawGroup)
}