from bs4 import BeautifulSoup import logging import re class HtmlParser: def __init__(self): self.l = logging.getLogger('solo_turnier.html_parser') def parseString(self, text: str): soup = BeautifulSoup(text, 'html.parser') participants = {} def __parseRows(rows, finalist: bool): def __parseRow(row): tds = row.find_all('td') regex = re.compile('(.*) \\(([0-9]+)\\)') place = tds[0].contents[0] match = regex.fullmatch(tds[1].contents[0]) if match is None: raise Exception(f'Could not match {tds} to regex search pattern') name = match.group(1) number = match.group(2) participant = { 'name': name, 'place': place, 'finalist': finalist } participants[number] = participant for row in rows: __parseRow(row) def __parseFirstTable(table): roundName = table.tr.td.contents[0] if roundName != 'Endrunde': raise Exception('Could not parse HTML file') __parseRows(table.find_all('tr')[2:], True) def __parseRemainingTables(tables): for table in tables: __parseRows(table.find_all('tr')[2:], False) tables = soup.find('div', class_='extract').find_all('table') if len(tables) > 0: __parseFirstTable(tables[0]) __parseRemainingTables(tables[1:]) title = soup.find('div', class_='eventhead').table.tr.td.contents[0] ret = { 'participants': participants, 'title': title } return ret def guessDataFromHtmlTitle(self, title): match = re.compile('.*?ETW, Solos (.*)').match(title) if match is None: raise Exception(f'Cannot parse title "{title}"') rest = match.group(1) rawGroup, rawClass, dance = rest.split(' ', 2) classMap = { 'Newcomer': 'Newc.', 'Beginner': 'Beg.', 'Advanced': 'Adv.' } groupMap = { 'Kinder': 'Kin.', 'Junioren': 'Jun.', 'Jugend': 'Jug.', } return { 'dance': dance, 'class_': classMap.get(rawClass, rawClass), 'group': groupMap.get(rawGroup, rawGroup) }