from bs4 import BeautifulSoup import logging import re class HtmlParticipant: def __init__(self, name, place, finalist): self.name = name self.place = place self.finalist = finalist def __str__(self): return f'{self.name} (with place {self.place})' class HtmlImport: def __init__(self, title: str, participants: dict[int, HtmlParticipant]): self.title = title self.participants = participants class HtmlParser: def __init__(self, text: str): self.l = logging.getLogger('solo_turnier.html_parser') self.soup = BeautifulSoup(text, 'html.parser') def getEventTitle(self): return self.soup.find('div', class_='eventhead').table.tr.td.contents[0] def guessDataFromHtmlTitle(self, title = None): if title is None: title = self.getEventTitle() match = re.compile('.*?ETW, Solos (.*)').match(title) if match is None: raise Exception(f'Cannot parse title "{title}"') rest = match.group(1) rawGroup, rawClass, dance = rest.split(' ', 2) classMap = { 'Newcomer': 'Newc.', 'Beginner': 'Beg.', 'Advanced': 'Adv.' } groupMap = { 'Kinder': 'Kin.', 'Junioren': 'Jun.', 'Jugend': 'Jug.', } return { 'dance': dance.strip(), 'class_': classMap.get(rawClass, rawClass), 'group': groupMap.get(rawGroup, rawGroup) } def parseString(self, text: str): soup = BeautifulSoup(text, 'html.parser') participants = {} def __parseRows(rows, finalist: bool): def __parseRow(row): tds = row.find_all('td') regex = re.compile('(.*) \\(([0-9]+)\\)') place = tds[0].contents[0] match = regex.fullmatch(tds[1].contents[0]) if match is None: raise Exception(f'Could not match {tds} to regex search pattern') name = match.group(1) number = match.group(2) participant = HtmlParticipant(name, place, finalist) participants[number] = participant for row in rows: __parseRow(row) def __parseFirstTable(table): roundName = table.tr.td.contents[0] if roundName != 'Endrunde': raise Exception('Could not parse HTML file') __parseRows(table.find_all('tr')[2:], True) def __parseRemainingTables(tables): for table in tables: __parseRows(table.find_all('tr')[2:], False) tables = soup.find('div', class_='extract').find_all('table') if len(tables) > 0: __parseFirstTable(tables[0]) __parseRemainingTables(tables[1:]) title = soup.find('div', class_='eventhead').table.tr.td.contents[0] ret = HtmlImport(title, participants) return ret def parsePreparationRound(self): title = self.soup.find('div', class_='eventhead').table.tr.td.contents[0] tableData = [] rowTitles = [] def __mapBr(td): for br in td.find_all('br'): br.replace_with('\n') td.smooth() return td def __extractTitles(table): for row in table.find_all('tr')[1:]: rowTitles.append(__mapBr(row.td).string) def __extractColumns(table): content = [] def __extractContent(td): for br in td.find_all('br'): br.replace_with('\n') span = td.span if span is not None: span = span.extract() meta = span.string else: meta = None td.smooth() return { 'text': td.string, 'meta': meta } def __extractRow(row): entries = [] for entry in row.find_all('td')[1:]: entries.append(__extractContent(entry)) return entries for row in table.find_all('tr')[1:]: content.append(__extractRow(row)) return content def __mergeColumns(columns1, columns2): return list(map(lambda x, y: x + y, columns1, columns2)) extract = self.soup.find('div', class_='extract') tables = extract.find_all('table', class_='tab1') __extractTitles(tables[0]) tableData = __extractColumns(tables[0]) for table in tables[1:]: tableData = __mergeColumns(tableData, __extractColumns(table)) data = { 'titles': rowTitles, 'table': tableData } return {'title': title, 'data': data} def cleanPreparationRoundImport(self, data): def __cleanTable(table): def __cleanText(s: str): print("cleaning string ", s) return s.strip(' \n\xa0') def __cleanEntry(entry): entry['text'] = __cleanText(entry['text']) if entry['meta'] is not None: entry['meta'] = __cleanText(entry['meta']) for row in table: for entry in row: print(entry) __cleanEntry(entry) data['title'] = data['title'].strip() __cleanTable(data['data']['table'])