from bs4 import BeautifulSoup import logging import re from .types import HtmlPreviewParticipant, HtmlParticipant, HtmlResultTotalTable from .types import HtmlPreviewImport as HtmlImport, HtmlResultImport from .group import GroupParser from .competition_class import CompetitionClassParser class HtmlParser: def __init__(self, text: str, fileName: str = None): self.l = logging.getLogger('solo_turnier.html_parser') self.soup = BeautifulSoup(text, 'html.parser') self.fileName = fileName self.groupParser = GroupParser() self.classParser = CompetitionClassParser() def __repr__(self): if self.fileName is None: return 'HtmlParser(direct text)' else: return f'HtmlParser({self.fileName})' def getEventTitle(self): return self.soup.find('div', class_='eventhead').table.tr.td.contents[0] def guessDataFromHtmlTitle(self, title = None): if title is None: title = self.getEventTitle() match = re.compile('.*?ETW, Solos (.*)').match(title) if match is None: raise Exception(f'Cannot parse title "{title}"') rest = match.group(1) rawGroup, rawClass, dance = rest.split(' ', 2) return { 'dance': dance.strip(), 'class_': str(self.classParser.parseClass(rawClass, True)), 'group': str(self.groupParser.parseClass(rawGroup)) } def parseResult(self): participants = {} def __parseRows(rows, finalist: bool): def __parseRow(row): tds = row.find_all('td') if len(tds) != 2: return regex = re.compile('(.*) \\(([0-9]+)\\)') place = tds[0].contents[0] match = regex.fullmatch(tds[1].contents[0]) if match is None: raise Exception(f'Could not match {tds} to regex search pattern') name = match.group(1) number = match.group(2) participant = HtmlParticipant(name, number) participant.finalist = finalist participants[participant] = place for row in rows: __parseRow(row) def __parseFirstTable(table): roundName = table.tr.td.contents[0] if roundName != 'Endrunde': raise Exception('Could not parse HTML file') __parseRows(table.find_all('tr')[2:], True) def __parseRemainingTables(tables): for table in tables: __parseRows(table.find_all('tr'), False) tables = self.soup.find('div', class_='extract').find_all('table') if len(tables) > 0: __parseFirstTable(tables[0]) __parseRemainingTables(tables[1:]) # title = self.soup.find('div', class_='eventhead').table.tr.td.contents[0] # ret = HtmlImport(title, participants) ret = HtmlResultImport(participants) return ret def parsePreparationRound(self): title = self.soup.find('div', class_='eventhead').table.tr.td.contents[0] tableData = [] rowTitles = [] def __mapBr(td): for br in td.find_all('br'): br.replace_with('\n') td.smooth() return td def __extractTitles(table): for row in table.find_all('tr')[1:]: rowTitles.append(__mapBr(row.td).string) def __extractColumns(table): content = [] def __extractContent(td): for br in td.find_all('br'): br.replace_with('\n') span = td.span if span is not None: span = span.extract() meta = span.string else: meta = None td.smooth() return { 'text': td.string.replace('\xa0', ' ').strip(), 'meta': meta } def __extractRow(row): entries = [] for entry in row.find_all('td')[1:]: entries.append(__extractContent(entry)) return entries for row in table.find_all('tr')[1:]: content.append(__extractRow(row)) return content def __mergeColumns(columns1, columns2): return list(map(lambda x, y: x + y, columns1, columns2)) extract = self.soup.find('div', class_='extract') tables = extract.find_all('table', class_='tab1') __extractTitles(tables[0]) tableData = __extractColumns(tables[0]) for table in tables[1:]: tableData = __mergeColumns(tableData, __extractColumns(table)) data = { 'titles': rowTitles, 'table': tableData } return {'title': title, 'data': data} def cleanPreparationRoundImport(self, data): def __cleanTable(table): def __cleanText(s: str): # print("cleaning string ", s) return s.strip(' \n\xa0') def __cleanEntry(entry): entry['text'] = __cleanText(entry['text']) if entry['meta'] is not None: entry['meta'] = __cleanText(entry['meta']) for row in table: for entry in row: # print(entry) __cleanEntry(entry) data['title'] = data['title'].strip() __cleanTable(data['data']['table']) def parseIndividualResult(self, competitionGroup, competitionClass, dance): participants = {} def __parseTable(table): rows = table.find_all('tr') def __getIds(): row = rows[1] entries = row('td') entries = entries[1:] entries = [x for x in entries if len(x.contents[0].strip()) > 0] return [x.contents[0].strip() for x in entries] ids = __getIds() numIds = len(ids) self.l.log(5, 'Found ids in dataset: %s', ids) def findRowIndex(prefixStr): def isRowMatchingCriteria(row): if row.td.contents[0].startswith(prefixStr): return True return False l = list(map(isRowMatchingCriteria, rows)) if True not in l: return None return l.index(True) def getPlaces(): placeRowIdx = findRowIndex('Platz von') placeTags = rows[placeRowIdx]('td')[1:(numIds+1)] def getSinglePlaceStr(tag): for br in tag('br'): br.replace_with('-') tag.smooth() rawStr = tag.contents[0].strip() if rawStr.endswith('-'): rawStr = rawStr[:-1] return rawStr places = list(map(getSinglePlaceStr, placeTags)) return places places = getPlaces() self.l.log(5, 'Found places: %s', places) def getClass(): classRow = findRowIndex('Startklasse') if classRow is not None: classTags = rows[classRow]('td')[1:(numIds+1)] return list(map(lambda x: x.contents[0], classTags)) return None classes = getClass() self.l.log(5, 'Classes: %s', classes) def getGroups(): groupRow = findRowIndex('Startgruppe') if groupRow is not None: classTags = rows[groupRow]('td')[1:(numIds+1)] return list(map(lambda x: x.contents[0], classTags)) return None groups = getGroups() self.l.log(5, 'Groups: %s', groups) for idx, id in enumerate(ids): cls = classes[idx] if classes is not None else None grp = groups[idx] if groups is not None else None tup = (competitionGroup, competitionClass, dance, id) participants[tup] = (places[idx], cls, grp) tables = self.soup.find('div', class_='extract').find_all('table') for table in tables: __parseTable(table) return HtmlResultTotalTable( participants)