from bs4 import BeautifulSoup import logging import re from .types import HtmlPreviewParticipant, HtmlParticipant, HtmlResultTotalTable from .types import HtmlPreviewImport as HtmlImport, HtmlResultImport from .group import GroupParser from .competition_class import CompetitionClassParser class IncompleteRoundException(Exception): def __init__(self, *args): super(IncompleteRoundException, self).__init__(*args) class HtmlParser: def __init__(self, text: str, fileName: str = None): self.l = logging.getLogger("solo_turnier.html_parser") self.soup = BeautifulSoup(text, "html.parser") self.fileName = fileName self.groupParser = GroupParser() self.classParser = CompetitionClassParser() def __repr__(self): if self.fileName is None: return "HtmlParser(direct text)" else: return f"HtmlParser({self.fileName})" def getEventTitle(self): return self.soup.find("div", class_="eventhead").table.tr.td.contents[0] def guessDataFromHtmlTitle(self, title=None): if title is None: title = self.getEventTitle() match = re.compile('.*?ETW, Solos (.*?)(?: ".*")?').fullmatch(title) if match is None: self.l.error('Cannot parse html title "%s". Possible bug?', title) raise Exception(f'Cannot parse title "{title}"') rest = match.group(1) rawGroup, rawClass, dance = rest.split(" ", 2) return { "dance": dance.strip(), "class_": str(self.classParser.parseClass(rawClass, True)), "group": str(self.groupParser.parseClass(rawGroup)), } def parseResult(self): participants = {} def __parseRows(rows, finalist: bool): def __parseRow(row): tds = row.find_all("td") if len(tds) != 2: return if tds[1].contents[0].startswith("Alle Starter weiter genommen."): self.l.info("No excluded starters found.") return regex = re.compile("(.*) \\(([0-9]+)\\)") place = tds[0].contents[0] match = regex.fullmatch(tds[1].contents[0]) if match is None: self.l.error("Could not match %s to regex search pattern", str(tds)) raise Exception(f"Could not match {tds} to regex search pattern") name = match.group(1) number = match.group(2) participant = HtmlParticipant(name, number) participant.finalist = finalist participants[participant] = place for row in rows: __parseRow(row) def __parseFirstTable(table): roundName = table.tr.td.contents[0] if roundName != "Endrunde": self.l.warning("Found table with round name %s.", roundName) raise IncompleteRoundException("Could not parse HTML file") __parseRows(table.find_all("tr")[2:], True) def __parseRemainingTables(tables): for table in tables: __parseRows(table.find_all("tr"), False) tables = self.soup.find("div", class_="extract").find_all("table") try: if len(tables) > 0: __parseFirstTable(tables[0]) __parseRemainingTables(tables[1:]) except IncompleteRoundException: pass # title = self.soup.find('div', class_='eventhead').table.tr.td.contents[0] # ret = HtmlImport(title, participants) ret = HtmlResultImport(participants) return ret def parsePreparationRound(self): title = self.soup.find("div", class_="eventhead").table.tr.td.contents[0] tableData = [] rowTitles = [] def __mapBr(td): for br in td.find_all("br"): br.replace_with("\n") td.smooth() return td def __extractTitles(table): for row in table.find_all("tr")[1:]: rowTitles.append(__mapBr(row.td).string) def __extractColumns(table): content = [] def __extractContent(td): for br in td.find_all("br"): br.replace_with("\n") span = td.span if span is not None: span = span.extract() meta = span.string else: meta = None td.smooth() return {"text": td.string.replace("\xa0", " ").strip(), "meta": meta} def __extractRow(row): entries = [] for entry in row.find_all("td")[1:]: entries.append(__extractContent(entry)) return entries for row in table.find_all("tr")[1:]: content.append(__extractRow(row)) return content def __mergeColumns(columns1, columns2): return list(map(lambda x, y: x + y, columns1, columns2)) extract = self.soup.find("div", class_="extract") tables = extract.find_all("table", class_="tab1") __extractTitles(tables[0]) tableData = __extractColumns(tables[0]) for table in tables[1:]: tableData = __mergeColumns(tableData, __extractColumns(table)) data = {"titles": rowTitles, "table": tableData} return {"title": title, "data": data} def cleanPreparationRoundImport(self, data): def __cleanTable(table): def __cleanText(s: str): # print("cleaning string ", s) return s.strip(" \n\xa0") def __cleanEntry(entry): entry["text"] = __cleanText(entry["text"]) if entry["meta"] is not None: entry["meta"] = __cleanText(entry["meta"]) for row in table: for entry in row: # print(entry) __cleanEntry(entry) data["title"] = data["title"].strip() __cleanTable(data["data"]["table"]) def parseIndividualResult(self, competitionGroup, competitionClass, dance): participants = {} def __parseTable(table): rows = table.find_all("tr") def __getIds(): row = rows[1] entries = row("td") entries = entries[1:] entries = [x for x in entries if len(x.contents[0].strip()) > 0] return [x.contents[0].strip() for x in entries] ids = __getIds() numIds = len(ids) self.l.log(5, "Found ids in dataset: %s", ids) def findRowIndex(prefixStr): def isRowMatchingCriteria(row): if row.td.contents[0].startswith(prefixStr): return True return False l = list(map(isRowMatchingCriteria, rows)) if True not in l: return None return l.index(True) def getPlaces(): placeRowIdx = findRowIndex("Platz von") placeTags = rows[placeRowIdx]("td")[1 : (numIds + 1)] def getSinglePlaceStr(tag): for br in tag("br"): br.replace_with("-") tag.smooth() rawStr = tag.contents[0].strip() if rawStr.endswith("-"): rawStr = rawStr[:-1] return rawStr places = list(map(getSinglePlaceStr, placeTags)) return places places = getPlaces() self.l.log(5, "Found places: %s", places) def getClass(): classRow = findRowIndex("Startklasse") if classRow is not None: classTags = rows[classRow]("td")[1 : (numIds + 1)] return list(map(lambda x: x.contents[0], classTags)) return None classes = getClass() self.l.log(5, "Classes: %s", classes) def getGroups(): groupRow = findRowIndex("Startgruppe") if groupRow is not None: classTags = rows[groupRow]("td")[1 : (numIds + 1)] return list(map(lambda x: x.contents[0], classTags)) return None groups = getGroups() self.l.log(5, "Groups: %s", groups) for idx, id in enumerate(ids): cls = classes[idx] if classes is not None else None grp = groups[idx] if groups is not None else None tup = (competitionGroup, competitionClass, dance, id) participants[tup] = (places[idx], cls, grp) tables = self.soup.find("div", class_="extract").find_all("table") for table in tables: __parseTable(table) return HtmlResultTotalTable(participants)