solo-auswertung/src/solo_turnier/html_parser.py

from bs4 import BeautifulSoup

import logging
import re

from .types import HtmlPreviewParticipant, HtmlParticipant, HtmlResultTotalTable
from .types import HtmlPreviewImport as HtmlImport, HtmlResultImport
from .group import GroupParser
from .competition_class import CompetitionClassParser


class IncompleteRoundException(Exception):
    def __init__(self, *args):
        super(IncompleteRoundException, self).__init__(*args)


class HtmlParser:
    def __init__(self, text: str, fileName: str = None):
        self.l = logging.getLogger("solo_turnier.html_parser")
        self.soup = BeautifulSoup(text, "html.parser")
        self.fileName = fileName
        self.groupParser = GroupParser()
        self.classParser = CompetitionClassParser()

    def __repr__(self):
        if self.fileName is None:
            return "HtmlParser(direct text)"
        else:
            return f"HtmlParser({self.fileName})"

    def getEventTitle(self):
        return self.soup.find("div", class_="eventhead").table.tr.td.contents[0]

    def guessDataFromHtmlTitle(self, title=None):
        if title is None:
            title = self.getEventTitle()

        match = re.compile('.*?ETW, Solos (.*?)(?: ".*")?').fullmatch(title)
        if match is None:
            self.l.error('Cannot parse html title "%s". Possible bug?', title)
            raise Exception(f'Cannot parse title "{title}"')

        rest = match.group(1)
        rawGroup, rawClass, dance = rest.split(" ", 2)

        return {
            "dance": dance.strip(),
            "class_": str(self.classParser.parseClass(rawClass, True)),
            "group": str(self.groupParser.parseClass(rawGroup)),
        }

    def parseResult(self):
        participants = {}

        def __parseRows(rows, finalist: bool):
            def __parseRow(row):
                tds = row.find_all("td")

                if len(tds) != 2:
                    return

                if tds[1].contents[0].startswith("Alle Starter weiter genommen."):
                    self.l.info("No excluded starters found.")
                    return

                regex = re.compile("(.*) \\(([0-9]+)\\)")

                place = tds[0].contents[0]

                match = regex.fullmatch(tds[1].contents[0])
                if match is None:
                    self.l.error("Could not match %s to regex search pattern", str(tds))
                    raise Exception(f"Could not match {tds} to regex search pattern")
                name = match.group(1)
                number = match.group(2)

                participant = HtmlParticipant(name, number)
                participant.finalist = finalist
                participants[participant] = place

            for row in rows:
                __parseRow(row)

        def __parseFirstTable(table):
            roundName = table.tr.td.contents[0]
            if roundName != "Endrunde":
                self.l.warning("Found table with round name %s.", roundName)
                raise IncompleteRoundException("Could not parse HTML file")

            __parseRows(table.find_all("tr")[2:], True)

        def __parseRemainingTables(tables):
            for table in tables:
                __parseRows(table.find_all("tr"), False)

        tables = self.soup.find("div", class_="extract").find_all("table")

        try:
            if len(tables) > 0:
                __parseFirstTable(tables[0])

                __parseRemainingTables(tables[1:])
        except IncompleteRoundException:
            pass

        # title = self.soup.find('div', class_='eventhead').table.tr.td.contents[0]

        # ret = HtmlImport(title, participants)
        ret = HtmlResultImport(participants)
        return ret

    def parsePreparationRound(self):
        title = self.soup.find("div", class_="eventhead").table.tr.td.contents[0]
        tableData = []
        rowTitles = []

        def __mapBr(td):
            for br in td.find_all("br"):
                br.replace_with("\n")
            td.smooth()
            return td

        def __extractTitles(table):
            for row in table.find_all("tr")[1:]:
                rowTitles.append(__mapBr(row.td).string)

        def __extractColumns(table):
            content = []

            def __extractContent(td):
                for br in td.find_all("br"):
                    br.replace_with("\n")

                span = td.span
                if span is not None:
                    span = span.extract()
                    meta = span.string
                else:
                    meta = None

                td.smooth()

                return {"text": td.string.replace("\xa0", " ").strip(), "meta": meta}

            def __extractRow(row):
                entries = []
                for entry in row.find_all("td")[1:]:
                    entries.append(__extractContent(entry))
                return entries

            for row in table.find_all("tr")[1:]:
                content.append(__extractRow(row))

            return content

        def __mergeColumns(columns1, columns2):
            return list(map(lambda x, y: x + y, columns1, columns2))

        extract = self.soup.find("div", class_="extract")
        tables = extract.find_all("table", class_="tab1")

        __extractTitles(tables[0])
        tableData = __extractColumns(tables[0])

        for table in tables[1:]:
            tableData = __mergeColumns(tableData, __extractColumns(table))

        data = {"titles": rowTitles, "table": tableData}

        return {"title": title, "data": data}

    def cleanPreparationRoundImport(self, data):
        def __cleanTable(table):
            def __cleanText(s: str):
                # print("cleaning string ", s)
                return s.strip(" \n\xa0")

            def __cleanEntry(entry):
                entry["text"] = __cleanText(entry["text"])
                if entry["meta"] is not None:
                    entry["meta"] = __cleanText(entry["meta"])

            for row in table:
                for entry in row:
                    # print(entry)
                    __cleanEntry(entry)

        data["title"] = data["title"].strip()
        __cleanTable(data["data"]["table"])

    def parseIndividualResult(self, competitionGroup, competitionClass, dance):
        participants = {}

        def __parseTable(table):
            rows = table.find_all("tr")

            def __getIds():
                row = rows[1]
                entries = row("td")
                entries = entries[1:]
                entries = [x for x in entries if len(x.contents[0].strip()) > 0]
                return [x.contents[0].strip() for x in entries]

            ids = __getIds()
            numIds = len(ids)
            self.l.log(5, "Found ids in dataset: %s", ids)

            def findRowIndex(prefixStr):
                def isRowMatchingCriteria(row):
                    if row.td.contents[0].startswith(prefixStr):
                        return True
                    return False

                l = list(map(isRowMatchingCriteria, rows))

                if True not in l:
                    return None
                return l.index(True)

            def getPlaces():
                placeRowIdx = findRowIndex("Platz von")
                placeTags = rows[placeRowIdx]("td")[1 : (numIds + 1)]

                def getSinglePlaceStr(tag):
                    for br in tag("br"):
                        br.replace_with("-")
                    tag.smooth()
                    rawStr = tag.contents[0].strip()
                    if rawStr.endswith("-"):
                        rawStr = rawStr[:-1]
                    return rawStr

                places = list(map(getSinglePlaceStr, placeTags))
                return places

            places = getPlaces()
            self.l.log(5, "Found places: %s", places)

            def getClass():
                classRow = findRowIndex("Startklasse")
                if classRow is not None:
                    classTags = rows[classRow]("td")[1 : (numIds + 1)]
                    return list(map(lambda x: x.contents[0], classTags))
                return None

            classes = getClass()
            self.l.log(5, "Classes: %s", classes)

            def getGroups():
                groupRow = findRowIndex("Startgruppe")
                if groupRow is not None:
                    classTags = rows[groupRow]("td")[1 : (numIds + 1)]
                    return list(map(lambda x: x.contents[0], classTags))
                return None

            groups = getGroups()
            self.l.log(5, "Groups: %s", groups)

            for idx, id in enumerate(ids):
                cls = classes[idx] if classes is not None else None
                grp = groups[idx] if groups is not None else None

                tup = (competitionGroup, competitionClass, dance, id)
                participants[tup] = (places[idx], cls, grp)

        tables = self.soup.find("div", class_="extract").find_all("table")
        for table in tables:
            __parseTable(table)

        return HtmlResultTotalTable(participants)