solo-auswertung/src/solo_turnier/html_parser.py

from bs4 import BeautifulSoup

import logging
import re

from .types import HtmlPreviewParticipant, HtmlParticipant, HtmlResultTotalTable
from .types import HtmlPreviewImport as HtmlImport, HtmlResultImport
from .group import GroupParser
from .competition_class import CompetitionClassParser

class IncompleteRoundException(Exception):
    def __init__(self, *args):
        super(IncompleteRoundException, self).__init__(*args)

class HtmlParser:

    def __init__(self, text: str, fileName: str = None):
        self.l = logging.getLogger('solo_turnier.html_parser')
        self.soup = BeautifulSoup(text, 'html.parser')
        self.fileName = fileName
        self.groupParser = GroupParser()
        self.classParser = CompetitionClassParser()

    def __repr__(self):
        if self.fileName is None:
            return 'HtmlParser(direct text)'
        else:
            return f'HtmlParser({self.fileName})'

    def getEventTitle(self):
        return self.soup.find('div', class_='eventhead').table.tr.td.contents[0]

    def guessDataFromHtmlTitle(self, title = None):
        if title is None:
            title = self.getEventTitle()

        match = re.compile('.*?ETW, Solos (.*?)(?: ".*")?').fullmatch(title)
        if match is None:
            self.l.error('Cannot parse html title "%s". Possible bug?', title)
            raise Exception(f'Cannot parse title "{title}"')

        rest = match.group(1)
        rawGroup, rawClass, dance = rest.split(' ', 2)

        return {
            'dance': dance.strip(),
            'class_': str(self.classParser.parseClass(rawClass, True)),
            'group': str(self.groupParser.parseClass(rawGroup))
        }

    def parseResult(self):
        participants = {}

        def __parseRows(rows, finalist: bool):
            def __parseRow(row):
                tds = row.find_all('td')

                if len(tds) != 2:
                    return

                if tds[1].contents[0].startswith('Alle Starter weiter genommen.'):
                    self.l.info('No excluded starters found.')
                    return

                regex = re.compile('(.*) \\(([0-9]+)\\)')

                place = tds[0].contents[0]

                match = regex.fullmatch(tds[1].contents[0])
                if match is None:
                    self.l.error('Could not match %s to regex search pattern', str(tds))
                    raise Exception(f'Could not match {tds} to regex search pattern')
                name = match.group(1)
                number = match.group(2)

                participant = HtmlParticipant(name, number)
                participant.finalist = finalist
                participants[participant] = place

            for row in rows:
                __parseRow(row)

        def __parseFirstTable(table):
            roundName = table.tr.td.contents[0]
            if roundName != 'Endrunde':
                self.l.warning('Found table with round name %s.', roundName)
                raise IncompleteRoundException('Could not parse HTML file')

            __parseRows(table.find_all('tr')[2:], True)

        def __parseRemainingTables(tables):
            for table in tables:
                __parseRows(table.find_all('tr'), False)

        tables = self.soup.find('div', class_='extract').find_all('table')

        try:
            if len(tables) > 0:
                __parseFirstTable(tables[0])

                __parseRemainingTables(tables[1:])
        except IncompleteRoundException:
            pass

        # title = self.soup.find('div', class_='eventhead').table.tr.td.contents[0]

        # ret = HtmlImport(title, participants)
        ret =  HtmlResultImport(participants)
        return ret

    def parsePreparationRound(self):
        title = self.soup.find('div', class_='eventhead').table.tr.td.contents[0]
        tableData = []
        rowTitles = []

        def __mapBr(td):
            for br in td.find_all('br'):
                br.replace_with('\n')
            td.smooth()
            return td

        def __extractTitles(table):
            for row in table.find_all('tr')[1:]:
                rowTitles.append(__mapBr(row.td).string)

        def __extractColumns(table):
            content = []

            def __extractContent(td):
                for br in td.find_all('br'):
                    br.replace_with('\n')

                span = td.span
                if span is not None:
                    span = span.extract()
                    meta = span.string
                else:
                    meta = None

                td.smooth()

                return {
                    'text': td.string.replace('\xa0', ' ').strip(),
                    'meta': meta
                }

            def __extractRow(row):
                entries = []
                for entry in row.find_all('td')[1:]:
                    entries.append(__extractContent(entry))
                return entries

            for row in table.find_all('tr')[1:]:
                content.append(__extractRow(row))

            return content

        def __mergeColumns(columns1, columns2):
            return list(map(lambda x, y: x + y, columns1, columns2))

        extract = self.soup.find('div', class_='extract')
        tables = extract.find_all('table', class_='tab1')

        __extractTitles(tables[0])
        tableData = __extractColumns(tables[0])

        for table in tables[1:]:
            tableData = __mergeColumns(tableData, __extractColumns(table))

        data = {
            'titles': rowTitles,
            'table': tableData
        }

        return {'title': title, 'data': data}

    def cleanPreparationRoundImport(self, data):
        def __cleanTable(table):
            def __cleanText(s: str):
                # print("cleaning string ", s)
                return s.strip(' \n\xa0')

            def __cleanEntry(entry):
                entry['text'] = __cleanText(entry['text'])
                if entry['meta'] is not None:
                    entry['meta'] = __cleanText(entry['meta'])

            for row in table:
                for entry in row:
                    # print(entry)
                    __cleanEntry(entry)

        data['title'] = data['title'].strip()
        __cleanTable(data['data']['table'])

    def parseIndividualResult(self, competitionGroup, competitionClass, dance):

        participants = {}

        def __parseTable(table):
            rows = table.find_all('tr')

            def __getIds():
                row = rows[1]
                entries = row('td')
                entries = entries[1:]
                entries = [x for x in entries if len(x.contents[0].strip()) > 0]
                return [x.contents[0].strip() for x in entries]

            ids = __getIds()
            numIds = len(ids)
            self.l.log(5, 'Found ids in dataset: %s', ids)

            def findRowIndex(prefixStr):
                def isRowMatchingCriteria(row):
                    if row.td.contents[0].startswith(prefixStr):
                        return True
                    return False
                l = list(map(isRowMatchingCriteria, rows))

                if True not in l:
                    return None
                return l.index(True)

            def getPlaces():
                placeRowIdx = findRowIndex('Platz von')
                placeTags = rows[placeRowIdx]('td')[1:(numIds+1)]
                def getSinglePlaceStr(tag):
                    for br in tag('br'):
                        br.replace_with('-')
                    tag.smooth()
                    rawStr = tag.contents[0].strip()
                    if rawStr.endswith('-'):
                        rawStr = rawStr[:-1]
                    return rawStr
                places = list(map(getSinglePlaceStr, placeTags))
                return places
            places = getPlaces()
            self.l.log(5, 'Found places: %s', places)

            def getClass():
                classRow = findRowIndex('Startklasse')
                if classRow is not None:
                    classTags = rows[classRow]('td')[1:(numIds+1)]
                    return list(map(lambda x: x.contents[0], classTags))
                return None
            classes = getClass()
            self.l.log(5, 'Classes: %s', classes)

            def getGroups():
                groupRow = findRowIndex('Startgruppe')
                if groupRow is not None:
                    classTags = rows[groupRow]('td')[1:(numIds+1)]
                    return list(map(lambda x: x.contents[0], classTags))
                return None
            groups = getGroups()
            self.l.log(5, 'Groups: %s', groups)

            for idx, id in enumerate(ids):
                cls = classes[idx] if classes is not None else None
                grp = groups[idx] if groups is not None else None

                tup = (competitionGroup, competitionClass, dance, id)
                participants[tup] = (places[idx], cls, grp)

        tables = self.soup.find('div', class_='extract').find_all('table')
        for table in tables:
            __parseTable(table)

        return HtmlResultTotalTable( participants)