solo-auswertung/src/solo_turnier/html_parser.py

from bs4 import BeautifulSoup

import logging
import re

class HtmlParticipant:
    def __init__(self, name, place, finalist):
        self.name = name
        self.place = place
        self.finalist = finalist
    
    def __str__(self):
        return f'{self.name} (with place {self.place})'

class HtmlImport:
    def __init__(self, title: str, participants: dict[int, HtmlParticipant]):
        self.title = title
        self.participants = participants

class HtmlParser:

    def __init__(self, text: str, fileName: str = None):
        self.l = logging.getLogger('solo_turnier.html_parser')
        self.soup = BeautifulSoup(text, 'html.parser')
        self.fileName = fileName

    def __repr__(self):
        if self.fileName is None:
            return 'HtmlParser(direct text)'
        else:
            return f'HtmlParser({self.fileName})'
    
    def getEventTitle(self):
        return self.soup.find('div', class_='eventhead').table.tr.td.contents[0]

    def guessDataFromHtmlTitle(self, title = None):
        if title is None:
            title = self.getEventTitle()
        
        match = re.compile('.*?ETW, Solos (.*)').match(title)
        if match is None:
            raise Exception(f'Cannot parse title "{title}"')
        
        rest = match.group(1)
        rawGroup, rawClass, dance = rest.split(' ', 2)

        classMap = {
            'Newcomer': 'Newc.',
            'Beginner': 'Beg.',
            'Advanced': 'Adv.'
        }

        groupMap = {
            'Kinder': 'Kin.',
            'Junioren': 'Jun.',
            'Jugend': 'Jug.',
        }

        return {
            'dance': dance.strip(),
            'class_': classMap.get(rawClass, rawClass),
            'group': groupMap.get(rawGroup, rawGroup)
        }

    def parseString(self, text: str):
        soup = BeautifulSoup(text, 'html.parser')

        participants = {}

        def __parseRows(rows, finalist: bool):
            def __parseRow(row):
                tds = row.find_all('td')
                regex = re.compile('(.*) \\(([0-9]+)\\)')
                
                place = tds[0].contents[0]
                
                match = regex.fullmatch(tds[1].contents[0])
                if match is None:
                    raise Exception(f'Could not match {tds} to regex search pattern')
                name = match.group(1)
                number = match.group(2)

                participant = HtmlParticipant(name, place, finalist)
                participants[number] = participant
            
            for row in rows:
                __parseRow(row)

        def __parseFirstTable(table):
            roundName = table.tr.td.contents[0]
            if roundName != 'Endrunde':
                raise Exception('Could not parse HTML file')
            
            __parseRows(table.find_all('tr')[2:], True)

        def __parseRemainingTables(tables):
            for table in tables:
                __parseRows(table.find_all('tr')[2:], False)

        tables = soup.find('div', class_='extract').find_all('table')
        if len(tables) > 0:
            __parseFirstTable(tables[0])

            __parseRemainingTables(tables[1:])

        title = soup.find('div', class_='eventhead').table.tr.td.contents[0]

        ret = HtmlImport(title, participants)
        return ret

    def parsePreparationRound(self):
        title = self.soup.find('div', class_='eventhead').table.tr.td.contents[0]
        tableData = []
        rowTitles = []

        def __mapBr(td):
            for br in td.find_all('br'):
                br.replace_with('\n')
            td.smooth()
            return td

        def __extractTitles(table):
            for row in table.find_all('tr')[1:]:
                rowTitles.append(__mapBr(row.td).string)
        
        def __extractColumns(table):
            content = []

            def __extractContent(td):
                for br in td.find_all('br'):
                    br.replace_with('\n')
                
                span = td.span
                if span is not None:
                    span = span.extract()
                    meta = span.string
                else:
                    meta = None
                
                td.smooth()

                return {
                    'text': td.string.replace('\xa0', ' ').strip(),
                    'meta': meta
                }

            def __extractRow(row):
                entries = []
                for entry in row.find_all('td')[1:]:
                    entries.append(__extractContent(entry))
                return entries
            
            for row in table.find_all('tr')[1:]:
                content.append(__extractRow(row))
            
            return content
        
        def __mergeColumns(columns1, columns2):
            return list(map(lambda x, y: x + y, columns1, columns2))

        extract = self.soup.find('div', class_='extract')
        tables = extract.find_all('table', class_='tab1')

        __extractTitles(tables[0])
        tableData = __extractColumns(tables[0])

        for table in tables[1:]:
            tableData = __mergeColumns(tableData, __extractColumns(table))

        data = {
            'titles': rowTitles,
            'table': tableData
        }

        return {'title': title, 'data': data}
    
    def cleanPreparationRoundImport(self, data):
        def __cleanTable(table):
            def __cleanText(s: str):
                # print("cleaning string ", s)
                return s.strip(' \n\xa0')
            
            def __cleanEntry(entry):
                entry['text'] = __cleanText(entry['text'])
                if entry['meta'] is not None:
                    entry['meta'] = __cleanText(entry['meta'])
            
            for row in table:
                for entry in row:
                    # print(entry)
                    __cleanEntry(entry)

        data['title'] = data['title'].strip()
        __cleanTable(data['data']['table'])
Create tested HTML parser for result tables 2022-11-13 16:01:44 +00:00			`from bs4 import BeautifulSoup`

			`import logging`
			`import re`

Usage of objects in HTML parser 2022-11-15 13:38:59 +00:00			`class HtmlParticipant:`
			`def __init__(self, name, place, finalist):`
			`self.name = name`
			`self.place = place`
			`self.finalist = finalist`
Merge HTML final status into main data structures 2022-11-15 17:11:40 +00:00
			`def __str__(self):`
			`return f'{self.name} (with place {self.place})'`
Usage of objects in HTML parser 2022-11-15 13:38:59 +00:00
			`class HtmlImport:`
			`def __init__(self, title: str, participants: dict[int, HtmlParticipant]):`
			`self.title = title`
			`self.participants = participants`

Create tested HTML parser for result tables 2022-11-13 16:01:44 +00:00			`class HtmlParser:`

Make debug output better readable 2022-12-03 13:29:01 +00:00			`def __init__(self, text: str, fileName: str = None):`
Create tested HTML parser for result tables 2022-11-13 16:01:44 +00:00			`self.l = logging.getLogger('solo_turnier.html_parser')`
Cache parsed object in parser object 2022-11-26 07:43:15 +00:00			`self.soup = BeautifulSoup(text, 'html.parser')`
Make debug output better readable 2022-12-03 13:29:01 +00:00			`self.fileName = fileName`

			`def __repr__(self):`
			`if self.fileName is None:`
			`return 'HtmlParser(direct text)'`
			`else:`
			`return f'HtmlParser({self.fileName})'`
Create tested HTML parser for result tables 2022-11-13 16:01:44 +00:00
Cache parsed object in parser object 2022-11-26 07:43:15 +00:00			`def getEventTitle(self):`
			`return self.soup.find('div', class_='eventhead').table.tr.td.contents[0]`

			`def guessDataFromHtmlTitle(self, title = None):`
			`if title is None:`
			`title = self.getEventTitle()`

			`match = re.compile('.?ETW, Solos (.)').match(title)`
			`if match is None:`
			`raise Exception(f'Cannot parse title "{title}"')`

			`rest = match.group(1)`
			`rawGroup, rawClass, dance = rest.split(' ', 2)`

			`classMap = {`
			`'Newcomer': 'Newc.',`
			`'Beginner': 'Beg.',`
			`'Advanced': 'Adv.'`
			`}`

			`groupMap = {`
			`'Kinder': 'Kin.',`
			`'Junioren': 'Jun.',`
			`'Jugend': 'Jug.',`
			`}`

			`return {`
			`'dance': dance.strip(),`
			`'class_': classMap.get(rawClass, rawClass),`
			`'group': groupMap.get(rawGroup, rawGroup)`
			`}`

Create tested HTML parser for result tables 2022-11-13 16:01:44 +00:00			`def parseString(self, text: str):`
			`soup = BeautifulSoup(text, 'html.parser')`

Create class to look for possible result sets 2022-11-13 17:04:49 +00:00			`participants = {}`
Create tested HTML parser for result tables 2022-11-13 16:01:44 +00:00
			`def __parseRows(rows, finalist: bool):`
			`def __parseRow(row):`
			`tds = row.find_all('td')`
			`regex = re.compile('(.*) \\(([0-9]+)\\)')`

			`place = tds[0].contents[0]`

			`match = regex.fullmatch(tds[1].contents[0])`
			`if match is None:`
			`raise Exception(f'Could not match {tds} to regex search pattern')`
			`name = match.group(1)`
			`number = match.group(2)`

Usage of objects in HTML parser 2022-11-15 13:38:59 +00:00			`participant = HtmlParticipant(name, place, finalist)`
Create class to look for possible result sets 2022-11-13 17:04:49 +00:00			`participants[number] = participant`
Create tested HTML parser for result tables 2022-11-13 16:01:44 +00:00
			`for row in rows:`
			`__parseRow(row)`

			`def __parseFirstTable(table):`
			`roundName = table.tr.td.contents[0]`
			`if roundName != 'Endrunde':`
			`raise Exception('Could not parse HTML file')`

			`__parseRows(table.find_all('tr')[2:], True)`

			`def __parseRemainingTables(tables):`
			`for table in tables:`
			`__parseRows(table.find_all('tr')[2:], False)`

			`tables = soup.find('div', class_='extract').find_all('table')`
			`if len(tables) > 0:`
			`__parseFirstTable(tables[0])`

			`__parseRemainingTables(tables[1:])`

Create class to look for possible result sets 2022-11-13 17:04:49 +00:00			`title = soup.find('div', class_='eventhead').table.tr.td.contents[0]`

Usage of objects in HTML parser 2022-11-15 13:38:59 +00:00			`ret = HtmlImport(title, participants)`
Create tested HTML parser for result tables 2022-11-13 16:01:44 +00:00			`return ret`
Extract meta data from title in HTML 2022-11-15 13:19:10 +00:00
Cache parsed object in parser object 2022-11-26 07:43:15 +00:00			`def parsePreparationRound(self):`
			`title = self.soup.find('div', class_='eventhead').table.tr.td.contents[0]`
Create script to read setup rounds 2022-11-19 06:38:22 +00:00			`tableData = []`
			`rowTitles = []`

			`def __mapBr(td):`
			`for br in td.find_all('br'):`
			`br.replace_with('\n')`
			`td.smooth()`
			`return td`

			`def __extractTitles(table):`
			`for row in table.find_all('tr')[1:]:`
			`rowTitles.append(__mapBr(row.td).string)`

			`def __extractColumns(table):`
			`content = []`

			`def __extractContent(td):`
			`for br in td.find_all('br'):`
			`br.replace_with('\n')`

			`span = td.span`
			`if span is not None:`
			`span = span.extract()`
			`meta = span.string`
			`else:`
			`meta = None`

			`td.smooth()`

			`return {`
Clean up HTML import from nbsp chars 2022-11-26 07:43:46 +00:00			`'text': td.string.replace('\xa0', ' ').strip(),`
Create script to read setup rounds 2022-11-19 06:38:22 +00:00			`'meta': meta`
			`}`

			`def __extractRow(row):`
			`entries = []`
			`for entry in row.find_all('td')[1:]:`
			`entries.append(__extractContent(entry))`
			`return entries`

			`for row in table.find_all('tr')[1:]:`
			`content.append(__extractRow(row))`

			`return content`

			`def __mergeColumns(columns1, columns2):`
			`return list(map(lambda x, y: x + y, columns1, columns2))`

Cache parsed object in parser object 2022-11-26 07:43:15 +00:00			`extract = self.soup.find('div', class_='extract')`
Create script to read setup rounds 2022-11-19 06:38:22 +00:00			`tables = extract.find_all('table', class_='tab1')`

			`__extractTitles(tables[0])`
			`tableData = __extractColumns(tables[0])`

			`for table in tables[1:]:`
			`tableData = __mergeColumns(tableData, __extractColumns(table))`

			`data = {`
			`'titles': rowTitles,`
			`'table': tableData`
			`}`

			`return {'title': title, 'data': data}`

			`def cleanPreparationRoundImport(self, data):`
			`def __cleanTable(table):`
			`def __cleanText(s: str):`
Parse preview rounds in batch script 2022-11-27 08:10:17 +00:00			`# print("cleaning string ", s)`
Create script to read setup rounds 2022-11-19 06:38:22 +00:00			`return s.strip(' \n\xa0')`

			`def __cleanEntry(entry):`
			`entry['text'] = __cleanText(entry['text'])`
			`if entry['meta'] is not None:`
			`entry['meta'] = __cleanText(entry['meta'])`

			`for row in table:`
			`for entry in row:`
Parse preview rounds in batch script 2022-11-27 08:10:17 +00:00			`# print(entry)`
Create script to read setup rounds 2022-11-19 06:38:22 +00:00			`__cleanEntry(entry)`

			`data['title'] = data['title'].strip()`
			`__cleanTable(data['data']['table'])`