solo-auswertung/src/solo_turnier/html_parser.py

from bs4 import BeautifulSoup

import logging
import re

class HtmlParticipant:
    def __init__(self, name, place, finalist):
        self.name = name
        self.place = place
        self.finalist = finalist
    
    def __str__(self):
        return f'{self.name} (with place {self.place})'

class HtmlImport:
    def __init__(self, title: str, participants: dict[int, HtmlParticipant]):
        self.title = title
        self.participants = participants

class HtmlParser:

    def __init__(self):
        self.l = logging.getLogger('solo_turnier.html_parser')
    
    def parseString(self, text: str):
        soup = BeautifulSoup(text, 'html.parser')

        participants = {}

        def __parseRows(rows, finalist: bool):
            def __parseRow(row):
                tds = row.find_all('td')
                regex = re.compile('(.*) \\(([0-9]+)\\)')
                
                place = tds[0].contents[0]
                
                match = regex.fullmatch(tds[1].contents[0])
                if match is None:
                    raise Exception(f'Could not match {tds} to regex search pattern')
                name = match.group(1)
                number = match.group(2)

                participant = HtmlParticipant(name, place, finalist)
                participants[number] = participant
            
            for row in rows:
                __parseRow(row)

        def __parseFirstTable(table):
            roundName = table.tr.td.contents[0]
            if roundName != 'Endrunde':
                raise Exception('Could not parse HTML file')
            
            __parseRows(table.find_all('tr')[2:], True)

        def __parseRemainingTables(tables):
            for table in tables:
                __parseRows(table.find_all('tr')[2:], False)

        tables = soup.find('div', class_='extract').find_all('table')
        if len(tables) > 0:
            __parseFirstTable(tables[0])

            __parseRemainingTables(tables[1:])

        title = soup.find('div', class_='eventhead').table.tr.td.contents[0]

        ret = HtmlImport(title, participants)
        return ret

    def guessDataFromHtmlTitle(self, title):
        match = re.compile('.*?ETW, Solos (.*)').match(title)
        if match is None:
            raise Exception(f'Cannot parse title "{title}"')
        
        rest = match.group(1)
        rawGroup, rawClass, dance = rest.split(' ', 2)

        classMap = {
            'Newcomer': 'Newc.',
            'Beginner': 'Beg.',
            'Advanced': 'Adv.'
        }

        groupMap = {
            'Kinder': 'Kin.',
            'Junioren': 'Jun.',
            'Jugend': 'Jug.',
        }

        return {
            'dance': dance.strip(),
            'class_': classMap.get(rawClass, rawClass),
            'group': groupMap.get(rawGroup, rawGroup)
        }
Create tested HTML parser for result tables 2022-11-13 16:01:44 +00:00			`from bs4 import BeautifulSoup`

			`import logging`
			`import re`

Usage of objects in HTML parser 2022-11-15 13:38:59 +00:00			`class HtmlParticipant:`
			`def __init__(self, name, place, finalist):`
			`self.name = name`
			`self.place = place`
			`self.finalist = finalist`
Merge HTML final status into main data structures 2022-11-15 17:11:40 +00:00
			`def __str__(self):`
			`return f'{self.name} (with place {self.place})'`
Usage of objects in HTML parser 2022-11-15 13:38:59 +00:00
			`class HtmlImport:`
			`def __init__(self, title: str, participants: dict[int, HtmlParticipant]):`
			`self.title = title`
			`self.participants = participants`

Create tested HTML parser for result tables 2022-11-13 16:01:44 +00:00			`class HtmlParser:`

			`def __init__(self):`
			`self.l = logging.getLogger('solo_turnier.html_parser')`

			`def parseString(self, text: str):`
			`soup = BeautifulSoup(text, 'html.parser')`

Create class to look for possible result sets 2022-11-13 17:04:49 +00:00			`participants = {}`
Create tested HTML parser for result tables 2022-11-13 16:01:44 +00:00
			`def __parseRows(rows, finalist: bool):`
			`def __parseRow(row):`
			`tds = row.find_all('td')`
			`regex = re.compile('(.*) \\(([0-9]+)\\)')`

			`place = tds[0].contents[0]`

			`match = regex.fullmatch(tds[1].contents[0])`
			`if match is None:`
			`raise Exception(f'Could not match {tds} to regex search pattern')`
			`name = match.group(1)`
			`number = match.group(2)`

Usage of objects in HTML parser 2022-11-15 13:38:59 +00:00			`participant = HtmlParticipant(name, place, finalist)`
Create class to look for possible result sets 2022-11-13 17:04:49 +00:00			`participants[number] = participant`
Create tested HTML parser for result tables 2022-11-13 16:01:44 +00:00
			`for row in rows:`
			`__parseRow(row)`

			`def __parseFirstTable(table):`
			`roundName = table.tr.td.contents[0]`
			`if roundName != 'Endrunde':`
			`raise Exception('Could not parse HTML file')`

			`__parseRows(table.find_all('tr')[2:], True)`

			`def __parseRemainingTables(tables):`
			`for table in tables:`
			`__parseRows(table.find_all('tr')[2:], False)`

			`tables = soup.find('div', class_='extract').find_all('table')`
			`if len(tables) > 0:`
			`__parseFirstTable(tables[0])`

			`__parseRemainingTables(tables[1:])`

Create class to look for possible result sets 2022-11-13 17:04:49 +00:00			`title = soup.find('div', class_='eventhead').table.tr.td.contents[0]`

Usage of objects in HTML parser 2022-11-15 13:38:59 +00:00			`ret = HtmlImport(title, participants)`
Create tested HTML parser for result tables 2022-11-13 16:01:44 +00:00			`return ret`
Extract meta data from title in HTML 2022-11-15 13:19:10 +00:00
			`def guessDataFromHtmlTitle(self, title):`
			`match = re.compile('.?ETW, Solos (.)').match(title)`
			`if match is None:`
			`raise Exception(f'Cannot parse title "{title}"')`

			`rest = match.group(1)`
			`rawGroup, rawClass, dance = rest.split(' ', 2)`

			`classMap = {`
			`'Newcomer': 'Newc.',`
			`'Beginner': 'Beg.',`
			`'Advanced': 'Adv.'`
			`}`

			`groupMap = {`
			`'Kinder': 'Kin.',`
			`'Junioren': 'Jun.',`
			`'Jugend': 'Jug.',`
			`}`

			`return {`
Remove obsolete whitespace 2022-11-15 15:46:57 +00:00			`'dance': dance.strip(),`
Extract meta data from title in HTML 2022-11-15 13:19:10 +00:00			`'class_': classMap.get(rawClass, rawClass),`
			`'group': groupMap.get(rawGroup, rawGroup)`
			`}`