solo-auswertung/src/solo_turnier/html_parser.py

from bs4 import BeautifulSoup
import bs4

import logging
import re

class HtmlParser:

    def __init__(self):
        self.l = logging.getLogger('solo_turnier.html_parser')
    
    def parseString(self, text: str):
        soup = BeautifulSoup(text, 'html.parser')

        ret = {}

        def __parseRows(rows, finalist: bool):
            def __parseRow(row):
                tds = row.find_all('td')
                regex = re.compile('(.*) \\(([0-9]+)\\)')
                
                place = tds[0].contents[0]
                
                match = regex.fullmatch(tds[1].contents[0])
                if match is None:
                    raise Exception(f'Could not match {tds} to regex search pattern')
                name = match.group(1)
                number = match.group(2)

                participant = {
                    'name': name,
                    'place': place,
                    'finalist': finalist
                }
                ret[number] = participant
            
            for row in rows:
                __parseRow(row)

        def __parseFirstTable(table):
            roundName = table.tr.td.contents[0]
            if roundName != 'Endrunde':
                raise Exception('Could not parse HTML file')
            
            __parseRows(table.find_all('tr')[2:], True)

        def __parseRemainingTables(tables):
            for table in tables:
                __parseRows(table.find_all('tr')[2:], False)

        tables = soup.find('div', class_='extract').find_all('table')
        if len(tables) > 0:
            __parseFirstTable(tables[0])

            __parseRemainingTables(tables[1:])

        return ret
Create tested HTML parser for result tables 2022-11-13 16:01:44 +00:00			`from bs4 import BeautifulSoup`
			`import bs4`

			`import logging`
			`import re`

			`class HtmlParser:`

			`def __init__(self):`
			`self.l = logging.getLogger('solo_turnier.html_parser')`

			`def parseString(self, text: str):`
			`soup = BeautifulSoup(text, 'html.parser')`

			`ret = {}`

			`def __parseRows(rows, finalist: bool):`
			`def __parseRow(row):`
			`tds = row.find_all('td')`
			`regex = re.compile('(.*) \\(([0-9]+)\\)')`

			`place = tds[0].contents[0]`

			`match = regex.fullmatch(tds[1].contents[0])`
			`if match is None:`
			`raise Exception(f'Could not match {tds} to regex search pattern')`
			`name = match.group(1)`
			`number = match.group(2)`

			`participant = {`
			`'name': name,`
			`'place': place,`
			`'finalist': finalist`
			`}`
			`ret[number] = participant`

			`for row in rows:`
			`__parseRow(row)`

			`def __parseFirstTable(table):`
			`roundName = table.tr.td.contents[0]`
			`if roundName != 'Endrunde':`
			`raise Exception('Could not parse HTML file')`

			`__parseRows(table.find_all('tr')[2:], True)`

			`def __parseRemainingTables(tables):`
			`for table in tables:`
			`__parseRows(table.find_all('tr')[2:], False)`

			`tables = soup.find('div', class_='extract').find_all('table')`
			`if len(tables) > 0:`
			`__parseFirstTable(tables[0])`

			`__parseRemainingTables(tables[1:])`

			`return ret`