Parse preview rounds in batch script

This commit is contained in:
Christian Wolf 2022-11-27 09:10:17 +01:00
parent ffa1e36b6f
commit 9e59ca79d2
4 changed files with 131 additions and 34 deletions

View File

@ -177,46 +177,60 @@ class BatchWorker:
self.l.debug(self.config.__dict__) self.l.debug(self.config.__dict__)
locator = solo_turnier.html_locator.HtmlLocator() locator = solo_turnier.html_locator.HtmlLocator()
self.l.info('Checking for feasible HTML export files in "%s"', self.config.importHtmlPath()) self.l.info('Checking for feasible preview HTML export files in "%s"', self.config.importHtmlPath())
htmlCandidates = locator.findCandidates(self.config.importHtmlPath()) htmlCandidatesPreview = locator.findPreviewRoundCandidates(self.config.importHtmlPath())
self.l.debug('Found HTML file candidates: %s', htmlCandidates) self.l.debug('Found HTML file candidates for preview rounds: %s', htmlCandidatesPreview)
htmlParser = solo_turnier.html_parser.HtmlParser() previewWorker = solo_turnier.worker.PreviewWorker()
htmlImports = [] self.l.info('Filtering for pure preview rounds.')
self.l.info('Importing the candidates') parsers = previewWorker.filterFilesPreview(htmlCandidatesPreview)
for candidate in htmlCandidates: self.l.debug('Remaining files: %s', parsers.keys())
self.l.debug('Processing file %s', candidate)
with open(candidate, 'r') as fp: self.l.info('Extracting person data from the preview rounds.')
fileContent = fp.read() previewWorker.extractPersonsFromPreview(parsers)
htmlImports.append(htmlParser.parseString(fileContent))
self.l.info('All HTML files have been parsed')
csvReader = solo_turnier.reader.AllResultReader(self.config.importCSVPath()) csvReader = solo_turnier.reader.AllResultReader(self.config.importCSVPath())
self.l.info('Importing the total result CSV file %s', self.config.importCSVPath()) self.l.info('Importing the total result CSV file %s', self.config.importCSVPath())
csvData = csvReader.readFile() csvData = csvReader.readFile()
self.l.info('CSV file has been read') self.l.info('CSV file has been read')
self.l.info('Processing the imported data')
csvExtractor = solo_turnier.worker.CSVExtractor()
self.l.debug('Importing CSV data into internal structures')
csvRows = csvExtractor.mapCSVImport(csvData)
worker = solo_turnier.worker.DataWorker() worker = solo_turnier.worker.DataWorker()
self.l.debug('Combining results from CSV for individual users')
data = worker.combineRowsByPerson(csvRows)
self.l.debug('Fix the groups for combined competitions')
unambiguous, fixedGroups = worker.consolidateGroups(data)
if fixedGroups:
self.l.info('It was required to fix some group issues.')
if not unambiguous:
self.warning('There were couples whose group could not be extracted unambiguously.')
self.l.debug('Merging HTML and CSV data')
worker.mergeHtmlData(data, htmlImports)
self.l.info('Data is prepared')
consoleOutputtter = solo_turnier.output.ConsoleOutputter() # self.l.info('Checking for feasible HTML export files in "%s"', self.config.importHtmlPath())
consoleOutputtter.output(data) # htmlCandidates = locator.findCandidates(self.config.importHtmlPath())
# self.l.debug('Found HTML file candidates: %s', htmlCandidates)
# htmlParser = solo_turnier.html_parser.HtmlParser()
# htmlImports = []
# self.l.info('Importing the candidates')
# for candidate in htmlCandidates:
# self.l.debug('Processing file %s', candidate)
# with open(candidate, 'r') as fp:
# fileContent = fp.read()
# htmlImports.append(htmlParser.parseString(fileContent))
# self.l.info('All HTML files have been parsed')
# self.l.info('Processing the imported data')
# csvExtractor = solo_turnier.worker.CSVExtractor()
# self.l.debug('Importing CSV data into internal structures')
# csvRows = csvExtractor.mapCSVImport(csvData)
# self.l.debug('Combining results from CSV for individual users')
# data = worker.combineRowsByPerson(csvRows)
# self.l.debug('Fix the groups for combined competitions')
# unambiguous, fixedGroups = worker.consolidateGroups(data)
# if fixedGroups:
# self.l.info('It was required to fix some group issues.')
# if not unambiguous:
# self.warning('There were couples whose group could not be extracted unambiguously.')
# self.l.debug('Merging HTML and CSV data')
# worker.mergeHtmlData(data, htmlImports)
# self.l.info('Data is prepared')
# consoleOutputtter = solo_turnier.output.ConsoleOutputter()
# consoleOutputtter.output(data)
def run1(self): def run1(self):
allResults, finals = self.__extractDataFromFiles() allResults, finals = self.__extractDataFromFiles()

View File

@ -170,7 +170,7 @@ class HtmlParser:
def cleanPreparationRoundImport(self, data): def cleanPreparationRoundImport(self, data):
def __cleanTable(table): def __cleanTable(table):
def __cleanText(s: str): def __cleanText(s: str):
print("cleaning string ", s) # print("cleaning string ", s)
return s.strip(' \n\xa0') return s.strip(' \n\xa0')
def __cleanEntry(entry): def __cleanEntry(entry):
@ -180,7 +180,7 @@ class HtmlParser:
for row in table: for row in table:
for entry in row: for entry in row:
print(entry) # print(entry)
__cleanEntry(entry) __cleanEntry(entry)
data['title'] = data['title'].strip() data['title'] = data['title'].strip()

View File

@ -4,6 +4,7 @@ import csv
import os import os
import logging import logging
import re import re
from pprint import pformat
class AllResultReader: class AllResultReader:
def __init__(self, fileName: str): def __init__(self, fileName: str):
@ -25,5 +26,6 @@ class AllResultReader:
'data': rows[1:] 'data': rows[1:]
} }
logging.getLogger('solo_turnier.reader.all_results').debug('Imported results from allresults.csv file: %s', ret) l = logging.getLogger('solo_turnier.reader.all_results')
l.log(5, 'Imported results from allresults.csv file: %s', (ret))
return ret return ret

View File

@ -20,6 +20,15 @@ class ResultRow:
def __str__(self): def __str__(self):
return f'{self.name} ({self.id}, {self.club}) is in {self.group} {self.class_} and danced the {self.dance} in {self.competitionGroup} {self.competitionClass} getting place {self.place}-{self.placeTo}' return f'{self.name} ({self.id}, {self.club}) is in {self.group} {self.class_} and danced the {self.dance} in {self.competitionGroup} {self.competitionClass} getting place {self.place}-{self.placeTo}'
class HtmlPerson:
def __init__(self, name, id, group):
self.name = name
self.id = id
self.group = group
def __repr__(self):
return f'{self.name} ({self.id}, {self.group})'
class ResultPerson: class ResultPerson:
def __init__(self, firstName, lastName, club, id = None, group = None): def __init__(self, firstName, lastName, club, id = None, group = None):
self.firstName = firstName self.firstName = firstName
@ -148,6 +157,76 @@ class CSVExtractor:
return ret return ret
class PreviewWorker:
def __init__(self):
self.l = logging.getLogger('solo_turnier.worker.PreviewWorker')
def filterFilesPreview(self, files: list[str]) -> dict[str, html_parser.HtmlParser]:
self.l.debug('Filtering the list of parsers by removing all non preview entries.')
ret = {}
for file in files:
with open(file, 'r') as fp:
text = fp.read()
parser = html_parser.HtmlParser(text)
try:
data = parser.guessDataFromHtmlTitle()
except:
self.l.error(f'Unable to parse html file in {file}. Please check manually.')
continue
if data['class_'] == 'Sichtung':
self.l.debug(f"Found candidate in {file}. Adding to the list.")
ret[file] = parser
else:
self.l.debug(f'Rejecting file {file} as the name {data["class_"]} did not match.')
return ret
def __extractPersonsFromSinglePreview(self, parser: html_parser.HtmlParser):
imported = parser.parsePreparationRound()
parser.cleanPreparationRoundImport(imported)
data = imported['data']
self.l.log(5, data)
if data['titles'][0] != 'Wertungsrichter':
self.l.fatal('Cannot parse the parsed content of the preview file.')
raise Exception('Incompatible export file')
ids = []
names = []
indices = []
for index, e in enumerate(data['table'][0]):
if e['text'] == '':
continue
indices.append(index)
ids.append(e['text'])
names.append(e['meta'])
groups = []
if data['titles'][-1] == 'Startgruppe':
self.l.debug('Combined competition found. Extracting group from table')
groups = [data['table'][-1][idx]['text'] for idx in indices]
else:
self.l.debug('Using group from the title.')
group = parser.guessDataFromHtmlTitle(imported['title'])['group']
groups = [group for i in indices]
ret = []
for i in range(len(ids)):
ret.append(HtmlPerson(names[i], ids[i], groups[i]))
self.l.log(5, ret)
return ret
def extractPersonsFromPreview(self, parsers):
for file in parsers:
self.l.debug('Extracting person data from %s', file)
self.__extractPersonsFromSinglePreview(parsers[file])
class DataWorker: class DataWorker:
def __init__(self): def __init__(self):
self.l = logging.getLogger('solo_turnier.worker') self.l = logging.getLogger('solo_turnier.worker')
@ -297,3 +376,5 @@ class DataWorker:
ret.append(competitions[0]) ret.append(competitions[0])
return ret return ret