From 9e59ca79d2f4ab991a0480f78e54ceabf0df19ab Mon Sep 17 00:00:00 2001 From: Christian Wolf Date: Sun, 27 Nov 2022 09:10:17 +0100 Subject: [PATCH] Parse preview rounds in batch script --- src/solo_turnier/batch.py | 76 ++++++++++++++++++------------- src/solo_turnier/html_parser.py | 4 +- src/solo_turnier/reader.py | 4 +- src/solo_turnier/worker.py | 81 +++++++++++++++++++++++++++++++++ 4 files changed, 131 insertions(+), 34 deletions(-) diff --git a/src/solo_turnier/batch.py b/src/solo_turnier/batch.py index 460544d..57408ee 100644 --- a/src/solo_turnier/batch.py +++ b/src/solo_turnier/batch.py @@ -177,46 +177,60 @@ class BatchWorker: self.l.debug(self.config.__dict__) locator = solo_turnier.html_locator.HtmlLocator() - self.l.info('Checking for feasible HTML export files in "%s"', self.config.importHtmlPath()) - htmlCandidates = locator.findCandidates(self.config.importHtmlPath()) - self.l.debug('Found HTML file candidates: %s', htmlCandidates) + self.l.info('Checking for feasible preview HTML export files in "%s"', self.config.importHtmlPath()) + htmlCandidatesPreview = locator.findPreviewRoundCandidates(self.config.importHtmlPath()) + self.l.debug('Found HTML file candidates for preview rounds: %s', htmlCandidatesPreview) - htmlParser = solo_turnier.html_parser.HtmlParser() - htmlImports = [] - self.l.info('Importing the candidates') - for candidate in htmlCandidates: - self.l.debug('Processing file %s', candidate) - with open(candidate, 'r') as fp: - fileContent = fp.read() - htmlImports.append(htmlParser.parseString(fileContent)) - self.l.info('All HTML files have been parsed') + previewWorker = solo_turnier.worker.PreviewWorker() + self.l.info('Filtering for pure preview rounds.') + parsers = previewWorker.filterFilesPreview(htmlCandidatesPreview) + self.l.debug('Remaining files: %s', parsers.keys()) + + self.l.info('Extracting person data from the preview rounds.') + previewWorker.extractPersonsFromPreview(parsers) csvReader = solo_turnier.reader.AllResultReader(self.config.importCSVPath()) self.l.info('Importing the total result CSV file %s', self.config.importCSVPath()) csvData = csvReader.readFile() self.l.info('CSV file has been read') - - self.l.info('Processing the imported data') - csvExtractor = solo_turnier.worker.CSVExtractor() - self.l.debug('Importing CSV data into internal structures') - csvRows = csvExtractor.mapCSVImport(csvData) - worker = solo_turnier.worker.DataWorker() - self.l.debug('Combining results from CSV for individual users') - data = worker.combineRowsByPerson(csvRows) - self.l.debug('Fix the groups for combined competitions') - unambiguous, fixedGroups = worker.consolidateGroups(data) - if fixedGroups: - self.l.info('It was required to fix some group issues.') - if not unambiguous: - self.warning('There were couples whose group could not be extracted unambiguously.') - self.l.debug('Merging HTML and CSV data') - worker.mergeHtmlData(data, htmlImports) - self.l.info('Data is prepared') - consoleOutputtter = solo_turnier.output.ConsoleOutputter() - consoleOutputtter.output(data) + # self.l.info('Checking for feasible HTML export files in "%s"', self.config.importHtmlPath()) + # htmlCandidates = locator.findCandidates(self.config.importHtmlPath()) + # self.l.debug('Found HTML file candidates: %s', htmlCandidates) + + # htmlParser = solo_turnier.html_parser.HtmlParser() + # htmlImports = [] + # self.l.info('Importing the candidates') + # for candidate in htmlCandidates: + # self.l.debug('Processing file %s', candidate) + # with open(candidate, 'r') as fp: + # fileContent = fp.read() + # htmlImports.append(htmlParser.parseString(fileContent)) + # self.l.info('All HTML files have been parsed') + + + # self.l.info('Processing the imported data') + + # csvExtractor = solo_turnier.worker.CSVExtractor() + # self.l.debug('Importing CSV data into internal structures') + # csvRows = csvExtractor.mapCSVImport(csvData) + + # self.l.debug('Combining results from CSV for individual users') + # data = worker.combineRowsByPerson(csvRows) + # self.l.debug('Fix the groups for combined competitions') + # unambiguous, fixedGroups = worker.consolidateGroups(data) + # if fixedGroups: + # self.l.info('It was required to fix some group issues.') + # if not unambiguous: + # self.warning('There were couples whose group could not be extracted unambiguously.') + # self.l.debug('Merging HTML and CSV data') + # worker.mergeHtmlData(data, htmlImports) + # self.l.info('Data is prepared') + + # consoleOutputtter = solo_turnier.output.ConsoleOutputter() + # consoleOutputtter.output(data) def run1(self): allResults, finals = self.__extractDataFromFiles() diff --git a/src/solo_turnier/html_parser.py b/src/solo_turnier/html_parser.py index 04eb8ad..2fd0d7d 100644 --- a/src/solo_turnier/html_parser.py +++ b/src/solo_turnier/html_parser.py @@ -170,7 +170,7 @@ class HtmlParser: def cleanPreparationRoundImport(self, data): def __cleanTable(table): def __cleanText(s: str): - print("cleaning string ", s) + # print("cleaning string ", s) return s.strip(' \n\xa0') def __cleanEntry(entry): @@ -180,7 +180,7 @@ class HtmlParser: for row in table: for entry in row: - print(entry) + # print(entry) __cleanEntry(entry) data['title'] = data['title'].strip() diff --git a/src/solo_turnier/reader.py b/src/solo_turnier/reader.py index dac7720..ca246d2 100644 --- a/src/solo_turnier/reader.py +++ b/src/solo_turnier/reader.py @@ -4,6 +4,7 @@ import csv import os import logging import re +from pprint import pformat class AllResultReader: def __init__(self, fileName: str): @@ -25,5 +26,6 @@ class AllResultReader: 'data': rows[1:] } - logging.getLogger('solo_turnier.reader.all_results').debug('Imported results from allresults.csv file: %s', ret) + l = logging.getLogger('solo_turnier.reader.all_results') + l.log(5, 'Imported results from allresults.csv file: %s', (ret)) return ret diff --git a/src/solo_turnier/worker.py b/src/solo_turnier/worker.py index ab7e9e0..c1e9db7 100644 --- a/src/solo_turnier/worker.py +++ b/src/solo_turnier/worker.py @@ -20,6 +20,15 @@ class ResultRow: def __str__(self): return f'{self.name} ({self.id}, {self.club}) is in {self.group} {self.class_} and danced the {self.dance} in {self.competitionGroup} {self.competitionClass} getting place {self.place}-{self.placeTo}' +class HtmlPerson: + def __init__(self, name, id, group): + self.name = name + self.id = id + self.group = group + + def __repr__(self): + return f'{self.name} ({self.id}, {self.group})' + class ResultPerson: def __init__(self, firstName, lastName, club, id = None, group = None): self.firstName = firstName @@ -148,6 +157,76 @@ class CSVExtractor: return ret +class PreviewWorker: + def __init__(self): + self.l = logging.getLogger('solo_turnier.worker.PreviewWorker') + + def filterFilesPreview(self, files: list[str]) -> dict[str, html_parser.HtmlParser]: + self.l.debug('Filtering the list of parsers by removing all non preview entries.') + ret = {} + for file in files: + with open(file, 'r') as fp: + text = fp.read() + + parser = html_parser.HtmlParser(text) + + try: + data = parser.guessDataFromHtmlTitle() + except: + self.l.error(f'Unable to parse html file in {file}. Please check manually.') + continue + + if data['class_'] == 'Sichtung': + self.l.debug(f"Found candidate in {file}. Adding to the list.") + ret[file] = parser + else: + self.l.debug(f'Rejecting file {file} as the name {data["class_"]} did not match.') + + return ret + + def __extractPersonsFromSinglePreview(self, parser: html_parser.HtmlParser): + imported = parser.parsePreparationRound() + parser.cleanPreparationRoundImport(imported) + data = imported['data'] + + self.l.log(5, data) + + if data['titles'][0] != 'Wertungsrichter': + self.l.fatal('Cannot parse the parsed content of the preview file.') + raise Exception('Incompatible export file') + + ids = [] + names = [] + indices = [] + for index, e in enumerate(data['table'][0]): + if e['text'] == '': + continue + indices.append(index) + ids.append(e['text']) + names.append(e['meta']) + + groups = [] + + if data['titles'][-1] == 'Startgruppe': + self.l.debug('Combined competition found. Extracting group from table') + groups = [data['table'][-1][idx]['text'] for idx in indices] + else: + self.l.debug('Using group from the title.') + group = parser.guessDataFromHtmlTitle(imported['title'])['group'] + groups = [group for i in indices] + + ret = [] + for i in range(len(ids)): + ret.append(HtmlPerson(names[i], ids[i], groups[i])) + + self.l.log(5, ret) + return ret + + def extractPersonsFromPreview(self, parsers): + for file in parsers: + self.l.debug('Extracting person data from %s', file) + self.__extractPersonsFromSinglePreview(parsers[file]) + class DataWorker: def __init__(self): self.l = logging.getLogger('solo_turnier.worker') @@ -297,3 +376,5 @@ class DataWorker: ret.append(competitions[0]) return ret + +