parent
05285d29d4
commit
55e3efb3a8
24
.vscode/launch.json
vendored
Normal file
24
.vscode/launch.json
vendored
Normal file
@ -0,0 +1,24 @@
|
||||
{
|
||||
// Use IntelliSense to learn about possible attributes.
|
||||
// Hover to view descriptions of existing attributes.
|
||||
// For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
|
||||
"version": "0.2.0",
|
||||
"configurations": [
|
||||
{
|
||||
"name": "Python: Remote Attach",
|
||||
"type": "python",
|
||||
"request": "attach",
|
||||
"connect": {
|
||||
"host": "localhost",
|
||||
"port": 5678
|
||||
},
|
||||
"pathMappings": [
|
||||
// {
|
||||
// "localRoot": "${workspaceFolder}",
|
||||
// "remoteRoot": "."
|
||||
// }
|
||||
],
|
||||
"justMyCode": true
|
||||
}
|
||||
]
|
||||
}
|
1
scripts/read-competition-notification/.gitignore
vendored
Normal file
1
scripts/read-competition-notification/.gitignore
vendored
Normal file
@ -0,0 +1 @@
|
||||
__pycache__/
|
@ -0,0 +1,50 @@
|
||||
from . import cli
|
||||
from . import mail
|
||||
from . import headerExtractor
|
||||
from . import mailParser
|
||||
from . import competitionParser
|
||||
from . import mboxReader
|
||||
|
||||
import logging
|
||||
import debugpy
|
||||
import os
|
||||
|
||||
def main():
|
||||
args = cli.getArgs()
|
||||
|
||||
logging.basicConfig()
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
verbosityMap = {
|
||||
0: logging.WARNING,
|
||||
1: logging.INFO,
|
||||
}
|
||||
rootLogger = logging.getLogger()
|
||||
rootLogger.setLevel(verbosityMap.get(args.verbose, logging.DEBUG))
|
||||
|
||||
if args.debug:
|
||||
debugpy.listen(5678)
|
||||
debugpy.wait_for_client()
|
||||
|
||||
mp = mailParser.MailParser()
|
||||
cp = competitionParser.CompetitionParser()
|
||||
|
||||
if args.read_mbox is not None:
|
||||
if args.output_folder is None:
|
||||
logger.error('Cannot use batch mode without explicit output folder.')
|
||||
exit(1)
|
||||
|
||||
reader = mboxReader.MBocReader()
|
||||
mails = reader.parseMBoxFile(args.read_mbox[0])
|
||||
for mail in mails:
|
||||
body = mp.parseMail(mail)
|
||||
cp.parseMail(body)
|
||||
filename = cp.getFilename(args.output_folder[0])
|
||||
logger.info('Using file %s to generate the output.', filename)
|
||||
folder = os.path.dirname(filename)
|
||||
os.makedirs(folder, exist_ok=True)
|
||||
with open(filename, 'w') as fp:
|
||||
fp.write(cp.getContent())
|
||||
else:
|
||||
raise Exception('Not yet implemented')
|
||||
|
@ -0,0 +1,3 @@
|
||||
import competitionNotificationReader
|
||||
|
||||
competitionNotificationReader.main()
|
@ -0,0 +1,11 @@
|
||||
import argparse
|
||||
|
||||
def getArgs():
|
||||
parser = argparse.ArgumentParser()
|
||||
|
||||
parser.add_argument('--read-mbox', nargs=1, help='Read mails from mbox file instead of stdin')
|
||||
parser.add_argument('-o', '--output-folder', nargs=1, help='Set the output folder of the generated files.')
|
||||
parser.add_argument('-v', '--verbose', action='count', default=0, help='Increase the verbosity')
|
||||
parser.add_argument('--debug', action='store_true', help='Enable python debugger')
|
||||
|
||||
return parser.parse_args()
|
@ -0,0 +1,132 @@
|
||||
import bs4
|
||||
import logging
|
||||
import re
|
||||
import os
|
||||
import jinja2
|
||||
|
||||
class ParsingFailedEception(Exception):
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
|
||||
class CompetitionParser:
|
||||
|
||||
def __init__(self):
|
||||
self._l = logging.getLogger(__name__)
|
||||
|
||||
self._partner = ''
|
||||
self._partnerin = ''
|
||||
self._date = ''
|
||||
self._title = ''
|
||||
self._number = ''
|
||||
self._group = ''
|
||||
self._class = ''
|
||||
self._section = ''
|
||||
self._ort = ''
|
||||
self._verein = ''
|
||||
self._telefon = ''
|
||||
|
||||
self._reName = re.compile('Neue Meldung für (.*) / (.*)!')
|
||||
self._reDate = re.compile('([0-9]+)\.([0-9]+)\.([0-9]+)')
|
||||
self._reNumber = re.compile('Turnier: ([0-9]+)')
|
||||
self._rePhone = re.compile('Telefon: ([0-9 /]+)')
|
||||
self._rePlace = re.compile('Ort: (.*), (.*)')
|
||||
self._reCompetition = re.compile('(.*) ([A-ES]) ((?:Std)|(?:Lat)|(?:Kombi))')
|
||||
|
||||
self._reCleaningString = re.compile('[^a-z0-9-]')
|
||||
self._reDashes = re.compile('-+')
|
||||
|
||||
def parseMail(self, body: str):
|
||||
parser = bs4.BeautifulSoup(body, 'html.parser')
|
||||
self._getNames(parser.h2)
|
||||
self._parseTable(parser.table)
|
||||
|
||||
def _getNames(self, h2):
|
||||
matcher = self._reName.match(h2.string)
|
||||
if matcher is None:
|
||||
self._l.error('Parsing of header "%s" failed.', h2)
|
||||
raise ParsingFailedEception('Header could not be successfully parsed')
|
||||
self._partner = matcher.group(1)
|
||||
self._partnerin = matcher.group(2)
|
||||
|
||||
def _parseTable(self, table):
|
||||
def parseDate(date):
|
||||
match = self._reDate.fullmatch(date)
|
||||
if match is None:
|
||||
raise ParsingFailedEception('Cannot parse date %s in mail' % date)
|
||||
self._date = f'{match.group(3)}-{match.group(2)}-{match.group(1)}'
|
||||
|
||||
def parseNumber(content):
|
||||
match = self._reNumber.fullmatch(content)
|
||||
if match is None:
|
||||
raise ParsingFailedEception(f'Cannot parse the turnier number in field {content}')
|
||||
self._number = match.group(1)
|
||||
|
||||
def parseCompetition(competition):
|
||||
match = self._reCompetition.fullmatch(competition)
|
||||
if match is None:
|
||||
raise ParsingFailedEception(f'Cannot parse the competition line {competition}')
|
||||
self._group = match.group(1)
|
||||
self._class = match.group(2)
|
||||
self._section = match.group(3)
|
||||
|
||||
def parsePlace(place):
|
||||
match = self._rePlace.fullmatch(place)
|
||||
if match is None:
|
||||
raise ParsingFailedEception(f'Cannot parse the place entry {place}')
|
||||
self._verein = match.group(1)
|
||||
self._ort = match.group(2)
|
||||
|
||||
def parsePhone(phone):
|
||||
match = self._rePhone.fullmatch(phone)
|
||||
if match is None:
|
||||
raise ParsingFailedEception(f'Cannot parse the phone line {phone}')
|
||||
self._telefon = match.group(1)
|
||||
|
||||
tds = table('td')
|
||||
parseDate(tds[0].string.strip())
|
||||
self._title = tds[1].string.strip()
|
||||
parseNumber(tds[2].string.strip())
|
||||
parseCompetition(tds[3].string.strip())
|
||||
parsePlace(tds[4].string.strip())
|
||||
parsePhone(tds[5].string.strip())
|
||||
|
||||
def _cleanName(self, name: str) -> str:
|
||||
cleanedName = name.lower()
|
||||
cleanedName = re.sub('ä', 'ae', cleanedName)
|
||||
cleanedName = re.sub('ö', 'oe', cleanedName)
|
||||
cleanedName = re.sub('ü', 'ue', cleanedName)
|
||||
cleanedName = re.sub('ß', 'ss', cleanedName)
|
||||
cleanedName = re.sub(self._reCleaningString, '-', cleanedName)
|
||||
cleanedName = re.sub(self._reDashes, '-', cleanedName)
|
||||
return cleanedName.lower()
|
||||
|
||||
def getFilename(self, prefix: str) -> str:
|
||||
namePartner = self._cleanName(self._partner)
|
||||
namePartnerin = self._cleanName(self._partnerin)
|
||||
competition = f'{self._group} {self._class} {self._section}'
|
||||
competitionName = self._cleanName(competition)
|
||||
|
||||
return os.path.join(
|
||||
prefix,
|
||||
self._date[0:4],
|
||||
f'{self._date}-{self._ort.lower()}-{namePartner}-{namePartnerin}-{competitionName}.md'
|
||||
)
|
||||
|
||||
def getContent(self) -> str:
|
||||
with open(os.path.join(os.path.dirname(__file__), 'contenttemplate.md.tmpl')) as fp:
|
||||
tpl = fp.read()
|
||||
j2 = jinja2.Template(tpl)
|
||||
vars = {
|
||||
'date': self._date,
|
||||
'partner': self._partner,
|
||||
'partnerin': self._partnerin,
|
||||
'verein': self._verein,
|
||||
'ort': self._ort,
|
||||
'telefon': self._telefon,
|
||||
'group': self._group,
|
||||
'class': self._class,
|
||||
'section': self._section,
|
||||
'title': self._title,
|
||||
'number': self._number,
|
||||
}
|
||||
return j2.render(**vars)
|
@ -0,0 +1,13 @@
|
||||
---
|
||||
dateCompetition: {{ date }}
|
||||
partner: "{{ partner }}"
|
||||
partnerin: "{{ partnerin }}"
|
||||
verein: "{{ verein }}"
|
||||
ort: "{{ ort }}"
|
||||
telefon: "{{ telefon }}"
|
||||
gruppe: "{{ group }}"
|
||||
klasse: "{{ class }}"
|
||||
sektion: "{{ section }}"
|
||||
titel: "{{ title }}"
|
||||
nummer: {{ number }}
|
||||
---
|
@ -0,0 +1,30 @@
|
||||
import competitionNotificationReader as cnr
|
||||
import logging
|
||||
|
||||
def splitHeaders(lines: list[str]) -> cnr.mail.Mail:
|
||||
l = logging.getLogger(__name__)
|
||||
|
||||
l.debug('Separating headers of an email')
|
||||
|
||||
def _getHeaders(lines: list[str]):
|
||||
headerLines = []
|
||||
for idx,l in enumerate(lines):
|
||||
if l == '':
|
||||
remainingLines = lines[idx+1:]
|
||||
for j,rl in enumerate(remainingLines):
|
||||
if rl.strip() != '':
|
||||
return headerLines, remainingLines[j:]
|
||||
return headerLines, []
|
||||
|
||||
if l.startswith('\t') or l.startswith(' '):
|
||||
lastLine = headerLines.pop()
|
||||
newLine = f'{lastLine[1]} {l.strip()}'
|
||||
headerLines.append(tuple([lastLine[0], newLine]))
|
||||
else:
|
||||
parts = l.split(':', 1)
|
||||
headerLines.append(tuple([parts[0].strip(), parts[1].strip()]))
|
||||
|
||||
headerLines, bodyLines = _getHeaders(lines)
|
||||
|
||||
mail = cnr.mail.Mail(headerLines, bodyLines)
|
||||
return mail
|
@ -0,0 +1,11 @@
|
||||
import dataclasses
|
||||
|
||||
HeaderName_t = str
|
||||
HeaderValue_t = str
|
||||
HeaderEntry_t = tuple[HeaderName_t, HeaderValue_t]
|
||||
|
||||
@dataclasses.dataclass
|
||||
class Mail:
|
||||
headers: list[HeaderEntry_t]
|
||||
body: list[str]
|
||||
|
@ -0,0 +1,113 @@
|
||||
import competitionNotificationReader as cnr
|
||||
import logging
|
||||
import re
|
||||
|
||||
class MailParser:
|
||||
def __init__(self):
|
||||
self._l = logging.getLogger(__name__)
|
||||
|
||||
def parseMail(self, rawMail: cnr.mail.Mail):
|
||||
# Look for the correct Mail encoding
|
||||
contentType, boundary = self._getContentType(rawMail)
|
||||
subMails = self._splitMultipartBody(rawMail.body, boundary)
|
||||
|
||||
def isCorrectContentType(mail):
|
||||
for header in mail.headers:
|
||||
if header[0].lower() != 'content-type':
|
||||
continue
|
||||
return header[1].startswith('text/html')
|
||||
return False
|
||||
subMails = list(filter(isCorrectContentType, subMails))
|
||||
|
||||
def isCorrectContentEncoding(mail):
|
||||
for header in mail.headers:
|
||||
if header[0].lower() != 'content-transfer-encoding':
|
||||
continue
|
||||
return header[1] == 'quoted-printable'
|
||||
return False
|
||||
subMails = list(filter(isCorrectContentEncoding, subMails))
|
||||
|
||||
if len(subMails) != 1:
|
||||
raise Exception('Not implemented')
|
||||
|
||||
body = self._mapQuotedrintable(subMails[0].body)
|
||||
return body
|
||||
|
||||
|
||||
def _getContentType(self, rawMail: cnr.mail.Mail) -> str:
|
||||
ctHeaders = list(filter(lambda x: x[0].lower() == 'content-type', rawMail.headers))
|
||||
if len(ctHeaders) != 1:
|
||||
self._l.error('No unique content type of the mail was found.')
|
||||
exit(1)
|
||||
|
||||
ct = ctHeaders[0][1]
|
||||
if not ct.startswith('multipart/alternative'):
|
||||
raise Exception('Not yet implemented')
|
||||
|
||||
parser = re.compile('.*boundary="([^"]+)"')
|
||||
matcher = parser.match(ct)
|
||||
if matcher is None:
|
||||
self._l.error('Cannot extract boundary from mail header.')
|
||||
exit(1)
|
||||
|
||||
boundary = matcher.group(1)
|
||||
|
||||
return 'multipart/alternative', boundary
|
||||
|
||||
def _splitMultipartBody(self, bodyLines: list[str], boundary: str):
|
||||
parts = []
|
||||
subBody = []
|
||||
for line in bodyLines:
|
||||
if line.startswith(f'--{boundary}'):
|
||||
if len(subBody) > 0:
|
||||
parts.append(subBody)
|
||||
subBody = []
|
||||
else:
|
||||
subBody.append(line)
|
||||
return list(map(lambda x: cnr.headerExtractor.splitHeaders(x), parts))
|
||||
|
||||
def _mapQuotedrintable(self, lines: list[str]):
|
||||
def mergeLines():
|
||||
# Drop terminating newlines
|
||||
ret = [l for l in lines]
|
||||
r = list(range(len(ret)))
|
||||
r.reverse()
|
||||
for i in r:
|
||||
currentLine = ret[i]
|
||||
if currentLine.endswith('='):
|
||||
currentLine = currentLine[:-1] + ret.pop(i+1)
|
||||
ret[i] = currentLine
|
||||
|
||||
return ret
|
||||
|
||||
mergedLines = mergeLines()
|
||||
|
||||
def mapUnicodeChars():
|
||||
ret = []
|
||||
for line in mergedLines:
|
||||
i = 0
|
||||
chars = []
|
||||
while i < len(line):
|
||||
if line[i] != '=':
|
||||
chars.extend(list(line[i].encode()))
|
||||
else:
|
||||
hexChars = line[i+1:i+3]
|
||||
value = int(hexChars, 16)
|
||||
# print(f'{hexChars} -> {value}')
|
||||
chars.append(value)
|
||||
i += 2
|
||||
i += 1
|
||||
ret.append(chars)
|
||||
|
||||
return ret
|
||||
|
||||
mappedLines = mapUnicodeChars()
|
||||
|
||||
def decodeLine(l):
|
||||
bytes = [x.to_bytes(1, 'big') for x in l]
|
||||
decodedLine = b''.join(bytes).decode()
|
||||
return decodedLine
|
||||
decodedLines = list(map(decodeLine, mappedLines))
|
||||
|
||||
return ''.join(decodedLines)
|
||||
|
@ -0,0 +1,49 @@
|
||||
import logging
|
||||
import re
|
||||
import io
|
||||
|
||||
import competitionNotificationReader as cnr
|
||||
|
||||
class MBocReader:
|
||||
|
||||
def __init__(self):
|
||||
self._l = logging.getLogger(__name__)
|
||||
|
||||
def parseMBoxFile(self, filename: str) -> list[cnr.mail.Mail]:
|
||||
self._l.debug('Reading MBox file "%s"', filename)
|
||||
|
||||
mails = []
|
||||
with open(filename) as fp:
|
||||
return self._parseMails(fp)
|
||||
|
||||
def _isNewMailLine(self, line: str):
|
||||
return line.startswith('From ')
|
||||
|
||||
def _fixSingleLine(self, line: str) -> str:
|
||||
regex = re.compile('^>+From ')
|
||||
matcher = regex.match(line)
|
||||
|
||||
if matcher is None:
|
||||
return line
|
||||
|
||||
return line[1:]
|
||||
|
||||
def _parseMails(self, fp: io.FileIO) -> list[cnr.mail.Mail]:
|
||||
lines = []
|
||||
mails = []
|
||||
while True:
|
||||
line = fp.readline()
|
||||
if line == '':
|
||||
if len(lines) > 0:
|
||||
mails.append(self._parseSingleMail(lines))
|
||||
return mails
|
||||
|
||||
if self._isNewMailLine(line):
|
||||
if len(lines) > 0:
|
||||
mails.append(self._parseSingleMail(lines))
|
||||
lines = []
|
||||
else:
|
||||
lines.append(self._fixSingleLine(line[0:-1]))
|
||||
|
||||
def _parseSingleMail(self, lines: list[str]) -> cnr.mail.Mail:
|
||||
return cnr.headerExtractor.splitHeaders(lines)
|
5
scripts/read-competition-notification/requirements.txt
Normal file
5
scripts/read-competition-notification/requirements.txt
Normal file
@ -0,0 +1,5 @@
|
||||
beautifulsoup4==4.12.2
|
||||
debugpy==1.8.0
|
||||
Jinja2==3.1.3
|
||||
MarkupSafe==2.1.3
|
||||
soupsieve==2.5
|
Loading…
x
Reference in New Issue
Block a user