minyma/minyma/normalizer.py
2023-10-15 22:55:45 -04:00

46 lines
1.3 KiB
Python

from io import TextIOWrapper
import json
class DataNormalizer:
def __init__(self, file: TextIOWrapper):
pass
def __iter__(self):
pass
class PubMedNormalizer(DataNormalizer):
"""
Iterator class that takes a file and iterates over each line. Data is
normalized inside the iterator.
"""
def __init__(self, file: TextIOWrapper):
self.file = file
def __iter__(self):
count = 0
# Iterate over each line in self.file, normalize, increment counter,
# and yield the normalized data.
while True:
line = self.file.readline()
# EOF
if not line:
break
# Load JSON
l = json.loads(line, strict=False)
norm_text = l.get("text").lower()
# Using the second occurance of "text mining" as a break point. We
# only capture what follows. Initially tried using regular
# expressions, but this is significantly faster.
split_data = norm_text.split("text mining")
norm_text = "text mining".join(split_data[2:])
norm_text = "\n".join(norm_text.split("\n")[1:])
count += 1
# ID = Line Number
yield { "doc": norm_text, "id": str(count - 1) }