63 lines
1.7 KiB
Python
63 lines
1.7 KiB
Python
from io import TextIOWrapper
|
|
import json
|
|
|
|
class DataNormalizer():
|
|
def __init__(self, file: TextIOWrapper):
|
|
self.file = file
|
|
|
|
def __len__(self) -> int:
|
|
return 0
|
|
|
|
def __iter__(self):
|
|
yield None
|
|
|
|
|
|
class PubMedNormalizer(DataNormalizer):
|
|
"""
|
|
Iterator class that takes a file and iterates over each line. Data is
|
|
normalized inside the iterator.
|
|
"""
|
|
def __init__(self, file: TextIOWrapper):
|
|
self.file = file
|
|
self.length = 0
|
|
|
|
def __len__(self):
|
|
last_pos = self.file.tell()
|
|
self.length = sum(1 for _ in self.file)
|
|
self.file.seek(last_pos)
|
|
return self.length
|
|
|
|
def __iter__(self):
|
|
count = 0
|
|
|
|
# Iterate over each line in self.file, normalize, increment counter,
|
|
# and yield the normalized data.
|
|
while True:
|
|
line = self.file.readline()
|
|
|
|
# EOF
|
|
if not line:
|
|
break
|
|
|
|
# Load JSON
|
|
l = json.loads(line, strict=False)
|
|
norm_text = l.get("text").lower()
|
|
|
|
# Using the second occurance of "text mining" as a break point. We
|
|
# only capture what follows. Initially tried using regular
|
|
# expressions, but this is significantly faster.
|
|
split_data = norm_text.split("text mining")
|
|
norm_text = "text mining".join(split_data[2:])
|
|
norm_text = "\n".join(norm_text.split("\n")[1:])
|
|
|
|
count += 1
|
|
|
|
# ID = Line Number
|
|
yield {
|
|
"id": str(count - 1),
|
|
"doc": norm_text,
|
|
"metadata": {
|
|
"file": l.get("file")
|
|
},
|
|
}
|