Spaces:
Paused
Paused
Commit
·
9901139
1
Parent(s):
fa32459
normalized diseases
Browse files
utils.py
CHANGED
|
@@ -4,6 +4,7 @@ import pandas as pd
|
|
| 4 |
import ssl
|
| 5 |
import torch
|
| 6 |
import re
|
|
|
|
| 7 |
from pprint import pprint
|
| 8 |
from captum.attr import visualization
|
| 9 |
|
|
@@ -21,6 +22,39 @@ class PyTMinMaxScalerVectorized(object):
|
|
| 21 |
tensor.mul_(scale).sub_(tensor.min(dim=0, keepdim=True)[0])
|
| 22 |
return tensor
|
| 23 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 24 |
def get_diseases(text, pipe):
|
| 25 |
results = pipe(text)
|
| 26 |
diseases = []
|
|
@@ -44,7 +78,8 @@ def get_diseases(text, pipe):
|
|
| 44 |
if len(disease_span) > 1:
|
| 45 |
disease = text[disease_span[0]: disease_span[1]]
|
| 46 |
diseases.append(disease)
|
| 47 |
-
|
|
|
|
| 48 |
|
| 49 |
def find_end(text):
|
| 50 |
"""Find the end of the report."""
|
|
|
|
| 4 |
import ssl
|
| 5 |
import torch
|
| 6 |
import re
|
| 7 |
+
import difflib
|
| 8 |
from pprint import pprint
|
| 9 |
from captum.attr import visualization
|
| 10 |
|
|
|
|
| 22 |
tensor.mul_(scale).sub_(tensor.min(dim=0, keepdim=True)[0])
|
| 23 |
return tensor
|
| 24 |
|
| 25 |
+
def _normalized_diseases(text_list, disease):
|
| 26 |
+
candidates = difflib.get_close_matches(disease, text_list)
|
| 27 |
+
if len(candidates) > 0:
|
| 28 |
+
return candidates[0]
|
| 29 |
+
return ''
|
| 30 |
+
|
| 31 |
+
def clean_disease_string(disease):
|
| 32 |
+
disease = disease.strip().lower()
|
| 33 |
+
disease = re.sub(r'[^\w\s]','',disease)
|
| 34 |
+
return disease
|
| 35 |
+
|
| 36 |
+
def normalized_diseases(text, disease_list):
|
| 37 |
+
disease_list = list(set(disease_list))
|
| 38 |
+
text_split = text.split()
|
| 39 |
+
normalized = []
|
| 40 |
+
for disease in disease_list:
|
| 41 |
+
# case when the disease is one word
|
| 42 |
+
if ' ' not in disease:
|
| 43 |
+
candidate = _normalized_diseases(disease=disease, text_list=text_split)
|
| 44 |
+
if len(candidate) > 0:
|
| 45 |
+
candidate = clean_disease_string(candidate)
|
| 46 |
+
normalized.append(candidate)
|
| 47 |
+
else:
|
| 48 |
+
concept = ''
|
| 49 |
+
for disease_word in disease.split():
|
| 50 |
+
candidate = _normalized_diseases(text_list=text_split, disease=disease_word)
|
| 51 |
+
if len(candidate) > 0:
|
| 52 |
+
concept += (candidate + ' ')
|
| 53 |
+
if len(concept.split()) == len(disease.split()):
|
| 54 |
+
concept = clean_disease_string(concept)
|
| 55 |
+
normalized.append(concept)
|
| 56 |
+
return list(set(normalized))
|
| 57 |
+
|
| 58 |
def get_diseases(text, pipe):
|
| 59 |
results = pipe(text)
|
| 60 |
diseases = []
|
|
|
|
| 78 |
if len(disease_span) > 1:
|
| 79 |
disease = text[disease_span[0]: disease_span[1]]
|
| 80 |
diseases.append(disease)
|
| 81 |
+
normalized = normalized_diseases(text, diseases)
|
| 82 |
+
return normalized
|
| 83 |
|
| 84 |
def find_end(text):
|
| 85 |
"""Find the end of the report."""
|