-
Notifications
You must be signed in to change notification settings - Fork 1
/
app.py
298 lines (192 loc) · 14.5 KB
/
app.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
from flask import Flask, request, render_template, redirect, url_for
from drug_named_entity_recognition import find_drugs
import json
import requests
from transformers import BertTokenizer
import nltk
nltk.download('punkt') # Download the sentence tokenizer
from nltk import sent_tokenize
header = {
'Content-Type': 'application/json',
'Accept': 'application/json',
'Authorization': 'Bearer eyJraWQiOiIyMDI0MDEwNjA4MzciLCJhbGciOiJSUzI1NiJ9.eyJpYW1faWQiOiJJQk1pZC01NTAwMDM1QUs1IiwiaWQiOiJJQk1pZC01NTAwMDM1QUs1IiwicmVhbG1pZCI6IklCTWlkIiwianRpIjoiOTJmMTExMGItZTBlYS00NDIwLWExNWItM2RjZjlhOWExNGVlIiwiaWRlbnRpZmllciI6IjU1MDAwMzVBSzUiLCJnaXZlbl9uYW1lIjoicGVkcmFtIiwiZmFtaWx5X25hbWUiOiJhaGFkaW5lamFkIiwibmFtZSI6InBlZHJhbSBhaGFkaW5lamFkIiwiZW1haWwiOiJwZWRyYW1AdGNlZ3JvdXAuY29tIiwic3ViIjoicGVkcmFtQHRjZWdyb3VwLmNvbSIsImF1dGhuIjp7InN1YiI6InBlZHJhbUB0Y2Vncm91cC5jb20iLCJpYW1faWQiOiJJQk1pZC01NTAwMDM1QUs1IiwibmFtZSI6InBlZHJhbSBhaGFkaW5lamFkIiwiZ2l2ZW5fbmFtZSI6InBlZHJhbSIsImZhbWlseV9uYW1lIjoiYWhhZGluZWphZCIsImVtYWlsIjoicGVkcmFtQHRjZWdyb3VwLmNvbSJ9LCJhY2NvdW50Ijp7InZhbGlkIjp0cnVlLCJic3MiOiJmMmE0N2JjM2ViYWQ0MWZhOTA3YWIzNzM2YWZmMDMzMiIsImltc191c2VyX2lkIjoiODk4MDEzMiIsImZyb3plbiI6dHJ1ZSwiaW1zIjoiMTczMzkxMSJ9LCJpYXQiOjE3MDcwNzk1NzAsImV4cCI6MTcwNzA4MzE3MCwiaXNzIjoiaHR0cHM6Ly9pYW0uY2xvdWQuaWJtLmNvbS9pZGVudGl0eSIsImdyYW50X3R5cGUiOiJ1cm46aWJtOnBhcmFtczpvYXV0aDpncmFudC10eXBlOmFwaWtleSIsInNjb3BlIjoiaWJtIG9wZW5pZCIsImNsaWVudF9pZCI6ImRlZmF1bHQiLCJhY3IiOjEsImFtciI6WyJwd2QiXX0.H1s9k7WNgRh-m1OR2vhe9DcaaGAZjVMJLFxHuB5ylN7_6jPrvBkU7IUtD01ylZV23pChXIfTx0WjA862iazjfcFeAzgACaVxfFbUWQyOuo6O92U9NhoUMdf8nDE3SIc8mdAjTyO4_Mz0LNdlsddGXnyJn0pcluFIgPAz9dG-T9ovS3DZLxERxyco0hgdKSGaRt7QBJhuTA4Nsykl-D6H3K3Qn_EZ7X5kpB2J8JcDQ_kFyVKimNm3q_ly1rVVpxNOz9_GCP893nyjjJBBrlUxe6N8lsf_c0EoO4rwCja70q7DJQkxwUc9n9sf7wL3Iz2RCmWzLxbSe_8u7b0v6144-w'}
### Best parameters are
## portion 2000
## over lap 250
## max token for api 900
app = Flask(__name__,template_folder='templates')
# Tokenize the text into sentences
def split_text_into_sentences(text):
sentences = sent_tokenize(text)
return sentences
# Split text into smaller portions while keeping sentences intact
def split_text_with_equal_size(text, desired_portion_length=2000, overlap_tokens=250, model_name='bert-base-uncased'):
tokenizer = BertTokenizer.from_pretrained(model_name)
sentences = split_text_into_sentences(text)
total_tokens = tokenizer(text)['input_ids']
# Check if the text is shorter than 900 tokens
if len(total_tokens) <= 20:
return [text]
portions = []
current_portion = []
current_portion_length = 0
for sentence in sentences:
sentence_tokens = tokenizer.tokenize(sentence)
sentence_length = len(sentence_tokens)
if current_portion_length + sentence_length <= desired_portion_length:
current_portion.extend(sentence_tokens)
current_portion_length += sentence_length
else:
if current_portion:
portion_text = tokenizer.convert_tokens_to_string(current_portion)
portions.append(portion_text)
current_portion = sentence_tokens
current_portion_length = sentence_length
if current_portion:
portion_text = tokenizer.convert_tokens_to_string(current_portion)
portions.append(portion_text)
return portions
@app.route('/', methods=['GET', 'POST'])
def input_text():
if request.method == 'POST':
user_input = request.form['user_input']
if not user_input:
return render_template('error.html', error_message='Input text is required.')
else:
return redirect(url_for('generate_text', user_input=user_input))
else:
return render_template('input.html')
################## G generate_text T ################
@app.route('/generate_text', methods=['GET'])
def generate_text():
try:
user_input = request.args.get('user_input')
if not user_input:
return render_template('error.html', error_message='Input text is required.')
if not user_input:
return render_template_string('error.html')
pieces = split_text_with_equal_size(user_input)
url = "https://us-south.ml.cloud.ibm.com/ml/v1-beta/generation/text?version=2023-05-29"
headers = header
generated_texts = []
for piece in pieces:
payload = {
"model_id": "meta-llama/llama-2-70b-chat",
"input": "instruction : Extract medical information from the conversation specific to entities mentioned in the text. Categorize the information into demographics, Chief Complaint, HPI, PMHx, Social History, Family History, ROS, state and types of Physical Exam and Test, Medications and dosages, Plans and Other mentioned high-value information, and corresponding billing and icd codes. Only include explicitly mentioned information and notes when needed. refrain from inferring or adding details not present in the text. input text : " + piece ,
"parameters": {"decoding_method": "greedy", "max_new_tokens": 900, "min_new_tokens": 50, "stop_sequences": [], "repetition_penalty": 1},
"project_id": "beaf6470-c5bc-4695-b204-29d09c8bf7fb",
"moderations": {
"hap": {"input": True, "output": True, "threshold": 0.5, "mask": {"remove_entity_value": False}}
}
}
response = requests.post(url, headers=headers, json=payload)
response_data = response.json()
generated_text = response_data['results'][0]['generated_text']
generated_texts.append(generated_text)
combined_result = "".join(generated_texts)
#########################################
### use combined result as a prompt again
#########################################
# # Make the API call with the combined text
# combined_url = "https://us-south.ml.cloud.ibm.com/ml/v1-beta/generation/text?version=2023-05-29"
# combined_payload = {
# "model_id": "meta-llama/llama-2-70b-chat",
# "input": "instruction : Categorize the input_text into demographics, Chief Complaint, HPI, PMHx, Social History, Family #History, ROS, state and types of Physical Exam and Test, Medications and dosages, Plans and Other mentioned high value information, and #corresponding billing and icd codes. Only include explicitly mentioned information and notes when needed. refrain from inferring or adding #details not present in the text. input_text : " + combined_result,
# "parameters": {"decoding_method": "greedy", "max_new_tokens": 1500, "min_new_tokens": 0, "stop_sequences": [], #"repetition_penalty": 1},
# "project_id": "beaf6470-c5bc-4695-b204-29d09c8bf7fb",
# "moderations": {
# "hap": {"input": True, "output": True, "threshold": 0.5, "mask": {"remove_entity_value": True}}
# }
# }
#
# # Make the combined text API call
# combined_response = requests.post(combined_url, headers=headers, json=combined_payload)
# combined_response_data = combined_response.json()
#
# # Extract the generated text from the combined text response
# combined_generated_text = combined_response_data['results'][0]['generated_text']
######################################### #########################################
### end of added functionality, pass combined_generated_text instead of combined_result
######################################### #########################################
#return render_template_string('result.html', combined_result=combined_result)
return render_template('result.html', combined_result=combined_result , original_text=user_input)
except Exception as e:
print(e)
print(response_data)
return render_template('error.html', error_message=str(response_data))
from urllib.parse import quote, unquote
@app.route('/setup_n_generate_text', methods=['GET'])
def setup_n_generate_text():
try:
user_config = request.args.get('user_config_area', default="100 99 Extract medical information from the conversation specific to entities mentioned in the text. Categorize the information into demographics, Chief Complaint, HPI, PMHx, Social History, Family History, ROS, state and types of Physical Exam and Test, Medications and dosages, Plans and Other mentioned high-value information, and corresponding billing and icd codes. Only include explicitly mentioned information and notes when needed. refrain from inferring or adding details not present in the text.")
user_input = request.args.get('user_input', default="I am patient with diabetic, taking advil and night quels with men medicaitons such as viagra")
print("Got the variables")
if not user_input:
return render_template('error.html', error_message='Input text is required.')
if not user_input:
return render_template_string('error.html',error_message='Input text is required2.')
if not user_config:
return render_template('error.html', error_message='congif setting is required.')
configs = user_config.split("99")
maxTok, instruction = configs
drug_names = find_drugs(user_input.split(" "),is_ignore_case=True)
print("------Drugs-----")
print(drug_names)
drug_names2 = [entry[0]['name'] for entry in drug_names]
print("------Drugs name-----")
print(drug_names2)
drug_names2 = " ".join(drug_names2)
pieces = split_text_with_equal_size(user_input)
url = "https://us-south.ml.cloud.ibm.com/ml/v1-beta/generation/text?version=2023-05-29"
headers = header
#'Authorization': 'Bearer eyJraWQiOiIyMDI0MDEwNjA4MzciLCJhbGciOiJSUzI1NiJ9.eyJpYW1faWQiOiJJQk1pZC01NTAwMDM1QUs1IiwiaWQiOiJJQk1pZC01NTAwMDM1QUs1IiwicmVhbG1pZCI6IklCTWlkIiwianRpIjoiNWQyYTIxNGMtODA2Ny00ZDY4LWEwNmItMzM3NDMzNTVlMzczIiwiaWRlbnRpZmllciI6IjU1MDAwMzVBSzUiLCJnaXZlbl9uYW1lIjoicGVkcmFtIiwiZmFtaWx5X25hbWUiOiJhaGFkaW5lamFkIiwibmFtZSI6InBlZHJhbSBhaGFkaW5lamFkIiwiZW1haWwiOiJwZWRyYW1AdGNlZ3JvdXAuY29tIiwic3ViIjoicGVkcmFtQHRjZWdyb3VwLmNvbSIsImF1dGhuIjp7InN1YiI6InBlZHJhbUB0Y2Vncm91cC5jb20iLCJpYW1faWQiOiJJQk1pZC01NTAwMDM1QUs1IiwibmFtZSI6InBlZHJhbSBhaGFkaW5lamFkIiwiZ2l2ZW5fbmFtZSI6InBlZHJhbSIsImZhbWlseV9uYW1lIjoiYWhhZGluZWphZCIsImVtYWlsIjoicGVkcmFtQHRjZWdyb3VwLmNvbSJ9LCJhY2NvdW50Ijp7InZhbGlkIjp0cnVlLCJic3MiOiJmMmE0N2JjM2ViYWQ0MWZhOTA3YWIzNzM2YWZmMDMzMiIsImltc191c2VyX2lkIjoiODk4MDEzMiIsImZyb3plbiI6dHJ1ZSwiaW1zIjoiMTczMzkxMSJ9LCJpYXQiOjE3MDcwMjQ3OTgsImV4cCI6MTcwNzAyODM5OCwiaXNzIjoiaHR0cHM6Ly9pYW0uY2xvdWQuaWJtLmNvbS9pZGVudGl0eSIsImdyYW50X3R5cGUiOiJ1cm46aWJtOnBhcmFtczpvYXV0aDpncmFudC10eXBlOmFwaWtleSIsInNjb3BlIjoiaWJtIG9wZW5pZCIsImNsaWVudF9pZCI6ImRlZmF1bHQiLCJhY3IiOjEsImFtciI6WyJwd2QiXX0.r8HyUcnbqRqGSHOYM1S4buS43qBwC0B9zDtoEJIP_koCi7IryAxyQ8hvTaZ4-to9lhDQDWI9N67Hx5II6A_jSwQD_gszsbD_BHzZ29hmJa6bDj9d7OI3ikkLPSymHf9UWfPQMfwsWTqXNCCy5lqEk-vBE0zfFQddPqoq99SUmiRvTrasBYTOVjaRL3AavZS_DTeaHAkT0tVZTB4HA9_E_C2rskMw3S3vCdquudWyYco81WZPeEXyzWohXfYr_NHCzsHTf0MBpm7aKZJITY3o_Ji1x6-jokGnhbZOZCB5eO55Xfha4_3_mi-TKXE54_AN00V1KasAmQJYqbUHYNinEg'}
generated_texts = []
#"instruction : Extract medical information from the conversation specific to entities mentioned in the text. Categorize the information into demographics, Chief Complaint, HPI, PMHx, Social History, Family History, ROS, state and types of Physical Exam and Test, Medications and dosages, Plans and Other mentioned high-value information, and corresponding billing and icd codes. Only include explicitly mentioned information and notes when needed. refrain from inferring or adding details not present in the text. input text : "
for piece in pieces:
payload = {
"model_id": "meta-llama/llama-2-70b-chat",
"input": instruction + " input text : " + piece + ". medications: " + drug_names2 ,
"parameters": {"decoding_method": "greedy", "max_new_tokens": int(maxTok), "min_new_tokens": 10, "stop_sequences": [], "repetition_penalty": 1},
"project_id": "beaf6470-c5bc-4695-b204-29d09c8bf7fb",
"moderations": {
"hap": {"input": True, "output": True, "threshold": 0.5, "mask": {"remove_entity_value": False}}
}
}
print()
print()
print()
print(">>>>> payload <<<<<")
print(payload)
response = requests.post(url, headers=headers, json=payload)
response_data = response.json()
generated_text = response_data['results'][0]['generated_text']
generated_texts.append(generated_text)
print()
print()
print()
print(">>>>> RESULTS <<<<<")
print(generated_texts)
combined_result = "".join(generated_texts)
# Assuming drug_names is initialized somewhere above this snippet
unique_drug_tuples = [] # List to hold unique drugs
processed_drug_names = set() # Set to track processed drug names and avoid duplicates
for drug_tuple in drug_names:
drug_info = drug_tuple[0] # Access the drug info dictionary from the tuple
if 'name' in drug_info: # Ensure 'name' key exists in the dictionary
drug_name = drug_info['name'] # Get the drug's name
# Check if the drug has already been processed
if drug_name not in processed_drug_names:
if 'synonyms' in drug_info: # Ensure 'synonyms' key exists
drug_info['synonyms'] = list(drug_info['synonyms']) # Convert set to list
unique_drug_tuples.append((drug_info, drug_tuple[1], drug_tuple[2])) # Add the updated tuple
processed_drug_names.add(drug_name) # Mark this drug as processed
# Serialize the modified data to JSON, ensuring no duplicates
serialized_drugs_corrected = json.dumps(unique_drug_tuples) # Optional: , indent=4 for pretty print
#print(generated_texts)
print(serialized_drugs_corrected)
return render_template('result.html', combined_result=combined_result , original_text=user_input, drugs=serialized_drugs_corrected)
except Exception as e:
print(e)
return render_template('error.html', error_message='issue is here')
if __name__ == '__main__':
app.run(debug=True, host='0.0.0.0', port=8099)