Muhammad Haris commited on
Commit
380d8a4
1 Parent(s): 261d5bd

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +6 -11
app.py CHANGED
@@ -8,12 +8,7 @@ import torch
8
  import gdown
9
  import os
10
 
11
-
12
- # file_id = '1P3Nz6f3KG0m0kO_2pEfnVIhgP8Bvkl4v'
13
- # url = f'https://drive.google.com/uc?id={file_id}'
14
- # excel_file_path = os.path.join(os.path.expanduser("~"), 'medical_data.csv')
15
- # Download the file from Hugging Face Spaces
16
-
17
  url = 'https://huggingface.co/datasets/HEHEBOIBOT/PharmEvoDiabetesData/raw/main/medical_data.csv'
18
  excel_file_path = os.path.join(os.path.expanduser("~"), 'medical_data.csv')
19
 
@@ -27,7 +22,7 @@ except UnicodeDecodeError:
27
 
28
  # TF-IDF Vectorization
29
  vectorizer = TfidfVectorizer(stop_words='english')
30
- X_tfidf = vectorizer.fit_transform(medical_df['Questions'])
31
 
32
  # Load pre-trained GPT-2 model and tokenizer
33
  model_name = "sshleifer/tiny-gpt2"
@@ -47,11 +42,11 @@ def get_medical_response(question, vectorizer, X_tfidf, model, tokenizer, sbert_
47
 
48
  # Find the most similar question using semantic similarity
49
  question_embedding = sbert_model.encode(question, convert_to_tensor=True)
50
- similarities = util.pytorch_cos_sim(question_embedding, sbert_model.encode(medical_df['Questions'].tolist(), convert_to_tensor=True)).flatten()
51
  max_sim_index = similarities.argmax().item()
52
 
53
  # LLM response generation
54
- input_text = "DiBot: " + medical_df.iloc[max_sim_index]['Questions']
55
  input_ids = tokenizer.encode(input_text, return_tensors="pt")
56
  attention_mask = torch.ones(input_ids.shape, dtype=torch.long)
57
  pad_token_id = tokenizer.eos_token_id
@@ -61,7 +56,7 @@ def get_medical_response(question, vectorizer, X_tfidf, model, tokenizer, sbert_
61
  # Compare similarities and choose the best response
62
  if tfidf_similarities.max() > 0.5:
63
  tfidf_index = tfidf_similarities.argmax()
64
- return medical_df.iloc[tfidf_index]['Answers']
65
  else:
66
  return lm_generated_response
67
 
@@ -85,4 +80,4 @@ if user_input:
85
  # Display the chat messages
86
  for message in st.session_state.messages:
87
  with st.chat_message(message["role"]):
88
- st.markdown(message["content"])
 
8
  import gdown
9
  import os
10
 
11
+ # Download the CSV file from Hugging Face Spaces
 
 
 
 
 
12
  url = 'https://huggingface.co/datasets/HEHEBOIBOT/PharmEvoDiabetesData/raw/main/medical_data.csv'
13
  excel_file_path = os.path.join(os.path.expanduser("~"), 'medical_data.csv')
14
 
 
22
 
23
  # TF-IDF Vectorization
24
  vectorizer = TfidfVectorizer(stop_words='english')
25
+ X_tfidf = vectorizer.fit_transform(medical_df.iloc[:, 0]) # Accessing first column by index
26
 
27
  # Load pre-trained GPT-2 model and tokenizer
28
  model_name = "sshleifer/tiny-gpt2"
 
42
 
43
  # Find the most similar question using semantic similarity
44
  question_embedding = sbert_model.encode(question, convert_to_tensor=True)
45
+ similarities = util.pytorch_cos_sim(question_embedding, sbert_model.encode(medical_df.iloc[:, 0].tolist(), convert_to_tensor=True)).flatten()
46
  max_sim_index = similarities.argmax().item()
47
 
48
  # LLM response generation
49
+ input_text = "DiBot: " + medical_df.iloc[max_sim_index][0]
50
  input_ids = tokenizer.encode(input_text, return_tensors="pt")
51
  attention_mask = torch.ones(input_ids.shape, dtype=torch.long)
52
  pad_token_id = tokenizer.eos_token_id
 
56
  # Compare similarities and choose the best response
57
  if tfidf_similarities.max() > 0.5:
58
  tfidf_index = tfidf_similarities.argmax()
59
+ return medical_df.iloc[tfidf_index][1] # Assuming 'Answers' is in the second column (index 1)
60
  else:
61
  return lm_generated_response
62
 
 
80
  # Display the chat messages
81
  for message in st.session_state.messages:
82
  with st.chat_message(message["role"]):
83
+ st.markdown(message["content"])