shounakpaul95 commited on
Commit
8390a54
1 Parent(s): 7980958

Update eval_utils.py

Browse files
Files changed (1) hide show
  1. eval_utils.py +12 -15
eval_utils.py CHANGED
@@ -7,7 +7,6 @@ import nltk
7
  import numpy as np
8
 
9
  from nervaluate import Evaluator
10
- # from rouge_score import rouge_scorer
11
  from sacrebleu.metrics import BLEU, CHRF
12
  from sklearn.metrics import f1_score
13
  from tqdm import tqdm
@@ -37,7 +36,7 @@ def evaluate_bail(gold_data, pred_data):
37
 
38
  f1 = f1_score(gold_labels, pred_labels, average="macro")
39
  print("Macro-F1 on HLDC-all-districts test set:", f1)
40
- return {"mF1": f1}
41
 
42
  def get_BLEU_score(ref_text_all, machine_text_all):
43
  sc_all = []
@@ -90,7 +89,7 @@ def evaluate_cjpe(gold_data, pred_data):
90
  }
91
  print("Explanability for ILDC Expert:", explanation_result)
92
  #return {**prediction_result, **explanation_result}
93
- return {"mF1": f1, "ROUGE-L": rouge_score, "BLEU": bleu_score}
94
 
95
  def span2bio(txt, roles):
96
  roles = sorted(roles, key = lambda x:x['start'])
@@ -162,7 +161,7 @@ def evaluate_lner(gold_data, pred_data, text_data):
162
  results_per_fold[f"fold_{fold}"] = avg_f1
163
 
164
  print("Strict macro-F1 on L-NER Dataset:", results_per_fold)
165
- return {"strict mF1": sum(results_per_fold.values())/len(results_per_fold)}
166
 
167
 
168
  def evaluate_rr(gold_data, pred_data):
@@ -188,7 +187,7 @@ def evaluate_rr(gold_data, pred_data):
188
 
189
  f1 = f1_score(all_gold_labels, all_pred_labels, average="macro")
190
  print(f"Macro-F1 on combined test set:", f1)
191
- return {"mF1": f1}
192
 
193
 
194
  def evaluate_lsi(gold_data, pred_data):
@@ -211,7 +210,7 @@ def evaluate_lsi(gold_data, pred_data):
211
 
212
  f1 = f1_score(gold_matrix, pred_matrix, average="macro")
213
  print("Macro-F1 on ILSI test set:", f1)
214
- return {"mF1": f1}
215
 
216
 
217
  def evaluate_pcr(gold_data, pred_data):
@@ -241,7 +240,7 @@ def evaluate_pcr(gold_data, pred_data):
241
 
242
  max_f1 = max(f1_scores)
243
  index_max = f1_scores.index(max_f1) + 1
244
- return {"muF1@K": f"{max_f1:.2f}@{index_max}"}
245
 
246
 
247
  def evaluate_summ(gold_data, pred_data):
@@ -257,15 +256,13 @@ def evaluate_summ(gold_data, pred_data):
257
  pred_summaries.append(pred_summary)
258
 
259
 
260
- # rl_evaluator = rouge.Rouge(metrics=['rouge-n','rouge-l'], max_n=2, limit_length=False, apply_avg=True)
261
- # rl_scores = rl_evaluator.get_scores(pred_summaries, gold_summaries)
262
- # print("Rouge:", {k:v['f'] for k,v in rl_scores.items()}, flush=True)
263
 
264
  _, _, bs = bert_score.score(pred_summaries, gold_summaries, lang="en", verbose=True)
265
  print("BERTSCORE:", bs.mean().item())
266
- # return {'ROUGE-L': rl_scores['rouge-l']['f'], 'BERTSCORE': bs.mean().item()}
267
- return {'ROUGE-L': '-', 'BERTSCORE': bs.mean().item()}
268
-
269
 
270
  def evaluate_lmt(gold_data, pred_data):
271
  tokenizer = AutoTokenizer.from_pretrained("ai4bharat/indic-bert", use_fast=False)
@@ -308,14 +305,14 @@ def evaluate_lmt(gold_data, pred_data):
308
 
309
  return {
310
  "BLEU": sum(bleu_scores) / len(bleu_scores),
311
- "GLEU": sum(gleu_scores) / len(gleu_scores),
312
  "chrF++": sum(chrfpp_scores) / len(chrfpp_scores),
313
  }
314
 
315
 
316
  def create_output_json(evaluation_results):
317
  output = {
318
- "Method": "Dummy Ideal Only Summ",
319
  "Submitted By": "IL-TUR",
320
  "Github Link": "dummy submission",
321
  "L-NER": {"strict mF1": evaluation_results["lner"]["strict mF1"]},
 
7
  import numpy as np
8
 
9
  from nervaluate import Evaluator
 
10
  from sacrebleu.metrics import BLEU, CHRF
11
  from sklearn.metrics import f1_score
12
  from tqdm import tqdm
 
36
 
37
  f1 = f1_score(gold_labels, pred_labels, average="macro")
38
  print("Macro-F1 on HLDC-all-districts test set:", f1)
39
+ return {"mF1": f1*100}
40
 
41
  def get_BLEU_score(ref_text_all, machine_text_all):
42
  sc_all = []
 
89
  }
90
  print("Explanability for ILDC Expert:", explanation_result)
91
  #return {**prediction_result, **explanation_result}
92
+ return {"mF1": f1*100, "ROUGE-L": rouge_score*100, "BLEU": bleu_score*100}
93
 
94
  def span2bio(txt, roles):
95
  roles = sorted(roles, key = lambda x:x['start'])
 
161
  results_per_fold[f"fold_{fold}"] = avg_f1
162
 
163
  print("Strict macro-F1 on L-NER Dataset:", results_per_fold)
164
+ return {"strict mF1": sum(results_per_fold.values())/len(results_per_fold)*100}
165
 
166
 
167
  def evaluate_rr(gold_data, pred_data):
 
187
 
188
  f1 = f1_score(all_gold_labels, all_pred_labels, average="macro")
189
  print(f"Macro-F1 on combined test set:", f1)
190
+ return {"mF1": f1*100}
191
 
192
 
193
  def evaluate_lsi(gold_data, pred_data):
 
210
 
211
  f1 = f1_score(gold_matrix, pred_matrix, average="macro")
212
  print("Macro-F1 on ILSI test set:", f1)
213
+ return {"mF1": f1*100}
214
 
215
 
216
  def evaluate_pcr(gold_data, pred_data):
 
240
 
241
  max_f1 = max(f1_scores)
242
  index_max = f1_scores.index(max_f1) + 1
243
+ return {"muF1@K": f"{max_f1*100:.2f}@{index_max}"}
244
 
245
 
246
  def evaluate_summ(gold_data, pred_data):
 
256
  pred_summaries.append(pred_summary)
257
 
258
 
259
+ rl_evaluator = rouge.Rouge(metrics=['rouge-n','rouge-l'], max_n=2, limit_length=False, apply_avg=True)
260
+ rl_scores = rl_evaluator.get_scores(pred_summaries, gold_summaries)
261
+ print("Rouge:", {k:v['f'] for k,v in rl_scores.items()}, flush=True)
262
 
263
  _, _, bs = bert_score.score(pred_summaries, gold_summaries, lang="en", verbose=True)
264
  print("BERTSCORE:", bs.mean().item())
265
+ return {'ROUGE-L': rl_scores['rouge-l']['f'] * 100, 'BERTSCORE': bs.mean().item() * 100}
 
 
266
 
267
  def evaluate_lmt(gold_data, pred_data):
268
  tokenizer = AutoTokenizer.from_pretrained("ai4bharat/indic-bert", use_fast=False)
 
305
 
306
  return {
307
  "BLEU": sum(bleu_scores) / len(bleu_scores),
308
+ "GLEU": sum(gleu_scores) / len(gleu_scores) * 100,
309
  "chrF++": sum(chrfpp_scores) / len(chrfpp_scores),
310
  }
311
 
312
 
313
  def create_output_json(evaluation_results):
314
  output = {
315
+ "Method": "Dummy Ideal Only Summ 2",
316
  "Submitted By": "IL-TUR",
317
  "Github Link": "dummy submission",
318
  "L-NER": {"strict mF1": evaluation_results["lner"]["strict mF1"]},