import editdistance, argparse
import pandas as pd
import numpy as np
from sklearn.metrics import f1_score, precision_score, recall_score
from slue_toolkit.eval import eval_utils
from slue_toolkit.eval.eval_w2v_ner import make_distinct
eval_utils.get_stats = eval_utils.get_ner_stats #quickpatch
from IPython.display import HTML
def get_asr_wer(gold_df,submission_df):
assert len(gold_df)==len(submission_df), "tsv length mismatch"
errs_t = 0
lengths_t = 0
for tgt_words,hyp_words in zip(gold_df.pred_text.to_list(), submission_df.pred_text.to_list()):
tgt_words = tgt_words.split()
if pd.isna(hyp_words):
hyp_words = []
else:
hyp_words = hyp_words.split()
errs = editdistance.eval(hyp_words, tgt_words)
length = len(tgt_words)
errs_t += errs
lengths_t += length
wer = errs_t * 100.0 / lengths_t
return wer
def get_sentiment_f1(gold_df,submission_df):
gt = gold_df.pred_sentiment.to_list()
pred = submission_df.pred_sentiment.to_list()
macro_f1 = f1_score(gt, pred, average="macro") * 100
return macro_f1
def get_ner_distinct_label_lst(submission_df,gold_df):
gold_distinct_label_lst = []
for label in submission_df[gold_df.set_name=="slue-voxpopuli"].pred_ner.to_list():
label_lst = []
if eval(label):
for lab in eval(label):
label_lst.append(tuple(lab))
gold_distinct_label_lst.append(make_distinct(label_lst))
else:
gold_distinct_label_lst.append([])
return gold_distinct_label_lst
def get_slue_score(submission_file,gt_file = "submissions_rebalanced/gold.tsv",submission_name=None):
leaderboard_score = {}
gold = pd.read_csv(gt_file,sep="\t")
submission = pd.read_csv(f"{submission_file}",sep="\t")
#sort submission based on utterance id
new_order = []
for utterance_id in gold.id:
assert utterance_id in submission.id.to_list(),f"missing id = {utterance_id}"
index = submission.index[submission.id == utterance_id]
assert len(index)==1, f"too many entities for id = {utterance_id}"
new_order.append(index[0])
submission = submission.loc[new_order]
submission = submission.reset_index()
submission.pop('index')
#WER-Voxpopuli
leaderboard_score['wer_voxpopuli'] = get_asr_wer(gold[gold.set_name=="slue-voxpopuli"],submission[gold.set_name=="slue-voxpopuli"])
#WER-Voxceleb
leaderboard_score['wer_voxceleb'] = get_asr_wer(gold[gold.set_name=="slue-voxceleb"],submission[gold.set_name=="slue-voxceleb"])
# sentiment analysis f1 score
leaderboard_score['sentiment_macro_f1'] = get_sentiment_f1(gold[gold.set_name=="slue-voxceleb"],submission[gold.set_name=="slue-voxceleb"])
# NER f1 score
gold_distinct_label_lst = get_ner_distinct_label_lst(gold,gold)
submission_distinct_label_lst = get_ner_distinct_label_lst(submission,gold)
ner_result = eval_utils.get_ner_scores(gold_distinct_label_lst,submission_distinct_label_lst)
leaderboard_score['ner_micro_f1'] = ner_result['overall_micro']['fscore']*100
# SLUE-score
leaderboard_score['slue_score'] = ((100 - (leaderboard_score['wer_voxpopuli']+leaderboard_score['wer_voxceleb'])/2)\
+ leaderboard_score['sentiment_macro_f1']\
+ leaderboard_score['ner_micro_f1'])/3.0
return leaderboard_score
submissions = {
# "arxiv_indomain_w2v2-base-ls960": "https://arxiv.org/abs/2111.10367",
# "arxiv_indomain_w2v2-large-ll60k": "https://arxiv.org/abs/2111.10367",
"NLP-topline_bert-b": "https://arxiv.org/abs/2111.10367",
"NLP-topline_deberta-b": "https://arxiv.org/abs/2111.10367",
"NLP-topline_deberta-l": "https://arxiv.org/abs/2111.10367",
"pipeline_w2v2-b-ls960_bert-b": "https://arxiv.org/abs/2111.10367",
"pipeline_w2v2-b-ls960_deberta-b": "https://arxiv.org/abs/2111.10367",
"pipeline_w2v2-b-ls960_deberta-l": "https://arxiv.org/abs/2111.10367",
"pipeline_w2v2-l-ll60k_deberta-l": "https://arxiv.org/abs/2111.10367",
"pipeline_w2v2-b-ls960+lm_bert-b": "https://arxiv.org/abs/2111.10367",
"pipeline_w2v2-b-ls960+lm_deberta-b": "https://arxiv.org/abs/2111.10367",
"pipeline_w2v2-b-ls960+lm_deberta-l": "https://arxiv.org/abs/2111.10367",
"pipeline_w2v2-l-ll60k+lm_deberta-l": "https://arxiv.org/abs/2111.10367",
"e2e_w2v2-b-ls960": "https://arxiv.org/abs/2111.10367",
"e2e_w2v2-b-vp100k": "https://arxiv.org/abs/2111.10367",
"e2e_hubert-b-ls960": "https://arxiv.org/abs/2111.10367",
"e2e_w2v2-l-ll60k": "https://arxiv.org/abs/2111.10367",
"e2e_w2v2-b-ls960+lm": "https://arxiv.org/abs/2111.10367",
"e2e_w2v2-b-vp100k+lm": "https://arxiv.org/abs/2111.10367",
"e2e_hubert-b-ls960+lm": "https://arxiv.org/abs/2111.10367",
"e2e_w2v2-l-ll60k+lm": "https://arxiv.org/abs/2111.10367",
}
# "e2e_SEW-D-mid-LS960": "https://arxiv.org/abs/2109.06870",\
# "e2e_SEW-D-mid-LS960+LM": "https://arxiv.org/abs/2109.06870"}
leaderboard_scores = {}
leaderboard_scores['ref'] = []
leaderboard_scores['submission'] = []
leaderboard_scores['system type'] = []
for submission_name in submissions.keys():
score = get_slue_score(f"submissions_rebalanced/{submission_name}.tsv")
leaderboard_scores['submission'].append('_'.join(submission_name.split("_")[1:]).upper())
leaderboard_scores['system type'].append(submission_name.split("_")[0])
ref_link = '-'
if submissions[submission_name]:
ref_link = f'<a target="_blank" href="{submissions[submission_name]}">link</a>'
leaderboard_scores['ref'].append(ref_link)
for key in score.keys():
if not key in leaderboard_scores:
leaderboard_scores[key]=[]
leaderboard_scores[key].append(score[key])
rank_order = np.argsort(leaderboard_scores['slue_score'])
rank_order = rank_order[::-1]
df = pd.DataFrame.from_dict(leaderboard_scores)
# rank_order = max(rank_order)-rank_order
df = df.loc[rank_order]
df['Ranking'] = np.arange(len(rank_order))+1
df = df[['submission','Ranking','system type', 'slue_score','wer_voxpopuli', 'wer_voxceleb', 'ner_micro_f1','sentiment_macro_f1','ref']]
df = df.rename(columns={"submission": "Submission"})
df = df.rename(columns={"system type": "System type"})
df = df.rename(columns={"slue_score": "SLUE-score"})
df = df.rename(columns={"wer_voxpopuli": "WER(p)"})
df = df.rename(columns={"wer_voxceleb": "WER(c)"})
df = df.rename(columns={"ner_micro_f1": "NER f-1"})
df = df.rename(columns={"sentiment_macro_f1": "SA f-1"})
df = df.set_index("Submission")
headers = {
'selector': 'th',
'props': 'background-color: white; color: black; width: 75px; font-weight:bold;font-size:11px'
}
index_names = {
'selector': '.index_name',
'props': 'font-style: normal; color: black; font-weight:bold; width:300px;font-size:12px'}
cell_hover = { # for row hover use <tr> instead of <td>
'selector': 'td:hover',
'props': [('background-color', '#ffffb3'),("font-size", "130%"),("font-weight","bold")]
}
submission = {
'selector': 'td',
'props': 'font-style: italic; color: black; font-weight:normal;'}
caption = dict(selector="caption", props=[("caption-side", "bottom")])
# df.style.\
# set_table_attributes('style="font-size: 14px"',).\
# set_table_styles([submission,caption,cell_hover,index_names,headers], overwrite=False).\
# format(precision=1).\
# set_caption("*WER(p): WER for slue-voxpopuli, WER(c): WER for slue-voxceleb")
HTML(df.style.\
set_table_attributes('style="font-size: 14px"',).\
set_table_styles([submission,caption,cell_hover,index_names,headers], overwrite=False).\
format(precision=1).\
set_caption("*WER(p): WER for slue-voxpopuli, WER(c): WER for slue-voxceleb").to_html())