Leaderboard

import editdistance, argparse
import pandas as pd
import numpy as np
from sklearn.metrics import f1_score, precision_score, recall_score
from slue_toolkit.eval import eval_utils
from slue_toolkit.eval.eval_w2v_ner import make_distinct
eval_utils.get_stats = eval_utils.get_ner_stats #quickpatch
from IPython.display import HTML

def get_asr_wer(gold_df,submission_df):
    assert len(gold_df)==len(submission_df), "tsv length mismatch"
    errs_t = 0
    lengths_t = 0

    for tgt_words,hyp_words in zip(gold_df.pred_text.to_list(), submission_df.pred_text.to_list()):
        tgt_words = tgt_words.split()
        if pd.isna(hyp_words):
            hyp_words = []
        else:
            hyp_words = hyp_words.split()
        errs = editdistance.eval(hyp_words, tgt_words)
        length = len(tgt_words)
        errs_t += errs
        lengths_t += length

    wer = errs_t * 100.0 / lengths_t
    return wer

def get_sentiment_f1(gold_df,submission_df):
    
    gt = gold_df.pred_sentiment.to_list()
    pred = submission_df.pred_sentiment.to_list()
    macro_f1 = f1_score(gt, pred, average="macro") * 100
    
    return macro_f1

def get_ner_distinct_label_lst(submission_df,gold_df):
    gold_distinct_label_lst = []
    for label in submission_df[gold_df.set_name=="slue-voxpopuli"].pred_ner.to_list():
        label_lst = []
        if eval(label):
            for lab in eval(label):
                label_lst.append(tuple(lab))
            gold_distinct_label_lst.append(make_distinct(label_lst))
        else:
            gold_distinct_label_lst.append([])
    return gold_distinct_label_lst

def get_slue_score(submission_file,gt_file = "submissions_rebalanced/gold.tsv",submission_name=None):
    
    leaderboard_score = {}
    gold = pd.read_csv(gt_file,sep="\t")
    submission = pd.read_csv(f"{submission_file}",sep="\t")
    
    #sort submission based on utterance id
    new_order = []
    for utterance_id in gold.id:
        assert utterance_id in submission.id.to_list(),f"missing id = {utterance_id}"
        index = submission.index[submission.id == utterance_id]
        assert len(index)==1, f"too many entities for id = {utterance_id}"
        new_order.append(index[0])
    submission = submission.loc[new_order]
    submission = submission.reset_index()
    submission.pop('index')

    #WER-Voxpopuli
    leaderboard_score['wer_voxpopuli'] = get_asr_wer(gold[gold.set_name=="slue-voxpopuli"],submission[gold.set_name=="slue-voxpopuli"])
    
    #WER-Voxceleb
    leaderboard_score['wer_voxceleb'] = get_asr_wer(gold[gold.set_name=="slue-voxceleb"],submission[gold.set_name=="slue-voxceleb"])

    # sentiment analysis f1 score
    leaderboard_score['sentiment_macro_f1'] = get_sentiment_f1(gold[gold.set_name=="slue-voxceleb"],submission[gold.set_name=="slue-voxceleb"])
    
    # NER f1 score
    gold_distinct_label_lst = get_ner_distinct_label_lst(gold,gold)
    submission_distinct_label_lst = get_ner_distinct_label_lst(submission,gold)
    ner_result = eval_utils.get_ner_scores(gold_distinct_label_lst,submission_distinct_label_lst)
    leaderboard_score['ner_micro_f1'] = ner_result['overall_micro']['fscore']*100
    
    # SLUE-score
    leaderboard_score['slue_score'] = ((100 - (leaderboard_score['wer_voxpopuli']+leaderboard_score['wer_voxceleb'])/2)\
                                       + leaderboard_score['sentiment_macro_f1']\
                                       + leaderboard_score['ner_micro_f1'])/3.0
    
    return leaderboard_score

submissions = {
#     "arxiv_indomain_w2v2-base-ls960": "https://arxiv.org/abs/2111.10367",
#     "arxiv_indomain_w2v2-large-ll60k": "https://arxiv.org/abs/2111.10367",    
    "NLP-topline_bert-b": "https://arxiv.org/abs/2111.10367",
    "NLP-topline_deberta-b": "https://arxiv.org/abs/2111.10367",
    "NLP-topline_deberta-l": "https://arxiv.org/abs/2111.10367",
    "pipeline_w2v2-b-ls960_bert-b": "https://arxiv.org/abs/2111.10367",
    "pipeline_w2v2-b-ls960_deberta-b": "https://arxiv.org/abs/2111.10367",
    "pipeline_w2v2-b-ls960_deberta-l": "https://arxiv.org/abs/2111.10367",
    "pipeline_w2v2-l-ll60k_deberta-l": "https://arxiv.org/abs/2111.10367",
    "pipeline_w2v2-b-ls960+lm_bert-b": "https://arxiv.org/abs/2111.10367",
    "pipeline_w2v2-b-ls960+lm_deberta-b": "https://arxiv.org/abs/2111.10367",
    "pipeline_w2v2-b-ls960+lm_deberta-l": "https://arxiv.org/abs/2111.10367",
    "pipeline_w2v2-l-ll60k+lm_deberta-l": "https://arxiv.org/abs/2111.10367",
    "e2e_w2v2-b-ls960": "https://arxiv.org/abs/2111.10367",
    "e2e_w2v2-b-vp100k": "https://arxiv.org/abs/2111.10367",
    "e2e_hubert-b-ls960": "https://arxiv.org/abs/2111.10367",
    "e2e_w2v2-l-ll60k": "https://arxiv.org/abs/2111.10367",
    "e2e_w2v2-b-ls960+lm": "https://arxiv.org/abs/2111.10367",
    "e2e_w2v2-b-vp100k+lm": "https://arxiv.org/abs/2111.10367",
    "e2e_hubert-b-ls960+lm": "https://arxiv.org/abs/2111.10367",
    "e2e_w2v2-l-ll60k+lm": "https://arxiv.org/abs/2111.10367",
}
#                "e2e_SEW-D-mid-LS960": "https://arxiv.org/abs/2109.06870",\
#                "e2e_SEW-D-mid-LS960+LM": "https://arxiv.org/abs/2109.06870"}


leaderboard_scores = {}
leaderboard_scores['ref'] = []
leaderboard_scores['submission'] = []
leaderboard_scores['system type'] = []
for submission_name in submissions.keys():
    score = get_slue_score(f"submissions_rebalanced/{submission_name}.tsv")
    leaderboard_scores['submission'].append('_'.join(submission_name.split("_")[1:]).upper())
    leaderboard_scores['system type'].append(submission_name.split("_")[0])
    ref_link = '-'
    if submissions[submission_name]:
        ref_link = f'<a target="_blank" href="{submissions[submission_name]}">link</a>'
    leaderboard_scores['ref'].append(ref_link)
    for key in score.keys():
        if not key in leaderboard_scores:
            leaderboard_scores[key]=[]
        leaderboard_scores[key].append(score[key])



rank_order = np.argsort(leaderboard_scores['slue_score'])
rank_order = rank_order[::-1]
df = pd.DataFrame.from_dict(leaderboard_scores)
# rank_order = max(rank_order)-rank_order
df = df.loc[rank_order]
df['Ranking'] = np.arange(len(rank_order))+1
df = df[['submission','Ranking','system type', 'slue_score','wer_voxpopuli', 'wer_voxceleb', 'ner_micro_f1','sentiment_macro_f1','ref']]
df = df.rename(columns={"submission": "Submission"})
df = df.rename(columns={"system type": "System type"})
df = df.rename(columns={"slue_score": "SLUE-score"})
df = df.rename(columns={"wer_voxpopuli": "WER(p)"})
df = df.rename(columns={"wer_voxceleb": "WER(c)"})
df = df.rename(columns={"ner_micro_f1": "NER f-1"})
df = df.rename(columns={"sentiment_macro_f1": "SA f-1"})
df = df.set_index("Submission")

headers = {
    'selector': 'th',
    'props': 'background-color: white; color: black; width: 75px; font-weight:bold;font-size:11px'
}
index_names = {
    'selector': '.index_name',
    'props': 'font-style: normal; color: black; font-weight:bold; width:300px;font-size:12px'}
cell_hover = {  # for row hover use <tr> instead of <td>
    'selector': 'td:hover',
    'props': [('background-color', '#ffffb3'),("font-size", "130%"),("font-weight","bold")]
}
submission = {
    'selector': 'td',
    'props': 'font-style: italic; color: black; font-weight:normal;'}
caption = dict(selector="caption", props=[("caption-side", "bottom")])


# df.style.\
#     set_table_attributes('style="font-size: 14px"',).\
#     set_table_styles([submission,caption,cell_hover,index_names,headers], overwrite=False).\
#     format(precision=1).\
#     set_caption("*WER(p): WER for slue-voxpopuli, WER(c): WER for slue-voxceleb")



HTML(df.style.\
    set_table_attributes('style="font-size: 14px"',).\
    set_table_styles([submission,caption,cell_hover,index_names,headers], overwrite=False).\
    format(precision=1).\
    set_caption("*WER(p): WER for slue-voxpopuli, WER(c): WER for slue-voxceleb").to_html())
        
*WER(p): WER for slue-voxpopuli, WER(c): WER for slue-voxceleb
  Ranking System type SLUE-score WER(p) WER(c) NER f-1 SA f-1 ref
Submission                
DEBERTA-L 1 NLP-topline 82.7 0.0 0.0 81.4 66.8 link
DEBERTA-B 2 NLP-topline 82.3 0.0 0.0 81.4 65.5 link
BERT-B 3 NLP-topline 81.5 0.0 0.0 81.2 63.3 link
W2V2-L-LL60K+LM_DEBERTA-L 4 pipeline 75.7 9.3 11.1 71.8 65.5 link
W2V2-B-LS960+LM_DEBERTA-L 5 pipeline 73.5 12.3 16.1 69.2 65.4 link
W2V2-B-LS960+LM_DEBERTA-B 6 pipeline 72.6 12.3 16.1 68.5 63.6 link
W2V2-B-LS960+LM_BERT-B 7 pipeline 71.6 12.3 16.1 68.6 60.5 link
W2V2-L-LL60K_DEBERTA-L 8 pipeline 70.8 12.1 13.8 59.7 65.7 link
W2V2-L-LL60K+LM 9 e2e 68.2 9.3 11.1 64.8 50.1 link
W2V2-B-LS960+LM 10 e2e 65.9 12.3 16.1 63.4 48.6 link
HUBERT-B-LS960+LM 11 e2e 64.9 16.8 16.9 61.9 49.4 link
W2V2-B-LS960_DEBERTA-L 12 pipeline 64.5 18.4 20.9 49.5 63.6 link
W2V2-B-LS960_DEBERTA-B 13 pipeline 63.0 18.4 20.9 46.4 62.1 link
W2V2-B-LS960_BERT-B 14 pipeline 62.6 18.4 20.9 47.4 60.1 link
W2V2-L-LL60K 15 e2e 62.5 12.1 13.8 50.5 50.1 link
W2V2-B-VP100K+LM 16 e2e 60.2 17.3 23.0 61.8 38.9 link
W2V2-B-LS960 17 e2e 59.5 18.4 20.9 49.6 48.6 link
HUBERT-B-LS960 18 e2e 59.5 19.6 21.7 49.8 49.4 link
W2V2-B-VP100K 19 e2e 52.8 22.8 33.7 47.9 38.9 link