{ "cells": [ { "cell_type": "markdown", "id": "9eb97885", "metadata": {}, "source": [ "# Leaderboard" ] }, { "cell_type": "code", "execution_count": 3, "id": "e07fb041", "metadata": { "tags": [ "hide-input" ] }, "outputs": [ { "data": { "text/html": [ "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
*WER(p): WER for slue-voxpopuli, WER(c): WER for slue-voxceleb
 RankingSystem typeSLUE-scoreWER(p)WER(c)NER f-1SA f-1ref
Submission        
DEBERTA-L1NLP-topline82.70.00.081.466.8link
DEBERTA-B2NLP-topline82.30.00.081.465.5link
BERT-B3NLP-topline81.50.00.081.263.3link
W2V2-L-LL60K+LM_DEBERTA-L4pipeline75.79.311.171.865.5link
W2V2-B-LS960+LM_DEBERTA-L5pipeline73.512.316.169.265.4link
W2V2-B-LS960+LM_DEBERTA-B6pipeline72.612.316.168.563.6link
W2V2-B-LS960+LM_BERT-B7pipeline71.612.316.168.660.5link
W2V2-L-LL60K_DEBERTA-L8pipeline70.812.113.859.765.7link
W2V2-L-LL60K+LM9e2e68.29.311.164.850.1link
W2V2-B-LS960+LM10e2e65.912.316.163.448.6link
HUBERT-B-LS960+LM11e2e64.916.816.961.949.4link
W2V2-B-LS960_DEBERTA-L12pipeline64.518.420.949.563.6link
W2V2-B-LS960_DEBERTA-B13pipeline63.018.420.946.462.1link
W2V2-B-LS960_BERT-B14pipeline62.618.420.947.460.1link
W2V2-L-LL60K15e2e62.512.113.850.550.1link
W2V2-B-VP100K+LM16e2e60.217.323.061.838.9link
W2V2-B-LS96017e2e59.518.420.949.648.6link
HUBERT-B-LS96018e2e59.519.621.749.849.4link
W2V2-B-VP100K19e2e52.822.833.747.938.9link
\n" ], "text/plain": [ "" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import editdistance, argparse\n", "import pandas as pd\n", "import numpy as np\n", "from sklearn.metrics import f1_score, precision_score, recall_score\n", "from slue_toolkit.eval import eval_utils\n", "from slue_toolkit.eval.eval_w2v_ner import make_distinct\n", "eval_utils.get_stats = eval_utils.get_ner_stats #quickpatch\n", "from IPython.display import HTML\n", "\n", "def get_asr_wer(gold_df,submission_df):\n", " assert len(gold_df)==len(submission_df), \"tsv length mismatch\"\n", " errs_t = 0\n", " lengths_t = 0\n", "\n", " for tgt_words,hyp_words in zip(gold_df.pred_text.to_list(), submission_df.pred_text.to_list()):\n", " tgt_words = tgt_words.split()\n", " if pd.isna(hyp_words):\n", " hyp_words = []\n", " else:\n", " hyp_words = hyp_words.split()\n", " errs = editdistance.eval(hyp_words, tgt_words)\n", " length = len(tgt_words)\n", " errs_t += errs\n", " lengths_t += length\n", "\n", " wer = errs_t * 100.0 / lengths_t\n", " return wer\n", "\n", "def get_sentiment_f1(gold_df,submission_df):\n", " \n", " gt = gold_df.pred_sentiment.to_list()\n", " pred = submission_df.pred_sentiment.to_list()\n", " macro_f1 = f1_score(gt, pred, average=\"macro\") * 100\n", " \n", " return macro_f1\n", "\n", "def get_ner_distinct_label_lst(submission_df,gold_df):\n", " gold_distinct_label_lst = []\n", " for label in submission_df[gold_df.set_name==\"slue-voxpopuli\"].pred_ner.to_list():\n", " label_lst = []\n", " if eval(label):\n", " for lab in eval(label):\n", " label_lst.append(tuple(lab))\n", " gold_distinct_label_lst.append(make_distinct(label_lst))\n", " else:\n", " gold_distinct_label_lst.append([])\n", " return gold_distinct_label_lst\n", "\n", "def get_slue_score(submission_file,gt_file = \"submissions_rebalanced/gold.tsv\",submission_name=None):\n", " \n", " leaderboard_score = {}\n", " gold = pd.read_csv(gt_file,sep=\"\\t\")\n", " submission = pd.read_csv(f\"{submission_file}\",sep=\"\\t\")\n", " \n", " #sort submission based on utterance id\n", " new_order = []\n", " for utterance_id in gold.id:\n", " assert utterance_id in submission.id.to_list(),f\"missing id = {utterance_id}\"\n", " index = submission.index[submission.id == utterance_id]\n", " assert len(index)==1, f\"too many entities for id = {utterance_id}\"\n", " new_order.append(index[0])\n", " submission = submission.loc[new_order]\n", " submission = submission.reset_index()\n", " submission.pop('index')\n", "\n", " #WER-Voxpopuli\n", " leaderboard_score['wer_voxpopuli'] = get_asr_wer(gold[gold.set_name==\"slue-voxpopuli\"],submission[gold.set_name==\"slue-voxpopuli\"])\n", " \n", " #WER-Voxceleb\n", " leaderboard_score['wer_voxceleb'] = get_asr_wer(gold[gold.set_name==\"slue-voxceleb\"],submission[gold.set_name==\"slue-voxceleb\"])\n", "\n", " # sentiment analysis f1 score\n", " leaderboard_score['sentiment_macro_f1'] = get_sentiment_f1(gold[gold.set_name==\"slue-voxceleb\"],submission[gold.set_name==\"slue-voxceleb\"])\n", " \n", " # NER f1 score\n", " gold_distinct_label_lst = get_ner_distinct_label_lst(gold,gold)\n", " submission_distinct_label_lst = get_ner_distinct_label_lst(submission,gold)\n", " ner_result = eval_utils.get_ner_scores(gold_distinct_label_lst,submission_distinct_label_lst)\n", " leaderboard_score['ner_micro_f1'] = ner_result['overall_micro']['fscore']*100\n", " \n", " # SLUE-score\n", " leaderboard_score['slue_score'] = ((100 - (leaderboard_score['wer_voxpopuli']+leaderboard_score['wer_voxceleb'])/2)\\\n", " + leaderboard_score['sentiment_macro_f1']\\\n", " + leaderboard_score['ner_micro_f1'])/3.0\n", " \n", " return leaderboard_score\n", "\n", "submissions = {\n", "# \"arxiv_indomain_w2v2-base-ls960\": \"https://arxiv.org/abs/2111.10367\",\n", "# \"arxiv_indomain_w2v2-large-ll60k\": \"https://arxiv.org/abs/2111.10367\", \n", " \"NLP-topline_bert-b\": \"https://arxiv.org/abs/2111.10367\",\n", " \"NLP-topline_deberta-b\": \"https://arxiv.org/abs/2111.10367\",\n", " \"NLP-topline_deberta-l\": \"https://arxiv.org/abs/2111.10367\",\n", " \"pipeline_w2v2-b-ls960_bert-b\": \"https://arxiv.org/abs/2111.10367\",\n", " \"pipeline_w2v2-b-ls960_deberta-b\": \"https://arxiv.org/abs/2111.10367\",\n", " \"pipeline_w2v2-b-ls960_deberta-l\": \"https://arxiv.org/abs/2111.10367\",\n", " \"pipeline_w2v2-l-ll60k_deberta-l\": \"https://arxiv.org/abs/2111.10367\",\n", " \"pipeline_w2v2-b-ls960+lm_bert-b\": \"https://arxiv.org/abs/2111.10367\",\n", " \"pipeline_w2v2-b-ls960+lm_deberta-b\": \"https://arxiv.org/abs/2111.10367\",\n", " \"pipeline_w2v2-b-ls960+lm_deberta-l\": \"https://arxiv.org/abs/2111.10367\",\n", " \"pipeline_w2v2-l-ll60k+lm_deberta-l\": \"https://arxiv.org/abs/2111.10367\",\n", " \"e2e_w2v2-b-ls960\": \"https://arxiv.org/abs/2111.10367\",\n", " \"e2e_w2v2-b-vp100k\": \"https://arxiv.org/abs/2111.10367\",\n", " \"e2e_hubert-b-ls960\": \"https://arxiv.org/abs/2111.10367\",\n", " \"e2e_w2v2-l-ll60k\": \"https://arxiv.org/abs/2111.10367\",\n", " \"e2e_w2v2-b-ls960+lm\": \"https://arxiv.org/abs/2111.10367\",\n", " \"e2e_w2v2-b-vp100k+lm\": \"https://arxiv.org/abs/2111.10367\",\n", " \"e2e_hubert-b-ls960+lm\": \"https://arxiv.org/abs/2111.10367\",\n", " \"e2e_w2v2-l-ll60k+lm\": \"https://arxiv.org/abs/2111.10367\",\n", "}\n", "# \"e2e_SEW-D-mid-LS960\": \"https://arxiv.org/abs/2109.06870\",\\\n", "# \"e2e_SEW-D-mid-LS960+LM\": \"https://arxiv.org/abs/2109.06870\"}\n", "\n", "\n", "leaderboard_scores = {}\n", "leaderboard_scores['ref'] = []\n", "leaderboard_scores['submission'] = []\n", "leaderboard_scores['system type'] = []\n", "for submission_name in submissions.keys():\n", " score = get_slue_score(f\"submissions_rebalanced/{submission_name}.tsv\")\n", " leaderboard_scores['submission'].append('_'.join(submission_name.split(\"_\")[1:]).upper())\n", " leaderboard_scores['system type'].append(submission_name.split(\"_\")[0])\n", " ref_link = '-'\n", " if submissions[submission_name]:\n", " ref_link = f'link'\n", " leaderboard_scores['ref'].append(ref_link)\n", " for key in score.keys():\n", " if not key in leaderboard_scores:\n", " leaderboard_scores[key]=[]\n", " leaderboard_scores[key].append(score[key])\n", "\n", "\n", "\n", "rank_order = np.argsort(leaderboard_scores['slue_score'])\n", "rank_order = rank_order[::-1]\n", "df = pd.DataFrame.from_dict(leaderboard_scores)\n", "# rank_order = max(rank_order)-rank_order\n", "df = df.loc[rank_order]\n", "df['Ranking'] = np.arange(len(rank_order))+1\n", "df = df[['submission','Ranking','system type', 'slue_score','wer_voxpopuli', 'wer_voxceleb', 'ner_micro_f1','sentiment_macro_f1','ref']]\n", "df = df.rename(columns={\"submission\": \"Submission\"})\n", "df = df.rename(columns={\"system type\": \"System type\"})\n", "df = df.rename(columns={\"slue_score\": \"SLUE-score\"})\n", "df = df.rename(columns={\"wer_voxpopuli\": \"WER(p)\"})\n", "df = df.rename(columns={\"wer_voxceleb\": \"WER(c)\"})\n", "df = df.rename(columns={\"ner_micro_f1\": \"NER f-1\"})\n", "df = df.rename(columns={\"sentiment_macro_f1\": \"SA f-1\"})\n", "df = df.set_index(\"Submission\")\n", "\n", "headers = {\n", " 'selector': 'th',\n", " 'props': 'background-color: white; color: black; width: 75px; font-weight:bold;font-size:11px'\n", "}\n", "index_names = {\n", " 'selector': '.index_name',\n", " 'props': 'font-style: normal; color: black; font-weight:bold; width:300px;font-size:12px'}\n", "cell_hover = { # for row hover use instead of \n", " 'selector': 'td:hover',\n", " 'props': [('background-color', '#ffffb3'),(\"font-size\", \"130%\"),(\"font-weight\",\"bold\")]\n", "}\n", "submission = {\n", " 'selector': 'td',\n", " 'props': 'font-style: italic; color: black; font-weight:normal;'}\n", "caption = dict(selector=\"caption\", props=[(\"caption-side\", \"bottom\")])\n", "\n", "\n", "# df.style.\\\n", "# set_table_attributes('style=\"font-size: 14px\"',).\\\n", "# set_table_styles([submission,caption,cell_hover,index_names,headers], overwrite=False).\\\n", "# format(precision=1).\\\n", "# set_caption(\"*WER(p): WER for slue-voxpopuli, WER(c): WER for slue-voxceleb\")\n", "\n", "\n", "\n", "HTML(df.style.\\\n", " set_table_attributes('style=\"font-size: 14px\"',).\\\n", " set_table_styles([submission,caption,cell_hover,index_names,headers], overwrite=False).\\\n", " format(precision=1).\\\n", " set_caption(\"*WER(p): WER for slue-voxpopuli, WER(c): WER for slue-voxceleb\").to_html())\n", " " ] } ], "metadata": { "celltoolbar": "Tags", "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.0" } }, "nbformat": 4, "nbformat_minor": 5 }