Commit 
							
							·
						
						e9177b9
	
1
								Parent(s):
							
							7bdbf7b
								
Fix bugs, fix datasets path, added test functions
Browse files- functions.py +24 -19
- openllm.py +7 -1
    	
        functions.py
    CHANGED
    
    | @@ -7,6 +7,7 @@ from pytablewriter import MarkdownTableWriter | |
| 7 | 
             
            import gradio as gr
         | 
| 8 | 
             
            from openllm import get_json_format_data, get_datas
         | 
| 9 | 
             
            import pandas as pd
         | 
|  | |
| 10 |  | 
| 11 | 
             
            BOT_HF_TOKEN = os.getenv('BOT_HF_TOKEN')
         | 
| 12 |  | 
| @@ -23,7 +24,7 @@ If you encounter any issues, please report them to https://huggingface.co/spaces | |
| 23 | 
             
            """
         | 
| 24 |  | 
| 25 | 
             
            def search(df, value):
         | 
| 26 | 
            -
                result_df = df[df["Model"] == value]
         | 
| 27 | 
             
                return result_df.iloc[0].to_dict() if not result_df.empty else None
         | 
| 28 |  | 
| 29 |  | 
| @@ -39,8 +40,8 @@ def get_query_url(repo): | |
| 39 | 
             
            def get_task_summary(results):
         | 
| 40 | 
             
              return {
         | 
| 41 | 
             
                  "ENEM":
         | 
| 42 | 
            -
                      {"dataset_type":"enem_challenge",
         | 
| 43 | 
            -
                      "dataset_name":"ENEM Challenge",
         | 
| 44 | 
             
                      "metric_type":"acc",
         | 
| 45 | 
             
                      "metric_value":results["ENEM"],
         | 
| 46 | 
             
                      "dataset_config": None,
         | 
| @@ -50,8 +51,8 @@ def get_task_summary(results): | |
| 50 | 
             
                      "metric_name":"accuracy"
         | 
| 51 | 
             
                      },
         | 
| 52 | 
             
                  "BLUEX":
         | 
| 53 | 
            -
                      {"dataset_type":" | 
| 54 | 
            -
                      "dataset_name":"BLUEX",
         | 
| 55 | 
             
                      "metric_type":"acc",
         | 
| 56 | 
             
                      "metric_value":results["BLUEX"],
         | 
| 57 | 
             
                      "dataset_config": None,
         | 
| @@ -61,7 +62,7 @@ def get_task_summary(results): | |
| 61 | 
             
                      "metric_name":"accuracy"
         | 
| 62 | 
             
                      },
         | 
| 63 | 
             
                  "OAB Exams":
         | 
| 64 | 
            -
                      {"dataset_type":"oab_exams",
         | 
| 65 | 
             
                      "dataset_name":"OAB Exams",
         | 
| 66 | 
             
                      "metric_type":"acc",
         | 
| 67 | 
             
                      "metric_value":results["OAB Exams"],
         | 
| @@ -72,8 +73,8 @@ def get_task_summary(results): | |
| 72 | 
             
                      "metric_name":"accuracy"
         | 
| 73 | 
             
                      },
         | 
| 74 | 
             
                  "ASSIN2 RTE":
         | 
| 75 | 
            -
                      {"dataset_type":" | 
| 76 | 
            -
                      "dataset_name":" | 
| 77 | 
             
                      "metric_type":"f1_macro",
         | 
| 78 | 
             
                      "metric_value":results["ASSIN2 RTE"],
         | 
| 79 | 
             
                      "dataset_config": None,
         | 
| @@ -83,8 +84,8 @@ def get_task_summary(results): | |
| 83 | 
             
                      "metric_name":"f1-macro"
         | 
| 84 | 
             
                      },
         | 
| 85 | 
             
                  "ASSIN2 STS":
         | 
| 86 | 
            -
                      {"dataset_type":" | 
| 87 | 
            -
                      "dataset_name":" | 
| 88 | 
             
                      "metric_type":"pearson",
         | 
| 89 | 
             
                      "metric_value":results["ASSIN2 STS"],
         | 
| 90 | 
             
                      "dataset_config": None,
         | 
| @@ -94,8 +95,8 @@ def get_task_summary(results): | |
| 94 | 
             
                      "metric_name":"pearson"
         | 
| 95 | 
             
                      },
         | 
| 96 | 
             
                  "FAQUAD NLI":
         | 
| 97 | 
            -
                      {"dataset_type":" | 
| 98 | 
            -
                      "dataset_name":" | 
| 99 | 
             
                      "metric_type":"f1_macro",
         | 
| 100 | 
             
                      "metric_value":results["FAQUAD NLI"],
         | 
| 101 | 
             
                      "dataset_config": None,
         | 
| @@ -105,8 +106,8 @@ def get_task_summary(results): | |
| 105 | 
             
                      "metric_name":"f1-macro"
         | 
| 106 | 
             
                      },
         | 
| 107 | 
             
                  "HateBR":
         | 
| 108 | 
            -
                      {"dataset_type":" | 
| 109 | 
            -
                      "dataset_name":"HateBR",
         | 
| 110 | 
             
                      "metric_type":"f1_macro",
         | 
| 111 | 
             
                      "metric_value":results["HateBR"],
         | 
| 112 | 
             
                      "dataset_config": None,
         | 
| @@ -116,8 +117,8 @@ def get_task_summary(results): | |
| 116 | 
             
                      "metric_name":"f1-macro"
         | 
| 117 | 
             
                      },
         | 
| 118 | 
             
                  "PT Hate Speech":
         | 
| 119 | 
            -
                      {"dataset_type":" | 
| 120 | 
            -
                      "dataset_name":"PT Hate Speech",
         | 
| 121 | 
             
                      "metric_type":"f1_macro",
         | 
| 122 | 
             
                      "metric_value":results["PT Hate Speech"],
         | 
| 123 | 
             
                      "dataset_config": None,
         | 
| @@ -127,7 +128,7 @@ def get_task_summary(results): | |
| 127 | 
             
                      "metric_name":"f1-macro"
         | 
| 128 | 
             
                      },
         | 
| 129 | 
             
                  "tweetSentBR":
         | 
| 130 | 
            -
                      {"dataset_type":"tweetsentbr",
         | 
| 131 | 
             
                      "dataset_name":"tweetSentBR",
         | 
| 132 | 
             
                      "metric_type":"f1_macro",
         | 
| 133 | 
             
                      "metric_value":results["tweetSentBR"],
         | 
| @@ -146,7 +147,7 @@ def get_eval_results(repo): | |
| 146 | 
             
              task_summary = get_task_summary(results)
         | 
| 147 | 
             
              md_writer = MarkdownTableWriter()
         | 
| 148 | 
             
              md_writer.headers = ["Metric", "Value"]
         | 
| 149 | 
            -
              md_writer.value_matrix = [[" | 
| 150 |  | 
| 151 | 
             
              text = f"""
         | 
| 152 | 
             
            # [Open Portuguese LLM Leaderboard Evaluation Results](https://huggingface.co/spaces/eduagarcia/open_pt_llm_leaderboard)
         | 
| @@ -201,6 +202,7 @@ def commit(repo, pr_number=None, message="Adding Evaluation Results", oauth_toke | |
| 201 | 
             
                  if "Repo card metadata block was not found." in str(e): # There is no readme
         | 
| 202 | 
             
                    readme_text = get_edited_yaml_readme(repo, token=token)
         | 
| 203 | 
             
                  else:
         | 
|  | |
| 204 | 
             
                    print(f"Something went wrong: {e}")
         | 
| 205 |  | 
| 206 | 
             
                liste = [CommitOperationAdd(path_in_repo="README.md", path_or_fileobj=readme_text.encode())]
         | 
| @@ -217,4 +219,7 @@ def commit(repo, pr_number=None, message="Adding Evaluation Results", oauth_toke | |
| 217 | 
             
                elif "Repository Not Found" in str(e):
         | 
| 218 | 
             
                  return "Repository Not Found"
         | 
| 219 | 
             
                else:
         | 
| 220 | 
            -
                  return e
         | 
|  | |
|  | |
|  | 
|  | |
| 7 | 
             
            import gradio as gr
         | 
| 8 | 
             
            from openllm import get_json_format_data, get_datas
         | 
| 9 | 
             
            import pandas as pd
         | 
| 10 | 
            +
            import traceback
         | 
| 11 |  | 
| 12 | 
             
            BOT_HF_TOKEN = os.getenv('BOT_HF_TOKEN')
         | 
| 13 |  | 
|  | |
| 24 | 
             
            """
         | 
| 25 |  | 
| 26 | 
             
            def search(df, value):
         | 
| 27 | 
            +
                result_df = df[df["Model Name"] == value]
         | 
| 28 | 
             
                return result_df.iloc[0].to_dict() if not result_df.empty else None
         | 
| 29 |  | 
| 30 |  | 
|  | |
| 40 | 
             
            def get_task_summary(results):
         | 
| 41 | 
             
              return {
         | 
| 42 | 
             
                  "ENEM":
         | 
| 43 | 
            +
                      {"dataset_type":"eduagarcia/enem_challenge",
         | 
| 44 | 
            +
                      "dataset_name":"ENEM Challenge (No Images)",
         | 
| 45 | 
             
                      "metric_type":"acc",
         | 
| 46 | 
             
                      "metric_value":results["ENEM"],
         | 
| 47 | 
             
                      "dataset_config": None,
         | 
|  | |
| 51 | 
             
                      "metric_name":"accuracy"
         | 
| 52 | 
             
                      },
         | 
| 53 | 
             
                  "BLUEX":
         | 
| 54 | 
            +
                      {"dataset_type":"eduagarcia-temp/BLUEX_without_images",
         | 
| 55 | 
            +
                      "dataset_name":"BLUEX (No Images)",
         | 
| 56 | 
             
                      "metric_type":"acc",
         | 
| 57 | 
             
                      "metric_value":results["BLUEX"],
         | 
| 58 | 
             
                      "dataset_config": None,
         | 
|  | |
| 62 | 
             
                      "metric_name":"accuracy"
         | 
| 63 | 
             
                      },
         | 
| 64 | 
             
                  "OAB Exams":
         | 
| 65 | 
            +
                      {"dataset_type":"eduagarcia/oab_exams",
         | 
| 66 | 
             
                      "dataset_name":"OAB Exams",
         | 
| 67 | 
             
                      "metric_type":"acc",
         | 
| 68 | 
             
                      "metric_value":results["OAB Exams"],
         | 
|  | |
| 73 | 
             
                      "metric_name":"accuracy"
         | 
| 74 | 
             
                      },
         | 
| 75 | 
             
                  "ASSIN2 RTE":
         | 
| 76 | 
            +
                      {"dataset_type":"assin2",
         | 
| 77 | 
            +
                      "dataset_name":"Assin2 RTE",
         | 
| 78 | 
             
                      "metric_type":"f1_macro",
         | 
| 79 | 
             
                      "metric_value":results["ASSIN2 RTE"],
         | 
| 80 | 
             
                      "dataset_config": None,
         | 
|  | |
| 84 | 
             
                      "metric_name":"f1-macro"
         | 
| 85 | 
             
                      },
         | 
| 86 | 
             
                  "ASSIN2 STS":
         | 
| 87 | 
            +
                      {"dataset_type":"assin2",
         | 
| 88 | 
            +
                      "dataset_name":"Assin2 STS",
         | 
| 89 | 
             
                      "metric_type":"pearson",
         | 
| 90 | 
             
                      "metric_value":results["ASSIN2 STS"],
         | 
| 91 | 
             
                      "dataset_config": None,
         | 
|  | |
| 95 | 
             
                      "metric_name":"pearson"
         | 
| 96 | 
             
                      },
         | 
| 97 | 
             
                  "FAQUAD NLI":
         | 
| 98 | 
            +
                      {"dataset_type":"ruanchaves/faquad-nli",
         | 
| 99 | 
            +
                      "dataset_name":"FaQuAD NLI",
         | 
| 100 | 
             
                      "metric_type":"f1_macro",
         | 
| 101 | 
             
                      "metric_value":results["FAQUAD NLI"],
         | 
| 102 | 
             
                      "dataset_config": None,
         | 
|  | |
| 106 | 
             
                      "metric_name":"f1-macro"
         | 
| 107 | 
             
                      },
         | 
| 108 | 
             
                  "HateBR":
         | 
| 109 | 
            +
                      {"dataset_type":"eduagarcia/portuguese_benchmark",
         | 
| 110 | 
            +
                      "dataset_name":"HateBR Binary",
         | 
| 111 | 
             
                      "metric_type":"f1_macro",
         | 
| 112 | 
             
                      "metric_value":results["HateBR"],
         | 
| 113 | 
             
                      "dataset_config": None,
         | 
|  | |
| 117 | 
             
                      "metric_name":"f1-macro"
         | 
| 118 | 
             
                      },
         | 
| 119 | 
             
                  "PT Hate Speech":
         | 
| 120 | 
            +
                      {"dataset_type":"eduagarcia/portuguese_benchmark",
         | 
| 121 | 
            +
                      "dataset_name":"PT Hate Speech Binary",
         | 
| 122 | 
             
                      "metric_type":"f1_macro",
         | 
| 123 | 
             
                      "metric_value":results["PT Hate Speech"],
         | 
| 124 | 
             
                      "dataset_config": None,
         | 
|  | |
| 128 | 
             
                      "metric_name":"f1-macro"
         | 
| 129 | 
             
                      },
         | 
| 130 | 
             
                  "tweetSentBR":
         | 
| 131 | 
            +
                      {"dataset_type":"eduagarcia-temp/tweetsentbr",
         | 
| 132 | 
             
                      "dataset_name":"tweetSentBR",
         | 
| 133 | 
             
                      "metric_type":"f1_macro",
         | 
| 134 | 
             
                      "metric_value":results["tweetSentBR"],
         | 
|  | |
| 147 | 
             
              task_summary = get_task_summary(results)
         | 
| 148 | 
             
              md_writer = MarkdownTableWriter()
         | 
| 149 | 
             
              md_writer.headers = ["Metric", "Value"]
         | 
| 150 | 
            +
              md_writer.value_matrix = [["Average", f"**{results['Average ⬆️']}**"]] + [[v["dataset_name"], v["metric_value"]] for v in task_summary.values()]
         | 
| 151 |  | 
| 152 | 
             
              text = f"""
         | 
| 153 | 
             
            # [Open Portuguese LLM Leaderboard Evaluation Results](https://huggingface.co/spaces/eduagarcia/open_pt_llm_leaderboard)
         | 
|  | |
| 202 | 
             
                  if "Repo card metadata block was not found." in str(e): # There is no readme
         | 
| 203 | 
             
                    readme_text = get_edited_yaml_readme(repo, token=token)
         | 
| 204 | 
             
                  else:
         | 
| 205 | 
            +
                    traceback.print_exc()
         | 
| 206 | 
             
                    print(f"Something went wrong: {e}")
         | 
| 207 |  | 
| 208 | 
             
                liste = [CommitOperationAdd(path_in_repo="README.md", path_or_fileobj=readme_text.encode())]
         | 
|  | |
| 219 | 
             
                elif "Repository Not Found" in str(e):
         | 
| 220 | 
             
                  return "Repository Not Found"
         | 
| 221 | 
             
                else:
         | 
| 222 | 
            +
                  return e
         | 
| 223 | 
            +
                
         | 
| 224 | 
            +
            if __name__ == "__main__":
         | 
| 225 | 
            +
              print(get_eval_results("Qwen/Qwen1.5-72B-Chat"))
         | 
    	
        openllm.py
    CHANGED
    
    | @@ -41,4 +41,10 @@ def get_datas(data): | |
| 41 | 
             
                    except (KeyError, TypeError):
         | 
| 42 | 
             
                        continue
         | 
| 43 |  | 
| 44 | 
            -
                return result_list
         | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 41 | 
             
                    except (KeyError, TypeError):
         | 
| 42 | 
             
                        continue
         | 
| 43 |  | 
| 44 | 
            +
                return result_list
         | 
| 45 | 
            +
             | 
| 46 | 
            +
            if __name__ == "__main__":
         | 
| 47 | 
            +
                data = get_json_format_data()
         | 
| 48 | 
            +
                print(data)
         | 
| 49 | 
            +
                finished_models = get_datas(data)
         | 
| 50 | 
            +
                print(finished_models)
         | 
 
			
