From 70143fb42b76b7485986fbf5cb03dfbf59b9c7f0 Mon Sep 17 00:00:00 2001
From: William Pietri <wpietri+github.com@mlcommons.org>
Date: Tue, 1 Oct 2024 18:52:55 -0500
Subject: [PATCH] Fix the smoke test (except the benchmark runs) (#547)

* Try to get it working with the huggingface stuff.
* Disabling the SAFE sxc test because it has no valid data yet.
* Fix the broken nightly tests by increasing timeouts and declaring some models as ok to fail.
* Fix a project merge issue.
* Fix another project merge issue.
* Remove nous from 0.5 SUT set.
* Don't show the extremely large debugging output.
* Main is broken; disabling the smoke test for now.
---
 .github/workflows/scheduled-smoke-test.yml    |  35 ++++++++----------
 .../modelgauge/suts/huggingface_inference.py  |   4 +-
 .../validation_tests/test_object_creation.py  |   8 +++-
 src/modelbench/suts.py                        |   1 -
 src/modelgauge/instance_factory.py            |   5 +++
 src/modelgauge/tests/safe_v1.py               |   2 +-
 .../modelgauge_tests/data/sample_cache.sqlite | Bin 12288 -> 12288 bytes
 .../modelgauge_tests/test_instance_factory.py |   8 ++++
 8 files changed, 38 insertions(+), 25 deletions(-)

diff --git a/.github/workflows/scheduled-smoke-test.yml b/.github/workflows/scheduled-smoke-test.yml
index 4d8da9df..31776111 100644
--- a/.github/workflows/scheduled-smoke-test.yml
+++ b/.github/workflows/scheduled-smoke-test.yml
@@ -65,6 +65,9 @@ jobs:
 
           [openai]
           api_key = "${{ secrets.OPENAI_API_KEY }}"
+          
+          [hugging_face]
+          token = "${{ secrets.HUGGING_FACE_TOKEN }}"
 
           [demo]
           api_key="12345"
@@ -76,35 +79,29 @@ jobs:
       run: |
         source .venv/bin/activate
         pytest --expensive-tests
+
+# TODO Disabled pending Modelbench#509
+#    - name: Test standard run
+#      run: |
+#        source .venv/bin/activate
+#        modelbench benchmark --debug -m 1
+#
+#    - name: Test v1 run
+#      run: |
+#        source .venv/bin/activate
+#        modelbench benchmark -m 1 --benchmark GeneralPurposeAiChatBenchmarkV1
+
     - name: Ensure the artifact published on Pypi still works as expected
       run: |
         rm -rf .venv
         mkdir -p ../installation/config
-        cat ./tests/data/install_pyproject.toml > ../installation/pyproject.toml
+        cat ./tests/modelgauge_tests/data/install_pyproject.toml > ../installation/pyproject.toml
         cd ../installation
         touch ./config/secrets.toml
         poetry lock
         poetry install --no-root
         poetry run modelgauge list-tests
 
-    - name: Test standard run
-      run: |
-        source .venv/bin/activate
-        modelbench benchmark --debug -m 1
-
-    - uses: JasonEtco/create-an-issue@v2
-      if: failure()
-      env:
-        GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-        RUN_ID: ${{ github.run_id }}
-      with:
-        filename: .github/failed-scheduled-issue.md
-
-    - name: Test v1 run
-      run: |
-        source .venv/bin/activate
-        modelbench benchmark --debug -m 1 --benchmark GeneralPurposeAiChatBenchmarkV1
-
     - uses: JasonEtco/create-an-issue@v2
       if: failure()
       env:
diff --git a/plugins/huggingface/modelgauge/suts/huggingface_inference.py b/plugins/huggingface/modelgauge/suts/huggingface_inference.py
index 139b3ab0..6b71886d 100644
--- a/plugins/huggingface/modelgauge/suts/huggingface_inference.py
+++ b/plugins/huggingface/modelgauge/suts/huggingface_inference.py
@@ -43,7 +43,7 @@ def __init__(self, uid: str, inference_endpoint: str, token: HuggingFaceInferenc
     def _create_client(self):
         endpoint = get_inference_endpoint(self.inference_endpoint, token=self.token.value)
 
-        timeout = 60 * 6
+        timeout = 60 * 10
         if endpoint.status in [
             InferenceEndpointStatus.PENDING,
             InferenceEndpointStatus.INITIALIZING,
@@ -61,7 +61,7 @@ def _create_client(self):
             endpoint.wait(timeout)
         elif endpoint.status != InferenceEndpointStatus.RUNNING:
             raise ConnectionError(
-                "Endpoint is not running: Please contact admin to ensure endpoint is starting or running"
+                f"Endpoint is not running: Please contact admin to ensure endpoint is starting or running (status: {endpoint.status})"
             )
 
         self.client = InferenceClient(base_url=endpoint.url, token=self.token.value)
diff --git a/plugins/validation_tests/test_object_creation.py b/plugins/validation_tests/test_object_creation.py
index ae79874b..269e83c9 100644
--- a/plugins/validation_tests/test_object_creation.py
+++ b/plugins/validation_tests/test_object_creation.py
@@ -66,15 +66,19 @@ def test_all_suts_construct_and_record_init(sut_name):
     assert isinstance(sut.initialization_record, InitializationRecord)
 
 
+SUTS_THAT_WE_DONT_CARE_ABOUT_FAILING = {"StripedHyena-Nous-7B"}
+
+
 # This test can take a while, and we don't want a test run to fail
 # just because an external service is being slow. So we set a somewhat
 # high timeout value that gives the test a chance to complete most of the time,
 # but still fails if the external service really is flaky or slow, so we can
 # get a sense of a real user's experience.
 @expensive_tests
-@pytest.mark.timeout(45)
-@pytest.mark.parametrize("sut_name", [key for key, _ in SUTS.items()])
+@pytest.mark.timeout(650)  # up to 10 minutes for Hugging Face spinup, plus some time for the test itself
+@pytest.mark.parametrize("sut_name", set(SUTS.keys()) - SUTS_THAT_WE_DONT_CARE_ABOUT_FAILING)
 def test_all_suts_can_evaluate(sut_name):
+
     sut = SUTS.make_instance(sut_name, secrets=load_secrets_from_config())
     assert isinstance(sut, PromptResponseSUT), "Update this test to handle other types."
     if AcceptsTextPrompt in sut.capabilities:
diff --git a/src/modelbench/suts.py b/src/modelbench/suts.py
index 265070ed..93cbb6a5 100644
--- a/src/modelbench/suts.py
+++ b/src/modelbench/suts.py
@@ -74,7 +74,6 @@ def _register_required_suts():
         "mistral-7b",
         "mixtral-8x-7b",
         "qwen-72b",
-        "stripedhyena-nous-7b",
     ]
 ]
 
diff --git a/src/modelgauge/instance_factory.py b/src/modelgauge/instance_factory.py
index bfda923a..13b44b90 100644
--- a/src/modelgauge/instance_factory.py
+++ b/src/modelgauge/instance_factory.py
@@ -91,3 +91,8 @@ def items(self) -> List[Tuple[str, FactoryEntry[_T]]]:
         """List all items in the registry."""
         with self.lock:
             return list(self._lookup.items())
+
+    def keys(self) -> List[str]:
+        """List all keys in the registry."""
+        with self.lock:
+            return list(self._lookup.keys())
diff --git a/src/modelgauge/tests/safe_v1.py b/src/modelgauge/tests/safe_v1.py
index 76a1623e..6c374dab 100644
--- a/src/modelgauge/tests/safe_v1.py
+++ b/src/modelgauge/tests/safe_v1.py
@@ -80,7 +80,7 @@ class BaseSafeTestVersion1(PromptResponseTest, ABC):
         "iwp",
         "hte",
         "ssh",
-        "sxc",
+        # "sxc", TODO removed for lack of content
     ]
     annotators: AnnotatorSet
 
diff --git a/tests/modelgauge_tests/data/sample_cache.sqlite b/tests/modelgauge_tests/data/sample_cache.sqlite
index 15025f27bd5658a9f835ab49b9b694031b66e8a3..5d3c8833bfd7dbd3487449f069a337111ab3e622 100644
GIT binary patch
delta 31
mcmZojXh@il#24<x%m0IciNBYDznA~=W<dp6{>{Do#R>qQJqd&W

delta 31
mcmZojXh@il#1}8i%m0IciNBYDznA~=W<dp6{>{Do#R>qM@(Eu6

diff --git a/tests/modelgauge_tests/test_instance_factory.py b/tests/modelgauge_tests/test_instance_factory.py
index 2d7d3cf3..c99cf3d4 100644
--- a/tests/modelgauge_tests/test_instance_factory.py
+++ b/tests/modelgauge_tests/test_instance_factory.py
@@ -72,6 +72,14 @@ def test_lists_all_items():
     ]
 
 
+def test_keys():
+    factory = InstanceFactory[MockClass]()
+    factory.register(MockClass, "k1", "v1")
+    factory.register(MockClass, "k2", "v2")
+    factory.register(MockClass, "k3", "v3")
+    assert factory.keys() == ["k1", "k2", "k3"]
+
+
 def test_factory_entry_str():
     entry = FactoryEntry(MockClass, uid="k1", args=("v1",), kwargs={"arg2": "v2"})
     assert str(entry) == "MockClass(uid=k1, args=('v1',), kwargs={'arg2': 'v2'})"