Fix the smoke test (except the benchmark runs) (#547)

* Try to get it working with the huggingface stuff. * Disabling the SAFE sxc test because it has no valid data yet. * Fix the broken nightly tests by increasing timeouts and declaring some models as ok to fail. * Fix a project merge issue. * Fix another project merge issue. * Remove nous from 0.5 SUT set. * Don't show the extremely large debugging output. * Main is broken; disabling the smoke test for now.
mlcommons · Oct 1, 2024 · 70143fb · 70143fb
1 parent e995c38
commit 70143fb
Show file tree

Hide file tree

Showing 8 changed files with 38 additions and 25 deletions.
diff --git a/.github/workflows/scheduled-smoke-test.yml b/.github/workflows/scheduled-smoke-test.yml
@@ -65,6 +65,9 @@ jobs:
 
           [openai]
           api_key = "${{ secrets.OPENAI_API_KEY }}"
+          
+          [hugging_face]
+          token = "${{ secrets.HUGGING_FACE_TOKEN }}"
 
           [demo]
           api_key="12345"
@@ -76,35 +79,29 @@ jobs:
       run: |
         source .venv/bin/activate
         pytest --expensive-tests
+
+# TODO Disabled pending Modelbench#509
+#    - name: Test standard run
+#      run: |
+#        source .venv/bin/activate
+#        modelbench benchmark --debug -m 1
+#
+#    - name: Test v1 run
+#      run: |
+#        source .venv/bin/activate
+#        modelbench benchmark -m 1 --benchmark GeneralPurposeAiChatBenchmarkV1
+
     - name: Ensure the artifact published on Pypi still works as expected
       run: |
         rm -rf .venv
         mkdir -p ../installation/config
-        cat ./tests/data/install_pyproject.toml > ../installation/pyproject.toml
+        cat ./tests/modelgauge_tests/data/install_pyproject.toml > ../installation/pyproject.toml
         cd ../installation
         touch ./config/secrets.toml
         poetry lock
         poetry install --no-root
         poetry run modelgauge list-tests
 
-    - name: Test standard run
-      run: |
-        source .venv/bin/activate
-        modelbench benchmark --debug -m 1
-
-    - uses: JasonEtco/create-an-issue@v2
-      if: failure()
-      env:
-        GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-        RUN_ID: ${{ github.run_id }}
-      with:
-        filename: .github/failed-scheduled-issue.md
-
-    - name: Test v1 run
-      run: |
-        source .venv/bin/activate
-        modelbench benchmark --debug -m 1 --benchmark GeneralPurposeAiChatBenchmarkV1
-
     - uses: JasonEtco/create-an-issue@v2
       if: failure()
       env:

diff --git a/plugins/huggingface/modelgauge/suts/huggingface_inference.py b/plugins/huggingface/modelgauge/suts/huggingface_inference.py
@@ -43,7 +43,7 @@ def __init__(self, uid: str, inference_endpoint: str, token: HuggingFaceInferenc
     def _create_client(self):
         endpoint = get_inference_endpoint(self.inference_endpoint, token=self.token.value)
 
-        timeout = 60 * 6
+        timeout = 60 * 10
         if endpoint.status in [
             InferenceEndpointStatus.PENDING,
             InferenceEndpointStatus.INITIALIZING,
@@ -61,7 +61,7 @@ def _create_client(self):
             endpoint.wait(timeout)
         elif endpoint.status != InferenceEndpointStatus.RUNNING:
             raise ConnectionError(
-                "Endpoint is not running: Please contact admin to ensure endpoint is starting or running"
+                f"Endpoint is not running: Please contact admin to ensure endpoint is starting or running (status: {endpoint.status})"
             )
 
         self.client = InferenceClient(base_url=endpoint.url, token=self.token.value)

diff --git a/plugins/validation_tests/test_object_creation.py b/plugins/validation_tests/test_object_creation.py
@@ -66,15 +66,19 @@ def test_all_suts_construct_and_record_init(sut_name):
     assert isinstance(sut.initialization_record, InitializationRecord)
 
 
+SUTS_THAT_WE_DONT_CARE_ABOUT_FAILING = {"StripedHyena-Nous-7B"}
+
+
 # This test can take a while, and we don't want a test run to fail
 # just because an external service is being slow. So we set a somewhat
 # high timeout value that gives the test a chance to complete most of the time,
 # but still fails if the external service really is flaky or slow, so we can
 # get a sense of a real user's experience.
 @expensive_tests
-@pytest.mark.timeout(45)
-@pytest.mark.parametrize("sut_name", [key for key, _ in SUTS.items()])
+@pytest.mark.timeout(650)  # up to 10 minutes for Hugging Face spinup, plus some time for the test itself
+@pytest.mark.parametrize("sut_name", set(SUTS.keys()) - SUTS_THAT_WE_DONT_CARE_ABOUT_FAILING)
 def test_all_suts_can_evaluate(sut_name):
+
     sut = SUTS.make_instance(sut_name, secrets=load_secrets_from_config())
     assert isinstance(sut, PromptResponseSUT), "Update this test to handle other types."
     if AcceptsTextPrompt in sut.capabilities:

diff --git a/src/modelbench/suts.py b/src/modelbench/suts.py
@@ -74,7 +74,6 @@ def _register_required_suts():
         "mistral-7b",
         "mixtral-8x-7b",
         "qwen-72b",
-        "stripedhyena-nous-7b",
     ]
 ]
 

diff --git a/src/modelgauge/instance_factory.py b/src/modelgauge/instance_factory.py
@@ -91,3 +91,8 @@ def items(self) -> List[Tuple[str, FactoryEntry[_T]]]:
         """List all items in the registry."""
         with self.lock:
             return list(self._lookup.items())
+
+    def keys(self) -> List[str]:
+        """List all keys in the registry."""
+        with self.lock:
+            return list(self._lookup.keys())
diff --git a/src/modelgauge/tests/safe_v1.py b/src/modelgauge/tests/safe_v1.py
@@ -80,7 +80,7 @@ class BaseSafeTestVersion1(PromptResponseTest, ABC):
         "iwp",
         "hte",
         "ssh",
-        "sxc",
+        # "sxc", TODO removed for lack of content
     ]
     annotators: AnnotatorSet
 

diff --git a/tests/modelgauge_tests/data/sample_cache.sqlite b/tests/modelgauge_tests/data/sample_cache.sqlite
diff --git a/tests/modelgauge_tests/test_instance_factory.py b/tests/modelgauge_tests/test_instance_factory.py
@@ -72,6 +72,14 @@ def test_lists_all_items():
     ]
 
 
+def test_keys():
+    factory = InstanceFactory[MockClass]()
+    factory.register(MockClass, "k1", "v1")
+    factory.register(MockClass, "k2", "v2")
+    factory.register(MockClass, "k3", "v3")
+    assert factory.keys() == ["k1", "k2", "k3"]
+
+
 def test_factory_entry_str():
     entry = FactoryEntry(MockClass, uid="k1", args=("v1",), kwargs={"arg2": "v2"})
     assert str(entry) == "MockClass(uid=k1, args=('v1',), kwargs={'arg2': 'v2'})"