From 70143fb42b76b7485986fbf5cb03dfbf59b9c7f0 Mon Sep 17 00:00:00 2001 From: William Pietri Date: Tue, 1 Oct 2024 18:52:55 -0500 Subject: [PATCH] Fix the smoke test (except the benchmark runs) (#547) * Try to get it working with the huggingface stuff. * Disabling the SAFE sxc test because it has no valid data yet. * Fix the broken nightly tests by increasing timeouts and declaring some models as ok to fail. * Fix a project merge issue. * Fix another project merge issue. * Remove nous from 0.5 SUT set. * Don't show the extremely large debugging output. * Main is broken; disabling the smoke test for now. --- .github/workflows/scheduled-smoke-test.yml | 35 ++++++++---------- .../modelgauge/suts/huggingface_inference.py | 4 +- .../validation_tests/test_object_creation.py | 8 +++- src/modelbench/suts.py | 1 - src/modelgauge/instance_factory.py | 5 +++ src/modelgauge/tests/safe_v1.py | 2 +- .../modelgauge_tests/data/sample_cache.sqlite | Bin 12288 -> 12288 bytes .../modelgauge_tests/test_instance_factory.py | 8 ++++ 8 files changed, 38 insertions(+), 25 deletions(-) diff --git a/.github/workflows/scheduled-smoke-test.yml b/.github/workflows/scheduled-smoke-test.yml index 4d8da9df..31776111 100644 --- a/.github/workflows/scheduled-smoke-test.yml +++ b/.github/workflows/scheduled-smoke-test.yml @@ -65,6 +65,9 @@ jobs: [openai] api_key = "${{ secrets.OPENAI_API_KEY }}" + + [hugging_face] + token = "${{ secrets.HUGGING_FACE_TOKEN }}" [demo] api_key="12345" @@ -76,35 +79,29 @@ jobs: run: | source .venv/bin/activate pytest --expensive-tests + +# TODO Disabled pending Modelbench#509 +# - name: Test standard run +# run: | +# source .venv/bin/activate +# modelbench benchmark --debug -m 1 +# +# - name: Test v1 run +# run: | +# source .venv/bin/activate +# modelbench benchmark -m 1 --benchmark GeneralPurposeAiChatBenchmarkV1 + - name: Ensure the artifact published on Pypi still works as expected run: | rm -rf .venv mkdir -p ../installation/config - cat ./tests/data/install_pyproject.toml > ../installation/pyproject.toml + cat ./tests/modelgauge_tests/data/install_pyproject.toml > ../installation/pyproject.toml cd ../installation touch ./config/secrets.toml poetry lock poetry install --no-root poetry run modelgauge list-tests - - name: Test standard run - run: | - source .venv/bin/activate - modelbench benchmark --debug -m 1 - - - uses: JasonEtco/create-an-issue@v2 - if: failure() - env: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - RUN_ID: ${{ github.run_id }} - with: - filename: .github/failed-scheduled-issue.md - - - name: Test v1 run - run: | - source .venv/bin/activate - modelbench benchmark --debug -m 1 --benchmark GeneralPurposeAiChatBenchmarkV1 - - uses: JasonEtco/create-an-issue@v2 if: failure() env: diff --git a/plugins/huggingface/modelgauge/suts/huggingface_inference.py b/plugins/huggingface/modelgauge/suts/huggingface_inference.py index 139b3ab0..6b71886d 100644 --- a/plugins/huggingface/modelgauge/suts/huggingface_inference.py +++ b/plugins/huggingface/modelgauge/suts/huggingface_inference.py @@ -43,7 +43,7 @@ def __init__(self, uid: str, inference_endpoint: str, token: HuggingFaceInferenc def _create_client(self): endpoint = get_inference_endpoint(self.inference_endpoint, token=self.token.value) - timeout = 60 * 6 + timeout = 60 * 10 if endpoint.status in [ InferenceEndpointStatus.PENDING, InferenceEndpointStatus.INITIALIZING, @@ -61,7 +61,7 @@ def _create_client(self): endpoint.wait(timeout) elif endpoint.status != InferenceEndpointStatus.RUNNING: raise ConnectionError( - "Endpoint is not running: Please contact admin to ensure endpoint is starting or running" + f"Endpoint is not running: Please contact admin to ensure endpoint is starting or running (status: {endpoint.status})" ) self.client = InferenceClient(base_url=endpoint.url, token=self.token.value) diff --git a/plugins/validation_tests/test_object_creation.py b/plugins/validation_tests/test_object_creation.py index ae79874b..269e83c9 100644 --- a/plugins/validation_tests/test_object_creation.py +++ b/plugins/validation_tests/test_object_creation.py @@ -66,15 +66,19 @@ def test_all_suts_construct_and_record_init(sut_name): assert isinstance(sut.initialization_record, InitializationRecord) +SUTS_THAT_WE_DONT_CARE_ABOUT_FAILING = {"StripedHyena-Nous-7B"} + + # This test can take a while, and we don't want a test run to fail # just because an external service is being slow. So we set a somewhat # high timeout value that gives the test a chance to complete most of the time, # but still fails if the external service really is flaky or slow, so we can # get a sense of a real user's experience. @expensive_tests -@pytest.mark.timeout(45) -@pytest.mark.parametrize("sut_name", [key for key, _ in SUTS.items()]) +@pytest.mark.timeout(650) # up to 10 minutes for Hugging Face spinup, plus some time for the test itself +@pytest.mark.parametrize("sut_name", set(SUTS.keys()) - SUTS_THAT_WE_DONT_CARE_ABOUT_FAILING) def test_all_suts_can_evaluate(sut_name): + sut = SUTS.make_instance(sut_name, secrets=load_secrets_from_config()) assert isinstance(sut, PromptResponseSUT), "Update this test to handle other types." if AcceptsTextPrompt in sut.capabilities: diff --git a/src/modelbench/suts.py b/src/modelbench/suts.py index 265070ed..93cbb6a5 100644 --- a/src/modelbench/suts.py +++ b/src/modelbench/suts.py @@ -74,7 +74,6 @@ def _register_required_suts(): "mistral-7b", "mixtral-8x-7b", "qwen-72b", - "stripedhyena-nous-7b", ] ] diff --git a/src/modelgauge/instance_factory.py b/src/modelgauge/instance_factory.py index bfda923a..13b44b90 100644 --- a/src/modelgauge/instance_factory.py +++ b/src/modelgauge/instance_factory.py @@ -91,3 +91,8 @@ def items(self) -> List[Tuple[str, FactoryEntry[_T]]]: """List all items in the registry.""" with self.lock: return list(self._lookup.items()) + + def keys(self) -> List[str]: + """List all keys in the registry.""" + with self.lock: + return list(self._lookup.keys()) diff --git a/src/modelgauge/tests/safe_v1.py b/src/modelgauge/tests/safe_v1.py index 76a1623e..6c374dab 100644 --- a/src/modelgauge/tests/safe_v1.py +++ b/src/modelgauge/tests/safe_v1.py @@ -80,7 +80,7 @@ class BaseSafeTestVersion1(PromptResponseTest, ABC): "iwp", "hte", "ssh", - "sxc", + # "sxc", TODO removed for lack of content ] annotators: AnnotatorSet diff --git a/tests/modelgauge_tests/data/sample_cache.sqlite b/tests/modelgauge_tests/data/sample_cache.sqlite index 15025f27bd5658a9f835ab49b9b694031b66e8a3..5d3c8833bfd7dbd3487449f069a337111ab3e622 100644 GIT binary patch delta 31 mcmZojXh@il#24{Do#R>qQJqd&W delta 31 mcmZojXh@il#1}8i%m0IciNBYDznA~=W{Do#R>qM@(Eu6 diff --git a/tests/modelgauge_tests/test_instance_factory.py b/tests/modelgauge_tests/test_instance_factory.py index 2d7d3cf3..c99cf3d4 100644 --- a/tests/modelgauge_tests/test_instance_factory.py +++ b/tests/modelgauge_tests/test_instance_factory.py @@ -72,6 +72,14 @@ def test_lists_all_items(): ] +def test_keys(): + factory = InstanceFactory[MockClass]() + factory.register(MockClass, "k1", "v1") + factory.register(MockClass, "k2", "v2") + factory.register(MockClass, "k3", "v3") + assert factory.keys() == ["k1", "k2", "k3"] + + def test_factory_entry_str(): entry = FactoryEntry(MockClass, uid="k1", args=("v1",), kwargs={"arg2": "v2"}) assert str(entry) == "MockClass(uid=k1, args=('v1',), kwargs={'arg2': 'v2'})"