Skip to content

Commit

Permalink
Fix the smoke test (except the benchmark runs) (#547)
Browse files Browse the repository at this point in the history
* Try to get it working with the huggingface stuff.
* Disabling the SAFE sxc test because it has no valid data yet.
* Fix the broken nightly tests by increasing timeouts and declaring some models as ok to fail.
* Fix a project merge issue.
* Fix another project merge issue.
* Remove nous from 0.5 SUT set.
* Don't show the extremely large debugging output.
* Main is broken; disabling the smoke test for now.
  • Loading branch information
wpietri authored Oct 1, 2024
1 parent e995c38 commit 70143fb
Show file tree
Hide file tree
Showing 8 changed files with 38 additions and 25 deletions.
35 changes: 16 additions & 19 deletions .github/workflows/scheduled-smoke-test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,9 @@ jobs:
[openai]
api_key = "${{ secrets.OPENAI_API_KEY }}"
[hugging_face]
token = "${{ secrets.HUGGING_FACE_TOKEN }}"
[demo]
api_key="12345"
Expand All @@ -76,35 +79,29 @@ jobs:
run: |
source .venv/bin/activate
pytest --expensive-tests
# TODO Disabled pending Modelbench#509
# - name: Test standard run
# run: |
# source .venv/bin/activate
# modelbench benchmark --debug -m 1
#
# - name: Test v1 run
# run: |
# source .venv/bin/activate
# modelbench benchmark -m 1 --benchmark GeneralPurposeAiChatBenchmarkV1

- name: Ensure the artifact published on Pypi still works as expected
run: |
rm -rf .venv
mkdir -p ../installation/config
cat ./tests/data/install_pyproject.toml > ../installation/pyproject.toml
cat ./tests/modelgauge_tests/data/install_pyproject.toml > ../installation/pyproject.toml
cd ../installation
touch ./config/secrets.toml
poetry lock
poetry install --no-root
poetry run modelgauge list-tests
- name: Test standard run
run: |
source .venv/bin/activate
modelbench benchmark --debug -m 1
- uses: JasonEtco/create-an-issue@v2
if: failure()
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
RUN_ID: ${{ github.run_id }}
with:
filename: .github/failed-scheduled-issue.md

- name: Test v1 run
run: |
source .venv/bin/activate
modelbench benchmark --debug -m 1 --benchmark GeneralPurposeAiChatBenchmarkV1
- uses: JasonEtco/create-an-issue@v2
if: failure()
env:
Expand Down
4 changes: 2 additions & 2 deletions plugins/huggingface/modelgauge/suts/huggingface_inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ def __init__(self, uid: str, inference_endpoint: str, token: HuggingFaceInferenc
def _create_client(self):
endpoint = get_inference_endpoint(self.inference_endpoint, token=self.token.value)

timeout = 60 * 6
timeout = 60 * 10
if endpoint.status in [
InferenceEndpointStatus.PENDING,
InferenceEndpointStatus.INITIALIZING,
Expand All @@ -61,7 +61,7 @@ def _create_client(self):
endpoint.wait(timeout)
elif endpoint.status != InferenceEndpointStatus.RUNNING:
raise ConnectionError(
"Endpoint is not running: Please contact admin to ensure endpoint is starting or running"
f"Endpoint is not running: Please contact admin to ensure endpoint is starting or running (status: {endpoint.status})"
)

self.client = InferenceClient(base_url=endpoint.url, token=self.token.value)
Expand Down
8 changes: 6 additions & 2 deletions plugins/validation_tests/test_object_creation.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,15 +66,19 @@ def test_all_suts_construct_and_record_init(sut_name):
assert isinstance(sut.initialization_record, InitializationRecord)


SUTS_THAT_WE_DONT_CARE_ABOUT_FAILING = {"StripedHyena-Nous-7B"}


# This test can take a while, and we don't want a test run to fail
# just because an external service is being slow. So we set a somewhat
# high timeout value that gives the test a chance to complete most of the time,
# but still fails if the external service really is flaky or slow, so we can
# get a sense of a real user's experience.
@expensive_tests
@pytest.mark.timeout(45)
@pytest.mark.parametrize("sut_name", [key for key, _ in SUTS.items()])
@pytest.mark.timeout(650) # up to 10 minutes for Hugging Face spinup, plus some time for the test itself
@pytest.mark.parametrize("sut_name", set(SUTS.keys()) - SUTS_THAT_WE_DONT_CARE_ABOUT_FAILING)
def test_all_suts_can_evaluate(sut_name):

sut = SUTS.make_instance(sut_name, secrets=load_secrets_from_config())
assert isinstance(sut, PromptResponseSUT), "Update this test to handle other types."
if AcceptsTextPrompt in sut.capabilities:
Expand Down
1 change: 0 additions & 1 deletion src/modelbench/suts.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,6 @@ def _register_required_suts():
"mistral-7b",
"mixtral-8x-7b",
"qwen-72b",
"stripedhyena-nous-7b",
]
]

Expand Down
5 changes: 5 additions & 0 deletions src/modelgauge/instance_factory.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,3 +91,8 @@ def items(self) -> List[Tuple[str, FactoryEntry[_T]]]:
"""List all items in the registry."""
with self.lock:
return list(self._lookup.items())

def keys(self) -> List[str]:
"""List all keys in the registry."""
with self.lock:
return list(self._lookup.keys())
2 changes: 1 addition & 1 deletion src/modelgauge/tests/safe_v1.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@ class BaseSafeTestVersion1(PromptResponseTest, ABC):
"iwp",
"hte",
"ssh",
"sxc",
# "sxc", TODO removed for lack of content
]
annotators: AnnotatorSet

Expand Down
Binary file modified tests/modelgauge_tests/data/sample_cache.sqlite
Binary file not shown.
8 changes: 8 additions & 0 deletions tests/modelgauge_tests/test_instance_factory.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,14 @@ def test_lists_all_items():
]


def test_keys():
factory = InstanceFactory[MockClass]()
factory.register(MockClass, "k1", "v1")
factory.register(MockClass, "k2", "v2")
factory.register(MockClass, "k3", "v3")
assert factory.keys() == ["k1", "k2", "k3"]


def test_factory_entry_str():
entry = FactoryEntry(MockClass, uid="k1", args=("v1",), kwargs={"arg2": "v2"})
assert str(entry) == "MockClass(uid=k1, args=('v1',), kwargs={'arg2': 'v2'})"
Expand Down

0 comments on commit 70143fb

Please sign in to comment.