Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add Huggingface Integration #916

Open
wants to merge 45 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
45 commits
Select commit Hold shift + click to select a range
56dacad
added hf cli
pranayasinghcsmpl Aug 14, 2024
32a206e
updated setup.py
pranayasinghcsmpl Aug 14, 2024
f8c3e6a
Merge branch 'master' into hf_cli4
sarthakpati Aug 19, 2024
4651320
Merge branch 'master' into hf_cli4
sarthakpati Aug 20, 2024
5a8a7f1
added hf cli tests & documentation
Sep 5, 2024
350906f
Merge branch 'master' into hf_cli4
sarthakpati Sep 5, 2024
26bdd11
added colorlog
Sep 6, 2024
178b5ab
Merge branch 'hf_cli4' of https://github.com/pranayasinghcsmpl/GaNDLF…
Sep 6, 2024
0e6a297
added colorlog
Sep 6, 2024
97585b8
added colorlog
Sep 6, 2024
99d91bd
Merge branch 'master' into hf_cli4
sarthakpati Sep 9, 2024
3fdf152
Update GANDLF/cli/huggingface_hub_handler.py
sarthakpati Sep 10, 2024
9c414d9
Update GANDLF/cli/huggingface_hub_handler.py
sarthakpati Sep 10, 2024
2c70b01
Update GANDLF/cli/huggingface_hub_handler.py
sarthakpati Sep 10, 2024
57f22fc
Update GANDLF/cli/huggingface_hub_handler.py
sarthakpati Sep 10, 2024
8ac070a
Update GANDLF/cli/huggingface_hub_handler.py
sarthakpati Sep 10, 2024
c4fc455
Update GANDLF/cli/huggingface_hub_handler.py
sarthakpati Sep 10, 2024
be4a3c9
Update GANDLF/cli/huggingface_hub_handler.py
sarthakpati Sep 10, 2024
0d6b998
Merge branch 'master' into hf_cli4
sarthakpati Sep 11, 2024
5511c17
Merge branch 'master' into hf_cli4
sarthakpati Sep 11, 2024
4c3804f
Merge branch 'master' into hf_cli4
sarthakpati Sep 12, 2024
f51d7a4
hf-template-added
Sep 16, 2024
8a7ad4c
hf-template
Sep 16, 2024
acd6bdf
hf-template
Sep 16, 2024
38984d0
resolved conflit
Sep 16, 2024
912b7f8
resolved
Sep 18, 2024
a7f6335
resolved_2issue
Sep 18, 2024
06564b5
resolved-lint
Sep 19, 2024
7ac6b94
Update testing/test_full.py
sarthakpati Sep 19, 2024
a955473
huggingface_test updated
Sep 24, 2024
1a9d6e9
huggingface_test updated
Sep 24, 2024
5e8a97b
Merge branch 'hf_cli4' of https://github.com/pranayasinghcsmpl/GaNDLF…
Sep 24, 2024
a0c7e2d
change coding style
Sep 24, 2024
5e9374b
Merge branch 'master' into hf_cli4
sarthakpati Sep 24, 2024
8c788ee
Merge branch 'master' into hf_cli4
sarthakpati Sep 24, 2024
202230a
Merge branch 'master' into hf_cli4
sarthakpati Sep 30, 2024
d82e473
Merge branch 'master' into hf_cli4
sarthakpati Oct 1, 2024
3cadbfd
Merge branch 'master' into hf_cli4
sarthakpati Oct 1, 2024
04e049c
Update GANDLF/cli/huggingface_hub_handler.py
sarthakpati Oct 1, 2024
be3c37f
Update GANDLF/cli/huggingface_hub_handler.py
sarthakpati Oct 1, 2024
805bc06
Update GANDLF/entrypoints/hf_hub_integration.py
sarthakpati Oct 1, 2024
8cd0d7d
Update setup.py
sarthakpati Oct 1, 2024
b925855
added default template for the Huggingface deployment
Oct 8, 2024
d0e412f
Added Default Template For the Huggingface
Oct 8, 2024
cb9c4c1
Merge branch 'hf_cli4' of https://github.com/pranayasinghcsmpl/GaNDLF…
Oct 8, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
142 changes: 142 additions & 0 deletions GANDLF/cli/huggingface_hub_handler.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,142 @@
from huggingface_hub import HfApi, snapshot_download, ModelCardData, ModelCard
from typing import List, Union
from GANDLF import version
from pathlib import Path
from GANDLF.utils import get_git_hash
import re


def validate_model_card(file_path: str):
"""
Validate that the required fields in the model card are not null, empty, or set to 'REQUIRED_FOR_GANDLF'.
The fields must contain valid alphabetic or alphanumeric values.

Args:
file_path (str): The path to the Markdown file to validate.

Raises:
AssertionError: If any required field is missing, empty, null, or contains 'REQUIRED_FOR_GANDLF'.
"""
# Read the Markdown file
path = Path(file_path)
with path.open("r") as file:
template_str = file.read()

# Define required fields and their regex patterns to capture the values
patterns = {
"Developed by": re.compile(
r'\*\*Developed by:\*\*\s*\{\{\s*developers\s*\|\s*default\("(.+?)",\s*true\)\s*\}\}',
re.MULTILINE,
),
"License": re.compile(
r'\*\*License:\*\*\s*\{\{\s*license\s*\|\s*default\("(.+?)",\s*true\)\s*\}\}',
re.MULTILINE,
),
"Primary Organization": re.compile(
r'\*\*Primary Organization:\*\*\s*\{\{\s*primary_organization\s*\|\s*default\("(.+?)",\s*true\)\s*\}\}',
re.MULTILINE,
),
"Commercial use policy": re.compile(
r'\*\*Commercial use policy:\*\*\s*\{\{\s*commercial_use\s*\|\s*default\("(.+?)",\s*true\)\s*\}\}',
re.MULTILINE,
),
}

# Iterate through the required fields and validate
for field, pattern in patterns.items():
match = pattern.search(template_str)

# Ensure the field is present and does not contain 'REQUIRED_FOR_GANDLF'
assert match, f"Field '{field}' is missing or not found in the file."

extract_value = match.group(1)

# Get the field value
value = (
re.search(r"\[([^\]]+)\]", extract_value).group(1)
if re.search(r"\[([^\]]+)\]", extract_value)
else None
)

# Ensure the field is not set to 'REQUIRED_FOR_GANDLF' or empty
assert (
value != "REQUIRED_FOR_GANDLF"
), f"The value for '{field}' is set to the default placeholder '[REQUIRED_FOR_GANDLF]'. It must be a valid value."
assert value, f"The value for '{field}' is empty or null."

# Ensure the value contains only alphabetic or alphanumeric characters
assert re.match(
r"^[a-zA-Z0-9]+$", value
), f"The value for '{field}' must be alphabetic or alphanumeric, but got: '{value}'"

print(
"All required fields are valid, non-empty, properly filled, and do not contain '[REQUIRED_FOR_GANDLF]'."
)

# Example usage
return template_str


def push_to_model_hub(
repo_id: str,
folder_path: str,
hf_template: str,
path_in_repo: Union[str, None] = None,
commit_message: Union[str, None] = None,
commit_description: Union[str, None] = None,
sarthakpati marked this conversation as resolved.
Show resolved Hide resolved
token: Union[str, None] = None,
repo_type: Union[str, None] = None,
sarthakpati marked this conversation as resolved.
Show resolved Hide resolved
sarthakpati marked this conversation as resolved.
Show resolved Hide resolved
revision: Union[str, None] = None,
allow_patterns: Union[List[str], str, None] = None,
ignore_patterns: Union[List[str], str, None] = None,
delete_patterns: Union[List[str], str, None] = None,
sarthakpati marked this conversation as resolved.
Show resolved Hide resolved
sarthakpati marked this conversation as resolved.
Show resolved Hide resolved
):
api = HfApi(token=token)

try:
repo_id = api.create_repo(repo_id).repo_id
except Exception as e:
print(f"Error: {e}")

tags = ["v" + version]

git_hash = get_git_hash()

if not git_hash == "None":
tags += [git_hash]

readme_template = validate_model_card(hf_template)

card_data = ModelCardData(library_name="GaNDLF", tags=tags)
card = ModelCard.from_template(card_data, template_str=readme_template)
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

See comment above about template_str


card.save(Path(folder_path, "README.md"))

api.upload_folder(
repo_id=repo_id,
folder_path=folder_path,
repo_type="model",
revision=revision,
allow_patterns=allow_patterns,
ignore_patterns=ignore_patterns,
delete_patterns=delete_patterns,
)
print("Model Sucessfully Uploded")


def download_from_hub(
repo_id: str,
revision: Union[str, None] = None,
cache_dir: Union[str, None] = None,
local_dir: Union[str, None] = None,
force_download: bool = False,
token: Union[str, None] = None,
):
snapshot_download(
repo_id=repo_id,
revision=revision,
cache_dir=cache_dir,
local_dir=local_dir,
force_download=force_download,
token=token,
)
Comment on lines +127 to +142
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I am not sure this alias is really needed. I would simply call snapshot_download in other places in the code.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Agree.

Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I still think the alias is not needed and that snapshot_download could be used by default

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@Wauplin actually this alias is use for the alignment of hugging face downloading feature with Gandlf command line design pattern ,change it to by default may abruptly conflict the command line argument

125 changes: 125 additions & 0 deletions GANDLF/entrypoints/hf_hub_integration.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,125 @@
import click
from GANDLF.entrypoints import append_copyright_to_help
from GANDLF.cli.huggingface_hub_handler import push_to_model_hub, download_from_hub
from pathlib import Path

huggingfaceDir = Path(__file__).parent.absolute()

huggingfaceDir = huggingfaceDir.parent.absolute().__str__()

# Huggingface template Path for Model deployment
huggingface_file_path = huggingfaceDir + "\hugging_face.md"


@click.command()
@click.option(
"--upload/--download",
"-u/-d",
required=True,
help="Upload or download to/from a Huggingface Repo",
)
@click.option(
"--repo-id",
"-rid",
required=True,
help="Downloading/Uploading: A user or an organization name and a repo name separated by a /",
)
@click.option(
"--token",
"-tk",
help="Downloading/Uploading: A token to be used for the download/upload",
)
@click.option(
"--revision",
"-rv",
help="Downloading/Uploading: git revision id which can be a branch name, a tag, or a commit hash",
)
@click.option(
"--cache-dir",
"-cdir",
help="Downloading: path to the folder where cached files are stored",
type=click.Path(exists=True, file_okay=False, dir_okay=True),
)
@click.option(
"--local-dir",
"-ldir",
help="Downloading: if provided, the downloaded file will be placed under this directory",
type=click.Path(exists=True, file_okay=False, dir_okay=True),
)
@click.option(
"--force-download",
"-fd",
is_flag=True,
help="Downloading: Whether the file should be downloaded even if it already exists in the local cache",
)
@click.option(
"--folder-path",
"-fp",
help="Uploading: Path to the folder to upload on the local file system",
type=click.Path(exists=True, file_okay=False, dir_okay=True),
)
@click.option(
"--path-in-repo",
"-pir",
help="Uploading: Relative path of the directory in the repo. Will default to the root folder of the repository",
)
@click.option(
"--commit-message",
"-cr",
help='Uploading: The summary / title / first line of the generated commit. Defaults to: f"Upload {path_in_repo} with huggingface_hub"',
)
@click.option(
"--commit-description",
"-cd",
help="Uploading: The description of the generated commit",
)
@click.option(
"--repo-type",
"-rt",
help='Uploading: Set to "dataset" or "space" if uploading to a dataset or space, "model" if uploading to a model. Default is model',
)
@click.option(
"--hf-template",
"-hft",
help="Adding the template path for the model card it is Required during Uploaing a model",
default=huggingface_file_path,
type=click.Path(exists=True, file_okay=True, dir_okay=False),
)
@append_copyright_to_help
def new_way(
upload: bool,
repo_id: str,
token: str,
hf_template: str,
revision: str,
cache_dir: str,
local_dir: str,
force_download: bool,
folder_path: str,
path_in_repo: str,
commit_message: str,
commit_description: str,
repo_type: str,
allow_patterns: str,
ignore_patterns: str,
delete_patterns: str,
):
if upload:
push_to_model_hub(
repo_id,
folder_path,
hf_template,
path_in_repo,
commit_message,
commit_description,
token,
repo_type,
revision,
allow_patterns,
ignore_patterns,
delete_patterns,
)
else:
download_from_hub(
repo_id, revision, cache_dir, local_dir, force_download, token
)
2 changes: 2 additions & 0 deletions GANDLF/entrypoints/subcommands.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
from GANDLF.entrypoints.generate_metrics import new_way as generate_metrics_command
from GANDLF.entrypoints.debug_info import new_way as debug_info_command
from GANDLF.entrypoints.split_csv import new_way as split_csv_command
from GANDLF.entrypoints.hf_hub_integration import new_way as hf_command


cli_subcommands = {
Expand All @@ -29,4 +30,5 @@
"generate-metrics": generate_metrics_command,
"debug-info": debug_info_command,
"split-csv": split_csv_command,
"hf": hf_command,
}
Loading
Loading