Skip to content

Commit 4e63085

Browse files
Yadan-WeiYadan Wei
andauthored
Debug PT2.6 SM Inference Test Failure (#5451)
* run test with prod image * disable autopatch * use ami-04864586635537982 to test * use ami-024d21dba4813c0fb to test * pin sagemaker version * pin sagemaker version * pin sagemaker version * test protobuf * check docker compose * add back docker compose * revert sagemaker version pin in test requriements * update docker version * change docker version * add docker compose v1 to local test * format * add docker-compose before requriements * create a requirments file only for pytorch-inference * revert toml --------- Co-authored-by: Yadan Wei <[email protected]>
1 parent f675a63 commit 4e63085

File tree

3 files changed

+37
-1
lines changed

3 files changed

+37
-1
lines changed
Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
boto3
2+
coverage
3+
# SageMaker PyTorch Inference Toolkit still use docker compose v1
4+
# https://github.com/aws/sagemaker-pytorch-inference-toolkit/blob/v2.0.25/setup.py#L58
5+
# Docker v7.0.0 breaks compatibility with Docker Compose v1 (SageMaker Local)
6+
docker==6.1.3
7+
docker-compose
8+
Flask==1.1.1
9+
fabric
10+
flake8==3.7.7
11+
gitpython
12+
invoke
13+
mock
14+
numpy
15+
Pillow
16+
packaging
17+
# Preserve pytest caching behavior in earlier versions
18+
pytest<8.1
19+
pytest-cov
20+
pytest-rerunfailures
21+
pytest-xdist
22+
requests<2.32.0
23+
requests_mock
24+
retrying==1.3.3
25+
sagemaker>=2,<3
26+
sagemaker-inference
27+
tenacity
28+
toml
29+
torch<2.5.0
30+
torchvision

test/test_utils/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -115,6 +115,7 @@ def get_ami_id_ssm(region_name, parameter_path):
115115
region_name="us-west-2",
116116
parameter_path="/aws/service/deeplearning/ami/x86_64/base-oss-nvidia-driver-gpu-amazon-linux-2023/latest/ami-id",
117117
)
118+
118119
AL2023_BASE_DLAMI_US_EAST_1 = get_ami_id_ssm(
119120
region_name="us-east-1",
120121
parameter_path="/aws/service/deeplearning/ami/x86_64/base-oss-nvidia-driver-gpu-amazon-linux-2023/latest/ami-id",

test/test_utils/sagemaker.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -345,8 +345,13 @@ def execute_local_tests(image, pytest_cache_params):
345345
"sudo curl -L https://github.com/docker/compose/releases/latest/download/docker-compose-$(uname -s)-$(uname -m) -o /usr/local/bin/docker-compose"
346346
)
347347
ec2_conn.run("sudo chmod +x /usr/local/bin/docker-compose")
348+
348349
with ec2_conn.cd(path):
349-
ec2_conn.run(f"pip install -r requirements.txt")
350+
if "pytorch-inference" in image:
351+
ec2_conn.run(f"pip install -r pytorch-inference-requirements.txt")
352+
else:
353+
ec2_conn.run(f"pip install -r requirements.txt")
354+
350355
pytest_cache_util.download_pytest_cache_from_s3_to_ec2(
351356
ec2_conn, path, **pytest_cache_params
352357
)

0 commit comments

Comments
 (0)