Skip to content

Commit c3398ca

Browse files
committed
[Security] Scoped down the network access to managed EFS and FSx storage
from cluster nodes. In particular, before this change all traffic used to be allowed from cluster nodes to the shared storage. With this change this access is restricted to the strictly required ports. For EFS: * Allow all traffic from within the storage security group itself. * Allow TPC traffic on port 2049 form cluster nodes. For FSx: * Allow all traffic from within the storage security group itself. * Allow TPC traffic on ports 988 and 1018-1023 form cluster nodes.
1 parent 21bef59 commit c3398ca

File tree

8 files changed

+207
-54
lines changed

8 files changed

+207
-54
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ CHANGELOG
88
- Add validator that warns against the downsides of disabling in-place updates on compute and login nodes through DevSettings.
99
- Upgrade jmespath to ~=1.0 (from ~=0.10).
1010
- Upgrade tabulate to <=0.9.0 (from <=0.8.10).
11+
- Scope down storage security group ingress rules to specific ports for all node types.
1112

1213
**BUG FIXES**
1314
- Add validation to block updates that change tag order. Blocking such change prevents update failures.

cli/src/pcluster/constants.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -127,7 +127,8 @@
127127

128128
FSX_PORTS = {
129129
# Lustre Security group: https://docs.aws.amazon.com/fsx/latest/LustreGuide/limit-access-security-groups.html
130-
LUSTRE: {"tcp": [988]},
130+
# Among the Lustre ports, only 988 is mandatory to provide basic features.
131+
LUSTRE: {"tcp": [988, (1018, 1023)]},
131132
# OpenZFS Security group: https://docs.aws.amazon.com/fsx/latest/OpenZFSGuide/limit-access-security-groups.html
132133
OPENZFS: {"tcp": [111, 2049, 20001, 20002, 20003], "udp": [111, 2049, 20001, 20002, 20003]},
133134
# Ontap Security group: https://docs.aws.amazon.com/fsx/latest/ONTAPGuide/limit-access-security-groups.html

cli/src/pcluster/templates/cluster_stack.py

Lines changed: 26 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -806,25 +806,33 @@ def _add_storage_security_group(self, storage_cfn_id, storage):
806806
if sg_key in seen_sgs:
807807
continue
808808
seen_sgs.add(sg_key)
809-
# TODO Scope down ingress rules to allow only traffic on the strictly necessary ports.
810-
# Currently scoped down only on Login nodes to limit blast radius.
811-
ingress_protocol = "-1"
812-
ingress_port = ALL_PORTS_RANGE
813-
if sg_type == "Login":
814-
if storage_type == SharedStorageType.EFS:
815-
ingress_protocol = "tcp"
816-
ingress_port = EFS_PORT
817-
elif storage_type == SharedStorageType.FSX:
818-
ingress_protocol = "tcp"
819-
ingress_port = FSX_PORTS[LUSTRE]["tcp"][0]
820-
ingress_rule = self._allow_all_ingress(
821-
description=f"{storage_cfn_id}SecurityGroup{sg_type}Ingress{sg_ref_id}",
822-
source_security_group_id=sg_ref,
823-
group_id=storage_security_group.ref,
824-
ip_protocol=ingress_protocol,
825-
port=ingress_port,
809+
810+
# For Storage-to-Storage, allow all traffic.
811+
# For Head/Compute/Login nodes, allow only the required ports.
812+
storage_ports = {
813+
SharedStorageType.EFS: ("tcp", [EFS_PORT]),
814+
SharedStorageType.FSX: ("tcp", FSX_PORTS[LUSTRE]["tcp"]),
815+
}
816+
ingress_protocol, ingress_ports = (
817+
("-1", [ALL_PORTS_RANGE])
818+
if sg_type == "Storage"
819+
else storage_ports.get(storage_type, ("-1", [ALL_PORTS_RANGE]))
826820
)
827-
rules.append(ingress_rule)
821+
822+
for rule_id, ingress_port in enumerate(ingress_ports):
823+
ingress_rule = self._allow_all_ingress(
824+
description=f"{storage_cfn_id}SecurityGroup{sg_type}Ingress{sg_ref_id}Rule{rule_id}",
825+
source_security_group_id=sg_ref,
826+
group_id=storage_security_group.ref,
827+
ip_protocol=ingress_protocol,
828+
port=ingress_port,
829+
)
830+
rules.append(ingress_rule)
831+
832+
if sg_type == "Storage":
833+
ingress_rule.cfn_options.deletion_policy = ingress_rule.cfn_options.update_replace_policy = (
834+
storage_deletion_policy
835+
)
828836

829837
egress_rule = self._allow_all_egress(
830838
description=f"{storage_cfn_id}SecurityGroup{sg_type}Egress{sg_ref_id}",
@@ -834,9 +842,6 @@ def _add_storage_security_group(self, storage_cfn_id, storage):
834842
rules.append(egress_rule)
835843

836844
if sg_type == "Storage":
837-
ingress_rule.cfn_options.deletion_policy = ingress_rule.cfn_options.update_replace_policy = (
838-
storage_deletion_policy
839-
)
840845
egress_rule.cfn_options.deletion_policy = egress_rule.cfn_options.update_replace_policy = (
841846
storage_deletion_policy
842847
)

cli/src/pcluster/validators/cluster_validators.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626
DELETE_POLICY,
2727
EFS_PORT,
2828
FSX_PORTS,
29+
LUSTRE,
2930
PCLUSTER_IMAGE_BUILD_STATUS_TAG,
3031
PCLUSTER_NAME_MAX_LENGTH,
3132
PCLUSTER_NAME_MAX_LENGTH_SLURM_ACCOUNTING,
@@ -660,11 +661,14 @@ def _check_file_storage(self, security_groups_by_nodes, file_storages, subnet_id
660661
network_interfaces = [ni for ni in network_interface_responses if ni.get("VpcId") == vpc_id]
661662

662663
for protocol, ports in FSX_PORTS[file_storage.file_storage_type].items():
664+
# For Lustre, only validate the first port (988), which is the only mandatory one.
665+
# Other ports (1018-1023) are optional so we do not enforce them.
666+
ports_to_validate = [ports[0]] if file_storage.file_storage_type == LUSTRE else ports
663667
missing_ports = self._get_missing_ports(
664668
security_groups_by_nodes,
665669
subnet_ids,
666670
network_interfaces,
667-
ports,
671+
ports_to_validate,
668672
protocol,
669673
file_storage.file_storage_type,
670674
)
@@ -676,7 +680,7 @@ def _check_file_storage(self, security_groups_by_nodes, file_storages, subnet_id
676680
self._add_failure(
677681
f"The current security group settings on file storage '{file_storage_id}' does not"
678682
" satisfy mounting requirement. The file storage must be associated to a security group"
679-
f" that allows {direction} {protocol.upper()} traffic through ports {ports}. "
683+
f" that allows {direction} {protocol.upper()} traffic through ports {ports_to_validate}. "
680684
f"Missing ports: {missing_ports}",
681685
FailureLevel.ERROR,
682686
)

cli/tests/pcluster/templates/test_cluster_stack.py

Lines changed: 94 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1382,7 +1382,6 @@ def test_storage_security_group_deduplication(mocker, test_datadir):
13821382
13831383
When head node, compute nodes, and login nodes all use the same security group (sg-12345678),
13841384
only one set of ingress/egress rules should be created for that security group, not three separate sets.
1385-
The deduplicated rule uses the first occurrence's settings (Head node), which uses all protocols (-1).
13861385
"""
13871386
mock_aws_api(mocker)
13881387
mock_bucket(mocker)
@@ -1395,23 +1394,102 @@ def test_storage_security_group_deduplication(mocker, test_datadir):
13951394
cluster_config=cluster_config, bucket=dummy_cluster_bucket(), stack_name="clustername"
13961395
)
13971396

1398-
# Test both EFS and FSx storage types
1399-
for storage_type in ["EFS", "FSX"]:
1400-
storage_sg_ingress_rules = [
1401-
(name, resource)
1402-
for name, resource in generated_template["Resources"].items()
1403-
if resource["Type"] == "AWS::EC2::SecurityGroupIngress"
1404-
and name.startswith(storage_type)
1405-
and "SecurityGroup" in name
1406-
]
1407-
1408-
# Verify deduplication: only 2 rules instead of 4 (Head, Compute, Login share same SG + Storage SG)
1409-
assert_that(len(storage_sg_ingress_rules)).is_equal_to(2)
1397+
# The EFS storage security group must have 2 ingress rules:
1398+
# * allow traffic from cluster nodes (port 2049)
1399+
# * allow traffic from storage nodes (all traffic)
1400+
efs_sg_ingress_rules = [
1401+
(name, resource)
1402+
for name, resource in generated_template["Resources"].items()
1403+
if resource["Type"] == "AWS::EC2::SecurityGroupIngress" and name.startswith("EFS") and "SecurityGroup" in name
1404+
]
1405+
assert_that(len(efs_sg_ingress_rules)).is_equal_to(2)
1406+
1407+
# The FSx Lustre storage security group must have 3 rules:
1408+
# * allow traffic from cluster nodes (port 2049)
1409+
# * allow traffic from cluster nodes (ports 1018-1023)
1410+
# * allow traffic from storage nodes (all traffic)
1411+
fsx_sg_ingress_rules = [
1412+
(name, resource)
1413+
for name, resource in generated_template["Resources"].items()
1414+
if resource["Type"] == "AWS::EC2::SecurityGroupIngress" and name.startswith("FSX") and "SecurityGroup" in name
1415+
]
1416+
assert_that(len(fsx_sg_ingress_rules)).is_equal_to(3)
14101417

1411-
# Verify each unique source security group has exactly one ingress rule
1418+
# Verify each storage type has the expected unique source security groups
1419+
for _storage_type, rules in [("EFS", efs_sg_ingress_rules), ("FSX", fsx_sg_ingress_rules)]:
14121420
source_sgs = {
14131421
str(rule["Properties"].get("SourceSecurityGroupId"))
1414-
for _, rule in storage_sg_ingress_rules
1422+
for _, rule in rules
14151423
if rule["Properties"].get("SourceSecurityGroupId")
14161424
}
1417-
assert_that(len(storage_sg_ingress_rules)).is_equal_to(len(source_sgs))
1425+
# Should have 2 unique source SGs (shared SG + storage SG)
1426+
assert_that(len(source_sgs)).is_equal_to(2)
1427+
1428+
1429+
def test_storage_security_group_port_restrictions(mocker, test_datadir):
1430+
"""
1431+
Test that storage security group rules use restricted ports for head/compute/login nodes.
1432+
1433+
Security group rules should follow these principles:
1434+
1. Storage-to-Storage: Allow all traffic (protocol -1)
1435+
2. Head/Compute/Login nodes to EFS: Allow only TCP port 2049
1436+
3. Head/Compute/Login nodes to FSx Lustre: Allow only TCP ports 988 and 1018-1023
1437+
"""
1438+
mock_aws_api(mocker)
1439+
mock_bucket(mocker)
1440+
mock_bucket_object_utils(mocker)
1441+
1442+
input_yaml = load_yaml_dict(test_datadir / "config-shared-sg.yaml")
1443+
cluster_config = ClusterSchema(cluster_name="clustername").load(input_yaml)
1444+
1445+
generated_template, _ = CDKTemplateBuilder().build_cluster_template(
1446+
cluster_config=cluster_config, bucket=dummy_cluster_bucket(), stack_name="clustername"
1447+
)
1448+
1449+
# Test EFS storage - should only allow port 2049
1450+
efs_ingress_rules = [
1451+
(name, resource)
1452+
for name, resource in generated_template["Resources"].items()
1453+
if resource["Type"] == "AWS::EC2::SecurityGroupIngress" and name.startswith("EFS") and "SecurityGroup" in name
1454+
]
1455+
1456+
for name, rule in efs_ingress_rules:
1457+
props = rule["Properties"]
1458+
if "Storage" in name:
1459+
# Storage-to-Storage: all traffic allowed
1460+
assert_that(props["IpProtocol"]).is_equal_to("-1")
1461+
assert_that(props["FromPort"]).is_equal_to(0)
1462+
assert_that(props["ToPort"]).is_equal_to(65535)
1463+
else:
1464+
# Head/Compute/Login to EFS: only TCP port 2049
1465+
assert_that(props["IpProtocol"]).is_equal_to("tcp")
1466+
assert_that(props["FromPort"]).is_equal_to(2049)
1467+
assert_that(props["ToPort"]).is_equal_to(2049)
1468+
1469+
# Test FSx Lustre storage - should only allow ports 988 and 1018-1023
1470+
fsx_ingress_rules = [
1471+
(name, resource)
1472+
for name, resource in generated_template["Resources"].items()
1473+
if resource["Type"] == "AWS::EC2::SecurityGroupIngress" and name.startswith("FSX") and "SecurityGroup" in name
1474+
]
1475+
1476+
# Collect non-storage rules to verify FSx ports
1477+
fsx_node_rules = [(name, rule) for name, rule in fsx_ingress_rules if "Storage" not in name]
1478+
fsx_storage_rules = [(name, rule) for name, rule in fsx_ingress_rules if "Storage" in name]
1479+
1480+
# Verify Storage-to-Storage rule allows all traffic
1481+
for _name, rule in fsx_storage_rules:
1482+
props = rule["Properties"]
1483+
assert_that(props["IpProtocol"]).is_equal_to("-1")
1484+
assert_that(props["FromPort"]).is_equal_to(0)
1485+
assert_that(props["ToPort"]).is_equal_to(65535)
1486+
1487+
# Verify Head/Compute/Login rules use TCP and FSx Lustre ports (988, 1018-1023)
1488+
fsx_ports_found = set()
1489+
for _name, rule in fsx_node_rules:
1490+
props = rule["Properties"]
1491+
assert_that(props["IpProtocol"]).is_equal_to("tcp")
1492+
fsx_ports_found.add((props["FromPort"], props["ToPort"]))
1493+
1494+
# Should have rules for port 988 and port range 1018-1023
1495+
assert_that(fsx_ports_found).contains((988, 988), (1018, 1023))
Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
Image:
2+
Os: alinux2
3+
HeadNode:
4+
InstanceType: t3.micro
5+
Ssh:
6+
KeyName: ec2-key-name
7+
Networking:
8+
SubnetId: subnet-12345678
9+
SecurityGroups:
10+
- sg-12345678
11+
Scheduling:
12+
Scheduler: slurm
13+
SlurmQueues:
14+
- Name: queue1
15+
Networking:
16+
SubnetIds:
17+
- subnet-12345678
18+
SecurityGroups:
19+
- sg-12345678
20+
ComputeResources:
21+
- Name: compute_resource1
22+
InstanceType: t3.micro
23+
MinCount: 0
24+
MaxCount: 10
25+
LoginNodes:
26+
Pools:
27+
- Name: login
28+
InstanceType: t3.micro
29+
Count: 1
30+
Networking:
31+
SubnetIds:
32+
- subnet-12345678
33+
SecurityGroups:
34+
- sg-12345678
35+
SharedStorage:
36+
- Name: efs
37+
StorageType: Efs
38+
MountDir: /efs
39+
EfsSettings:
40+
ThroughputMode: bursting
41+
- Name: fsx
42+
StorageType: FsxLustre
43+
MountDir: /fsx
44+
FsxLustreSettings:
45+
StorageCapacity: 1200
46+
DeploymentType: SCRATCH_2

cli/tests/pcluster/templates/test_shared_storage.py

Lines changed: 31 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -107,9 +107,14 @@ def test_shared_storage_efs(mocker, test_datadir, config_file_name, storage_name
107107
else f"{cluster_config.login_nodes.pools[0].name}LoginNodesSecurityGroup"
108108
)
109109
for sg in ["HeadNodeSecurityGroup", "ComputeSecurityGroup", login_nodes_sg_name, mount_target_sg_name]:
110-
ingress_ip_protocol = "tcp" if sg == login_nodes_sg_name else "-1"
111-
ingress_port_range = [2049, 2049] if sg == login_nodes_sg_name else [0, 65535]
112110
rule_deletion_policy = deletion_policy if sg == mount_target_sg_name else None
111+
# Storage-to-Storage allows all traffic, node-to-storage uses scoped-down EFS port 2049
112+
if sg == mount_target_sg_name:
113+
ingress_ip_protocol = "-1"
114+
ingress_port_range = [0, 65535]
115+
else:
116+
ingress_ip_protocol = "tcp"
117+
ingress_port_range = [2049, 2049]
113118
assert_sg_rule(
114119
generated_template,
115120
mount_target_sg_name,
@@ -173,18 +178,31 @@ def test_shared_storage_fsx(mocker, test_datadir, config_file_name, storage_name
173178
else f"{cluster_config.login_nodes.pools[0].name}LoginNodesSecurityGroup"
174179
)
175180
for sg in ["HeadNodeSecurityGroup", "ComputeSecurityGroup", login_nodes_sg_name, file_system_sg_name]:
176-
ingress_ip_protocol = "tcp" if sg == login_nodes_sg_name else "-1"
177-
ingress_port_range = [988, 988] if sg == login_nodes_sg_name else [0, 65535]
178181
rule_deletion_policy = deletion_policy if sg == file_system_sg_name else None
179-
assert_sg_rule(
180-
generated_template,
181-
file_system_sg_name,
182-
rule_type="ingress",
183-
protocol=ingress_ip_protocol,
184-
port_range=ingress_port_range,
185-
target_sg=sg,
186-
deletion_policy=rule_deletion_policy,
187-
)
182+
# Storage-to-Storage allows all traffic, node-to-storage uses scoped-down FSx Lustre ports
183+
if sg == file_system_sg_name:
184+
assert_sg_rule(
185+
generated_template,
186+
file_system_sg_name,
187+
rule_type="ingress",
188+
protocol="-1",
189+
port_range=[0, 65535],
190+
target_sg=sg,
191+
deletion_policy=rule_deletion_policy,
192+
)
193+
else:
194+
ingress_ip_protocol = "tcp"
195+
ingress_port_ranges = [(988, 988), (1018, 1023)]
196+
for ingress_port_range in ingress_port_ranges:
197+
assert_sg_rule(
198+
generated_template,
199+
file_system_sg_name,
200+
rule_type="ingress",
201+
protocol=ingress_ip_protocol,
202+
port_range=ingress_port_range,
203+
target_sg=sg,
204+
deletion_policy=rule_deletion_policy,
205+
)
188206
assert_sg_rule(
189207
generated_template,
190208
file_system_sg_name,

tests/integration-tests/tests/storage/test_fsx_lustre.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -521,7 +521,7 @@ def _create_file_cache(file_cache_path, bucket_name):
521521

522522
def _create_fsx_lustre_volume_ids(num_existing_fsx_lustre, fsx_factory, import_path, export_path):
523523
return fsx_factory(
524-
ports=[988],
524+
ports=[988, 1018, 1019, 1020, 1021, 1022, 1023],
525525
ip_protocols=["tcp"],
526526
num=num_existing_fsx_lustre,
527527
file_system_type="LUSTRE",

0 commit comments

Comments
 (0)