From 2d6580da8c993ecbd3a35b78c4d6ab5b46a382e6 Mon Sep 17 00:00:00 2001 From: JinSoo Jeong Date: Sun, 5 Jan 2025 14:47:28 +0900 Subject: [PATCH] Refactoring terraform --- ai-ml/bionemo/eks.tf | 7 +++ ai-ml/bionemo/locals.tf | 27 +++++++++ ai-ml/bionemo/providers.tf | 32 +++++++++++ ai-ml/bionemo/vpc.tf | 14 ----- ai-ml/ray/terraform/locals.tf | 18 ++++++ ai-ml/ray/terraform/main.tf | 74 ------------------------- ai-ml/ray/terraform/providers.tf | 53 ++++++++++++++++++ ai-ml/ray/terraform/variables.tf | 3 - ai-ml/trainium-inferentia/eks.tf | 14 +++++ ai-ml/trainium-inferentia/jupyterhub.tf | 22 ++++---- ai-ml/trainium-inferentia/locals.tf | 33 +++++++++++ ai-ml/trainium-inferentia/providers.tf | 48 ++++++++++++++++ ai-ml/trainium-inferentia/vpc.tf | 12 ---- 13 files changed, 243 insertions(+), 114 deletions(-) create mode 100644 ai-ml/bionemo/locals.tf create mode 100644 ai-ml/bionemo/providers.tf create mode 100644 ai-ml/ray/terraform/locals.tf create mode 100644 ai-ml/ray/terraform/providers.tf create mode 100644 ai-ml/trainium-inferentia/locals.tf create mode 100755 ai-ml/trainium-inferentia/providers.tf diff --git a/ai-ml/bionemo/eks.tf b/ai-ml/bionemo/eks.tf index e45e5a816..c3e8f4193 100644 --- a/ai-ml/bionemo/eks.tf +++ b/ai-ml/bionemo/eks.tf @@ -143,3 +143,10 @@ module "eks" { } } } + + +data "aws_availability_zones" "available" {} + +data "aws_eks_cluster_auth" "this" { + name = module.eks.cluster_name +} diff --git a/ai-ml/bionemo/locals.tf b/ai-ml/bionemo/locals.tf new file mode 100644 index 000000000..cf15947ab --- /dev/null +++ b/ai-ml/bionemo/locals.tf @@ -0,0 +1,27 @@ +#--------------------------------------------------------------- +# Local variables +#--------------------------------------------------------------- +locals { + name = var.name + region = var.region + + # Routable Private subnets only for Private NAT Gateway -> Transit Gateway -> Second VPC for overlapping CIDRs + # e.g., var.vpc_cidr = "10.1.0.0/21" => output: ["10.1.0.0/24", "10.1.1.0/24"] => 256-2 = 254 usable IPs per subnet/AZ + private_subnets = [for k, v in local.azs : cidrsubnet(var.vpc_cidr, 3, k)] + # Routable Public subnets with NAT Gateway and Internet Gateway + # e.g., var.vpc_cidr = "10.1.0.0/21" => output: ["10.1.2.0/26", "10.1.2.64/26"] => 64-2 = 62 usable IPs per subnet/AZ + public_subnets = [for k, v in local.azs : cidrsubnet(var.vpc_cidr, 5, k + 8)] + + database_private_subnets = [for k, v in local.azs : cidrsubnet(var.vpc_cidr, 3, k + 5)] + # RFC6598 range 100.64.0.0/16 for EKS Data Plane for two subnets(32768 IPs per Subnet) across two AZs for EKS Control Plane ENI + Nodes + Pods + # e.g., var.secondary_cidr_blocks = "100.64.0.0/16" => output: ["100.64.0.0/17", "100.64.128.0/17"] => 32768-2 = 32766 usable IPs per subnet/AZ + secondary_ip_range_private_subnets = [for k, v in local.azs : cidrsubnet(element(var.secondary_cidr_blocks, 0), 1, k)] + + vpc_cidr = var.vpc_cidr + azs = slice(data.aws_availability_zones.available.names, 0, 2) + + tags = { + Blueprint = local.name + GithubRepo = "github.com/awslabs/data-on-eks" + } +} diff --git a/ai-ml/bionemo/providers.tf b/ai-ml/bionemo/providers.tf new file mode 100644 index 000000000..cab84e377 --- /dev/null +++ b/ai-ml/bionemo/providers.tf @@ -0,0 +1,32 @@ +provider "aws" { + region = local.region +} + +provider "kubernetes" { + host = module.eks.cluster_endpoint + cluster_ca_certificate = base64decode(module.eks.cluster_certificate_authority_data) + token = data.aws_eks_cluster_auth.this.token +} + +# ECR always authenticates with `us-east-1` region +# Docs -> https://docs.aws.amazon.com/AmazonECR/latest/public/public-registries.html +provider "aws" { + alias = "ecr" + region = "us-east-1" +} + +provider "helm" { + kubernetes { + host = module.eks.cluster_endpoint + cluster_ca_certificate = base64decode(module.eks.cluster_certificate_authority_data) + token = data.aws_eks_cluster_auth.this.token + } +} + +provider "kubectl" { + apply_retry_count = 10 + host = module.eks.cluster_endpoint + cluster_ca_certificate = base64decode(module.eks.cluster_certificate_authority_data) + load_config_file = false + token = data.aws_eks_cluster_auth.this.token +} diff --git a/ai-ml/bionemo/vpc.tf b/ai-ml/bionemo/vpc.tf index f63ccbe0c..30251a8ed 100644 --- a/ai-ml/bionemo/vpc.tf +++ b/ai-ml/bionemo/vpc.tf @@ -1,17 +1,3 @@ -locals { - # Routable Private subnets only for Private NAT Gateway -> Transit Gateway -> Second VPC for overlapping CIDRs - # e.g., var.vpc_cidr = "10.1.0.0/21" => output: ["10.1.0.0/24", "10.1.1.0/24"] => 256-2 = 254 usable IPs per subnet/AZ - private_subnets = [for k, v in local.azs : cidrsubnet(var.vpc_cidr, 3, k)] - # Routable Public subnets with NAT Gateway and Internet Gateway - # e.g., var.vpc_cidr = "10.1.0.0/21" => output: ["10.1.2.0/26", "10.1.2.64/26"] => 64-2 = 62 usable IPs per subnet/AZ - public_subnets = [for k, v in local.azs : cidrsubnet(var.vpc_cidr, 5, k + 8)] - - database_private_subnets = [for k, v in local.azs : cidrsubnet(var.vpc_cidr, 3, k + 5)] - # RFC6598 range 100.64.0.0/16 for EKS Data Plane for two subnets(32768 IPs per Subnet) across two AZs for EKS Control Plane ENI + Nodes + Pods - # e.g., var.secondary_cidr_blocks = "100.64.0.0/16" => output: ["100.64.0.0/17", "100.64.128.0/17"] => 32768-2 = 32766 usable IPs per subnet/AZ - secondary_ip_range_private_subnets = [for k, v in local.azs : cidrsubnet(element(var.secondary_cidr_blocks, 0), 1, k)] -} - #--------------------------------------------------------------- # VPC #--------------------------------------------------------------- diff --git a/ai-ml/ray/terraform/locals.tf b/ai-ml/ray/terraform/locals.tf new file mode 100644 index 000000000..b63541f5b --- /dev/null +++ b/ai-ml/ray/terraform/locals.tf @@ -0,0 +1,18 @@ +#--------------------------------------------------------------- +# Locals +#--------------------------------------------------------------- +locals { + name = var.name + region = var.region + + vpc_cidr = "10.0.0.0/16" + secondary_vpc_cidr = "100.64.0.0/16" + azs = slice(data.aws_availability_zones.available.names, 0, 3) + + cluster_version = var.eks_cluster_version + + tags = { + Blueprint = local.name + GithubRepo = "github.com/awslabs/data-on-eks" + } +} diff --git a/ai-ml/ray/terraform/main.tf b/ai-ml/ray/terraform/main.tf index d6e9f5b49..191085c9a 100644 --- a/ai-ml/ray/terraform/main.tf +++ b/ai-ml/ray/terraform/main.tf @@ -1,57 +1,3 @@ -#--------------------------------------------------------------- -# Providers -#--------------------------------------------------------------- - -provider "aws" { - region = local.region -} - -# Used for Karpenter Helm chart -provider "aws" { - region = "us-east-1" - alias = "ecr_public_region" -} - -provider "kubernetes" { - host = module.eks.cluster_endpoint - cluster_ca_certificate = base64decode(module.eks.cluster_certificate_authority_data) - - exec { - api_version = "client.authentication.k8s.io/v1beta1" - command = "aws" - # This requires the awscli to be installed locally where Terraform is executed - args = ["eks", "get-token", "--cluster-name", module.eks.cluster_name] - } -} - -provider "helm" { - kubernetes { - host = module.eks.cluster_endpoint - cluster_ca_certificate = base64decode(module.eks.cluster_certificate_authority_data) - - exec { - api_version = "client.authentication.k8s.io/v1beta1" - command = "aws" - # This requires the awscli to be installed locally where Terraform is executed - args = ["eks", "get-token", "--cluster-name", module.eks.cluster_name] - } - } -} - -provider "kubectl" { - apply_retry_count = 5 - host = module.eks.cluster_endpoint - cluster_ca_certificate = base64decode(module.eks.cluster_certificate_authority_data) - load_config_file = false - - exec { - api_version = "client.authentication.k8s.io/v1beta1" - command = "aws" - # This requires the awscli to be installed locally where Terraform is executed - args = ["eks", "get-token", "--cluster-name", module.eks.cluster_name] - } -} - #--------------------------------------------------------------- # Data Sources #--------------------------------------------------------------- @@ -63,26 +9,6 @@ data "aws_ecrpublic_authorization_token" "token" { provider = aws.ecr_public_region } -#--------------------------------------------------------------- -# Locals -#--------------------------------------------------------------- - -locals { - name = var.name - region = var.region - - vpc_cidr = "10.0.0.0/16" - secondary_vpc_cidr = "100.64.0.0/16" - azs = slice(data.aws_availability_zones.available.names, 0, 3) - - cluster_version = var.eks_cluster_version - - tags = { - Blueprint = local.name - GithubRepo = "github.com/awslabs/data-on-eks" - } -} - #--------------------------------------------------------------- # EKS Cluster #--------------------------------------------------------------- diff --git a/ai-ml/ray/terraform/providers.tf b/ai-ml/ray/terraform/providers.tf new file mode 100644 index 000000000..00119c4c2 --- /dev/null +++ b/ai-ml/ray/terraform/providers.tf @@ -0,0 +1,53 @@ +#--------------------------------------------------------------- +# Providers +#--------------------------------------------------------------- + +provider "aws" { + region = local.region +} + +# Used for Karpenter Helm chart +provider "aws" { + region = "us-east-1" + alias = "ecr_public_region" +} + +provider "kubernetes" { + host = module.eks.cluster_endpoint + cluster_ca_certificate = base64decode(module.eks.cluster_certificate_authority_data) + + exec { + api_version = "client.authentication.k8s.io/v1beta1" + command = "aws" + # This requires the awscli to be installed locally where Terraform is executed + args = ["eks", "get-token", "--cluster-name", module.eks.cluster_name] + } +} + +provider "helm" { + kubernetes { + host = module.eks.cluster_endpoint + cluster_ca_certificate = base64decode(module.eks.cluster_certificate_authority_data) + + exec { + api_version = "client.authentication.k8s.io/v1beta1" + command = "aws" + # This requires the awscli to be installed locally where Terraform is executed + args = ["eks", "get-token", "--cluster-name", module.eks.cluster_name] + } + } +} + +provider "kubectl" { + apply_retry_count = 5 + host = module.eks.cluster_endpoint + cluster_ca_certificate = base64decode(module.eks.cluster_certificate_authority_data) + load_config_file = false + + exec { + api_version = "client.authentication.k8s.io/v1beta1" + command = "aws" + # This requires the awscli to be installed locally where Terraform is executed + args = ["eks", "get-token", "--cluster-name", module.eks.cluster_name] + } +} diff --git a/ai-ml/ray/terraform/variables.tf b/ai-ml/ray/terraform/variables.tf index 4e52b46d3..7d1417425 100644 --- a/ai-ml/ray/terraform/variables.tf +++ b/ai-ml/ray/terraform/variables.tf @@ -1,17 +1,14 @@ variable "region" { description = "Region" type = string - default = "us-west-2" } variable "name" { description = "Name of the VPC, EKS Cluster and Ray cluster" - default = "ray-cluster" type = string } variable "eks_cluster_version" { description = "EKS Cluster version" - default = "1.25" type = string } diff --git a/ai-ml/trainium-inferentia/eks.tf b/ai-ml/trainium-inferentia/eks.tf index 642fd472b..d7b4af8ba 100644 --- a/ai-ml/trainium-inferentia/eks.tf +++ b/ai-ml/trainium-inferentia/eks.tf @@ -1,3 +1,17 @@ +#--------------------------------------------------------------- +# Data Sources +#--------------------------------------------------------------- + +data "aws_ecrpublic_authorization_token" "token" { + provider = aws.ecr +} + +data "aws_caller_identity" "current" {} + +data "aws_iam_session_context" "current" { + arn = data.aws_caller_identity.current.arn +} + #--------------------------------------------------------------- # EKS Cluster #--------------------------------------------------------------- diff --git a/ai-ml/trainium-inferentia/jupyterhub.tf b/ai-ml/trainium-inferentia/jupyterhub.tf index 8d2754597..51a8ee6a8 100644 --- a/ai-ml/trainium-inferentia/jupyterhub.tf +++ b/ai-ml/trainium-inferentia/jupyterhub.tf @@ -1,14 +1,3 @@ -#----------------------------------------------------------------------------------------- -# JupyterHub Single User IRSA, maybe that block could be incorporated in add-on registry -#----------------------------------------------------------------------------------------- -resource "kubernetes_namespace_v1" "jupyterhub" { - count = var.enable_jupyterhub ? 1 : 0 - - metadata { - name = "jupyterhub" - } -} - module "jupyterhub_single_user_irsa" { count = var.enable_jupyterhub ? 1 : 0 @@ -28,6 +17,17 @@ module "jupyterhub_single_user_irsa" { } } +#----------------------------------------------------------------------------------------- +# JupyterHub Single User IRSA, maybe that block could be incorporated in add-on registry +#----------------------------------------------------------------------------------------- +resource "kubernetes_namespace_v1" "jupyterhub" { + count = var.enable_jupyterhub ? 1 : 0 + + metadata { + name = "jupyterhub" + } +} + resource "kubernetes_service_account_v1" "jupyterhub_single_user_sa" { count = var.enable_jupyterhub ? 1 : 0 diff --git a/ai-ml/trainium-inferentia/locals.tf b/ai-ml/trainium-inferentia/locals.tf new file mode 100644 index 000000000..2f2e4fe0d --- /dev/null +++ b/ai-ml/trainium-inferentia/locals.tf @@ -0,0 +1,33 @@ +#--------------------------------------------------------------- +# Locals +#--------------------------------------------------------------- + +locals { + name = var.name + region = var.region + + # Routable Private subnets only for Private NAT Gateway -> Transit Gateway -> Second VPC for overlapping CIDRs + # e.g., var.vpc_cidr = "10.1.0.0/21" => output: ["10.1.0.0/24", "10.1.1.0/24"] => 256-2 = 254 usable IPs per subnet/AZ + private_subnets = [for k, v in local.azs : cidrsubnet(var.vpc_cidr, 3, k)] + # Routable Public subnets with NAT Gateway and Internet Gateway + # e.g., var.vpc_cidr = "10.1.0.0/21" => output: ["10.1.2.0/26", "10.1.2.64/26"] => 64-2 = 62 usable IPs per subnet/AZ + public_subnets = [for k, v in local.azs : cidrsubnet(var.vpc_cidr, 5, k + 8)] + # RFC6598 range 100.64.0.0/16 for EKS Data Plane for two subnets(32768 IPs per Subnet) across two AZs for EKS Control Plane ENI + Nodes + Pods + # e.g., var.secondary_cidr_blocks = "100.64.0.0/16" => output: ["100.64.0.0/17", "100.64.128.0/17"] => 32768-2 = 32766 usable IPs per subnet/AZ + secondary_ip_range_private_subnets = [for k, v in local.azs : cidrsubnet(element(var.secondary_cidr_blocks, 0), 1, k)] + + # Trn1 and Inf2 instances are available in specific AZs in us-east-1, + # us-east-2, and us-west-2. For Trn1, the first AZ id (below) should be used. + az_mapping = { + "us-west-2" = ["usw2-az4", "usw2-az1"], + "us-east-1" = ["use1-az6", "use1-az5"], + "us-east-2" = ["use2-az3", "use2-az1"] + } + + azs = local.az_mapping[var.region] + + tags = { + Blueprint = local.name + GithubRepo = "github.com/awslabs/data-on-eks" + } +} diff --git a/ai-ml/trainium-inferentia/providers.tf b/ai-ml/trainium-inferentia/providers.tf new file mode 100755 index 000000000..a62a0bb7c --- /dev/null +++ b/ai-ml/trainium-inferentia/providers.tf @@ -0,0 +1,48 @@ +provider "aws" { + region = local.region +} + +provider "aws" { + alias = "ecr" + region = "us-east-1" +} + +provider "kubernetes" { + host = module.eks.cluster_endpoint + cluster_ca_certificate = base64decode(module.eks.cluster_certificate_authority_data) + + exec { + api_version = "client.authentication.k8s.io/v1beta1" + command = "aws" + # This requires the awscli to be installed locally where Terraform is executed + args = ["eks", "get-token", "--cluster-name", module.eks.cluster_name] + } +} + +provider "helm" { + kubernetes { + host = module.eks.cluster_endpoint + cluster_ca_certificate = base64decode(module.eks.cluster_certificate_authority_data) + + exec { + api_version = "client.authentication.k8s.io/v1beta1" + command = "aws" + # This requires the awscli to be installed locally where Terraform is executed + args = ["eks", "get-token", "--cluster-name", module.eks.cluster_name] + } + } +} + +provider "kubectl" { + apply_retry_count = 5 + host = module.eks.cluster_endpoint + cluster_ca_certificate = base64decode(module.eks.cluster_certificate_authority_data) + load_config_file = false + + exec { + api_version = "client.authentication.k8s.io/v1beta1" + command = "aws" + # This requires the awscli to be installed locally where Terraform is executed + args = ["eks", "get-token", "--cluster-name", module.eks.cluster_name] + } +} diff --git a/ai-ml/trainium-inferentia/vpc.tf b/ai-ml/trainium-inferentia/vpc.tf index 59c3da89c..e6f0b1dfb 100755 --- a/ai-ml/trainium-inferentia/vpc.tf +++ b/ai-ml/trainium-inferentia/vpc.tf @@ -1,15 +1,3 @@ -locals { - # Routable Private subnets only for Private NAT Gateway -> Transit Gateway -> Second VPC for overlapping CIDRs - # e.g., var.vpc_cidr = "10.1.0.0/21" => output: ["10.1.0.0/24", "10.1.1.0/24"] => 256-2 = 254 usable IPs per subnet/AZ - private_subnets = [for k, v in local.azs : cidrsubnet(var.vpc_cidr, 3, k)] - # Routable Public subnets with NAT Gateway and Internet Gateway - # e.g., var.vpc_cidr = "10.1.0.0/21" => output: ["10.1.2.0/26", "10.1.2.64/26"] => 64-2 = 62 usable IPs per subnet/AZ - public_subnets = [for k, v in local.azs : cidrsubnet(var.vpc_cidr, 5, k + 8)] - # RFC6598 range 100.64.0.0/16 for EKS Data Plane for two subnets(32768 IPs per Subnet) across two AZs for EKS Control Plane ENI + Nodes + Pods - # e.g., var.secondary_cidr_blocks = "100.64.0.0/16" => output: ["100.64.0.0/17", "100.64.128.0/17"] => 32768-2 = 32766 usable IPs per subnet/AZ - secondary_ip_range_private_subnets = [for k, v in local.azs : cidrsubnet(element(var.secondary_cidr_blocks, 0), 1, k)] -} - #--------------------------------------------------------------- # VPC #---------------------------------------------------------------