Kubernetes Advanced Reference

Pods & Containers

Multi-Container Pod Patterns

Sidecar Pattern

apiVersion: v1
kind: Pod
metadata:
  name: app-with-log-sidecar
spec:
  containers:
  - name: app
    image: myapp:3.2.1
    ports:
    - containerPort: 8080
    volumeMounts:
    - name: logs
      mountPath: /var/log/app

  - name: log-shipper
    image: fluentd:v1.16
    volumeMounts:
    - name: logs
      mountPath: /var/log/app
      readOnly: true
    env:
    - name: FLUENTD_CONF
      value: fluent.conf

  volumes:
  - name: logs
    emptyDir: {}

Init Container (DB migration before app starts)

apiVersion: v1
kind: Pod
metadata:
  name: app-with-migration
spec:
  initContainers:
  - name: migrate
    image: myapp:3.2.1
    command: ["./manage.py", "migrate", "--no-input"]
    env:
    - name: DATABASE_URL
      valueFrom:
        secretKeyRef:
          name: db-credentials
          key: url

  containers:
  - name: app
    image: myapp:3.2.1
    ports:
    - containerPort: 8080

Liveness, Readiness & Startup Probes

apiVersion: v1
kind: Pod
metadata:
  name: probed-app
spec:
  containers:
  - name: app
    image: myapp:3.2.1
    ports:
    - containerPort: 8080

    # Restart container if /healthz fails
    livenessProbe:
      httpGet:
        path: /healthz
        port: 8080
      initialDelaySeconds: 15
      periodSeconds: 10
      failureThreshold: 3

    # Remove from service endpoints if not ready
    readinessProbe:
      httpGet:
        path: /ready
        port: 8080
      initialDelaySeconds: 5
      periodSeconds: 5
      successThreshold: 2

    # Protect slow-starting containers
    startupProbe:
      httpGet:
        path: /healthz
        port: 8080
      failureThreshold: 30
      periodSeconds: 10

Ephemeral Containers (Debug)

# Attach a debug container to a running pod
kubectl debug -it pod/myapp-7d8f9b --image=busybox:1.36 --target=app

# Debug with a copy of the pod (changes image)
kubectl debug pod/myapp-7d8f9b -it --copy-to=debug-pod --container=app --image=ubuntu

# Debug node-level issues
kubectl debug node/worker-1 -it --image=ubuntu

Pod Disruption Budgets

apiVersion: policy/v1
kind: PodDisruptionBudget
metadata:
  name: app-pdb
spec:
  # At least 2 pods must remain available during voluntary disruptions
  minAvailable: 2
  # OR: maxUnavailable: 1
  selector:
    matchLabels:
      app: myapp

Pod Topology Spread

apiVersion: v1
kind: Pod
metadata:
  name: spread-pod
  labels:
    app: web
spec:
  topologySpreadConstraints:
  - maxSkew: 1
    topologyKey: topology.kubernetes.io/zone
    whenUnsatisfiable: DoNotSchedule
    labelSelector:
      matchLabels:
        app: web
  - maxSkew: 1
    topologyKey: kubernetes.io/hostname
    whenUnsatisfiable: ScheduleAnyway
    labelSelector:
      matchLabels:
        app: web
  containers:
  - name: web
    image: nginx:1.25

Deployments & Rollouts

Production-Grade Deployment

apiVersion: apps/v1
kind: Deployment
metadata:
  name: api-server
  labels:
    app: api-server
    version: v3.2.1
spec:
  replicas: 5
  revisionHistoryLimit: 10
  strategy:
    type: RollingUpdate
    rollingUpdate:
      maxSurge: 1          # 1 extra pod during rollout
      maxUnavailable: 0     # zero-downtime
  selector:
    matchLabels:
      app: api-server
  template:
    metadata:
      labels:
        app: api-server
        version: v3.2.1
      annotations:
        prometheus.io/scrape: "true"
        prometheus.io/port: "9090"
    spec:
      terminationGracePeriodSeconds: 60
      containers:
      - name: api
        image: registry.example.com/api:3.2.1
        ports:
        - name: http
          containerPort: 8080
        - name: metrics
          containerPort: 9090
        resources:
          requests:
            cpu: 250m
            memory: 256Mi
          limits:
            cpu: "1"
            memory: 512Mi
        env:
        - name: LOG_LEVEL
          valueFrom:
            configMapKeyRef:
              name: api-config
              key: log-level
        readinessProbe:
          httpGet:
            path: /ready
            port: http
          periodSeconds: 5
        livenessProbe:
          httpGet:
            path: /healthz
            port: http
          initialDelaySeconds: 15
        lifecycle:
          preStop:
            exec:
              # Allow in-flight requests to drain
              command: ["/bin/sh", "-c", "sleep 15"]

Rollout Operations

# Watch rollout progress
kubectl rollout status deployment/api-server --timeout=5m

# View rollout history
kubectl rollout history deployment/api-server
kubectl rollout history deployment/api-server --revision=3

# Rollback to previous
kubectl rollout undo deployment/api-server

# Rollback to specific revision
kubectl rollout undo deployment/api-server --to-revision=7

# Pause/resume rollout (canary-style)
kubectl rollout pause deployment/api-server
# Inspect the new pods, check metrics...
kubectl rollout resume deployment/api-server

# Trigger rollout via annotation change (no image change)
kubectl patch deployment api-server -p \
  '{"spec":{"template":{"metadata":{"annotations":{"rollout-trigger":"'$(date +%s)'"}}}}}'

Blue-Green via Service Switching

# Deploy "green" alongside existing "blue"
# Then switch the Service selector:
kubectl patch svc api-server -p '{"spec":{"selector":{"version":"v3.3.0"}}}'

# Verify, then scale down blue
kubectl scale deployment api-server-blue --replicas=0

Tip:

Set minReadySeconds: 30 on deployments to catch crashlooping pods before they replace healthy ones during a rollout.

Services & Endpoints

Service Types Compared

Type	Scope	Use Case
`ClusterIP`	Cluster-internal	Internal microservice communication
`NodePort`	Exposed on each node (30000-32767)	Dev/test, direct node access
`LoadBalancer`	Cloud LB provisioned	Production external traffic
`ExternalName`	CNAME alias	Reference external services
Headless (`clusterIP: None`)	DNS-only, no proxy	StatefulSets, custom discovery

ClusterIP with Session Affinity

apiVersion: v1
kind: Service
metadata:
  name: api-server
spec:
  type: ClusterIP
  selector:
    app: api-server
  sessionAffinity: ClientIP
  sessionAffinityConfig:
    clientIP:
      timeoutSeconds: 3600
  ports:
  - name: http
    port: 80
    targetPort: http
    protocol: TCP

Headless Service for StatefulSet

apiVersion: v1
kind: Service
metadata:
  name: postgres-headless
spec:
  clusterIP: None
  selector:
    app: postgres
  ports:
  - port: 5432
    targetPort: 5432
# Each pod gets a DNS record: pod-0.postgres-headless.namespace.svc.cluster.local

ExternalName (Map internal name to external DNS)

apiVersion: v1
kind: Service
metadata:
  name: external-db
spec:
  type: ExternalName
  externalName: db.prod.example.com
# Pods can reach db.prod.example.com via "external-db" service name

Service Without Selectors (manual endpoints)

apiVersion: v1
kind: Service
metadata:
  name: legacy-api
spec:
  ports:
  - port: 443
    targetPort: 8443
---
apiVersion: v1
kind: Endpoints
metadata:
  name: legacy-api      # Must match Service name
subsets:
- addresses:
  - ip: 10.0.50.12
  - ip: 10.0.50.13
  ports:
  - port: 8443

Ingress

NGINX Ingress with TLS, Rate Limiting & Rewrites

apiVersion: networking.k8s.io/v1
kind: Ingress
metadata:
  name: main-ingress
  annotations:
    nginx.ingress.kubernetes.io/ssl-redirect: "true"
    nginx.ingress.kubernetes.io/force-ssl-redirect: "true"
    nginx.ingress.kubernetes.io/proxy-body-size: "50m"
    nginx.ingress.kubernetes.io/limit-rps: "50"
    nginx.ingress.kubernetes.io/limit-burst-multiplier: "5"
    nginx.ingress.kubernetes.io/configuration-snippet: |
      more_set_headers "X-Frame-Options: DENY";
      more_set_headers "X-Content-Type-Options: nosniff";
    cert-manager.io/cluster-issuer: letsencrypt-prod
spec:
  ingressClassName: nginx
  tls:
  - hosts:
    - api.example.com
    - app.example.com
    secretName: example-com-tls
  rules:
  - host: api.example.com
    http:
      paths:
      - path: /v1
        pathType: Prefix
        backend:
          service:
            name: api-v1
            port:
              number: 80
      - path: /v2
        pathType: Prefix
        backend:
          service:
            name: api-v2
            port:
              number: 80
  - host: app.example.com
    http:
      paths:
      - path: /
        pathType: Prefix
        backend:
          service:
            name: frontend
            port:
              number: 80

Path Rewriting

apiVersion: networking.k8s.io/v1
kind: Ingress
metadata:
  name: rewrite-ingress
  annotations:
    nginx.ingress.kubernetes.io/rewrite-target: /$2
spec:
  ingressClassName: nginx
  rules:
  - host: api.example.com
    http:
      paths:
      # /api/users/123 → forwards to backend as /users/123
      - path: /api(/|$)(.*)
        pathType: ImplementationSpecific
        backend:
          service:
            name: api-server
            port:
              number: 80

Gateway API (Modern Replacement)

apiVersion: gateway.networking.k8s.io/v1
kind: Gateway
metadata:
  name: main-gateway
  namespace: gateway-infra
spec:
  gatewayClassName: istio
  listeners:
  - name: https
    protocol: HTTPS
    port: 443
    tls:
      mode: Terminate
      certificateRefs:
      - name: example-com-tls
    allowedRoutes:
      namespaces:
        from: Selector
        selector:
          matchLabels:
            gateway-access: "true"
---
apiVersion: gateway.networking.k8s.io/v1
kind: HTTPRoute
metadata:
  name: api-route
spec:
  parentRefs:
  - name: main-gateway
    namespace: gateway-infra
  hostnames:
  - "api.example.com"
  rules:
  - matches:
    - path:
        type: PathPrefix
        value: /v1
    backendRefs:
    - name: api-v1
      port: 80
      weight: 90
    - name: api-v2
      port: 80
      weight: 10    # 10% canary traffic

Note:

Gateway API is the successor to Ingress. It provides more expressive routing, traffic splitting, and cross-namespace references. Prefer it for new clusters running K8s 1.27+.

Egress & Network Policies

Default Deny All (Namespace Isolation)

apiVersion: networking.k8s.io/v1
kind: NetworkPolicy
metadata:
  name: default-deny-all
  namespace: production
spec:
  podSelector: {}       # Applies to ALL pods in namespace
  policyTypes:
  - Ingress
  - Egress

Critical:

After applying default-deny, pods lose access to DNS (CoreDNS) and cannot resolve service names. Always pair with a DNS egress rule.

Allow DNS + Specific Egress

apiVersion: networking.k8s.io/v1
kind: NetworkPolicy
metadata:
  name: api-egress
  namespace: production
spec:
  podSelector:
    matchLabels:
      app: api-server
  policyTypes:
  - Egress
  egress:
  # Allow DNS resolution
  - to:
    - namespaceSelector: {}
      podSelector:
        matchLabels:
          k8s-app: kube-dns
    ports:
    - protocol: UDP
      port: 53
    - protocol: TCP
      port: 53

  # Allow traffic to postgres in same namespace
  - to:
    - podSelector:
        matchLabels:
          app: postgres
    ports:
    - protocol: TCP
      port: 5432

  # Allow HTTPS to external APIs (CIDR-based)
  - to:
    - ipBlock:
        cidr: 0.0.0.0/0
        except:
        - 10.0.0.0/8
        - 172.16.0.0/12
        - 192.168.0.0/16
    ports:
    - protocol: TCP
      port: 443

Ingress Policy: Allow Only from Specific Namespace

apiVersion: networking.k8s.io/v1
kind: NetworkPolicy
metadata:
  name: db-ingress
  namespace: production
spec:
  podSelector:
    matchLabels:
      app: postgres
  policyTypes:
  - Ingress
  ingress:
  - from:
    # Only pods in namespaces labeled team=backend
    - namespaceSelector:
        matchLabels:
          team: backend
      podSelector:
        matchLabels:
          role: api
    ports:
    - protocol: TCP
      port: 5432

Tip:

The namespaceSelector + podSelector in a single from entry is AND logic. Separate from entries are OR logic. This is a common source of misconfig.

Cilium: L7 Egress Policy (FQDN-based)

apiVersion: cilium.io/v2
kind: CiliumNetworkPolicy
metadata:
  name: allow-github-api
spec:
  endpointSelector:
    matchLabels:
      app: ci-runner
  egress:
  - toFQDNs:
    - matchName: "api.github.com"
    - matchPattern: "*.githubusercontent.com"
    toPorts:
    - ports:
      - port: "443"
        protocol: TCP

DNS & Service Discovery

DNS Resolution Reference

Record Type	Format	Example
Service (ClusterIP)	`svc.namespace.svc.cluster.local`	`api-server.production.svc.cluster.local`
Headless Pod	`pod.svc.namespace.svc.cluster.local`	`pg-0.postgres-headless.db.svc.cluster.local`
Pod IP-based	`a-b-c-d.namespace.pod.cluster.local`	`10-244-1-5.default.pod.cluster.local`

Custom DNS Policy

apiVersion: v1
kind: Pod
metadata:
  name: custom-dns
spec:
  dnsPolicy: None
  dnsConfig:
    nameservers:
    - 8.8.8.8
    - 1.1.1.1
    searches:
    - production.svc.cluster.local
    - svc.cluster.local
    options:
    - name: ndots
      value: "2"       # Reduce unnecessary search domain lookups
    - name: single-request-reopen
  containers:
  - name: app
    image: myapp:latest

Tip:

Setting ndots: 2 (default is 5) significantly reduces DNS query volume. Names with 2+ dots resolve directly rather than trying all search domains first.

Persistent Volumes & Claims

PersistentVolume (Manual Provisioning)

apiVersion: v1
kind: PersistentVolume
metadata:
  name: nfs-data
  labels:
    type: nfs
spec:
  capacity:
    storage: 100Gi
  accessModes:
  - ReadWriteMany
  persistentVolumeReclaimPolicy: Retain
  storageClassName: ""        # Disable dynamic provisioning
  nfs:
    server: nfs.internal.example.com
    path: /exports/data
  mountOptions:
  - nfsvers=4.1
  - hard
  - rsize=1048576
  - wsize=1048576

PersistentVolumeClaim

apiVersion: v1
kind: PersistentVolumeClaim
metadata:
  name: app-data
spec:
  accessModes:
  - ReadWriteOnce
  resources:
    requests:
      storage: 50Gi
  storageClassName: gp3-encrypted
  # Optionally bind to a specific PV
  # volumeName: nfs-data

Using PVC in a Deployment

apiVersion: apps/v1
kind: Deployment
metadata:
  name: app
spec:
  replicas: 1          # RWO volumes can only attach to one node
  selector:
    matchLabels:
      app: myapp
  template:
    metadata:
      labels:
        app: myapp
    spec:
      containers:
      - name: app
        image: myapp:3.2.1
        volumeMounts:
        - name: data
          mountPath: /data
        - name: cache
          mountPath: /tmp/cache
      volumes:
      - name: data
        persistentVolumeClaim:
          claimName: app-data
      - name: cache
        emptyDir:
          medium: Memory    # tmpfs-backed, counts against memory limits
          sizeLimit: 256Mi

Volume Expansion

# StorageClass must have allowVolumeExpansion: true
kubectl patch pvc app-data -p '{"spec":{"resources":{"requests":{"storage":"100Gi"}}}}'

# Check status (may require pod restart for filesystem resize)
kubectl get pvc app-data -o jsonpath='{.status.conditions}'

Volume Access Modes

Mode	Abbrev	Description
ReadWriteOnce	RWO	Single node read-write
ReadOnlyMany	ROX	Multi-node read-only
ReadWriteMany	RWX	Multi-node read-write (NFS, EFS, CephFS)
ReadWriteOncePod	RWOP	Single pod read-write (K8s 1.27+)

StorageClasses

AWS EBS gp3 with Encryption

apiVersion: storage.k8s.io/v1
kind: StorageClass
metadata:
  name: gp3-encrypted
  annotations:
    storageclass.kubernetes.io/is-default-class: "true"
provisioner: ebs.csi.aws.com
parameters:
  type: gp3
  iops: "3000"
  throughput: "125"
  encrypted: "true"
  kmsKeyId: arn:aws:kms:us-east-1:123456789:key/abc-123
reclaimPolicy: Delete
allowVolumeExpansion: true
volumeBindingMode: WaitForFirstConsumer   # Bind to AZ of first consumer pod

GCP SSD with Snapshot

apiVersion: storage.k8s.io/v1
kind: StorageClass
metadata:
  name: ssd-regional
provisioner: pd.csi.storage.gke.io
parameters:
  type: pd-ssd
  replication-type: regional-pd
reclaimPolicy: Retain
volumeBindingMode: WaitForFirstConsumer

VolumeSnapshotClass + Snapshot

apiVersion: snapshot.storage.k8s.io/v1
kind: VolumeSnapshotClass
metadata:
  name: ebs-snapshot-class
driver: ebs.csi.aws.com
deletionPolicy: Retain
---
apiVersion: snapshot.storage.k8s.io/v1
kind: VolumeSnapshot
metadata:
  name: db-snapshot-2024-01-15
spec:
  volumeSnapshotClassName: ebs-snapshot-class
  source:
    persistentVolumeClaimName: postgres-data

# Restore from snapshot
---
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
  name: postgres-data-restored
spec:
  accessModes: [ReadWriteOnce]
  storageClassName: gp3-encrypted
  resources:
    requests:
      storage: 100Gi
  dataSource:
    name: db-snapshot-2024-01-15
    kind: VolumeSnapshot
    apiGroup: snapshot.storage.k8s.io

ConfigMaps

Creating ConfigMaps

# From literal values
kubectl create configmap app-config \
  --from-literal=LOG_LEVEL=info \
  --from-literal=MAX_CONNECTIONS=100

# From a file
kubectl create configmap nginx-conf --from-file=nginx.conf

# From a directory
kubectl create configmap app-configs --from-file=./configs/

# From env file
kubectl create configmap env-config --from-env-file=.env.production

ConfigMap Manifest

apiVersion: v1
kind: ConfigMap
metadata:
  name: app-config
data:
  LOG_LEVEL: info
  MAX_CONNECTIONS: "100"
  # Multi-line config file
  app.properties: |
    server.port=8080
    cache.ttl=300
    feature.dark-mode=true
    db.pool.size=20

Using ConfigMaps

apiVersion: v1
kind: Pod
metadata:
  name: app
spec:
  containers:
  - name: app
    image: myapp:latest
    # Individual keys as env vars
    env:
    - name: LOG_LEVEL
      valueFrom:
        configMapKeyRef:
          name: app-config
          key: LOG_LEVEL
    # All keys as env vars
    envFrom:
    - configMapRef:
        name: app-config
      prefix: APP_     # Optional prefix: APP_LOG_LEVEL, APP_MAX_CONNECTIONS
    volumeMounts:
    # Mount as files
    - name: config-vol
      mountPath: /etc/app
      readOnly: true
  volumes:
  - name: config-vol
    configMap:
      name: app-config
      items:            # Mount only specific keys
      - key: app.properties
        path: application.properties

Tip:

ConfigMaps mounted as volumes auto-update when the ConfigMap changes (with a propagation delay of ~60s). Environment variables do NOT auto-update — they require a pod restart.

Immutable ConfigMaps (K8s 1.21+)

apiVersion: v1
kind: ConfigMap
metadata:
  name: app-config-v3
immutable: true         # Cannot be changed after creation, reduces apiserver load
data:
  LOG_LEVEL: info

Secrets

Secret Types

Type	Usage
`Opaque`	Generic key-value (default)
`kubernetes.io/tls`	TLS cert + key
`kubernetes.io/dockerconfigjson`	Private registry credentials
`kubernetes.io/basic-auth`	Username + password
`kubernetes.io/ssh-auth`	SSH private key

Creating Secrets

# Generic secret from literals
kubectl create secret generic db-creds \
  --from-literal=username=admin \
  --from-literal=password='s3cUr3!p@ss'

# TLS secret
kubectl create secret tls example-tls \
  --cert=./tls.crt \
  --key=./tls.key

# Docker registry secret
kubectl create secret docker-registry regcred \
  --docker-server=registry.example.com \
  --docker-username=deploy \
  --docker-password=$REGISTRY_TOKEN

Secret Manifest (base64-encoded)

apiVersion: v1
kind: Secret
metadata:
  name: db-credentials
type: Opaque
data:
  username: YWRtaW4=          # echo -n 'admin' | base64
  password: czNjVXIzIXBAc3M=  # echo -n 's3cUr3!p@ss' | base64
  # OR use stringData for plain text (auto-encoded on apply)
stringData:
  connection-string: "postgres://admin:s3cUr3!p@ss@db:5432/mydb?sslmode=require"

Using Secrets in Pods

apiVersion: v1
kind: Pod
metadata:
  name: app
spec:
  containers:
  - name: app
    image: myapp:latest
    env:
    - name: DB_PASSWORD
      valueFrom:
        secretKeyRef:
          name: db-credentials
          key: password
    volumeMounts:
    - name: tls
      mountPath: /etc/tls
      readOnly: true
  volumes:
  - name: tls
    secret:
      secretName: example-tls
      defaultMode: 0400       # Restrict file permissions
  imagePullSecrets:
  - name: regcred              # Private registry auth

Warning:

K8s Secrets are base64-encoded, NOT encrypted. Enable encryption at rest via EncryptionConfiguration, or use external secret managers (Vault, AWS Secrets Manager) with operators like External Secrets Operator.

External Secrets Operator (ESO)

apiVersion: external-secrets.io/v1beta1
kind: ExternalSecret
metadata:
  name: db-credentials
spec:
  refreshInterval: 1h
  secretStoreRef:
    name: aws-secrets-manager
    kind: ClusterSecretStore
  target:
    name: db-credentials
    creationPolicy: Owner
  data:
  - secretKey: password
    remoteRef:
      key: production/db-credentials
      property: password

StatefulSets

PostgreSQL StatefulSet

apiVersion: apps/v1
kind: StatefulSet
metadata:
  name: postgres
spec:
  serviceName: postgres-headless
  replicas: 3
  podManagementPolicy: OrderedReady    # Sequential startup (default)
  updateStrategy:
    type: RollingUpdate
    rollingUpdate:
      partition: 0       # Set > 0 for canary: only pods >= partition update
  selector:
    matchLabels:
      app: postgres
  template:
    metadata:
      labels:
        app: postgres
    spec:
      terminationGracePeriodSeconds: 120
      containers:
      - name: postgres
        image: postgres:16.1
        ports:
        - containerPort: 5432
          name: postgres
        env:
        - name: POSTGRES_PASSWORD
          valueFrom:
            secretKeyRef:
              name: pg-credentials
              key: password
        - name: PGDATA
          value: /var/lib/postgresql/data/pgdata
        volumeMounts:
        - name: data
          mountPath: /var/lib/postgresql/data
        resources:
          requests:
            cpu: 500m
            memory: 1Gi
          limits:
            cpu: "2"
            memory: 4Gi
        readinessProbe:
          exec:
            command:
            - pg_isready
            - -U
            - postgres
          periodSeconds: 10
  # Each replica gets its own PVC (postgres-data-postgres-0, etc.)
  volumeClaimTemplates:
  - metadata:
      name: data
    spec:
      accessModes: [ReadWriteOnce]
      storageClassName: gp3-encrypted
      resources:
        requests:
          storage: 100Gi

Note:

Scaling down a StatefulSet does NOT delete its PVCs. This is intentional for data safety but can lead to orphaned volumes. Clean up manually: kubectl delete pvc data-postgres-2

DaemonSets

Node Log Collector with Tolerations

apiVersion: apps/v1
kind: DaemonSet
metadata:
  name: fluentd
  namespace: logging
spec:
  selector:
    matchLabels:
      app: fluentd
  updateStrategy:
    type: RollingUpdate
    rollingUpdate:
      maxUnavailable: 1
  template:
    metadata:
      labels:
        app: fluentd
    spec:
      serviceAccountName: fluentd
      # Run on ALL nodes, including control plane
      tolerations:
      - operator: Exists
      containers:
      - name: fluentd
        image: fluent/fluentd:v1.16
        resources:
          requests:
            cpu: 100m
            memory: 200Mi
          limits:
            memory: 500Mi
        volumeMounts:
        - name: varlog
          mountPath: /var/log
          readOnly: true
        - name: containers
          mountPath: /var/lib/docker/containers
          readOnly: true
      volumes:
      - name: varlog
        hostPath:
          path: /var/log
      - name: containers
        hostPath:
          path: /var/lib/docker/containers

Jobs & CronJobs

Parallel Job with Backoff

apiVersion: batch/v1
kind: Job
metadata:
  name: data-migration
spec:
  parallelism: 4           # 4 pods run concurrently
  completions: 20           # Total work items to complete
  backoffLimit: 3           # Retry failed pods up to 3 times
  activeDeadlineSeconds: 3600
  ttlSecondsAfterFinished: 86400   # Auto-cleanup after 24h
  template:
    spec:
      restartPolicy: Never
      containers:
      - name: migrate
        image: migration-tool:2.1
        command: ["./migrate", "--batch"]
        resources:
          requests:
            cpu: 500m
            memory: 512Mi

CronJob with Concurrency Control

apiVersion: batch/v1
kind: CronJob
metadata:
  name: db-backup
spec:
  schedule: "0 2 * * *"         # Daily at 2 AM UTC
  timeZone: "America/New_York"  # K8s 1.27+ timezone support
  concurrencyPolicy: Forbid     # Skip if previous still running
  successfulJobsHistoryLimit: 3
  failedJobsHistoryLimit: 5
  startingDeadlineSeconds: 600  # Fail if can't start within 10 min
  jobTemplate:
    spec:
      backoffLimit: 2
      template:
        spec:
          restartPolicy: OnFailure
          containers:
          - name: backup
            image: pg-backup:1.5
            env:
            - name: PGPASSWORD
              valueFrom:
                secretKeyRef:
                  name: pg-credentials
                  key: password
            command:
            - /bin/sh
            - -c
            - |
              pg_dump -h postgres-0.postgres-headless -U postgres mydb \
                | gzip | aws s3 cp - s3://backups/db/$(date +%Y%m%d).sql.gz

HPA & VPA

HPA with Custom Metrics

apiVersion: autoscaling/v2
kind: HorizontalPodAutoscaler
metadata:
  name: api-hpa
spec:
  scaleTargetRef:
    apiVersion: apps/v1
    kind: Deployment
    name: api-server
  minReplicas: 3
  maxReplicas: 50
  behavior:
    scaleDown:
      stabilizationWindowSeconds: 300   # Wait 5 min before scaling down
      policies:
      - type: Percent
        value: 10
        periodSeconds: 60
    scaleUp:
      stabilizationWindowSeconds: 0     # Scale up immediately
      policies:
      - type: Pods
        value: 4
        periodSeconds: 60
      - type: Percent
        value: 100
        periodSeconds: 60
      selectPolicy: Max                 # Use whichever adds more pods
  metrics:
  - type: Resource
    resource:
      name: cpu
      target:
        type: Utilization
        averageUtilization: 70
  - type: Resource
    resource:
      name: memory
      target:
        type: Utilization
        averageUtilization: 80
  # Custom Prometheus metric via prometheus-adapter
  - type: Pods
    pods:
      metric:
        name: http_requests_per_second
      target:
        type: AverageValue
        averageValue: "1000"

VPA (Vertical Pod Autoscaler)

apiVersion: autoscaling.k8s.io/v1
kind: VerticalPodAutoscaler
metadata:
  name: api-vpa
spec:
  targetRef:
    apiVersion: apps/v1
    kind: Deployment
    name: api-server
  updatePolicy:
    updateMode: "Auto"       # Off | Initial | Recreate | Auto
  resourcePolicy:
    containerPolicies:
    - containerName: api
      minAllowed:
        cpu: 100m
        memory: 128Mi
      maxAllowed:
        cpu: "4"
        memory: 8Gi
      controlledResources: [cpu, memory]

Warning:

Do NOT use HPA and VPA on the same resource dimension (e.g., both scaling on CPU). They will conflict. Common pattern: HPA on custom metrics + VPA on CPU/memory.

Resource Management

Resource Requests vs Limits

	Requests	Limits
CPU	Guaranteed minimum, used for scheduling	Throttled (not killed) if exceeded
Memory	Guaranteed minimum, used for scheduling	OOMKilled if exceeded

QoS Classes

Class	Condition	Eviction Priority
Guaranteed	requests == limits for all containers	Last (safest)
Burstable	At least one request or limit set	Middle
BestEffort	No requests or limits	First (evicted first)

# Guaranteed QoS - production workloads
resources:
  requests:
    cpu: "1"
    memory: 1Gi
  limits:
    cpu: "1"
    memory: 1Gi

# Burstable - most common in practice
resources:
  requests:
    cpu: 250m        # 0.25 cores
    memory: 256Mi
  limits:
    cpu: "2"         # Can burst to 2 cores
    memory: 1Gi      # Hard OOM boundary

Best Practice:

For CPU: set requests based on steady-state usage, limits 2-4x requests (or omit limits entirely — throttling is often worse than burst). For Memory: always set limits close to requests to avoid OOM surprises.

LimitRanges & ResourceQuotas

LimitRange (Per-Pod Defaults & Constraints)

apiVersion: v1
kind: LimitRange
metadata:
  name: container-limits
  namespace: production
spec:
  limits:
  - type: Container
    default:           # Applied if no limits specified
      cpu: 500m
      memory: 512Mi
    defaultRequest:    # Applied if no requests specified
      cpu: 100m
      memory: 128Mi
    max:
      cpu: "4"
      memory: 8Gi
    min:
      cpu: 50m
      memory: 64Mi
  - type: PersistentVolumeClaim
    max:
      storage: 500Gi
    min:
      storage: 1Gi

ResourceQuota (Namespace-Level Caps)

apiVersion: v1
kind: ResourceQuota
metadata:
  name: team-quota
  namespace: team-alpha
spec:
  hard:
    requests.cpu: "20"
    requests.memory: 40Gi
    limits.cpu: "40"
    limits.memory: 80Gi
    pods: "50"
    services: "10"
    persistentvolumeclaims: "20"
    requests.storage: 500Gi
    count/deployments.apps: "20"
    count/secrets: "50"
  scopeSelector:
    matchExpressions:
    - scopeName: PriorityClass
      operator: In
      values: ["high"]

# Check quota usage
kubectl describe resourcequota team-quota -n team-alpha

RBAC

Role & RoleBinding (Namespace-Scoped)

apiVersion: rbac.authorization.k8s.io/v1
kind: Role
metadata:
  name: developer
  namespace: staging
rules:
# Full access to apps
- apiGroups: ["apps"]
  resources: ["deployments", "replicasets"]
  verbs: ["get", "list", "watch", "create", "update", "patch"]
# Read pods, exec into them, view logs
- apiGroups: [""]
  resources: ["pods"]
  verbs: ["get", "list", "watch"]
- apiGroups: [""]
  resources: ["pods/log", "pods/exec"]
  verbs: ["get", "create"]
# Read-only on configmaps and secrets
- apiGroups: [""]
  resources: ["configmaps"]
  verbs: ["get", "list"]
- apiGroups: [""]
  resources: ["secrets"]
  verbs: ["get"]
  resourceNames: ["app-config"]   # Restrict to specific secret
---
apiVersion: rbac.authorization.k8s.io/v1
kind: RoleBinding
metadata:
  name: developer-binding
  namespace: staging
subjects:
- kind: User
  name: alice@example.com
  apiGroup: rbac.authorization.k8s.io
- kind: Group
  name: team-backend
  apiGroup: rbac.authorization.k8s.io
roleRef:
  kind: Role
  name: developer
  apiGroup: rbac.authorization.k8s.io

ClusterRole & Aggregation

apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
  name: monitoring-reader
  labels:
    rbac.example.com/aggregate-to-admin: "true"   # Auto-aggregate into admin
rules:
- apiGroups: ["monitoring.coreos.com"]
  resources: ["prometheuses", "alertmanagers", "servicemonitors"]
  verbs: ["get", "list", "watch"]
---
# Aggregated ClusterRole (auto-collects matching rules)
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
  name: admin-aggregated
aggregationRule:
  clusterRoleSelectors:
  - matchLabels:
      rbac.example.com/aggregate-to-admin: "true"
rules: []   # Auto-populated from matching ClusterRoles

Audit RBAC Permissions

# Check if a user can perform an action
kubectl auth can-i create deployments --as=alice@example.com -n staging
kubectl auth can-i '*' '*' --as=system:serviceaccount:kube-system:default

# List all permissions for a user
kubectl auth can-i --list --as=alice@example.com -n staging

# Who can do what (requires kubectl-who-can plugin)
kubectl who-can delete pods -n production

Security Contexts

Hardened Pod Security Context

apiVersion: v1
kind: Pod
metadata:
  name: hardened-app
spec:
  securityContext:
    runAsNonRoot: true
    runAsUser: 1000
    runAsGroup: 3000
    fsGroup: 2000
    seccompProfile:
      type: RuntimeDefault
  containers:
  - name: app
    image: myapp:3.2.1
    securityContext:
      allowPrivilegeEscalation: false
      readOnlyRootFilesystem: true
      capabilities:
        drop: [ALL]
        add: [NET_BIND_SERVICE]   # Only if binding port < 1024
    volumeMounts:
    # Writable dirs for apps that need them
    - name: tmp
      mountPath: /tmp
    - name: cache
      mountPath: /var/cache
  volumes:
  - name: tmp
    emptyDir: {}
  - name: cache
    emptyDir: {}

Pod Security Standards (PSS/PSA)

Namespace-Level Enforcement

apiVersion: v1
kind: Namespace
metadata:
  name: production
  labels:
    # Enforce: reject pods that violate
    pod-security.kubernetes.io/enforce: restricted
    pod-security.kubernetes.io/enforce-version: latest
    # Audit: log violations
    pod-security.kubernetes.io/audit: restricted
    # Warn: show warnings to users
    pod-security.kubernetes.io/warn: restricted

PSS Levels

Level	Description
`privileged`	Unrestricted (default, no restrictions)
`baseline`	Minimally restrictive (prevents known privilege escalations)
`restricted`	Heavily restricted (security best practices)

# Dry-run: check existing pods against restricted
kubectl label --dry-run=server --overwrite ns production \
  pod-security.kubernetes.io/enforce=restricted

Service Accounts

Scoped Service Account

apiVersion: v1
kind: ServiceAccount
metadata:
  name: app-sa
  namespace: production
  annotations:
    # AWS IRSA (IAM Roles for Service Accounts)
    eks.amazonaws.com/role-arn: arn:aws:iam::123456789:role/app-s3-reader
    # GKE Workload Identity
    # iam.gke.io/gcp-service-account: app@project.iam.gserviceaccount.com
automountServiceAccountToken: false   # Don't mount token unless needed
---
# Bind with least-privilege role
apiVersion: rbac.authorization.k8s.io/v1
kind: RoleBinding
metadata:
  name: app-sa-binding
  namespace: production
subjects:
- kind: ServiceAccount
  name: app-sa
  namespace: production
roleRef:
  kind: Role
  name: app-role
  apiGroup: rbac.authorization.k8s.io

Bound Token (K8s 1.22+ projected)

apiVersion: v1
kind: Pod
metadata:
  name: app
spec:
  serviceAccountName: app-sa
  automountServiceAccountToken: false
  containers:
  - name: app
    image: myapp:latest
    volumeMounts:
    - name: token
      mountPath: /var/run/secrets/tokens
  volumes:
  - name: token
    projected:
      sources:
      - serviceAccountToken:
          audience: vault
          expirationSeconds: 3600
          path: vault-token

Helm

Essential Commands

# Add & update repos
helm repo add bitnami https://charts.bitnami.com/bitnami
helm repo update

# Search for charts
helm search repo nginx --versions

# Show chart details
helm show values bitnami/nginx > values-defaults.yaml

# Install with custom values
helm install my-nginx bitnami/nginx \
  -f values-prod.yaml \
  --set service.type=ClusterIP \
  --namespace ingress --create-namespace \
  --version 15.4.0 \
  --wait --timeout 5m

# Upgrade (or install if not exists)
helm upgrade --install my-nginx bitnami/nginx \
  -f values-prod.yaml \
  --namespace ingress \
  --atomic           # Auto-rollback on failure
  --cleanup-on-fail

# Rollback
helm rollback my-nginx 3 -n ingress

# Diff before upgrade (requires helm-diff plugin)
helm diff upgrade my-nginx bitnami/nginx -f values-prod.yaml -n ingress

# Template locally (debug without deploying)
helm template my-nginx bitnami/nginx -f values-prod.yaml --debug

# List releases
helm list -A
helm history my-nginx -n ingress

Helmfile (Declarative Multi-Release)

# helmfile.yaml
repositories:
- name: bitnami
  url: https://charts.bitnami.com/bitnami
- name: prometheus
  url: https://prometheus-community.github.io/helm-charts

environments:
  production:
    values:
    - env/production.yaml
  staging:
    values:
    - env/staging.yaml

releases:
- name: nginx
  namespace: ingress
  chart: bitnami/nginx
  version: 15.4.0
  values:
  - values/nginx.yaml
  - values/nginx-{{ .Environment.Name }}.yaml

- name: prometheus
  namespace: monitoring
  chart: prometheus/kube-prometheus-stack
  version: 55.0.0
  values:
  - values/prometheus.yaml

Debugging

Pod Troubleshooting Flow

# 1. Check pod status & events
kubectl get pods -o wide
kubectl describe pod <name>

# 2. Check logs (current + previous crash)
kubectl logs <pod> -c <container> --tail=100
kubectl logs <pod> -c <container> --previous
kubectl logs <pod> --all-containers --since=1h

# 3. Exec into running pod
kubectl exec -it <pod> -- /bin/sh

# 4. Debug with ephemeral container
kubectl debug -it <pod> --image=nicolaka/netshoot --target=app

# 5. Check resource usage
kubectl top pods --sort-by=memory
kubectl top nodes

# 6. Check events cluster-wide
kubectl get events --sort-by='.lastTimestamp' -A | tail -50
kubectl get events --field-selector reason=OOMKilled -A

Network Debugging

# DNS resolution test
kubectl run dnstest --rm -it --image=busybox:1.36 --restart=Never -- \
  nslookup kubernetes.default.svc.cluster.local

# HTTP connectivity test
kubectl run curlpod --rm -it --image=curlimages/curl --restart=Never -- \
  curl -sv http://api-server.production.svc.cluster.local/healthz

# Full network debug toolkit
kubectl run netshoot --rm -it --image=nicolaka/netshoot --restart=Never -- bash
# Inside: tcpdump, dig, nmap, iperf, curl, etc.

# Check service endpoints
kubectl get endpoints api-server -o yaml
kubectl get endpointslices -l kubernetes.io/service-name=api-server

Node Debugging

# Check node conditions
kubectl describe node <node> | grep -A5 Conditions

# Resource pressure
kubectl get nodes -o custom-columns=\
  NAME:.metadata.name,\
  CPU:.status.allocatable.cpu,\
  MEM:.status.allocatable.memory,\
  DISK_PRESSURE:.status.conditions[?(@.type=='DiskPressure')].status

# Debug node directly
kubectl debug node/worker-1 -it --image=ubuntu
# chroot /host to access node filesystem

Common Failure Patterns

Status	Likely Cause	Fix
`CrashLoopBackOff`	App crash on startup	Check `kubectl logs --previous`, verify env vars & config
`ImagePullBackOff`	Wrong image name/tag or missing pull secret	Verify image exists, check `imagePullSecrets`
`Pending`	No schedulable node (resources, taints, affinity)	`kubectl describe pod` → Events section
`OOMKilled`	Container exceeded memory limit	Increase limits or fix memory leak
`Evicted`	Node under resource pressure	Set proper requests, check node capacity
`CreateContainerConfigError`	Missing ConfigMap/Secret	Verify referenced CM/Secret exists

Node Maintenance

Safe Node Drain

# Cordon: prevent new pods from scheduling
kubectl cordon worker-3

# Drain: evict existing pods gracefully
kubectl drain worker-3 \
  --ignore-daemonsets \
  --delete-emptydir-data \
  --grace-period=120 \
  --timeout=5m

# Perform maintenance...

# Uncordon: allow scheduling again
kubectl uncordon worker-3

Taints & Tolerations

# Taint a node
kubectl taint nodes gpu-node-1 nvidia.com/gpu=present:NoSchedule

# Remove a taint
kubectl taint nodes gpu-node-1 nvidia.com/gpu=present:NoSchedule-

# Toleration in pod spec
tolerations:
- key: nvidia.com/gpu
  operator: Equal
  value: present
  effect: NoSchedule

# Tolerate everything (e.g., DaemonSets)
tolerations:
- operator: Exists

Taint Effects

Effect	Behavior
`NoSchedule`	Don't schedule new pods (existing unaffected)
`PreferNoSchedule`	Try to avoid, but schedule if necessary
`NoExecute`	Evict existing pods too (unless tolerated)

Affinity & Scheduling

Node Affinity

apiVersion: v1
kind: Pod
metadata:
  name: gpu-workload
spec:
  affinity:
    nodeAffinity:
      # Hard requirement: must match
      requiredDuringSchedulingIgnoredDuringExecution:
        nodeSelectorTerms:
        - matchExpressions:
          - key: node.kubernetes.io/instance-type
            operator: In
            values: [p3.2xlarge, p3.8xlarge]
          - key: topology.kubernetes.io/zone
            operator: In
            values: [us-east-1a, us-east-1b]
      # Soft preference: try to match
      preferredDuringSchedulingIgnoredDuringExecution:
      - weight: 80
        preference:
          matchExpressions:
          - key: gpu-memory
            operator: Gt
            values: ["16"]
  containers:
  - name: training
    image: ml-training:latest
    resources:
      limits:
        nvidia.com/gpu: 1

Pod Anti-Affinity (Spread Replicas)

apiVersion: apps/v1
kind: Deployment
metadata:
  name: web
spec:
  replicas: 3
  selector:
    matchLabels:
      app: web
  template:
    metadata:
      labels:
        app: web
    spec:
      affinity:
        podAntiAffinity:
          # Hard: never on same node as another web pod
          requiredDuringSchedulingIgnoredDuringExecution:
          - labelSelector:
              matchExpressions:
              - key: app
                operator: In
                values: [web]
            topologyKey: kubernetes.io/hostname
          # Soft: try to spread across zones
          preferredDuringSchedulingIgnoredDuringExecution:
          - weight: 100
            podAffinityTerm:
              labelSelector:
                matchLabels:
                  app: web
              topologyKey: topology.kubernetes.io/zone
      containers:
      - name: web
        image: nginx:1.25

Pod Affinity (Co-locate with dependency)

affinity:
  podAffinity:
    requiredDuringSchedulingIgnoredDuringExecution:
    - labelSelector:
        matchLabels:
          app: redis-cache
      topologyKey: kubernetes.io/hostname
# Schedules on the same node as redis-cache for low latency

Priority Classes

apiVersion: scheduling.k8s.io/v1
kind: PriorityClass
metadata:
  name: critical-workload
value: 1000000
globalDefault: false
preemptionPolicy: PreemptLowerPriority
description: "Critical production workloads"
---
apiVersion: scheduling.k8s.io/v1
kind: PriorityClass
metadata:
  name: batch-low
value: 100
preemptionPolicy: Never   # Won't evict other pods
description: "Low-priority batch jobs"

# Usage in pod spec:
spec:
  priorityClassName: critical-workload
  containers:
  - name: app
    image: myapp:latest