Single pod in kubernetes cluster has much higher memory, cpu, network than rest of replicas

Using the official helm chart with 15 replicas and we are seeing a single pod using 3x the amount of memory, increased cpu, and double the amount of networking as the rest of the replicas. I have a feeling this could be related to access logging, prometheus, DataDog metrics integration, or tracing Jaeger integration. Is there a way to scale this work out horizontally, so we don't have a single point of failure? Anybody else witnessed this behavior?

Here are our helm values:

## https://github.com/traefik/traefik-helm-chart/blob/master/traefik/values.yaml
image:
  tag: "v2.10.7"

deployment:
  kind: Deployment

service:
  spec:
    externalTrafficPolicy: Local

commonLabels:
  app: traefik

topologySpreadConstraints:
  - maxSkew: 1
    labelSelector:
      matchLabels:
        app: '{{ template "traefik.name" . }}'
    topologyKey: kubernetes.io/hostname
    whenUnsatisfiable: ScheduleAnyway

ingressRoute:
  dashboard:
    enabled: true
    annotations:
      kubernetes.io/tls-acme: "true"
    middlewares:
      - name: special-ips-only
        namespace: networking
    tls:
      secretName: traffic.tls
      domains:
        - main: traffic.ourdomain.com
autoscaling:
  enabled: true
  minReplicas: 15
  maxReplicas: 999

priorityClassName: traefik-priority-class

extraObjects:
  - apiVersion: scheduling.k8s.io/v1
    kind: PriorityClass
    metadata:
      name: traefik-priority-class
    value: 1000000000
    description: High priority class for essential pods
    preemptionPolicy: PreemptLowerPriority
  - apiVersion: autoscaling.k8s.io/v1
    kind: VerticalPodAutoscaler
    metadata:
      name: traefik-deployment-vpa
      namespace: networking
    spec:
      targetRef:
        apiVersion: "apps/v1"
        kind: Deployment
        name: traefik
      updatePolicy:
        updateMode: "Off"

additionalArguments:
  - "--api.dashboard=true"
  - "--api.insecure=true"
  - --global.sendAnonymousUsage=false
  - --global.checkNewVersion=false

providers:
  kubernetesCRD:
    allowCrossNamespace: true

resources:
  limits:
    cpu: 2
    ephemeral-storage: 500Mi
    memory: 2Gi
  requests:
    cpu: 2
    ephemeral-storage: 500Mi
    memory: 2Gi

updateStrategy:
  type: RollingUpdate
  rollingUpdate:
    maxUnavailable: 0
    maxSurge: 25%

ports:
  web:
    redirectTo:
      port: websecure
  websecure:
    middlewares:
      - networking-retry@kubernetescrd
      - networking-security-headers@kubernetescrd
  metrics:
    port: 9100
    expose: false
    exposedPort: 9100
    protocol: TCP

experimental:
  plugins: {}

logs:
  access:
    enabled: true
    format: json
    fields:
      general:
        defaultMode: drop
        names:
          Overhead: keep # How long traefik took to process
          Duration: keep # How long the request took
          RequestLine: keep # [Method] [Path] [Protocol]
      headers:
        defaultMode: drop # drop all headers per default
        names:
          User-Agent: keep # log user agent strings

## Metrics https://doc.traefik.io/traefik/observability/metrics/prometheus/
metrics:
  datadog:
    address: datadog.datadog.svc:8125
  prometheus:
    # -- Entry point used to expose metrics.
    entryPoint: metrics
    ## Enable metrics on entry points. Default=true
    addEntryPointsLabels: true
    ## Enable metrics on routers. Default=false
    addRoutersLabels: true
    ## Enable metrics on services. Default=true
    addServicesLabels: true
    ## Buckets for latency metrics. Default="0.1,0.3,1.2,5.0"
    # buckets: "0.5,1.0,2.5"
    ## When manualRouting is true, it disables the default internal router in
    ## order to allow creating a custom router for prometheus@internal service.
    # manualRouting: true
tracing:
  jaeger:
    samplingServerURL: http://jaeger-agent.monitoring.svc:5778/sampling
    localAgentHostPort: jaeger-agent.monitoring.svc:6831
1 Like