Using the official helm chart with 15 replicas and we are seeing a single pod using 3x the amount of memory, increased cpu, and double the amount of networking as the rest of the replicas. I have a feeling this could be related to access logging, prometheus, DataDog metrics integration, or tracing Jaeger integration. Is there a way to scale this work out horizontally, so we don't have a single point of failure? Anybody else witnessed this behavior?
Here are our helm values:
## https://github.com/traefik/traefik-helm-chart/blob/master/traefik/values.yaml
image:
tag: "v2.10.7"
deployment:
kind: Deployment
service:
spec:
externalTrafficPolicy: Local
commonLabels:
app: traefik
topologySpreadConstraints:
- maxSkew: 1
labelSelector:
matchLabels:
app: '{{ template "traefik.name" . }}'
topologyKey: kubernetes.io/hostname
whenUnsatisfiable: ScheduleAnyway
ingressRoute:
dashboard:
enabled: true
annotations:
kubernetes.io/tls-acme: "true"
middlewares:
- name: special-ips-only
namespace: networking
tls:
secretName: traffic.tls
domains:
- main: traffic.ourdomain.com
autoscaling:
enabled: true
minReplicas: 15
maxReplicas: 999
priorityClassName: traefik-priority-class
extraObjects:
- apiVersion: scheduling.k8s.io/v1
kind: PriorityClass
metadata:
name: traefik-priority-class
value: 1000000000
description: High priority class for essential pods
preemptionPolicy: PreemptLowerPriority
- apiVersion: autoscaling.k8s.io/v1
kind: VerticalPodAutoscaler
metadata:
name: traefik-deployment-vpa
namespace: networking
spec:
targetRef:
apiVersion: "apps/v1"
kind: Deployment
name: traefik
updatePolicy:
updateMode: "Off"
additionalArguments:
- "--api.dashboard=true"
- "--api.insecure=true"
- --global.sendAnonymousUsage=false
- --global.checkNewVersion=false
providers:
kubernetesCRD:
allowCrossNamespace: true
resources:
limits:
cpu: 2
ephemeral-storage: 500Mi
memory: 2Gi
requests:
cpu: 2
ephemeral-storage: 500Mi
memory: 2Gi
updateStrategy:
type: RollingUpdate
rollingUpdate:
maxUnavailable: 0
maxSurge: 25%
ports:
web:
redirectTo:
port: websecure
websecure:
middlewares:
- networking-retry@kubernetescrd
- networking-security-headers@kubernetescrd
metrics:
port: 9100
expose: false
exposedPort: 9100
protocol: TCP
experimental:
plugins: {}
logs:
access:
enabled: true
format: json
fields:
general:
defaultMode: drop
names:
Overhead: keep # How long traefik took to process
Duration: keep # How long the request took
RequestLine: keep # [Method] [Path] [Protocol]
headers:
defaultMode: drop # drop all headers per default
names:
User-Agent: keep # log user agent strings
## Metrics https://doc.traefik.io/traefik/observability/metrics/prometheus/
metrics:
datadog:
address: datadog.datadog.svc:8125
prometheus:
# -- Entry point used to expose metrics.
entryPoint: metrics
## Enable metrics on entry points. Default=true
addEntryPointsLabels: true
## Enable metrics on routers. Default=false
addRoutersLabels: true
## Enable metrics on services. Default=true
addServicesLabels: true
## Buckets for latency metrics. Default="0.1,0.3,1.2,5.0"
# buckets: "0.5,1.0,2.5"
## When manualRouting is true, it disables the default internal router in
## order to allow creating a custom router for prometheus@internal service.
# manualRouting: true
tracing:
jaeger:
samplingServerURL: http://jaeger-agent.monitoring.svc:5778/sampling
localAgentHostPort: jaeger-agent.monitoring.svc:6831