Hi, I'm currently running several services in a Docker Swarm encompassing 3 Ubuntu servers. Traefik runs on all three nodes, but the main node, let's call it node1, is getting sent all the initial traffic.
I'm running a front-end(FE) project with several back-end (BE) APIs serving it.
Sometimes I get 504 from the APIs when requests timeout, but it is not reproducible at will. Sometimes the APIs are also very slow to respond and I can see in my Chrome DevTools that there is significant time spent waiting on the server to respond after the FE has sent the request. The problem is not on the application side as I have logging which shows that none of the end-points spends more than 0.5sec to retrieve and send back the results. The server-side logs are not giving me any indication of what is wrong on both the traefik and application sides.
Here is my setup for traefik and the applications:
Traefik config:
version: "3.9"
networks:
traefik-swarm:
external: true
proxy:
external: true
services:
traefik:
image: "traefik:v2.6"
networks:
- traefik-swarm
- proxy
command:
#Providers
- "--providers.docker.endpoint=tcp://socket-proxy:2375"
- "--providers.docker=true"
- "--providers.docker.swarmMode=true"
- "--providers.docker.swarmModeRefreshSeconds=1"
- "--providers.docker.exposedByDefault=false"
- "--providers.docker.network=traefik-swarm"
- "--providers.docker.useBindPortIP=true"
#Logging
- "--log.level=INFO"
- "--log.filePath=$PWD/logs/traefik.log"
- "--log.format=json"
#Access logs
- "--accesslog.filePath=$PWD/logs/access.log"
- "--accesslog.format=json"
- "--accesslog.fields.defaultmode=keep"
- "--accesslog.fields.names.ClientUsername=drop"
- "--accesslog.fields.headers.defaultmode=keep"
- "--accesslog.fields.headers.names.User-Agent=redact"
- "--accesslog.fields.headers.names.Authorization=drop"
- "--accesslog.fields.headers.names.Content-Type=keep"
- "--accessLog.filters.statusCodes=204-299,400-499,500-599"
#Entry points
- "--entryPoints.web.address=:80"
- "--entryPoints.websecure.address=:443"
#Certs
- "--certificatesresolvers.le.acme.httpchallenge=true"
- "--certificatesresolvers.le.acme.httpchallenge.entrypoint=web"
- "--certificatesresolvers.le.acme.email=email@domain.com"
- "--certificatesresolvers.le.acme.storage=/letsencrypt/acme.json"
# - "--certificatesresolvers.le.acme.caserver=https://acme-staging-v02.api.letsencrypt.org/directory"
- "--certificatesResolvers.le.acme.caServer=https://acme-v02.api.letsencrypt.org/directory"
ports:
- target: 80
published: 80
protocol: tcp
mode: host
- target: 443
published: 443
protocol: tcp
mode: host
deploy:
mode: global
update_config:
parallelism: 1
delay: 10s
order: start-first
failure_action: rollback
placement:
constraints:
- node.role == manager
restart_policy:
condition: on-failure
delay: 5s
max_attempts: 3
window: 120s
labels:
- "traefik.enable=true"
- "traefik.http.routers.traefik.rule=Host(`traefik.services.domain.com`)"
- "traefik.http.routers.traefik.tls.certresolver=le"
- "traefik.http.routers.traefik.tls=true"
- "traefik.http.routers.traefik.entrypoints=websecure"
# Global redirect to https
- "traefik.http.routers.http-catchall.rule=hostregexp(`{host:.+}`)"
- "traefik.http.routers.http-catchall.entrypoints=web"
- "traefik.http.routers.http-catchall.middlewares=redirect-to-https"
# Middleware redirect
- "traefik.http.middlewares.redirect-to-https.redirectscheme.scheme=https"
- "traefik.http.middlewares.redirect-to-https.redirectscheme.permanent=true"
# CORS Headers
- "traefik.http.middlewares.cors.headers.accesscontrolalloworiginlist=*"
- "traefik.http.middlewares.cors.headers.accesscontrolmaxage=100"
- "traefik.http.middlewares.cors.headers.accesscontrolallowmethods=GET,OPTIONS,PUT,DELETE,POST,PATCH"
- "traefik.http.middlewares.cors.headers.addvaryheader=true"
- "traefik.http.services.traefik.loadbalancer.server.port=80"
volumes:
- "./letsencrypt:/letsencrypt"
- "/var/run/docker.sock:/var/run/docker.sock:ro"
- "/mnt/nfs_share/logs/:$PWD/logs/"
socket-proxy:
image: tecnativa/docker-socket-proxy
volumes:
- /var/run/docker.sock:/var/run/docker.sock:ro
environment:
ALLOW_RESTARTS: 1
AUTH: 1
BUILD: 1
COMMIT: 1
CONFIGS: 1
CONTAINERS: 1
DISTRIBUTION: 1
EVENTS: 1
EXEC: 1
IMAGES: 1
INFO: 1
LOG_LEVEL: "info"
NETWORKS: 1
NODES: 1
PING: 1
PLUGINS: 1
POST: 1
DELETE: 1
SECRETS: 1
SERVICES: 1
SESSION: 1
SWARM: 1
SYSTEM: 1
TASKS: 1
VERSION: 1
VOLUMES: 1
CONTAINERS_CREATE: 1
CONTAINERS_PRUNE: 1
CONTAINERS_RESIZE: 1
CONTAINERS_START: 1
CONTAINERS_UPDATE: 1
CONTAINERS_RENAME: 1
CONTAINERS_PAUSE: 1
CONTAINERS_UNPAUSE: 1
CONTAINERS_ATTACH: 1
CONTAINERS_WAIT: 1
CONTAINERS_EXEC: 1
VOLUMES_CREATE: 1
VOLUMES_PRUNE: 1
NETWORKS_CREATE: 1
NETWORKS_PRUNE: 1
NETWORKS_CONNECT: 1
NETWORKS_DISCONNECT: 1
NETWORKS_DELETE: 1
CONTAINERS_DELETE: 1
IMAGES_DELETE: 1
VOLUMES_DELETE: 1
networks:
- proxy
deploy:
mode: replicated
replicas: 1
update_config:
parallelism: 1
delay: 10s
order: start-first
failure_action: rollback
placement:
constraints:
- node.role == manager
restart_policy:
condition: on-failure
delay: 5s
max_attempts: 3
window: 120s
BE service
version: "3.9"
networks:
traefik-swarm:
external: true
proxy:
external: true
services:
service-name:
image: image
networks:
- traefik-swarm
deploy:
mode: replicated
replicas: 3
update_config:
parallelism: 1
delay: 10s
order: start-first
failure_action: rollback
placement:
constraints:
- node.role == manager
restart_policy:
condition: on-failure
delay: 5s
max_attempts: 3
window: 120s
labels:
- "traefik.enable=true"
- "traefik.docker.network=traefik-swarm"
- "traefik.http.routers.service-name.service=service-name"
- "traefik.http.routers.service-name.entrypoints=websecure"
- "traefik.http.routers.service-name.rule=Host(`api.service-name.services.domain.com`)"
- "traefik.http.routers.service-name.tls=true"
- "traefik.http.routers.service-name.tls.certresolver=le"
- "traefik.http.services.service-name.loadbalancer.server.port=80"
ports:
- "80"
secrets:
- secret
secrets:
secret:
external: true
FE service
version: "3.9"
networks:
traefik-swarm:
external: true
proxy:
external: true
services:
service-name-fe:
image: image
networks:
- traefik-swarm
deploy:
mode: replicated
replicas: 3
update_config:
parallelism: 1
delay: 10s
order: start-first
failure_action: rollback
placement:
constraints:
- node.role == manager
restart_policy:
condition: on-failure
delay: 5s
max_attempts: 3
window: 120s
labels:
- "traefik.enable=true"
- "traefik.docker.network=traefik-swarm"
- "traefik.http.routers.service-name-fe.service=service-name-fe"
- "traefik.http.routers.service-name-fe.entrypoints=websecure"
- "traefik.http.routers.service-name-fe.rule=Host(`service-name.domain.com`)"
- "traefik.http.routers.service-name-fe.tls=true"
- "traefik.http.routers.service-name-fe.tls.certresolver=le"
- "traefik.http.services.service-name-fe.loadbalancer.server.port=3000"
ports:
- "3000"
secrets:
- secret
secrets:
secret:
external: true
Otherwise, it's running great but as the number of services and the traffic has gone up I have been experiencing some issues. Any feedback would be appriciated, as I've been debugging for a while now!
Anton