prometheus 服务部署

本贴最后更新于 1440 天前,其中的信息可能已经天翻地覆

  • 主服务 prometheus 挂载了 配置文件 prometheus.yml(详细介绍在 prometheus 配置文件)、告警规则目录 ./rules、监控服务地址目录 ./targets
  • 搭配告警服务 alertmanager 挂载了 配置文件 alertmanager.yml(详细介绍在 Alertmanager)
  • 配合告警服务进行钉钉告警 dingtalk 挂载了 配置文件 ding.yml(详细介绍在钉钉告警) 、自定义告警模板 template.tmpl
  • 服务监控中转 pushgateway
  • prometheus 远程存储服务器 influxdb,用于数据持久化

docker-compose 文件

version: '3.6'
services:
  prometheus:
    image: prom/prometheus
    container_name: prometheus
    environment:
      TZ : 'Asia/Shanghai'
    restart: always
    volumes:
      - /etc/localtime:/etc/localtime
      - ./prometheus.yml:/etc/prometheus/prometheus.yml
      - ./rules/alert.rules:/prometheus/alert.rules
      - ./rules/containerAlert.rules:/prometheus/containerAlert.rules
      - ./rules/jvm.rules:/prometheus/jvm.rules
      - ./targets/convergence_targets.json:/prometheus/convergence_targets.json
      - ./targets/ac_targets.json:/prometheus/ac_targets.json
      - ./targets/other_targets.json:/prometheus/other_targets.json
      - ./targets/ex_targets.json:/prometheus/ex_targets.json
      - ./targets/ng_grouptask_targets.json:/prometheus/ng_grouptask_targets.json
    ports:
      - 19090:9090

  alertmanager:
    image: prom/alertmanager:latest
    container_name: alertmanager
    environment:
      TZ : 'Asia/Shanghai'
    volumes:
      - /etc/localtime:/etc/localtime
      - ./alertmanager.yml:/etc/alertmanager/alertmanager.yml
    restart: always
    ports:
      - 19093:9093

  prometheusdingtalk:
    image: timonwong/prometheus-webhook-dingtalk
    container_name: dingtalk
    environment:
      TZ : 'Asia/Shanghai'
    restart: always
    volumes:
      - /etc/localtime:/etc/localtime
      - ./ding.yml:/etc/prometheus-webhook-dingtalk/config.yml
      - ./template.tmpl:/etc/prometheus-webhook-dingtalk/templates/default.tmpl
    ports:
      - 18060:8060

  prometheusgateway:
   image: prom/pushgateway
   container_name: pg
   environment:
     TZ : 'Asia/Shanghai'
   volumes:
     - /etc/localtime:/etc/localtime
   restart: always
   ports:
     - 9091:9091
  influxdb:
    image: influxdb:1.8
    container_name: influxdb
    ports:
      - 18086:8086
    restart: always
    volumes:
      - /prometheus/influxdb/conf:/etc/influxdb
      - /data/influxdb/data:/var/lib/influxdb/data
      - /data/influxdb/meta:/var/lib/influxdb/meta
      - /data/influxdb/wal:/var/lib/influxdb/wale
    environment:
      - TZ=Asia/Shanghai

启动命令

docker-compose up -d

查看容器状态

docker ps -a

配置文件参考

prometheus.yml

# my global config
global:
  scrape_interval:     15s # Set the scrape interval to every 15 seconds. Default is every 1 minute.
  evaluation_interval: 15s # Evaluate rules every 15 seconds. The default is every 1 minute.
  # scrape_timeout is set to the global default (10s).

# Alertmanager configuration
alerting:
  alertmanagers:
  - static_configs:
    - targets:
       - alertmanager:9093

# Load rules once and periodically evaluate them according to the global 'evaluation_interval'.
rule_files:
  - /prometheus/alert.rules
  - /prometheus/containerAlert.rules
  - /prometheus/jvm.rules
# A scrape configuration containing exactly one endpoint to scrape:
# Here it's Prometheus itself.
scrape_configs:
  # The job name is added as a label `job=<job_name>` to any timeseries scraped from this config.
  - job_name: 'pushgateway'
    static_configs:
      - targets: ['pg:9091']
        labels:
          instance: pushgateway
  - job_name: 'convergency_sd'
    scrape_interval: 3s
    metrics_path: '/actuator/prometheus'
    file_sd_configs:
     - refresh_interval: 1m
       files: ['/prometheus/convergence_targets.json']
  - job_name: 'ac_sd'
    file_sd_configs:
     - refresh_interval: 1m
       files: ['/prometheus/ac_targets.json']
  - job_name: 'other_sd'
    file_sd_configs:
     - files: ['/prometheus/other_targets.json']
  - job_name: 'ex_sd'
    file_sd_configs:
     - files: ['/prometheus/ex_targets.json']
remote_write:
  - url: "http://influxdb:8086/api/v1/prom/write?db=prometheus&u=admin&p=admin"
remote_read:
  - url: "http://influxdb:8086/api/v1/prom/read?db=prometheus&u=amdin&p=admin"

alertmanager.yml

global:
  resolve_timeout: 5m
route:
  receiver: webhook
  group_wait: 10s
  group_interval: 10s
  repeat_interval: 5m
  routes:
  - receiver: webhook
    match_re:
      level: P0
  - receiver: webhookJvm
    repeat_interval: 20m
    match:
      checkType: jvm
  - receiver: webhookJvmNum
    repeat_interval: 20m
    match:
      checkType: jvmNum
  - receiver: webhook
    repeat_interval: 20m
    match_re:
      level: P3
  - receiver: webhook
    repeat_interval: 20m
    match_re:
      level: P2
  - receiver: webhook
    match_re:
      level: P1
receivers:
- name: webhook
  webhook_configs:
  - url: http://dingtalk:8060/dingtalk/webhook2/send
    send_resolved: true
- name: webhookTest
  webhook_configs:
  - url: http://dingtalk:8060/dingtalk/webhook3/send
    send_resolved: false
- name: webhookJvm
  webhook_configs:
  - url: http://dingtalk:8060/dingtalk/webhook5/send
    send_resolved: false
- name: webhookJvmNum
  webhook_configs:
  - url: http://dingtalk:8060/dingtalk/webhook6/send
    send_resolved: false
inhibit_rules:
  - source_match:      ## 源报警规则
     severity: 'critical'
    target_match:      ## 抑制的报警规则
      severity: 'warning'
    equal: ['checkType']    ## 需要都有相同的标签及值,否则抑制不起作用

ding.yml

## Request timeout
# timeout: 5s

## Customizable templates path
templates:
  - /etc/prometheus-webhook-dingtalk/templates/default.tmpl

## You can also override default template using `default_message`
## The following example to use the 'legacy' template from v0.3.0
# default_message:
#   title: '{{ template "legacy.title" . }}'
#   text: '{{ template "legacy.content" . }}'

## Targets, previously was known as "profiles"
targets:
  webhook2:
    #芬香服务器告警群
    url: https://oapi.dingtalk.com/robot/send?access_token=61e9af8466b046fc5b3bffdd5bbe2de1c3f456d5df8d8396629a5664979422c7
    message:
      title: '{{ template "legacy.title" . }}'
      text: '{{ template "legacy.content" . }}'
  webhook4:
    url: https://oapi.dingtalk.com/robot/send?access_token=2c4c5d8d7c1dfc87236cfddb4919d96d9c4c5f0054959debef1cf980a41844db
    message:
      # Use legacy template
      title: '{{ template "legacy.title" . }}'
      text: '{{ template "legacy.content" . }}'
  webhook5:
    # Jvm模板
    url: https://oapi.dingtalk.com/robot/send?access_token=61e9af8466b046fc5b3bffdd5bbe2de1c3f456d5df8d8396629a5664979422c7
    message:
      title: '{{ template "jvm.title" . }}'
      text: '{{ template "jvm.content" . }}'
  webhook6:
    # Jvm模板换算单位
    url: https://oapi.dingtalk.com/robot/send?access_token=61e9af8466b046fc5b3bffdd5bbe2de1c3f456d5df8d8396629a5664979422c7
    message:
      title: '{{ template "jvmNum.title" . }}'
      text: '{{ template "jvmNum.content" . }}'
  webhook_legacy:
    url: https://oapi.dingtalk.com/robot/send?access_token=xxxxxxxxxxxx
    # Customize template content
    message:
      # Use legacy template
      title: '{{ template "legacy.title" . }}'
      text: '{{ template "legacy.content" . }}'

influx

[meta]
  dir = "/var/lib/influxdb/meta"

[data]
  dir = "/var/lib/influxdb/data"
  engine = "tsm1"
  wal-dir = "/var/lib/influxdb/wal"

相关帖子

欢迎来到这里!

我们正在构建一个小众社区,大家在这里相互信任,以平等 • 自由 • 奔放的价值观进行分享交流。最终,希望大家能够找到与自己志同道合的伙伴,共同成长。

注册 关于
请输入回帖内容 ...