一、prometheus搭建
1.配置文件构成
全局、报警、规则、抓取
Prometheus 的配置文件(prometheus.yml)就四大金刚:
global
全局默认参数:多久抓一次、多久算一次报警、对外的“身份证”标签。alerting
报警出口:算出火警后往哪台 Alertmanager 送;没有就空着。rule_files
报警规则/记录规则文件列表,Prometheus 启动时会把它们加载进来。scrape_configs
抓取任务列表:告诉 Prometheus 去哪些地址、多久一次、怎么贴标签、怎样服务发现。
2.配置文件模板
# 全局 global: scrape_interval: 5s evaluation_interval: 5s external_labels: monitor: 'dashboard' # 告警出口 #alerting: # alertmanagers: # - static_configs: # - targets: # - "10.xx.xx.xx:9093" # 告警规则 rule_files: - '/etc/prometheus/rules/*.yml' # 抓取指标 scrape_configs: #示例 - job_name: 'prometheus/pushgateway' scrape_interval: 15s static_configs: - targets: ['prometheus:9090'] #服务器监控 - job_name: 'node_exporter' scrape_interval: 15s static_configs: - targets: ['10.xx.xx.xx:xxxx] #容器监控 - job_name: 'cadvisor' scrape_interval: 15s static_configs: - targets: ['10.xx.xx.xx:xxxx] #后端服务 - job_name: 'qc6_metrics' metrics_path: '/metrics/qc_server' scrape_interval: 15s static_configs: - targets: ['10.233.63.6:9990'] relabel_configs: - source_labels: [__address__] target_label: instance replacement: "qc6_仿真"3.docker-compose文件示例
version: "3" services: grafana: image: grafana/grafana:10.4.19 #image: grafana/grafana:12.0.0 container_name: grafana user: root restart: always ports: - "13000:3000" volumes: - /data/grafana/conf:/etc/grafana - /data/grafana/data:/var/lib/grafana - /nfsdata/prod_env/app_log/middle/grafana:/var/log/grafana #- ./log:/var/log/grafana #networks: #- lightning-network二、监控采集容器搭建
1.node-exporter搭建
docker-compose 文件示例
version: '3' services: node_exporter: image: prom/node-exporter:latest container_name: node_exporter command: - '--path.rootfs=/host' pid: host restart: always environment: - TZ=Asia/Shanghai ports: - "9100:9100" volumes: - '/:/host:ro,rslave' #“把整个宿主机根目录挂到容器里的 /host 路径,只读(ro)方式,并且用rslave保证容器内能看到宿主机后续挂载点变化,让 node_exporter 能读到真正的 /proc、/sys 等系统文件。”2.cadvisor搭建
docker-compose文件示例
version: '3.2' services: cadvisor: image: harbor:443/cmamoc/cadvisor:v0.37.0 container_name: cadvisor ports: - 8080:8080 volumes: - /:/rootfs:ro - /var/run:/var/run:rw - /sys:/sys:ro - /data/docker/:/var/lib/docker:ro三、grafana的搭建
1.docker-compose文件示例
version: "3" services: grafana: image: grafana/grafana:10.4.19 #image: grafana/grafana:12.0.0 container_name: grafana user: root restart: always ports: - "13000:3000" volumes: - /data/grafana/conf:/etc/grafana - /data/grafana/data:/var/lib/grafana - /nfsdata/prod_env/app_log/middle/grafana:/var/log/grafana #- ./log:/var/log/grafana #networks: #- lightning-network2.配置文件
1.控制 Grafana 服务器的基本行为
[server] # 协议:http 或 https protocol = http # HTTP 端口 http_port = 3000 # 绑定地址(0.0.0.0 表示所有接口) domain = 0.0.0.0 # 公共访问地址(必须设置,用于重定向和链接生成) root_url = http://localhost:3000 # 是否从子路径提供服务(用于反向代理) serve_from_sub_path = false # 启用 GZIP 压缩 enable_gzip = false # 静态文件路径 static_root_path = public # SSL/TLS 配置 cert_file = /path/to/cert.pem cert_key = /path/to/key.pem2.数据库配置 [database]
[database] # 数据库类型:sqlite3, mysql, postgres type = sqlite3 # SQLite 数据库文件路径 path = grafana.db # MySQL/PostgreSQL 连接 host = 127.0.0.1:3306 name = grafana user = grafana password = secret # 连接池设置 max_idle_conn = 2 max_open_conn = 0 conn_max_lifetime = 14400 # 日志 SQL 查询 log_queries = false3.安全配置 [security]
[security] # 管理员初始密码 admin_password = admin # 加密密钥(必须更改!) secret_key = SW2YcwTIb9zpOOhoPsMm # Cookie 安全设置 cookie_secure = false # 仅 HTTPS cookie_samesite = lax # lax/strict/none login_remember_days = 7 # 记住登录天数 # 强制密码策略 disable_initial_admin_creation = false disable_brute_force_login_protection = false # 允许的用户名正则 allowed_username = ^[A-Za-z0-9_-]+$4.用户配置 [users]
[users] # 允许用户注册 allow_sign_up = false # 自动分配组织和角色 auto_assign_org = true auto_assign_org_role = Viewer # 查看者是否可以编辑 viewers_can_edit = false # 允许组织创建 allow_org_create = false # 默认主题:dark, light, system default_theme = dark # 登录提示 login_hint = email or username # 外部用户管理 external_manage_link_url = external_manage_link_name = external_manage_info =5.认证配置 [auth]
[auth] # 禁用登录表单(当使用 OAuth/LDAP 时) disable_login_form = false # 禁用登出菜单 disable_signout_menu = false # 登录令牌生命周期(秒) oauth_state_cookie_max_age = 600 # API 密钥最大天数 api_key_max_seconds_to_live = 86400 # 匿名访问配置 [auth.anonymous] enabled = false org_name = Main Org. org_role = Viewer # 基本认证 [auth.basic] enabled = true6.日志配置 [log]
[log] # 日志模式:console, file, syslog mode = console file # 日志级别:debug, info, warn, error, critical level = info # 日志文件设置 filters = # 过滤器 log_line_limit = 1000 # 每行最大字符数 log_file_name = grafana.log # 日志文件名 # 控制台输出格式 console_format = console # console/json7.仪表板配置 [dashboards]
[dashboards] # 保留的历史版本数量 versions_to_keep = 20 # 最小刷新间隔 min_refresh_interval = 5s # 默认时间范围 default_home_dashboard_path = # 不允许的时间范围 timezone_options =8.警报配置 [alerting]
[alerting] # 告警启用状态 enabled = true execute_alerts = true # 告警超时设置 notification_timeout_seconds = 30 max_attempts = 3 # 告警评估超时 evaluation_timeout_seconds = 30 # 错误/超时配置 error_or_timeout = alerting nodata_or_nullvalues = no_data # 并发告警限制 concurrent_render_limit = 5