Prometheus 是一套开源的系统监控报警框架。它启发于 Google 的 borgmon 监控系统,由工作在 SoundCloud 的 google 前员工在 2012 年创建,作为社区开源项目进行开发,并于 2015 年正式发布。2016 年,Prometheus 正式加入 Cloud Native Computing Foundation,成为受欢迎度仅次于 Kubernetes 的项目。 -- 翻译自官网
本文以操作为主, 关于prometheus的概念请参考:
启动prometheus
docker启动
docker pull prom/prometheus
docker run -d -p 9090:9090 \
-v $PWD/prometheus.yml:/etc/prometheus/prometheus.yml \
-v $PWD/alert.rules:/etc/prometheus/alert.rules \
--name prometheus \
prom/prometheus \
-config.file=/etc/prometheus/prometheus.yml \
-alertmanager.url=http://10.0.2.15:9093
- 用docker-compose启动容器
version: "2"
services:
prom:
image: prom/prometheus
container_name: prometheus
volumes:
- ./prometheus.yml:/etc/prometheus/prometheus.yml
ports:
- 9090:9090
二进制文件启动
prometheus -h
/bin/prometheus --config.file=/etc/prometheus/prometheus.yml --storage.tsdb.path=/prometheus --web.console.libraries=/usr/share/prometheus/console_libraries --web.console.templates=/usr/share/prometheus/consoles --web.listen-address="0.0.0.0:9090"
配置
prometheus.yml
global: # 全局设置,可以被覆盖
scrape_interval: 15s # 默认值为 15s,用于设置每次数据收集的间隔
external_labels: # 所有时间序列和警告与外部通信时用的外部标签
monitor: 'codelab-monitor'
rule_files: # 警告规则设置文件
- '/etc/prometheus/alert.rules'
# 用于配置 scrape 的 endpoint 配置需要 scrape 的 targets 以及相应的参数
scrape_configs:
# The job name is added as a label `job=<job_name>` to any timeseries scraped from this config.
- job_name: 'prometheus' # 一定要全局唯一, 采集 Prometheus 自身的 metrics
# 覆盖全局的 scrape_interval
scrape_interval: 5s
static_configs: # 静态目标的配置
- targets: ['172.17.0.2:9090']
- job_name: 'node' # 一定要全局唯一, 采集本机的 metrics,需要在本机安装 node_exporter
scrape_interval: 10s
static_configs:
- targets: ['10.0.2.15:9100'] # 本机 node_exporter 的 endpoint
数据采集
golang
github.com/prometheus/client_golang/prometheusgithub.com/mcuadros/go-gin-prometheus
import (
"github.com/gin-gonic/gin"
gp "github.com/mcuadros/go-gin-prometheus"
)
router := gin.Default()
// prometheus
p := gp.NewPrometheus("gin")
p.Use(router)
- 自定义监控
// 假设我们需要对etcd进行一个持续的健康状况监控
// 首先创建一个gauge类型的metric
var EtcdUp = prometheus.NewGauge(prometheus.GaugeOpts{
Name: "etcd_up",
Help: "Etcd cluster alive check.",
})
// 启动一个协程, 专门用来定时检测etcd状况
go func() {
// etcdAlive方法能够判断etcd状态
if ok := etcdAlive(); ok {
EtcdUp.Set(1)
} else {
EtcdUp.Set(0)
}
time.Sleep(3*time.Second)
}()
python
pip install prometheus_client
from prometheus_client import Counter, Histogram
# 请求延时及请求量统计, 可以按照url和method进行筛选
FLASK_REQUEST_LATENCY = Histogram('flask_request_latency_seconds', 'Flask Request Latency',
['method', 'endpoint'])
FLASK_REQUEST_COUNT = Counter('flask_request_count', 'Flask Request Count',
['method', 'endpoint', 'http_status'])
# 通过flask提供的钩子, 在请求前后设置trick
@app.before_request
def before_request():
request.start_time = time.time() #记录请求开始时间
@app.after_request
def after_request(response):
request_latency = time.time() - request.start_time #计算请求开销
# 将开销按照url, method进行上报
FLASK_REQUEST_LATENCY.labels(request.method, request.path).observe(request_latency)
# 将请求个数按照status, method, url进行上报
FLASK_REQUEST_COUNT.labels(request.method, request.path, response.status_code).inc()
return response
# 如果要监控内存使用情况, 建议使用psutil第三方库, "pip install psutil"
import psutil
mem_used = psutil.Process(os.getpid()).memory_info().rss #单位byte
# 数据库存活的监控可以起一个线程用空查询的方式轮询
# DIY实现
# 通过wsgi网关服务器提供的middleware方式实现prometheus路由
from werkzeug.wsgi import DispatcherMiddleware
from prometheus_client import make_wsgi_app
app = Flask(__name__)
dispatch = DispatcherMiddleware(app, {"/metrics": make_wsgi_app()})
# 使用gunicorn启动服务
# gunicorn -k gevent -w 1 main:dispatch -b 0.0.0.0:5000
- 使用gunicorn多进程的时候会导致数据冲突, 建议强制worker为1
数据查询
官方文档:
# 单挑记录查询, 不带time参数默认取最近一条数据
$ curl 'http://localhost:9090/api/v1/query?query=up&time=2015-07-01T20:10:51.781Z'
{
"status" : "success",
"data" : {
"resultType" : "vector",
"result" : [
{
"metric" : {
"__name__" : "up",
"job" : "prometheus",
"instance" : "localhost:9090"
},
"value": [ 1435781451.781, "1" ]
},
{
"metric" : {
"__name__" : "up",
"job" : "node",
"instance" : "localhost:9100"
},
"value" : [ 1435781451.781, "0" ]
}
]
}
}
# 范围查询
$ curl 'http://localhost:9090/api/v1/query_range?query=up&start=2015-07-01T20:10:30.781Z&end=2015-07-01T20:11:00.781Z&step=15s'
{
"status" : "success",
"data" : {
"resultType" : "matrix",
"result" : [
{
"metric" : {
"__name__" : "up",
"job" : "prometheus",
"instance" : "localhost:9090"
},
"values" : [
[ 1435781430.781, "1" ],
[ 1435781445.781, "1" ],
[ 1435781460.781, "1" ]
]
},
{
"metric" : {
"__name__" : "up",
"job" : "node",
"instance" : "localhost:9091"
},
"values" : [
[ 1435781430.781, "0" ],
[ 1435781445.781, "0" ],
[ 1435781460.781, "1" ]
]
}
]
}
}
引用
<prometheus实战>:
吴 莉, 殷一鸣, 蔡林 <prometheus入门与实践>:
官方文档: