# Last 5 minutes of sampleshttp_requests_total[5m]# Offset (5 minutes ago)http_requests_totaloffset5m# Range with offsethttp_requests_total[5m]offset1h
# CPU usage per pod (cores)sum(rate(container_cpu_usage_seconds_total{container!=""}[5m]))by(pod,namespace)# CPU usage as percentage of requestedsum(rate(container_cpu_usage_seconds_total{container!=""}[5m]))by(pod)/sum(kube_pod_container_resource_requests{resource="cpu",container!=""})by(pod)# Top 10 pods by CPU usagetopk(10,sum(rate(container_cpu_usage_seconds_total{container!=""}[5m]))by(pod,namespace))
# Memory working set per pod (bytes)sum(container_memory_working_set_bytes{container!=""})by(pod,namespace)# Memory usage in MiBsum(container_memory_working_set_bytes{container!=""})by(pod,namespace)/1024/1024# Memory usage as percentage of requestedsum(container_memory_working_set_bytes{container!=""})by(pod)/sum(kube_pod_container_resource_requests{resource="memory",container!=""})by(pod)# Top 10 pods by memory usagetopk(10,sum(container_memory_working_set_bytes{container!=""})by(pod,namespace))
# Number of running pods per namespacecount(kube_pod_status_phase{phase="Running"})by(namespace)# Pods not in Running or Succeeded statecount(kube_pod_status_phase{phase!~"Running|Succeeded"})by(pod,namespace,phase)# Pod restarts in the last hourincrease(kube_pod_container_status_restarts_total[1h])>0# OOMKilled podskube_pod_container_status_last_terminated_reason{reason="OOMKilled"}
# Node CPU utilization (%)100-(avgby(node)(rate(node_cpu_seconds_total{mode="idle"}[5m]))*100)# CPU usage per nodesum(rate(node_cpu_seconds_total{mode!="idle"}[5m]))by(node)
# Disk usage per node filesystem (%)100-(node_filesystem_avail_bytes{fstype!~"tmpfs|overlay"}/node_filesystem_size_bytes*100)# Disk I/Orate(node_disk_read_bytes_total[5m])rate(node_disk_written_bytes_total[5m])
# Total request rate (per second)sum(rate(http_requests_total[5m]))# Request rate by status codesum(rate(http_requests_total[5m]))by(status_code)# Error rate (5xx)sum(rate(http_requests_total{status_code=~"5.."}[5m]))/sum(rate(http_requests_total[5m]))# Error rate by servicesum(rate(http_requests_total{status_code=~"5.."}[5m]))by(service)/sum(rate(http_requests_total[5m]))by(service)
# CPU quota usage per namespacesum(kube_pod_container_resource_requests{resource="cpu"})by(namespace)/sum(kube_resourcequota{resource="requests.cpu",type="hard"})by(namespace)# Memory quota usage per namespacesum(kube_pod_container_resource_requests{resource="memory"})by(namespace)/sum(kube_resourcequota{resource="requests.memory",type="hard"})by(namespace)
# Request rate through Ingresssum(rate(nginx_ingress_controller_requests[5m]))by(ingress,namespace)# Error rate through Ingresssum(rate(nginx_ingress_controller_requests{status=~"[45].."}[5m]))by(ingress)/sum(rate(nginx_ingress_controller_requests[5m]))by(ingress)# Ingress latency (p99)histogram_quantile(0.99,sum(rate(nginx_ingress_controller_request_duration_seconds_bucket[5m]))by(le,ingress))# Active connectionsavg(nginx_ingress_controller_nginx_process_connections{state="active"})
groups:-name:kubernetesrules:# Pod crash looping-alert:PodCrashLoopingexpr:|increase(kube_pod_container_status_restarts_total[1h]) > 5for:15mlabels:severity:warningannotations:summary:"Pod{{$labels.namespace}}/{{$labels.pod}}iscrashlooping"description:"Container{{$labels.container}}hasrestarted{{$value}}timesinthelasthour"# High memory usage-alert:HighMemoryUsageexpr:|(container_memory_working_set_bytes{container!=""}/ kube_pod_container_resource_limits{resource="memory", container!=""}) > 0.9for:5mlabels:severity:warningannotations:summary:"Highmemoryusagein{{$labels.namespace}}/{{$labels.pod}}"# High error rate-alert:HighErrorRateexpr:|sum(rate(http_requests_total{status_code=~"5.."}[5m])) by (service)/sum(rate(http_requests_total[5m])) by (service) > 0.05for:5mlabels:severity:criticalannotations:summary:"Higherrorrateon{{$labels.service}}:{{$value|humanizePercentage}}"# Node not ready-alert:NodeNotReadyexpr:kube_node_status_condition{condition="Ready", status="true"} == 0for:5mlabels:severity:criticalannotations:summary:"Node{{$labels.node}}isnotready"