Kubernetes API Server故障排查完整指南 - 诊断工具与恢复策略

Kubernetes API Server故障排查完整指南

常见故障场景诊断

1. API Server启动失败

场景描述

API Server进程无法启动或启动后立即退出,集群完全不可用。

常见原因和诊断方法

证书配置错误

1
2
3
4
5
6
7
8
9
# 检查证书文件权限和存在性
ls -la /etc/kubernetes/pki/
openssl x509 -in /etc/kubernetes/pki/apiserver.crt -text -noout

# 验证证书有效期
openssl x509 -in /etc/kubernetes/pki/apiserver.crt -noout -dates

# 检查证书链完整性
openssl verify -CAfile /etc/kubernetes/pki/ca.crt /etc/kubernetes/pki/apiserver.crt

etcd连接问题排查

1
2
3
4
5
6
7
8
9
10
11
12
13
# 检查etcd集群状态
ETCDCTL_API=3 etcdctl --endpoints=localhost:2379 \
--cert=/etc/kubernetes/pki/etcd/server.crt \
--key=/etc/kubernetes/pki/etcd/server.key \
--cacert=/etc/kubernetes/pki/etcd/ca.crt \
endpoint health

# 测试API Server到etcd的连接
ETCDCTL_API=3 etcdctl --endpoints=localhost:2379 \
--cert=/etc/kubernetes/pki/apiserver-etcd-client.crt \
--key=/etc/kubernetes/pki/apiserver-etcd-client.key \
--cacert=/etc/kubernetes/pki/etcd/ca.crt \
get "" --prefix=true --keys-only | head -10

端口占用问题诊断

1
2
3
4
5
6
7
# 检查端口占用
netstat -tulpn | grep :6443
lsof -i :6443

# 检查防火墙规则
iptables -L -n | grep 6443
systemctl status firewalld

2. 高延迟问题诊断

延迟监控工具

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
#!/bin/bash
# api-latency-monitor.sh - API延迟实时监控

# 监控API请求延迟
monitor_latency() {
while true; do
start_time=$(date +%s%N)
kubectl get pods --request-timeout=30s > /dev/null 2>&1
end_time=$(date +%s%N)

latency=$(( (end_time - start_time) / 1000000 )) # 转换为毫秒
timestamp=$(date '+%Y-%m-%d %H:%M:%S')

echo "[$timestamp] API latency: ${latency}ms"

if [ $latency -gt 1000 ]; then
echo "⚠️ WARNING: High latency detected!"
collect_diagnostic_info
fi

sleep 5
done
}

collect_diagnostic_info() {
echo "=== Diagnostic Info at $(date) ==="

# API Server CPU和内存使用
kubectl top pod -n kube-system | grep apiserver

# 检查API Server日志中的慢查询
kubectl logs -n kube-system kube-apiserver-master --tail=50 | grep -i "slow\|timeout"

# 检查etcd延迟
ETCDCTL_API=3 etcdctl --endpoints=localhost:2379 \
--command-timeout=5s \
--cert=/etc/kubernetes/pki/etcd/server.crt \
--key=/etc/kubernetes/pki/etcd/server.key \
--cacert=/etc/kubernetes/pki/etcd/ca.crt \
check perf

# 检查网络连接
ss -tuln | grep :6443
ss -tuln | grep :2379
}

monitor_latency

性能分析工具

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
# curl延迟测试工具
measure_api_latency() {
local endpoint="$1"
local token="$2"

curl -w "@curl-format.txt" -o /dev/null -s \
-H "Authorization: Bearer $token" \
-H "Accept: application/json" \
"https://kubernetes-api:6443$endpoint"
}

# curl-format.txt
cat > curl-format.txt << 'EOF'
time_namelookup: %{time_namelookup}s
time_connect: %{time_connect}s
time_appconnect: %{time_appconnect}s
time_pretransfer: %{time_pretransfer}s
time_redirect: %{time_redirect}s
time_starttransfer: %{time_starttransfer}s
----------
time_total: %{time_total}s
EOF

# 测试不同API端点
TOKEN=$(kubectl get secret -n kube-system $(kubectl get sa -n kube-system default -o jsonpath='{.secrets[0].name}') -o jsonpath='{.data.token}' | base64 -d)

measure_api_latency "/api/v1/pods" "$TOKEN"
measure_api_latency "/api/v1/nodes" "$TOKEN"
measure_api_latency "/apis/apps/v1/deployments" "$TOKEN"

3. 内存泄漏问题诊断

内存监控脚本

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
#!/bin/bash
# memory-monitor.sh - API Server内存泄漏监控

monitor_memory() {
while true; do
# 获取API Server容器内存使用
memory_usage=$(kubectl top pod -n kube-system | grep apiserver | awk '{print $3}')
timestamp=$(date '+%Y-%m-%d %H:%M:%S')

echo "[$timestamp] API Server memory usage: $memory_usage"

# 提取数值部分(去除单位)
memory_val=$(echo $memory_usage | sed 's/[^0-9]//g')

if [ "$memory_val" -gt 4000 ]; then # 超过4GB
echo "⚠️ WARNING: High memory usage detected!"
collect_memory_info
fi

sleep 30
done
}

collect_memory_info() {
echo "=== Memory Diagnostic Info ==="

# Go runtime内存统计
kubectl exec -n kube-system kube-apiserver-master -- \
curl -s http://localhost:8080/debug/pprof/heap > heap-profile-$(date +%s).pprof

# 获取详细的内存分配信息
kubectl exec -n kube-system kube-apiserver-master -- \
curl -s http://localhost:8080/debug/pprof/allocs > allocs-profile-$(date +%s).pprof

# 检查goroutine数量
kubectl exec -n kube-system kube-apiserver-master -- \
curl -s http://localhost:8080/debug/pprof/goroutine?debug=1 | head -20

# 检查Watch连接数
kubectl exec -n kube-system kube-apiserver-master -- \
netstat -an | grep :6443 | grep ESTABLISHED | wc -l
}

monitor_memory

内存泄漏检测程序

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
// memory-leak-detector.go
package main

import (
"context"
"fmt"
"runtime"
"time"
)

type MemoryLeakDetector struct {
baselineMemory uint64
threshold uint64
checkInterval time.Duration
}

func NewMemoryLeakDetector(threshold uint64, interval time.Duration) *MemoryLeakDetector {
var m runtime.MemStats
runtime.ReadMemStats(&m)

return &MemoryLeakDetector{
baselineMemory: m.Alloc,
threshold: threshold,
checkInterval: interval,
}
}

func (mld *MemoryLeakDetector) Start(ctx context.Context) {
ticker := time.NewTicker(mld.checkInterval)
defer ticker.Stop()

for {
select {
case <-ticker.C:
mld.checkMemoryLeak()
case <-ctx.Done():
return
}
}
}

func (mld *MemoryLeakDetector) checkMemoryLeak() {
var m runtime.MemStats
runtime.ReadMemStats(&m)

currentMemory := m.Alloc
memoryGrowth := currentMemory - mld.baselineMemory

fmt.Printf("Memory Stats:\n")
fmt.Printf(" Current Alloc: %d KB\n", currentMemory/1024)
fmt.Printf(" Baseline: %d KB\n", mld.baselineMemory/1024)
fmt.Printf(" Growth: %d KB\n", memoryGrowth/1024)
fmt.Printf(" Sys: %d KB\n", m.Sys/1024)
fmt.Printf(" NumGC: %d\n", m.NumGC)
fmt.Printf(" GCCPUFraction: %.4f\n", m.GCCPUFraction)

if memoryGrowth > mld.threshold {
fmt.Printf("🚨 WARNING: Potential memory leak detected! Growth: %d KB\n",
memoryGrowth/1024)

// 强制GC并重新检查
runtime.GC()
runtime.GC()

runtime.ReadMemStats(&m)
afterGCMemory := m.Alloc
fmt.Printf("After GC: %d KB\n", afterGCMemory/1024)

// 更新基线
mld.baselineMemory = afterGCMemory
}
}

func main() {
detector := NewMemoryLeakDetector(100*1024*1024, 30*time.Second) // 100MB阈值,30秒检查

ctx := context.Background()
detector.Start(ctx)
}

4. etcd连接问题排查

全面连接检查脚本

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
#!/bin/bash
# etcd-connection-check.sh - etcd连接全面诊断

check_etcd_connection() {
echo "=== etcd Connection Diagnostic ==="

# 1. 检查etcd集群健康状态
echo "1. Checking etcd cluster health..."
ETCDCTL_API=3 etcdctl --endpoints=localhost:2379 \
--cert=/etc/kubernetes/pki/etcd/server.crt \
--key=/etc/kubernetes/pki/etcd/server.key \
--cacert=/etc/kubernetes/pki/etcd/ca.crt \
endpoint health

# 2. 检查etcd集群成员
echo "2. Checking etcd cluster members..."
ETCDCTL_API=3 etcdctl --endpoints=localhost:2379 \
--cert=/etc/kubernetes/pki/etcd/server.crt \
--key=/etc/kubernetes/pki/etcd/server.key \
--cacert=/etc/kubernetes/pki/etcd/ca.crt \
member list

# 3. 检查etcd性能
echo "3. Checking etcd performance..."
ETCDCTL_API=3 etcdctl --endpoints=localhost:2379 \
--cert=/etc/kubernetes/pki/etcd/server.crt \
--key=/etc/kubernetes/pki/etcd/server.key \
--cacert=/etc/kubernetes/pki/etcd/ca.crt \
check perf

# 4. 检查API Server到etcd的连接
echo "4. Testing API Server to etcd connectivity..."
test_etcd_connectivity

# 5. 检查etcd数据大小
echo "5. Checking etcd database size..."
ETCDCTL_API=3 etcdctl --endpoints=localhost:2379 \
--cert=/etc/kubernetes/pki/etcd/server.crt \
--key=/etc/kubernetes/pki/etcd/server.key \
--cacert=/etc/kubernetes/pki/etcd/ca.crt \
endpoint status --write-out=table
}

test_etcd_connectivity() {
# 使用API Server的etcd客户端证书测试连接
ETCDCTL_API=3 etcdctl --endpoints=localhost:2379 \
--cert=/etc/kubernetes/pki/apiserver-etcd-client.crt \
--key=/etc/kubernetes/pki/apiserver-etcd-client.key \
--cacert=/etc/kubernetes/pki/etcd/ca.crt \
--command-timeout=5s \
put test-key test-value

ETCDCTL_API=3 etcdctl --endpoints=localhost:2379 \
--cert=/etc/kubernetes/pki/apiserver-etcd-client.crt \
--key=/etc/kubernetes/pki/apiserver-etcd-client.key \
--cacert=/etc/kubernetes/pki/etcd/ca.crt \
--command-timeout=5s \
get test-key

ETCDCTL_API=3 etcdctl --endpoints=localhost:2379 \
--cert=/etc/kubernetes/pki/apiserver-etcd-client.crt \
--key=/etc/kubernetes/pki/apiserver-etcd-client.key \
--cacert=/etc/kubernetes/pki/etcd/ca.crt \
--command-timeout=5s \
del test-key
}

check_etcd_connection

5. 认证授权问题诊断

RBAC问题排查脚本

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
#!/bin/bash
# rbac-troubleshoot.sh - RBAC权限问题诊断

troubleshoot_rbac() {
local user="$1"
local verb="$2"
local resource="$3"
local namespace="${4:-}"

echo "=== RBAC Troubleshooting for User: $user ==="

# 1. 检查用户权限
echo "1. Checking user permissions..."
if [ -n "$namespace" ]; then
kubectl auth can-i "$verb" "$resource" --as="$user" --namespace="$namespace"
else
kubectl auth can-i "$verb" "$resource" --as="$user"
fi

# 2. 列出用户的所有权限
echo "2. Listing all permissions for user..."
kubectl auth can-i --list --as="$user"

# 3. 查找相关的RoleBinding和ClusterRoleBinding
echo "3. Finding relevant RoleBindings..."
kubectl get rolebinding,clusterrolebinding -o json | \
jq -r --arg user "$user" '.items[] | select(.subjects[]?.name == $user) | .metadata.name'

# 4. 检查ServiceAccount Token
if [[ "$user" =~ ^system:serviceaccount: ]]; then
echo "4. Checking ServiceAccount token..."
sa_namespace=$(echo "$user" | cut -d: -f3)
sa_name=$(echo "$user" | cut -d: -f4)

kubectl get sa "$sa_name" -n "$sa_namespace" -o yaml
kubectl get secret $(kubectl get sa "$sa_name" -n "$sa_namespace" -o jsonpath='{.secrets[0].name}') -n "$sa_namespace" -o yaml
fi
}

# 使用示例
troubleshoot_rbac "system:serviceaccount:default:my-sa" "get" "pods" "default"
troubleshoot_rbac "jane" "create" "deployments" "production"

RBAC权限检查工具

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
// rbac-checker.go
package main

import (
"context"
"fmt"

authorizationv1 "k8s.io/api/authorization/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/client-go/kubernetes"
"k8s.io/client-go/rest"
)

func checkUserPermissions(clientset *kubernetes.Clientset, user string, verb string, resource string, namespace string) {
// 创建SubjectAccessReview
sar := &authorizationv1.SubjectAccessReview{
Spec: authorizationv1.SubjectAccessReviewSpec{
User: user,
ResourceAttributes: &authorizationv1.ResourceAttributes{
Verb: verb,
Group: "",
Version: "v1",
Resource: resource,
Namespace: namespace,
},
},
}

// 发送权限检查请求
result, err := clientset.AuthorizationV1().SubjectAccessReviews().Create(
context.TODO(), sar, metav1.CreateOptions{})
if err != nil {
fmt.Printf("Error checking permissions: %v\n", err)
return
}

// 输出结果
if result.Status.Allowed {
fmt.Printf("✅ User %s CAN %s %s in namespace %s\n", user, verb, resource, namespace)
} else {
fmt.Printf("❌ User %s CANNOT %s %s in namespace %s\n", user, verb, resource, namespace)
if result.Status.Reason != "" {
fmt.Printf(" Reason: %s\n", result.Status.Reason)
}
}
}

日志分析技巧

API Server日志分析

结构化日志解析脚本

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
#!/bin/bash
# parse-apiserver-logs.sh - API Server日志智能分析

parse_apiserver_logs() {
local log_file="${1:-/var/log/pods/kube-system_kube-apiserver*/*.log}"

echo "=== API Server Log Analysis ==="

# 1. 错误统计
echo "1. Error Summary:"
grep -E "(ERROR|error)" "$log_file" | \
sed 's/.*\(ERROR\|error\).*/\1/' | \
sort | uniq -c | sort -nr

# 2. 慢查询分析
echo "2. Slow Queries (>1s):"
grep "slow" "$log_file" | \
sed -n 's/.*duration:\([0-9.]*s\).*/\1/p' | \
sort -n | tail -10

# 3. 认证失败
echo "3. Authentication Failures:"
grep -i "authentication\|unauthorized" "$log_file" | \
head -10

# 4. 授权拒绝
echo "4. Authorization Denials:"
grep -i "forbidden\|access denied" "$log_file" | \
head -10

# 5. etcd错误
echo "5. etcd Errors:"
grep -i "etcd" "$log_file" | grep -i "error" | \
head -10

# 6. Watch连接统计
echo "6. Watch Connection Stats:"
grep "watch" "$log_file" | \
grep -o "established\|closed" | \
sort | uniq -c
}

# 实时日志监控
monitor_apiserver_logs() {
kubectl logs -n kube-system -f kube-apiserver-master | \
while read line; do
timestamp=$(date '+%Y-%m-%d %H:%M:%S')

# 检查错误模式
if echo "$line" | grep -qE "(ERROR|FATAL|panic)"; then
echo "[$timestamp] 🚨 CRITICAL: $line"
elif echo "$line" | grep -qE "(WARN|warning)"; then
echo "[$timestamp] ⚠️ WARNING: $line"
elif echo "$line" | grep -qE "slow.*[0-9]+s"; then
echo "[$timestamp] 🐌 SLOW QUERY: $line"
fi
done
}

parse_apiserver_logs

审计日志分析

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
# 审计日志分析脚本
analyze_audit_logs() {
local audit_log="/var/log/audit.log"

echo "=== Audit Log Analysis ==="

# 1. 用户活动统计
echo "1. User Activity:"
jq -r '.user.username // "unknown"' "$audit_log" | \
sort | uniq -c | sort -nr | head -10

# 2. 资源访问统计
echo "2. Resource Access:"
jq -r '.objectRef.resource // "unknown"' "$audit_log" | \
sort | uniq -c | sort -nr | head -10

# 3. 失败的操作
echo "3. Failed Operations:"
jq -r 'select(.responseStatus.code >= 400) |
"\(.user.username) \(.verb) \(.objectRef.resource) \(.responseStatus.code)"' \
"$audit_log" | head -20

# 4. 敏感资源访问
echo "4. Sensitive Resource Access:"
jq -r 'select(.objectRef.resource == "secrets" or .objectRef.resource == "configmaps") |
"\(.user.username) \(.verb) \(.objectRef.name)"' \
"$audit_log" | head -20
}

恢复策略和自动修复

自动故障转移机制

API Server健康检查配置

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
# API Server健康检查配置
apiVersion: v1
kind: Pod
metadata:
name: kube-apiserver
namespace: kube-system
spec:
containers:
- name: kube-apiserver
image: k8s.gcr.io/kube-apiserver:v1.28.0
livenessProbe:
httpGet:
path: /livez
port: 6443
scheme: HTTPS
initialDelaySeconds: 10
timeoutSeconds: 15
periodSeconds: 10
successThreshold: 1
failureThreshold: 8
readinessProbe:
httpGet:
path: /readyz
port: 6443
scheme: HTTPS
initialDelaySeconds: 0
timeoutSeconds: 15
periodSeconds: 1
successThreshold: 1
failureThreshold: 3

自动故障转移脚本

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
#!/bin/bash
# api-server-failover.sh - API Server自动故障转移

# 配置参数
PRIMARY_APISERVER="10.0.0.1:6443"
BACKUP_APISERVER="10.0.0.2:6443"
HEALTH_CHECK_INTERVAL=30
MAX_FAILURES=3

failure_count=0

check_apiserver_health() {
local endpoint="$1"

# 检查API Server健康状态
if curl -k --connect-timeout 5 "https://$endpoint/livez" &>/dev/null; then
return 0
else
return 1
fi
}

failover_to_backup() {
echo "🔄 Failing over to backup API Server: $BACKUP_APISERVER"

# 更新负载均衡器配置
update_loadbalancer_config

# 通知管理员
send_alert "API Server failover executed"

# 重置计数器
failure_count=0
}

update_loadbalancer_config() {
# 更新HAProxy配置
cat > /etc/haproxy/haproxy.cfg << EOF
global
daemon

defaults
mode tcp
timeout connect 5000ms
timeout client 50000ms
timeout server 50000ms

frontend apiserver-frontend
bind *:6443
default_backend apiserver-backend

backend apiserver-backend
balance roundrobin
server backup-apiserver $BACKUP_APISERVER check
EOF

# 重启HAProxy
systemctl reload haproxy
}

send_alert() {
local message="$1"
# 发送告警通知
logger "API Server Monitor: $message"
# 可以集成钉钉、Slack等通知
}

# 主监控循环
while true; do
if check_apiserver_health "$PRIMARY_APISERVER"; then
echo "✅ Primary API Server is healthy"
failure_count=0
else
failure_count=$((failure_count + 1))
echo "❌ Primary API Server check failed (attempt $failure_count/$MAX_FAILURES)"

if [ $failure_count -ge $MAX_FAILURES ]; then
failover_to_backup
fi
fi

sleep $HEALTH_CHECK_INTERVAL
done

etcd备份恢复策略

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
#!/bin/bash
# etcd-backup-restore.sh - etcd数据备份恢复

ETCD_ENDPOINTS="https://127.0.0.1:2379"
BACKUP_DIR="/var/lib/etcd-backup"
CERT_DIR="/etc/kubernetes/pki/etcd"

# 创建备份
create_etcd_backup() {
local backup_name="etcd-backup-$(date +%Y%m%d-%H%M%S)"
local backup_path="$BACKUP_DIR/$backup_name"

echo "📦 Creating etcd backup: $backup_path"

ETCDCTL_API=3 etcdctl snapshot save "$backup_path" \
--endpoints="$ETCD_ENDPOINTS" \
--cert="$CERT_DIR/server.crt" \
--key="$CERT_DIR/server.key" \
--cacert="$CERT_DIR/ca.crt"

if [ $? -eq 0 ]; then
echo "✅ Backup created successfully: $backup_path"

# 验证备份
ETCDCTL_API=3 etcdctl snapshot status "$backup_path" \
--write-out=table
else
echo "❌ Backup failed!"
return 1
fi
}

# 恢复备份
restore_etcd_backup() {
local backup_file="$1"
local data_dir="$2"

if [ ! -f "$backup_file" ]; then
echo "❌ Backup file not found: $backup_file"
return 1
fi

echo "🔄 Restoring etcd from backup: $backup_file"

# 停止etcd服务
systemctl stop etcd

# 备份当前数据目录
if [ -d "$data_dir" ]; then
mv "$data_dir" "${data_dir}.backup-$(date +%Y%m%d-%H%M%S)"
fi

# 恢复数据
ETCDCTL_API=3 etcdctl snapshot restore "$backup_file" \
--data-dir="$data_dir" \
--name=etcd-server \
--initial-cluster=etcd-server=https://127.0.0.1:2380 \
--initial-advertise-peer-urls=https://127.0.0.1:2380

# 重启etcd服务
systemctl start etcd

# 验证恢复
sleep 5
ETCDCTL_API=3 etcdctl endpoint health \
--endpoints="$ETCD_ENDPOINTS" \
--cert="$CERT_DIR/server.crt" \
--key="$CERT_DIR/server.key" \
--cacert="$CERT_DIR/ca.crt"
}

# 定期备份任务
schedule_backup() {
# 添加到crontab
(crontab -l 2>/dev/null; echo "0 2 * * * $PWD/etcd-backup-restore.sh backup") | crontab -
}

case "${1:-backup}" in
backup)
create_etcd_backup
;;
restore)
if [ -z "$2" ]; then
echo "Usage: $0 restore <backup-file> [data-dir]"
exit 1
fi
restore_etcd_backup "$2" "${3:-/var/lib/etcd}"
;;
schedule)
schedule_backup
;;
*)
echo "Usage: $0 {backup|restore|schedule}"
exit 1
;;
esac

这是Kubernetes API Server故障排查的完整实战指南,涵盖了从问题诊断到自动恢复的全流程。在生产环境中,建议将这些脚本集成到监控系统中实现自动化运维。

系列文章导航: