NFS与Keepalived实现高可用存储集群
环境准备与依赖安装
在所有节点上执行以下命令,安装必要的服务组件:
yum install -y nfs-utils keepalived rsync rpcbind inotify-tools
NFS共享配置
所有节点保持一致的配置,定义共享目录并启用服务:
cat > /etc/exports << EOF
/data/nfs 192.168.189.0/24(rw,sync,no_root_squash,no_all_squash,fsid=0,anonuid=0,anongid=0)
EOF
systemctl enable --now rpcbind nfs-server
exportfs -rv
Keepalived主备配置
通过单播模式配置高可用集群,防止脑裂现象。主节点优先级最高,其余为备用。
主节点(192.168.189.155)配置
vi /etc/keepalived/keepalived.conf
global_defs {
router_id NFS_MASTER_155
script_user root
enable_script_security
}
vrrp_script chk_nfs {
script "/usr/local/bin/chk_nfs.sh"
interval 2
weight -50
fall 2
rise 2
}
vrrp_instance VI_NFS {
state BACKUP
interface ens3
virtual_router_id 51
priority 150
advert_int 1
nopreempt
preempt_delay 300
garp_master_delay 1
unicast_src_ip 192.168.189.155
unicast_peer {
192.168.189.163
192.168.189.164
}
authentication {
auth_type PASS
auth_pass NFS_HA_2026_Prod
}
virtual_ipaddress {
192.168.189.156/24 dev ens3 noprefixroute
}
track_script {
chk_nfs
}
notify_master "/usr/local/bin/vip_master.sh"
notify_backup "/usr/local/bin/vip_backup.sh"
notify_fault "/usr/local/bin/vip_fault.sh"
}
备节点1(192.168.189.163)配置
vi /etc/keepalived/keepalived.conf
global_defs {
router_id NFS_BACKUP_163
script_user root
enable_script_security
}
vrrp_script chk_nfs {
script "/usr/local/bin/chk_nfs.sh"
interval 2
weight -50
fall 2
rise 2
}
vrrp_instance VI_NFS {
state BACKUP
interface ens3
virtual_router_id 51
priority 120
advert_int 1
nopreempt
preempt_delay 300
garp_master_delay 1
unicast_src_ip 192.168.189.163
unicast_peer {
192.168.189.155
192.168.189.164
}
authentication {
auth_type PASS
auth_pass NFS_HA_2026_Prod
}
virtual_ipaddress {
192.168.189.156/24 dev ens3 noprefixroute
}
track_script {
chk_nfs
}
notify_master "/usr/local/bin/vip_master.sh"
notify_backup "/usr/local/bin/vip_backup.sh"
notify_fault "/usr/local/bin/vip_fault.sh"
}
备节点2(192.168.189.164)配置
vi /etc/keepalived/keepalived.conf
global_defs {
router_id NFS_BACKUP_164
script_user root
enable_script_security
}
vrrp_script chk_nfs {
script "/usr/local/bin/chk_nfs.sh"
interval 2
weight -50
fall 2
rise 2
}
vrrp_instance VI_NFS {
state BACKUP
interface ens3
virtual_router_id 51
priority 100
advert_int 1
nopreempt
preempt_delay 300
garp_master_delay 1
unicast_src_ip 192.168.189.164
unicast_peer {
192.168.189.155
192.168.189.163
}
authentication {
auth_type PASS
auth_pass NFS_HA_2026_Prod
}
virtual_ipaddress {
192.168.189.156/24 dev ens3 noprefixroute
}
track_script {
chk_nfs
}
notify_master "/usr/local/bin/vip_master.sh"
notify_backup "/usr/local/bin/vip_backup.sh"
notify_fault "/usr/local/bin/vip_fault.sh"
}
核心健康检查与状态切换脚本
NFS健康检测脚本(/usr/local/bin/chk_nfs.sh)
#!/bin/bash
systemctl is-active --quiet nfs-server || exit 1
touch /data/nfs/.nfs_health_check 2>/dev/null
if [ $? -ne 0 ]; then exit 1; fi
rm -f /data/nfs/.nfs_health_check
if ip addr | grep -q 192.168.189.156; then
ping -c 1 -W 1 192.168.189.156 >/dev/null 2>&1 || exit 1
fi
exit 0
主节点接管脚本(/usr/local/bin/vip_master.sh)
#!/bin/bash
LOG_FILE="/var/log/nfs_ha.log"
VIP="192.168.189.156"
DEV="ens3"
NFS_DIR="/data/nfs"
log() {
echo "$(date +'%Y-%m-%d %H:%M:%S') [MASTER] - $1" >> $LOG_FILE
}
log "============ 主节点切换开始 ============"
pkill -f "nfs_inotify_sync.sh" >/dev/null 2>&1
sleep 2
log "同步远程最新数据至本地"
rsync -avz --delete --exclude ".nfs_*" root@$VIP:$NFS_DIR/ $NFS_DIR/ >> $LOG_FILE 2>&1
arping -c 3 -I $DEV $VIP >/dev/null 2>&1
log "ARP缓存刷新完成"
systemctl restart nfs-server
systemctl is-active --quiet nfs-server && log "NFS服务正常" || log "NFS启动失败!"
nohup /usr/local/bin/nfs_inotify_sync.sh >> /var/log/nfs_inotify.log 2>&1 &
log "实时同步进程已启动"
log "主节点切换完成"
备节点同步脚本(/usr/local/bin/vip_backup.sh)
#!/bin/bash
LOG_FILE="/var/log/nfs_ha.log"
VIP="192.168.189.156"
NFS_DIR="/data/nfs"
log() {
echo "$(date +'%Y-%m-%d %H:%M:%S') [BACKUP] - $1" >> $LOG_FILE
}
log "============ 备节点切换开始 ============"
pkill -9 -f "nfs_inotify_sync.sh" >/dev/null 2>&1
pkill -9 -f "inotifywait" >/dev/null 2>&1
sleep 2
log "从主节点拉取最新数据"
rsync -avz --delete \
--exclude ".nfs_*" \
--exclude ".nfs_health_check" \
--timeout=30 \
root@$VIP:$NFS_DIR/ $NFS_DIR/ >> $LOG_FILE 2>&1
if [ $? -eq 0 ]; then
log "✅ 数据同步成功"
else
log "❌ 数据同步失败,请检查网络或认证配置"
fi
log "备节点切换完成"
故障通知脚本(/usr/local/bin/vip_fault.sh)
#!/bin/bash
LOG_FILE="/var/log/nfs_ha.log"
VIP="192.168.189.156"
log() {
echo "$(date +'%Y-%m-%d %H:%M:%S') [FAULT] - $1" >> $LOG_FILE
}
log "VIP $VIP 已漂移,当前节点进入故障状态"
实时文件同步脚本(/usr/local/bin/nfs_inotify_sync.sh)
#!/bin/bash
VIP="192.168.189.156"
NFS_DIR="/data/nfs"
NODE_LIST=("192.168.189.155" "192.168.189.163" "192.168.189.164")
LOG_FILE="/var/log/nfs_inotify.log"
RETRY_COUNT=3
RETRY_INTERVAL=2
SYNC_TIMEOUT=10
log() {
local LEVEL=$1
local MSG=$2
echo "$(date +'%Y-%m-%d %H:%M:%S') [$LEVEL] - $MSG" >> $LOG_FILE
}
check_vip() {
if ! ip addr | grep -q "$VIP"; then
log "ERROR" "未持有VIP,退出同步"
exit 0
fi
}
single_sync() {
local TARGET_NODE=$1
local FILE=$2
local RETRY=0
while [ $RETRY -lt $RETRY_COUNT ]; do
rsync -avz --delete \
--exclude ".nfs_*" \
--times \
--timeout=$SYNC_TIMEOUT \
$NFS_DIR/ root@$TARGET_NODE:$NFS_DIR/ >> $LOG_FILE 2>&1
if [ $? -eq 0 ]; then
log "INFO" "同步到 $TARGET_NODE 成功"
return 0
else
RETRY=$((RETRY + 1))
log "WARN" "重试第$RETRY次"
sleep $RETRY_INTERVAL
fi
done
log "ERROR" "同步失败,已达最大重试次数"
return 1
}
sync_data() {
local CURRENT_IP=$(hostname -I | awk '{print $1}')
check_vip
log "INFO" "本机($CURRENT_IP)持有主用地址,开始监听变化"
inotifywait -mrq --format '%w%f' -e create,delete,modify,move $NFS_DIR | while read FILE; do
if [[ $FILE =~ ".nfs_" ]]; then continue; fi
log "INFO" "检测到变更:$FILE"
for node in "${NODE_LIST[@]}"; do
if [ "$node" != "$CURRENT_IP" ]; then
single_sync $node $FILE
fi
done
if ! ip addr | grep -q "$VIP"; then
log "ERROR" "VIP已漂移,停止同步"
exit 0
fi
done
}
main() {
if [ ! -d $NFS_DIR ]; then
log "ERROR" "NFS目录不存在"
exit 1
fi
if ! which inotifywait >/dev/null 2>&1; then
log "ERROR" "缺少inotify-tools"
exit 1
fi
check_vip
sync_data
}
main
权限设置与服务启动
chmod +x /usr/local/bin/chk_nfs.sh /usr/local/bin/vip_*.sh /usr/local/bin/nfs_inotify_sync.sh
# 清理旧状态并启动
pkill -9 keepalived
ip addr del 192.168.189.156/32 dev ens3 2>/dev/null
systemctl enable --now keepalived
# 验证
ip addr show ens3 | grep 192.168.189.156
# 测试同步
touch /data/nfs/test.txt
# 在其他节点查看是否存在该文件
# 故障模拟
systemctl stop keepalived
# 观察其他节点是否获得VIP
Kubernetes中使用NFS存储
创建命名空间与存储类:
kubectl create ns shared
kind: StorageClass
apiVersion: storage.k8s.io/v1
metadata:
name: nfs-storage
namespace: shared
annotations:
storageclass.beta.kubernetes.io/is-default-class: 'true'
storageclass.kubernetes.io/is-default-class: 'true'
labels:
environment: test
provisioner: fuseim.pri/ifs
reclaimPolicy: Retain
volumeBindingMode: Immediate
mountOptions:
- hard
- nocto
- noacl
配置RBAC权限:
apiVersion: v1
kind: ServiceAccount
metadata:
name: nfs-client-provisioner
namespace: shared
---
kind: ClusterRole
apiVersion: rbac.authorization.k8s.io/v1
metadata:
name: nfs-client-provisioner-runner
rules:
- apiGroups: [""]
resources: ["persistentvolumes"]
verbs: ["get", "list", "watch", "create", "delete"]
- apiGroups: [""]
resources: ["persistentvolumeclaims"]
verbs: ["get", "list", "watch", "update"]
- apiGroups: ["storage.k8s.io"]
resources: ["storageclasses"]
verbs: ["get", "list", "watch"]
- apiGroups: [""]
resources: ["events"]
verbs: ["create", "update", "patch"]
---
kind: ClusterRoleBinding
apiVersion: rbac.authorization.k8s.io/v1
metadata:
name: run-nfs-client-provisioner
subjects:
- kind: ServiceAccount
name: nfs-client-provisioner
namespace: shared
roleRef:
kind: ClusterRole
name: nfs-client-provisioner-runner
apiGroup: rbac.authorization.k8s.io
---
kind: Role
apiVersion: rbac.authorization.k8s.io/v1
metadata:
name: leader-locking-nfs-client-provisioner
namespace: shared
rules:
- apiGroups: [""]
resources: ["endpoints"]
verbs: ["get", "list", "watch", "create", "update", "patch"]
---
kind: RoleBinding
apiVersion: rbac.authorization.k8s.io/v1
metadata:
name: leader-locking-nfs-client-provisioner
namespace: shared
subjects:
- kind: ServiceAccount
name: nfs-client-provisioner
namespace: shared
roleRef:
kind: Role
name: leader-locking-nfs-client-provisioner
apiGroup: rbac.authorization.k8s.io
部署Provisioner控制器:
kind: Deployment
apiVersion: apps/v1
metadata:
name: nfs-client-provisioner
namespace: shared
spec:
replicas: 3
strategy:
type: Recreate
selector:
matchLabels:
app: nfs-client-provisioner
template:
metadata:
labels:
app: nfs-client-provisioner
spec:
serviceAccountName: nfs-client-provisioner
containers:
- name: nfs-client-provisioner
image: docker.1ms.run/eipwork/nfs-subdir-external-provisioner:v4.0.2
imagePullPolicy: IfNotPresent
volumeMounts:
- name: nfs-client-root
mountPath: /persistentvolumes
env:
- name: PROVISIONER_NAME
value: fuseim.pri/ifs
- name: NFS_SERVER
value: 192.168.189.156
- name: NFS_PATH
value: /data/nfs
volumes:
- name: nfs-client-root
nfs:
server: 192.168.189.156
path: /data/nfs
验证部署结果:
kubectl get po -A | grep shared