报错
127.0.0.1:6379> set name tom ---》测试在redis集群中存数据时报错 -> Redirected to slot [5798] located at 192.168.3.2:6379 (error) CLUSTERDOWN The cluster is down 192.168.3.2:6379> cluster info cluster_state:fail ---》显示集群状态已关闭 cluster_slots_assigned:16384 cluster_slots_ok:10923 cluster_slots_pfail:0 cluster_slots_fail:5461 cluster_known_nodes:6 cluster_size:3 cluster_current_epoch:6 cluster_my_epoch:2 cluster_stats_messages_ping_sent:2203 cluster_stats_messages_pong_sent:392 cluster_stats_messages_meet_sent:4 cluster_stats_messages_fail_sent:4 cluster_stats_messages_sent:2603 cluster_stats_messages_ping_received:391 cluster_stats_messages_pong_received:310 cluster_stats_messages_meet_received:1 cluster_stats_messages_fail_received:1 cluster_stats_messages_received:703
解决
查看所有redis日志发现redis-5容器一直在反复连接192.168.3.1:6379这个master节点node-1
[root@es-node22 ~]# docker logs -f redis-5 ...... 1:S 28 May 2022 13:07:53.233 # Cluster state changed: fail 1:S 28 May 2022 13:07:53.442 * Connecting to MASTER 192.168.3.1:6379 1:S 28 May 2022 13:07:53.442 * MASTER <-> REPLICA sync started 1:S 28 May 2022 13:07:53.442 # Error condition on socket for SYNC: Connection refused 1:S 28 May 2022 13:07:54.481 * Connecting to MASTER 192.168.3.1:6379 1:S 28 May 2022 13:07:54.481 * MASTER <-> REPLICA sync started ......
查看node-1的redis节点配置文件redis.conf中的节点IP
[root@es-node22 ~]# cat /root/redis/node-1/conf/redis.conf port 6379 bind 0.0.0.0 cluster-enabled yes cluster-config-file nodes.conf ---》redis集群节点配置文件 cluster-node-timeout 5000 cluster-announce-ip 192.168.3.11 ---》可以看到node-1节点配置文件中IP为192.168.3.11 cluster-announce-port 6379 cluster-announce-bus-port 16379 appendonly yes
查看当前的redis集群状态,以数组形式展示
192.168.3.2:6379> cluster slots ---》当前的集群状态,以数组形式展示 1) 1) (integer) 10923 2) (integer) 16383 3) 1) "192.168.3.3" 2) (integer) 6379 3) "ff0d1d636f94d9b092e6012408c1d0918e00e6ed" 4) 1) "192.168.3.4" 2) (integer) 6379 3) "2113cf366ad27ebd73585f03d368e77f03b1a2e1" 2) 1) (integer) 0 2) (integer) 5460 3) 1) "192.168.3.1" ---》可以看到集群中该节点的IP是192.168.3.1 2) (integer) 6379 3) "c856c94ba8d2c55a0d176831bc85aa34a96fde88" 4) 1) "192.168.3.5" 2) (integer) 6379 3) "d92ff5984ab29370af0adeaca71e7938c0287ca5" 3) 1) (integer) 5461 2) (integer) 10922 3) 1) "192.168.3.2" 2) (integer) 6379 3) "8b01b1bc6202e1dc7ff9f15013d8200b10ecb3f3" 4) 1) "192.168.3.6" 2) (integer) 6379 3) "2108a90495c147c675328f9b8b4fa49e2b856faf"
查看redis集群节点配置文件nodes.conf
[root@es-node22 ~]# cat /root/redis/node-1/data/nodes.conf c856c94ba8d2c55a0d176831bc85aa34a96fde88 192.168.3.1:6379@16379 myself,master - 0 1653743266000 1 connected 0-5460 d92ff5984ab29370af0adeaca71e7938c0287ca5 192.168.3.5:6379@16379 slave c856c94ba8d2c55a0d176831bc85aa34a96fde88 0 1653743274000 5 connected 2108a90495c147c675328f9b8b4fa49e2b856faf 192.168.3.6:6379@16379 slave 8b01b1bc6202e1dc7ff9f15013d8200b10ecb3f3 0 1653743275531 6 connected 2113cf366ad27ebd73585f03d368e77f03b1a2e1 192.168.3.4:6379@16379 slave ff0d1d636f94d9b092e6012408c1d0918e00e6ed 0 1653743275531 4 connected 8b01b1bc6202e1dc7ff9f15013d8200b10ecb3f3 192.168.3.2:6379@16379 master - 0 1653743275531 2 connected 5461-10922 ff0d1d636f94d9b092e6012408c1d0918e00e6ed 192.168.3.3:6379@16379 master - 0 1653743275000 3 connected 10923-16383 vars currentEpoch 6 lastVoteEpoch 0 [root@es-node22 ~]# cat /root/redis/node-2/data/nodes.conf ff0d1d636f94d9b092e6012408c1d0918e00e6ed 192.168.3.3:6379@16379 master - 0 1653743273233 3 connected 10923-16383 2113cf366ad27ebd73585f03d368e77f03b1a2e1 192.168.3.4:6379@16379 slave ff0d1d636f94d9b092e6012408c1d0918e00e6ed 0 1653743271151 4 connected c856c94ba8d2c55a0d176831bc85aa34a96fde88 192.168.3.1:6379@16379 master,fail - 1653743267074 1653743266961 1 connected 0-5460 d92ff5984ab29370af0adeaca71e7938c0287ca5 192.168.3.5:6379@16379 slave c856c94ba8d2c55a0d176831bc85aa34a96fde88 0 1653743272000 1 connected 8b01b1bc6202e1dc7ff9f15013d8200b10ecb3f3 192.168.3.2:6379@16379 myself,master - 0 1653743271000 2 connected 5461-10922 2108a90495c147c675328f9b8b4fa49e2b856faf 192.168.3.6:6379@16379 slave 8b01b1bc6202e1dc7ff9f15013d8200b10ecb3f3 0 1653743272194 6 connected vars currentEpoch 6 lastVoteEpoch 0
可以看到redis所有节点的集群配置文件nodes.conf中是192.168.3.1:6379,与node-1节点的redis.conf文件中不一致。
批量修改所有redis节点nodes.conf文件中该节点IP配置
[root@es-node22 ~]# for i in $(seq 1 6); do \ > sed -i 's/192.168.3.1/192.168.3.11/' /root/redis/node-${i}/data/nodes.conf > done
查看修改后的所有redis集群nodes.conf文件
[root@es-node22 ~]# cat /root/redis/node-1/data/nodes.conf c856c94ba8d2c55a0d176831bc85aa34a96fde88 192.168.3.11:6379@16379 myself,master - 0 1653743266000 1 connected 0-5460 d92ff5984ab29370af0adeaca71e7938c0287ca5 192.168.3.5:6379@16379 slave c856c94ba8d2c55a0d176831bc85aa34a96fde88 0 1653743274000 5 connected 2108a90495c147c675328f9b8b4fa49e2b856faf 192.168.3.6:6379@16379 slave 8b01b1bc6202e1dc7ff9f15013d8200b10ecb3f3 0 1653743275531 6 connected 2113cf366ad27ebd73585f03d368e77f03b1a2e1 192.168.3.4:6379@16379 slave ff0d1d636f94d9b092e6012408c1d0918e00e6ed 0 1653743275531 4 connected 8b01b1bc6202e1dc7ff9f15013d8200b10ecb3f3 192.168.3.2:6379@16379 master - 0 1653743275531 2 connected 5461-10922 ff0d1d636f94d9b092e6012408c1d0918e00e6ed 192.168.3.3:6379@16379 master - 0 1653743275000 3 connected 10923-16383 vars currentEpoch 6 lastVoteEpoch 0 [root@es-node22 ~]# cat /root/redis/node-2/data/nodes.conf ff0d1d636f94d9b092e6012408c1d0918e00e6ed 192.168.3.3:6379@16379 master - 0 1653743273233 3 connected 10923-16383 2113cf366ad27ebd73585f03d368e77f03b1a2e1 192.168.3.4:6379@16379 slave ff0d1d636f94d9b092e6012408c1d0918e00e6ed 0 1653743271151 4 connected c856c94ba8d2c55a0d176831bc85aa34a96fde88 192.168.3.11:6379@16379 master,fail - 1653743267074 1653743266961 1 connected 0-5460 d92ff5984ab29370af0adeaca71e7938c0287ca5 192.168.3.5:6379@16379 slave c856c94ba8d2c55a0d176831bc85aa34a96fde88 0 1653743272000 1 connected 8b01b1bc6202e1dc7ff9f15013d8200b10ecb3f3 192.168.3.2:6379@16379 myself,master - 0 1653743271000 2 connected 5461-10922 2108a90495c147c675328f9b8b4fa49e2b856faf 192.168.3.6:6379@16379 slave 8b01b1bc6202e1dc7ff9f15013d8200b10ecb3f3 0 1653743272194 6 connected vars currentEpoch 6 lastVoteEpoch 0 ... ...
批量重启redis集群所有节点容器
[root@es-node22 ~]# docker restart $(docker ps | grep redis | awk '{print $1}') dcd802a160c6 6e2f628457f6 f05d3dfb9c8b 220df78836e9 31e7b232f1d1 1de91b4d4e68 [root@es-node22 ~]# docker ps CONTAINER ID IMAGE COMMAND CREATED STATUS PORTS NAMES 6e2f628457f6 redis:5.0.9-alpine3.11 "docker-entrypoint.s…" 3 hours ago Up 2 hours 0.0.0.0:6376->6379/tcp, :::6376->6379/tcp, 0.0.0.0:16376->16379/tcp, :::16376->16379/tcp redis-6 f05d3dfb9c8b redis:5.0.9-alpine3.11 "docker-entrypoint.s…" 3 hours ago Up 2 hours 0.0.0.0:6375->6379/tcp, :::6375->6379/tcp, 0.0.0.0:16375->16379/tcp, :::16375->16379/tcp redis-5 220df78836e9 redis:5.0.9-alpine3.11 "docker-entrypoint.s…" 3 hours ago Up 2 hours 0.0.0.0:6374->6379/tcp, :::6374->6379/tcp, 0.0.0.0:16374->16379/tcp, :::16374->16379/tcp redis-4 31e7b232f1d1 redis:5.0.9-alpine3.11 "docker-entrypoint.s…" 3 hours ago Up 2 hours 0.0.0.0:6373->6379/tcp, :::6373->6379/tcp, 0.0.0.0:16373->16379/tcp, :::16373->16379/tcp redis-3 1de91b4d4e68 redis:5.0.9-alpine3.11 "docker-entrypoint.s…" 3 hours ago Up 2 hours 0.0.0.0:6372->6379/tcp, :::6372->6379/tcp, 0.0.0.0:16372->16379/tcp, :::16372->16379/tcp redis-2 dcd802a160c6 redis:5.0.9-alpine3.11 "docker-entrypoint.s…" 3 hours ago Up 2 hours 0.0.0.0:6371->6379/tcp, :::6371->6379/tcp, 0.0.0.0:16371->16379/tcp, :::16371->16379/tcp redis-1
重新查看redis集群状态
[root@es-node22 ~]# docker exec -it redis-1 /bin/sh ---》redis中默认没有bash解释器 /data # redis-cli -c 127.0.0.1:6379> cluster info cluster_state:ok ---》可以看到redis集群状态已经为OK cluster_slots_assigned:16384 cluster_slots_ok:16384 cluster_slots_pfail:0 cluster_slots_fail:0 cluster_known_nodes:6 cluster_size:3 cluster_current_epoch:6 cluster_my_epoch:1 cluster_stats_messages_ping_sent:236 cluster_stats_messages_pong_sent:233 cluster_stats_messages_sent:469 cluster_stats_messages_ping_received:233 cluster_stats_messages_pong_received:232 cluster_stats_messages_received:465 127.0.0.1:6379> cluster nodes c856c94ba8d2c55a0d176831bc85aa34a96fde88 192.168.3.11:6379@16379 master - 0 1653752958838 1 connected 0-5460 8b01b1bc6202e1dc7ff9f15013d8200b10ecb3f3 192.168.3.2:6379@16379 myself,master - 0 1653752957000 2 connected 5461-10922 2113cf366ad27ebd73585f03d368e77f03b1a2e1 192.168.3.4:6379@16379 slave ff0d1d636f94d9b092e6012408c1d0918e00e6ed 0 1653752957804 4 connected 2108a90495c147c675328f9b8b4fa49e2b856faf 192.168.3.6:6379@16379 slave 8b01b1bc6202e1dc7ff9f15013d8200b10ecb3f3 0 1653752957086 6 connected ff0d1d636f94d9b092e6012408c1d0918e00e6ed 192.168.3.3:6379@16379 master - 0 1653752958000 3 connected 10923-16383 d92ff5984ab29370af0adeaca71e7938c0287ca5 192.168.3.5:6379@16379 slave c856c94ba8d2c55a0d176831bc85aa34a96fde88 0 1653752958529 1 connected 127.0.0.1:6379> cluster slots 1) 1) (integer) 5461 2) (integer) 10922 3) 1) "192.168.3.2" 2) (integer) 6379 3) "8b01b1bc6202e1dc7ff9f15013d8200b10ecb3f3" 4) 1) "192.168.3.6" 2) (integer) 6379 3) "2108a90495c147c675328f9b8b4fa49e2b856faf" 2) 1) (integer) 0 2) (integer) 5460 3) 1) "192.168.3.11" ---》可以看到集群中该节点的IP已经为修改后的IP 2) (integer) 6379 3) "c856c94ba8d2c55a0d176831bc85aa34a96fde88" 4) 1) "192.168.3.5" 2) (integer) 6379 3) "d92ff5984ab29370af0adeaca71e7938c0287ca5" 3) 1) (integer) 10923 2) (integer) 16383 3) 1) "192.168.3.3" 2) (integer) 6379 3) "ff0d1d636f94d9b092e6012408c1d0918e00e6ed" 4) 1) "192.168.3.4" 2) (integer) 6379 3) "2113cf366ad27ebd73585f03d368e77f03b1a2e1"
另一种情况
当集群报错cluster_state:fail
时,也有可能是因为slot未完全分配的问题导致集群不可用。因为redis为了保证集群完整性, 默认情况下当集群16384个槽任何一个没有指派到节点时,整个redis集群都会不可用。这是对集群完整性的一种保护措施, 保证所有的槽都指派给在线的redis节点。这种情况时,重新分配这些slots
即可解决集群不可用问题。
这种情况时可以看看这篇:未指派的slots问题解决