package cluster import ( "crypto/tls" "encoding/binary" "fmt" "os" "path/filepath" "sync" "time" "github.com/hashicorp/raft" raftboltdb "github.com/hashicorp/raft-boltdb" "github.com/samber/lo" "gitlink.org.cn/cloudream/common/pkgs/async" "gitlink.org.cn/cloudream/common/pkgs/logger" "gitlink.org.cn/cloudream/jcs-pub/common/ecode" "gitlink.org.cn/cloudream/jcs-pub/common/pkgs/rpc" clirpc "gitlink.org.cn/cloudream/jcs-pub/common/pkgs/rpc/client" "google.golang.org/grpc" "google.golang.org/grpc/credentials" ) type Cluster struct { cfg Config poolCfg clirpc.PoolConfig masterCli MasterClient doneCh chan any raft *raft.Raft fsm *raftFSM transport *Transport } func New(cfg Config) *Cluster { return &Cluster{ cfg: cfg, masterCli: MasterClient{ Client: clirpc.NewFusedClient(rpc.Failed(ecode.OperationFailed, "master unknown")), }, doneCh: make(chan any, 1), } } func (c *Cluster) Start(fsms []FSM) (*ClusterEventChan, error) { log := logger.WithField("Mod", "Cluster") ch := async.NewUnboundChannel[ClusterEvent]() if !c.cfg.Enabled { log.Infof("cluster disabled") return ch, nil } c.fsm = NewFSM(fsms) poolCfgJSON := clirpc.PoolConfigJSON{ RootCA: c.cfg.RootCA, ClientCert: c.cfg.ClientCert, ClientKey: c.cfg.ClientKey, } poolCfg, err := poolCfgJSON.Build() if err != nil { return nil, fmt.Errorf("build pool config: %w", err) } c.poolCfg = *poolCfg raftCfg := raft.DefaultConfig() raftCfg.LocalID = raft.ServerID(c.cfg.Announce) err = os.MkdirAll(c.cfg.StoreBase, 0755) if err != nil { return nil, fmt.Errorf("create store base dir: %w", err) } logDB, err := raftboltdb.NewBoltStore(filepath.Join(c.cfg.StoreBase, "log.db")) if err != nil { return nil, fmt.Errorf("create raft log store: %w", err) } stableDB, err := raftboltdb.NewBoltStore(filepath.Join(c.cfg.StoreBase, "stable.db")) if err != nil { return nil, fmt.Errorf("create raft stable store: %w", err) } snapshotStore, err := raft.NewFileSnapshotStore(c.cfg.StoreBase, 1, os.Stdout) if err != nil { return nil, fmt.Errorf("create raft snapshot store: %w", err) } c.transport = NewTransport(c.cfg.Announce, *poolCfg) rft, err := raft.NewRaft(raftCfg, c.fsm, logDB, stableDB, snapshotStore, c.transport) if err != nil { return nil, fmt.Errorf("create raft: %w", err) } hasState, err := raft.HasExistingState(logDB, stableDB, snapshotStore) if err != nil { return nil, fmt.Errorf("check has existing state: %w", err) } if !hasState { bootCfg := raft.Configuration{} if !lo.Contains(c.cfg.Peers, c.cfg.Announce) { bootCfg.Servers = append(bootCfg.Servers, raft.Server{ ID: raft.ServerID(c.cfg.Announce), Address: raft.ServerAddress(c.cfg.Announce), }) } for _, peer := range c.cfg.Peers { bootCfg.Servers = append(bootCfg.Servers, raft.Server{ ID: raft.ServerID(peer), Address: raft.ServerAddress(peer), }) } bootFut := rft.BootstrapCluster(bootCfg) if err := bootFut.Error(); err != nil { return nil, fmt.Errorf("bootstrap cluster: %w", err) } log.Infof("bootstrap new cluster") } else { log.Infof("start existing cluster") } ch.Send(&BootstrapEvent{}) c.raft = rft eventCh := make(chan raft.Observation, 1) obs := raft.NewObserver(eventCh, true, nil) rft.RegisterObserver(obs) go func() { loop: for { select { case <-c.doneCh: break loop case e := <-eventCh: state, ok := e.Data.(raft.RaftState) if !ok { continue } switch state { case raft.Leader: log.Info("become leader") ch.Send(&LeaderEvent{ CurrentIsMaster: true, Address: c.cfg.Announce, }) case raft.Follower: addr, id := rft.LeaderWithID() log.Infof("become follower, master is: %v, %v", id, addr) ch.Send(&LeaderEvent{ CurrentIsMaster: false, Address: string(addr), }) case raft.Candidate: log.Info("become candidate") } } } c.raft.DeregisterObserver(obs) }() return ch, nil } func (c *Cluster) Stop() { c.raft.Shutdown().Error() select { case c.doneCh <- nil: default: } } func (c *Cluster) ID() string { return c.cfg.Announce } func (c *Cluster) IsMaster() bool { addr, _ := c.raft.LeaderWithID() return string(addr) == c.cfg.Announce } func (c *Cluster) Enabled() bool { return c.cfg.Enabled } func (c *Cluster) Name() string { return c.cfg.NodeName } // 由于主节点可能会变化,因此不要缓存MasterClient,每次都重新获取 func (c *Cluster) MasterClient() *MasterClient { addr, _ := c.raft.LeaderWithID() c.masterCli.lock.Lock() defer c.masterCli.lock.Unlock() addr2 := string(addr) if addr2 == "" { if c.masterCli.con != nil { c.masterCli.con.Close() } c.masterCli.Client = clirpc.NewFusedClient(rpc.Failed(ecode.ClusterNoMaster, "no master")) return &c.masterCli } if c.masterCli.addr != addr2 { if c.masterCli.con != nil { c.masterCli.con.Close() } gcon, err := grpc.NewClient(addr2, grpc.WithTransportCredentials(credentials.NewTLS(&tls.Config{ RootCAs: c.poolCfg.Conn.RootCA, Certificates: []tls.Certificate{*c.poolCfg.Conn.ClientCert}, ServerName: rpc.InternalAPISNIV1, NextProtos: []string{"h2"}, }))) if err != nil { c.masterCli.Client = clirpc.NewFusedClient(rpc.Failed(ecode.OperationFailed, "%v", err)) c.masterCli.addr = "" } else { c.masterCli.con = gcon c.masterCli.Client = clirpc.NewClient(gcon) c.masterCli.addr = addr2 } } return &c.masterCli } func (c *Cluster) RaftTransport() *Transport { return c.transport } // 只有Leader才能调用 func (c *Cluster) Apply(fsmID string, data []byte, timeout time.Duration) ([]byte, error) { fsmIDBytes := []byte(fsmID) logBytes := make([]byte, 4+len(fsmIDBytes)+len(data)) // 前4个字节表示ID的长度,后面跟着ID和数据 binary.LittleEndian.PutUint32(logBytes[:4], uint32(len(fsmIDBytes))) copy(logBytes[4:], fsmIDBytes) copy(logBytes[4+len(fsmIDBytes):], data) fut := c.raft.Apply(logBytes, timeout) err := fut.Error() if err != nil { return nil, err } applyRet := fut.Response().(applyResult) return applyRet.Result, applyRet.Error } type ClusterEvent interface { IsClusterEvent() bool } type ClusterEventChan = async.UnboundChannel[ClusterEvent] type ExitEvent struct { Err error } func (e *ExitEvent) IsClusterEvent() bool { return true } type BootstrapEvent struct{} func (e *BootstrapEvent) IsClusterEvent() bool { return true } type LeaderEvent struct { CurrentIsMaster bool Address string } func (e *LeaderEvent) IsClusterEvent() bool { return true } type MasterClient struct { *clirpc.Client con *grpc.ClientConn addr string lock sync.Mutex } func (c *MasterClient) Release() { }