| @@ -29,15 +29,17 @@ type AcquireActor struct { | |||||
| etcdCli *clientv3.Client | etcdCli *clientv3.Client | ||||
| providersActor *ProvidersActor | providersActor *ProvidersActor | ||||
| serviceID string | |||||
| acquirings []*acquireInfo | |||||
| lock sync.Mutex | |||||
| isMaintenance bool | |||||
| serviceID string | |||||
| acquirings []*acquireInfo | |||||
| lock sync.Mutex | |||||
| } | } | ||||
| func NewAcquireActor(cfg *Config, etcdCli *clientv3.Client) *AcquireActor { | func NewAcquireActor(cfg *Config, etcdCli *clientv3.Client) *AcquireActor { | ||||
| return &AcquireActor{ | return &AcquireActor{ | ||||
| cfg: cfg, | |||||
| etcdCli: etcdCli, | |||||
| cfg: cfg, | |||||
| etcdCli: etcdCli, | |||||
| isMaintenance: true, | |||||
| } | } | ||||
| } | } | ||||
| @@ -57,6 +59,12 @@ func (a *AcquireActor) Acquire(ctx context.Context, req LockRequest) (string, er | |||||
| defer a.lock.Unlock() | defer a.lock.Unlock() | ||||
| a.acquirings = append(a.acquirings, info) | a.acquirings = append(a.acquirings, info) | ||||
| // 如果处于维护模式,那么只接受请求,不实际去处理 | |||||
| if a.isMaintenance { | |||||
| return | |||||
| } | |||||
| // TODO 处理错误 | // TODO 处理错误 | ||||
| err := a.doAcquiring() | err := a.doAcquiring() | ||||
| if err != nil { | if err != nil { | ||||
| @@ -93,6 +101,11 @@ func (a *AcquireActor) TryAcquireNow() { | |||||
| a.lock.Lock() | a.lock.Lock() | ||||
| defer a.lock.Unlock() | defer a.lock.Unlock() | ||||
| // 处于维护模式中时,即使是主动触发Acqurire也不予理会 | |||||
| if a.isMaintenance { | |||||
| return | |||||
| } | |||||
| err := a.doAcquiring() | err := a.doAcquiring() | ||||
| if err != nil { | if err != nil { | ||||
| logger.Std.Debugf("doing acquiring: %s", err.Error()) | logger.Std.Debugf("doing acquiring: %s", err.Error()) | ||||
| @@ -100,19 +113,27 @@ func (a *AcquireActor) TryAcquireNow() { | |||||
| }() | }() | ||||
| } | } | ||||
| // 进入维护模式。维护模式期间只接受请求,不处理请求。 | |||||
| func (a *AcquireActor) EnterMaintenance() { | |||||
| a.lock.Lock() | |||||
| defer a.lock.Unlock() | |||||
| a.isMaintenance = true | |||||
| } | |||||
| // 退出维护模式。退出之后建议调用一下TryAcquireNow。 | |||||
| func (a *AcquireActor) LeaveMaintenance() { | |||||
| a.lock.Lock() | |||||
| defer a.lock.Unlock() | |||||
| a.isMaintenance = false | |||||
| } | |||||
| func (a *AcquireActor) ResetState(serviceID string) { | func (a *AcquireActor) ResetState(serviceID string) { | ||||
| a.lock.Lock() | a.lock.Lock() | ||||
| defer a.lock.Unlock() | defer a.lock.Unlock() | ||||
| a.serviceID = serviceID | a.serviceID = serviceID | ||||
| for _, info := range a.acquirings { | |||||
| if info.LastErr != nil { | |||||
| info.Callback.SetError(info.LastErr) | |||||
| } else { | |||||
| info.Callback.SetError(ErrAcquiringTimeout) | |||||
| } | |||||
| } | |||||
| a.acquirings = nil | |||||
| } | } | ||||
| func (a *AcquireActor) doAcquiring() error { | func (a *AcquireActor) doAcquiring() error { | ||||
| @@ -136,7 +157,7 @@ func (a *AcquireActor) doAcquiring() error { | |||||
| // 等待本地状态同步到最新 | // 等待本地状态同步到最新 | ||||
| // TODO 配置等待时间 | // TODO 配置等待时间 | ||||
| err = a.providersActor.WaitIndexUpdated(ctx, index) | |||||
| err = a.providersActor.WaitLocalIndexTo(ctx, index) | |||||
| if err != nil { | if err != nil { | ||||
| return err | return err | ||||
| } | } | ||||
| @@ -5,6 +5,7 @@ type Config struct { | |||||
| EtcdUsername string `json:"etcdUsername"` | EtcdUsername string `json:"etcdUsername"` | ||||
| EtcdPassword string `json:"etcdPassword"` | EtcdPassword string `json:"etcdPassword"` | ||||
| EtcdLockLeaseTimeSec int64 `json:"etcdLockLeaseTimeSec"` // 全局锁的租约时间。锁服务会在这个时间内自动续约锁,但如果服务崩溃,则其他服务在租约到期后能重新获得锁。 | |||||
| RandomReleasingDelayMs int64 `json:"randomReleasingDelayMs"` // 释放锁失败,随机延迟之后再次尝试。延迟时间=random(0, RandomReleasingDelayMs) + 最少延迟时间(1000ms) | |||||
| EtcdLockLeaseTimeSec int64 `json:"etcdLockLeaseTimeSec"` // 全局锁的租约时间。锁服务会在这个时间内自动续约锁,但如果服务崩溃,则其他服务在租约到期后能重新获得锁。 | |||||
| RandomReleasingDelayMs int64 `json:"randomReleasingDelayMs"` // 释放锁失败,随机延迟之后再次尝试。延迟时间=random(0, RandomReleasingDelayMs) + 最少延迟时间(1000ms) | |||||
| ServiceDescription string `json:"serviceDescription"` // 锁服务描述信息,锁服务启动后会注册到Etcd中 | |||||
| } | } | ||||
| @@ -91,26 +91,14 @@ func (a *LeaseActor) Remove(reqID string) error { | |||||
| }) | }) | ||||
| } | } | ||||
| func (a *LeaseActor) ResetState() { | |||||
| actor.Wait(context.Background(), a.commandChan, func() error { | |||||
| a.leases = make(map[string]*lockRequestLease) | |||||
| return nil | |||||
| }) | |||||
| } | |||||
| func (a *LeaseActor) Serve() error { | |||||
| func (a *LeaseActor) Serve() { | |||||
| cmdChan := a.commandChan.BeginChanReceive() | cmdChan := a.commandChan.BeginChanReceive() | ||||
| defer a.commandChan.CloseChanReceive() | defer a.commandChan.CloseChanReceive() | ||||
| for { | for { | ||||
| if a.ticker != nil { | if a.ticker != nil { | ||||
| select { | select { | ||||
| case cmd, ok := <-cmdChan: | |||||
| if !ok { | |||||
| a.ticker.Stop() | |||||
| return fmt.Errorf("command chan closed") | |||||
| } | |||||
| case cmd := <-cmdChan: | |||||
| cmd() | cmd() | ||||
| case now := <-a.ticker.C: | case now := <-a.ticker.C: | ||||
| @@ -127,11 +115,7 @@ func (a *LeaseActor) Serve() error { | |||||
| } | } | ||||
| } else { | } else { | ||||
| select { | select { | ||||
| case cmd, ok := <-cmdChan: | |||||
| if !ok { | |||||
| return fmt.Errorf("command chan closed") | |||||
| } | |||||
| case cmd := <-cmdChan: | |||||
| cmd() | cmd() | ||||
| } | } | ||||
| } | } | ||||
| @@ -72,5 +72,6 @@ func MakeServiceInfoKey(svcID string) string { | |||||
| } | } | ||||
| type ServiceInfo struct { | type ServiceInfo struct { | ||||
| ID string `json:"id"` | |||||
| ID string `json:"id"` | |||||
| Description string `json:"description"` | |||||
| } | } | ||||
| @@ -7,7 +7,6 @@ import ( | |||||
| "sync" | "sync" | ||||
| "gitlink.org.cn/cloudream/common/pkgs/future" | "gitlink.org.cn/cloudream/common/pkgs/future" | ||||
| "gitlink.org.cn/cloudream/common/pkgs/logger" | |||||
| "gitlink.org.cn/cloudream/common/pkgs/trie" | "gitlink.org.cn/cloudream/common/pkgs/trie" | ||||
| ) | ) | ||||
| @@ -39,7 +38,7 @@ func (a *ProvidersActor) AddProvider(prov LockProvider, path ...any) { | |||||
| func (a *ProvidersActor) Init() { | func (a *ProvidersActor) Init() { | ||||
| } | } | ||||
| func (a *ProvidersActor) WaitIndexUpdated(ctx context.Context, index int64) error { | |||||
| func (a *ProvidersActor) WaitLocalIndexTo(ctx context.Context, index int64) error { | |||||
| fut := future.NewSetVoid() | fut := future.NewSetVoid() | ||||
| a.lock.Lock() | a.lock.Lock() | ||||
| @@ -56,32 +55,34 @@ func (a *ProvidersActor) WaitIndexUpdated(ctx context.Context, index int64) erro | |||||
| return fut.Wait(ctx) | return fut.Wait(ctx) | ||||
| } | } | ||||
| func (a *ProvidersActor) OnLockRequestEvent(evt LockRequestEvent) { | |||||
| func() { | |||||
| func (a *ProvidersActor) OnLockRequestEvent(evt LockRequestEvent) error { | |||||
| err := func() error { | |||||
| a.lock.Lock() | a.lock.Lock() | ||||
| defer a.lock.Unlock() | defer a.lock.Unlock() | ||||
| if evt.IsLocking { | if evt.IsLocking { | ||||
| err := a.lockLockRequest(evt.Data) | err := a.lockLockRequest(evt.Data) | ||||
| if err != nil { | if err != nil { | ||||
| // TODO 发生这种错误需要重新加载全量状态,下同 | |||||
| logger.Std.Warnf("applying locking event: %s", err.Error()) | |||||
| return | |||||
| return fmt.Errorf("applying locking event: %w", err) | |||||
| } | } | ||||
| } else { | } else { | ||||
| err := a.unlockLockRequest(evt.Data) | err := a.unlockLockRequest(evt.Data) | ||||
| if err != nil { | if err != nil { | ||||
| logger.Std.Warnf("applying unlocking event: %s", err.Error()) | |||||
| return | |||||
| return fmt.Errorf("applying unlocking event: %w", err) | |||||
| } | } | ||||
| } | } | ||||
| a.localLockReqIndex++ | a.localLockReqIndex++ | ||||
| return nil | |||||
| }() | }() | ||||
| if err != nil { | |||||
| return err | |||||
| } | |||||
| // 检查是否有等待同步进度的需求 | // 检查是否有等待同步进度的需求 | ||||
| a.wakeUpIndexWaiter() | a.wakeUpIndexWaiter() | ||||
| return nil | |||||
| } | } | ||||
| func (svc *ProvidersActor) lockLockRequest(reqData LockRequestData) error { | func (svc *ProvidersActor) lockLockRequest(reqData LockRequestData) error { | ||||
| @@ -22,16 +22,18 @@ type ReleaseActor struct { | |||||
| cfg *Config | cfg *Config | ||||
| etcdCli *clientv3.Client | etcdCli *clientv3.Client | ||||
| lock sync.Mutex | |||||
| isMaintenance bool | |||||
| releasingLockRequestIDs map[string]bool | releasingLockRequestIDs map[string]bool | ||||
| timer *time.Timer | timer *time.Timer | ||||
| timerSetup bool | timerSetup bool | ||||
| lock sync.Mutex | |||||
| } | } | ||||
| func NewReleaseActor(cfg *Config, etcdCli *clientv3.Client) *ReleaseActor { | func NewReleaseActor(cfg *Config, etcdCli *clientv3.Client) *ReleaseActor { | ||||
| return &ReleaseActor{ | return &ReleaseActor{ | ||||
| cfg: cfg, | cfg: cfg, | ||||
| etcdCli: etcdCli, | etcdCli: etcdCli, | ||||
| isMaintenance: true, | |||||
| releasingLockRequestIDs: make(map[string]bool), | releasingLockRequestIDs: make(map[string]bool), | ||||
| } | } | ||||
| } | } | ||||
| @@ -45,6 +47,10 @@ func (a *ReleaseActor) Release(reqIDs []string) { | |||||
| a.releasingLockRequestIDs[id] = true | a.releasingLockRequestIDs[id] = true | ||||
| } | } | ||||
| if a.isMaintenance { | |||||
| return | |||||
| } | |||||
| // TODO 处理错误 | // TODO 处理错误 | ||||
| err := a.doReleasing() | err := a.doReleasing() | ||||
| if err != nil { | if err != nil { | ||||
| @@ -63,21 +69,48 @@ func (a *ReleaseActor) DelayRelease(reqIDs []string) { | |||||
| a.releasingLockRequestIDs[id] = true | a.releasingLockRequestIDs[id] = true | ||||
| } | } | ||||
| if a.isMaintenance { | |||||
| return | |||||
| } | |||||
| a.setupTimer() | a.setupTimer() | ||||
| } | } | ||||
| func (a *ReleaseActor) ResetState(reqIDs []string) { | |||||
| // 重试一下内部的解锁请求。不会阻塞调用者 | |||||
| func (a *ReleaseActor) TryReleaseNow() { | |||||
| a.lock.Lock() | a.lock.Lock() | ||||
| defer a.lock.Unlock() | defer a.lock.Unlock() | ||||
| a.releasingLockRequestIDs = make(map[string]bool) | |||||
| for _, id := range reqIDs { | |||||
| a.releasingLockRequestIDs[id] = true | |||||
| // 如果处于维护模式,那么即使主动进行释放操作,也不予理会 | |||||
| if a.isMaintenance { | |||||
| return | |||||
| } | |||||
| // TODO 处理错误 | |||||
| err := a.doReleasing() | |||||
| if err != nil { | |||||
| logger.Std.Debugf("doing releasing: %s", err.Error()) | |||||
| } | } | ||||
| a.setupTimer() | a.setupTimer() | ||||
| } | } | ||||
| // 进入维护模式。在维护模式期间只接受请求,不处理请求,包括延迟释放请求。 | |||||
| func (a *ReleaseActor) EnterMaintenance() { | |||||
| a.lock.Lock() | |||||
| defer a.lock.Unlock() | |||||
| a.isMaintenance = true | |||||
| } | |||||
| // 退出维护模式。退出之后建议调用一下TryReleaseNow。 | |||||
| func (a *ReleaseActor) LeaveMaintenance() { | |||||
| a.lock.Lock() | |||||
| defer a.lock.Unlock() | |||||
| a.isMaintenance = false | |||||
| } | |||||
| func (a *ReleaseActor) OnLockRequestEvent(event LockRequestEvent) { | func (a *ReleaseActor) OnLockRequestEvent(event LockRequestEvent) { | ||||
| a.lock.Lock() | a.lock.Lock() | ||||
| defer a.lock.Unlock() | defer a.lock.Unlock() | ||||
| @@ -157,6 +190,11 @@ func (a *ReleaseActor) setupTimer() { | |||||
| a.timerSetup = false | a.timerSetup = false | ||||
| // 如果处于维护模式,那么即使是定时器要求的释放操作,也不予理会 | |||||
| if a.isMaintenance { | |||||
| return | |||||
| } | |||||
| // TODO 处理错误 | // TODO 处理错误 | ||||
| err := a.doReleasing() | err := a.doReleasing() | ||||
| if err != nil { | if err != nil { | ||||
| @@ -2,6 +2,7 @@ package internal | |||||
| import ( | import ( | ||||
| "context" | "context" | ||||
| "errors" | |||||
| "fmt" | "fmt" | ||||
| "sync" | "sync" | ||||
| @@ -11,6 +12,8 @@ import ( | |||||
| clientv3 "go.etcd.io/etcd/client/v3" | clientv3 "go.etcd.io/etcd/client/v3" | ||||
| ) | ) | ||||
| var ErrSelfServiceDown = errors.New("self service is down, need to restart") | |||||
| type serviceStatus struct { | type serviceStatus struct { | ||||
| Info ServiceInfo | Info ServiceInfo | ||||
| LockRequestIDs []string | LockRequestIDs []string | ||||
| @@ -27,10 +30,11 @@ type ServiceInfoActor struct { | |||||
| services map[string]*serviceStatus | services map[string]*serviceStatus | ||||
| } | } | ||||
| func NewServiceInfoActor(cfg *Config, etcdCli *clientv3.Client) *ServiceInfoActor { | |||||
| func NewServiceInfoActor(cfg *Config, etcdCli *clientv3.Client, baseSelfInfo ServiceInfo) *ServiceInfoActor { | |||||
| return &ServiceInfoActor{ | return &ServiceInfoActor{ | ||||
| cfg: cfg, | |||||
| etcdCli: etcdCli, | |||||
| cfg: cfg, | |||||
| etcdCli: etcdCli, | |||||
| selfInfo: baseSelfInfo, | |||||
| } | } | ||||
| } | } | ||||
| @@ -91,7 +95,7 @@ func (a *ServiceInfoActor) ResetState(ctx context.Context, currentServices []Ser | |||||
| return willReleaseIDs, nil | return willReleaseIDs, nil | ||||
| } | } | ||||
| func (a *ServiceInfoActor) OnServiceEvent(evt ServiceEvent) { | |||||
| func (a *ServiceInfoActor) OnServiceEvent(evt ServiceEvent) error { | |||||
| a.lock.Lock() | a.lock.Lock() | ||||
| defer a.lock.Unlock() | defer a.lock.Unlock() | ||||
| @@ -104,15 +108,20 @@ func (a *ServiceInfoActor) OnServiceEvent(evt ServiceEvent) { | |||||
| } else { | } else { | ||||
| status, ok := a.services[evt.Info.ID] | status, ok := a.services[evt.Info.ID] | ||||
| if !ok { | if !ok { | ||||
| return | |||||
| return nil | |||||
| } | } | ||||
| a.releaseActor.DelayRelease(status.LockRequestIDs) | a.releaseActor.DelayRelease(status.LockRequestIDs) | ||||
| delete(a.services, evt.Info.ID) | delete(a.services, evt.Info.ID) | ||||
| // TODO 处理收到自己崩溃的消息 | |||||
| // 如果收到的被删除服务信息是自己的,那么自己要重启,重新获取全量数据 | |||||
| if evt.Info.ID == a.selfInfo.ID { | |||||
| return ErrSelfServiceDown | |||||
| } | |||||
| } | } | ||||
| return nil | |||||
| } | } | ||||
| func (a *ServiceInfoActor) OnLockRequestEvent(evt LockRequestEvent) { | func (a *ServiceInfoActor) OnLockRequestEvent(evt LockRequestEvent) { | ||||
| @@ -121,8 +130,9 @@ func (a *ServiceInfoActor) OnLockRequestEvent(evt LockRequestEvent) { | |||||
| status, ok := a.services[evt.Data.SerivceID] | status, ok := a.services[evt.Data.SerivceID] | ||||
| if !ok { | if !ok { | ||||
| // 加锁的是一个没有注册过的锁服务,大概率是因为这个锁服务之前网络发生了波动, | |||||
| // 在波动期间它注册的信息过期,于是被当前的服务删除了。 | |||||
| // 加锁的是一个没有注册过的锁服务,可能是因为这个锁服务之前网络发生了波动, | |||||
| // 在波动期间它注册的信息过期,于是被大家认为服务下线,清理掉了它管理的锁, | |||||
| // 而在网络恢复回来之后,它还没有意识到自己被认为下线了,于是还在提交锁请求。 | |||||
| // 为了防止它加了这个锁之后又崩溃,导致的无限锁定,它加的锁我们都直接释放。 | // 为了防止它加了这个锁之后又崩溃,导致的无限锁定,它加的锁我们都直接释放。 | ||||
| a.releaseActor.Release([]string{evt.Data.ID}) | a.releaseActor.Release([]string{evt.Data.ID}) | ||||
| return | return | ||||
| @@ -24,6 +24,8 @@ type OnLockRequestEventFn func(event LockRequestEvent) | |||||
| type OnServiceEventFn func(event ServiceEvent) | type OnServiceEventFn func(event ServiceEvent) | ||||
| type OnWatchFailedFn func(err error) | |||||
| type WatchEtcdActor struct { | type WatchEtcdActor struct { | ||||
| etcdCli *clientv3.Client | etcdCli *clientv3.Client | ||||
| @@ -31,6 +33,7 @@ type WatchEtcdActor struct { | |||||
| watchChanCancel func() | watchChanCancel func() | ||||
| onLockRequestEventFn OnLockRequestEventFn | onLockRequestEventFn OnLockRequestEventFn | ||||
| onServiceEventFn OnServiceEventFn | onServiceEventFn OnServiceEventFn | ||||
| onWatchFailedFn OnWatchFailedFn | |||||
| commandChan *actor.CommandChannel | commandChan *actor.CommandChannel | ||||
| } | } | ||||
| @@ -41,9 +44,10 @@ func NewWatchEtcdActor(etcdCli *clientv3.Client) *WatchEtcdActor { | |||||
| } | } | ||||
| } | } | ||||
| func (a *WatchEtcdActor) Init(onLockRequestEvent OnLockRequestEventFn, onServiceDown OnServiceEventFn) { | |||||
| func (a *WatchEtcdActor) Init(onLockRequestEvent OnLockRequestEventFn, onServiceDown OnServiceEventFn, onWatchFailed OnWatchFailedFn) { | |||||
| a.onLockRequestEventFn = onLockRequestEvent | a.onLockRequestEventFn = onLockRequestEvent | ||||
| a.onServiceEventFn = onServiceDown | a.onServiceEventFn = onServiceDown | ||||
| a.onWatchFailedFn = onWatchFailed | |||||
| } | } | ||||
| func (a *WatchEtcdActor) Start(revision int64) { | func (a *WatchEtcdActor) Start(revision int64) { | ||||
| @@ -71,40 +75,37 @@ func (a *WatchEtcdActor) Stop() { | |||||
| }) | }) | ||||
| } | } | ||||
| func (a *WatchEtcdActor) Serve() error { | |||||
| func (a *WatchEtcdActor) Serve() { | |||||
| cmdChan := a.commandChan.BeginChanReceive() | cmdChan := a.commandChan.BeginChanReceive() | ||||
| defer a.commandChan.CloseChanReceive() | defer a.commandChan.CloseChanReceive() | ||||
| for { | for { | ||||
| if a.watchChan != nil { | if a.watchChan != nil { | ||||
| select { | select { | ||||
| case cmd, ok := <-cmdChan: | |||||
| if !ok { | |||||
| return fmt.Errorf("command channel closed") | |||||
| } | |||||
| case cmd := <-cmdChan: | |||||
| cmd() | cmd() | ||||
| case msg := <-a.watchChan: | case msg := <-a.watchChan: | ||||
| // 只要发生错误,就停止监听,通知外部处理 | |||||
| if msg.Canceled { | if msg.Canceled { | ||||
| // TODO 更好的错误处理 | |||||
| return fmt.Errorf("watch etcd channel closed") | |||||
| a.onWatchFailedFn(fmt.Errorf("watch etcd channel closed")) | |||||
| a.watchChanCancel() | |||||
| a.watchChan = nil | |||||
| continue | |||||
| } | } | ||||
| err := a.dispatchEtcdEvent(msg) | err := a.dispatchEtcdEvent(msg) | ||||
| if err != nil { | if err != nil { | ||||
| // TODO 更好的错误处理 | |||||
| return err | |||||
| a.onWatchFailedFn(err) | |||||
| a.watchChanCancel() | |||||
| a.watchChan = nil | |||||
| continue | |||||
| } | } | ||||
| } | } | ||||
| } else { | } else { | ||||
| select { | select { | ||||
| case cmd, ok := <-cmdChan: | |||||
| if !ok { | |||||
| return fmt.Errorf("command channel closed") | |||||
| } | |||||
| case cmd := <-cmdChan: | |||||
| cmd() | cmd() | ||||
| } | } | ||||
| } | } | ||||
| @@ -6,6 +6,7 @@ import ( | |||||
| "strconv" | "strconv" | ||||
| "time" | "time" | ||||
| "gitlink.org.cn/cloudream/common/pkgs/actor" | |||||
| "gitlink.org.cn/cloudream/common/pkgs/distlock/internal" | "gitlink.org.cn/cloudream/common/pkgs/distlock/internal" | ||||
| "gitlink.org.cn/cloudream/common/pkgs/logger" | "gitlink.org.cn/cloudream/common/pkgs/logger" | ||||
| "gitlink.org.cn/cloudream/common/utils/serder" | "gitlink.org.cn/cloudream/common/utils/serder" | ||||
| @@ -47,6 +48,7 @@ type Service struct { | |||||
| cfg *internal.Config | cfg *internal.Config | ||||
| etcdCli *clientv3.Client | etcdCli *clientv3.Client | ||||
| cmdChan *actor.CommandChannel | |||||
| acquireActor *internal.AcquireActor | acquireActor *internal.AcquireActor | ||||
| releaseActor *internal.ReleaseActor | releaseActor *internal.ReleaseActor | ||||
| providersActor *internal.ProvidersActor | providersActor *internal.ProvidersActor | ||||
| @@ -66,42 +68,55 @@ func NewService(cfg *internal.Config, initProvs []PathProvider) (*Service, error | |||||
| return nil, fmt.Errorf("new etcd client failed, err: %w", err) | return nil, fmt.Errorf("new etcd client failed, err: %w", err) | ||||
| } | } | ||||
| acquireActor := internal.NewAcquireActor(cfg, etcdCli) | |||||
| releaseActor := internal.NewReleaseActor(cfg, etcdCli) | |||||
| providersActor := internal.NewProvidersActor() | |||||
| watchEtcdActor := internal.NewWatchEtcdActor(etcdCli) | |||||
| leaseActor := internal.NewLeaseActor() | |||||
| serviceInfoActor := internal.NewServiceInfoActor(cfg, etcdCli) | |||||
| acquireActor.Init(providersActor) | |||||
| leaseActor.Init(releaseActor) | |||||
| providersActor.Init() | |||||
| watchEtcdActor.Init( | |||||
| svc := &Service{ | |||||
| cfg: cfg, | |||||
| etcdCli: etcdCli, | |||||
| cmdChan: actor.NewCommandChannel(), | |||||
| } | |||||
| svc.acquireActor = internal.NewAcquireActor(cfg, etcdCli) | |||||
| svc.releaseActor = internal.NewReleaseActor(cfg, etcdCli) | |||||
| svc.providersActor = internal.NewProvidersActor() | |||||
| svc.watchEtcdActor = internal.NewWatchEtcdActor(etcdCli) | |||||
| svc.leaseActor = internal.NewLeaseActor() | |||||
| svc.serviceInfoActor = internal.NewServiceInfoActor(cfg, etcdCli, internal.ServiceInfo{ | |||||
| Description: cfg.ServiceDescription, | |||||
| }) | |||||
| svc.acquireActor.Init(svc.providersActor) | |||||
| svc.leaseActor.Init(svc.releaseActor) | |||||
| svc.providersActor.Init() | |||||
| svc.watchEtcdActor.Init( | |||||
| func(event internal.LockRequestEvent) { | func(event internal.LockRequestEvent) { | ||||
| providersActor.OnLockRequestEvent(event) | |||||
| acquireActor.TryAcquireNow() | |||||
| releaseActor.OnLockRequestEvent(event) | |||||
| serviceInfoActor.OnLockRequestEvent(event) | |||||
| err := svc.providersActor.OnLockRequestEvent(event) | |||||
| if err != nil { | |||||
| logger.Std.Warnf("%s, will reset service state", err.Error()) | |||||
| svc.cmdChan.Send(func() { svc.doResetState() }) | |||||
| return | |||||
| } | |||||
| svc.acquireActor.TryAcquireNow() | |||||
| svc.releaseActor.OnLockRequestEvent(event) | |||||
| svc.serviceInfoActor.OnLockRequestEvent(event) | |||||
| }, | }, | ||||
| func(event internal.ServiceEvent) { | func(event internal.ServiceEvent) { | ||||
| serviceInfoActor.OnServiceEvent(event) | |||||
| err := svc.serviceInfoActor.OnServiceEvent(event) | |||||
| if err != nil { | |||||
| logger.Std.Warnf("%s, will reset service state", err.Error()) | |||||
| svc.cmdChan.Send(func() { svc.doResetState() }) | |||||
| } | |||||
| }, | |||||
| func(err error) { | |||||
| logger.Std.Warnf("%s, will reset service state", err.Error()) | |||||
| svc.cmdChan.Send(func() { svc.doResetState() }) | |||||
| }, | }, | ||||
| ) | ) | ||||
| for _, prov := range initProvs { | for _, prov := range initProvs { | ||||
| providersActor.AddProvider(prov.Provider, prov.Path...) | |||||
| svc.providersActor.AddProvider(prov.Provider, prov.Path...) | |||||
| } | } | ||||
| return &Service{ | |||||
| cfg: cfg, | |||||
| etcdCli: etcdCli, | |||||
| acquireActor: acquireActor, | |||||
| releaseActor: releaseActor, | |||||
| providersActor: providersActor, | |||||
| watchEtcdActor: watchEtcdActor, | |||||
| leaseActor: leaseActor, | |||||
| serviceInfoActor: serviceInfoActor, | |||||
| }, nil | |||||
| return svc, nil | |||||
| } | } | ||||
| // Acquire 请求一批锁。成功后返回锁请求ID | // Acquire 请求一批锁。成功后返回锁请求ID | ||||
| @@ -158,39 +173,48 @@ func (svc *Service) Serve() error { | |||||
| // 1. client退出时直接中断进程,此时AcquireActor可能正在进行重试,于是导致Etcd锁没有解除就退出了进程。 | // 1. client退出时直接中断进程,此时AcquireActor可能正在进行重试,于是导致Etcd锁没有解除就退出了进程。 | ||||
| // 虽然由于租约的存在不会导致系统长期卡死,但会影响client的使用 | // 虽然由于租约的存在不会导致系统长期卡死,但会影响client的使用 | ||||
| go func() { | |||||
| // TODO 处理错误 | |||||
| err := svc.watchEtcdActor.Serve() | |||||
| if err != nil { | |||||
| logger.Std.Warnf("serving watch etcd actor actor failed, err: %s", err.Error()) | |||||
| } | |||||
| }() | |||||
| go svc.watchEtcdActor.Serve() | |||||
| go func() { | |||||
| // TODO 处理错误 | |||||
| err := svc.leaseActor.Serve() | |||||
| if err != nil { | |||||
| logger.Std.Warnf("serving lease actor failed, err: %s", err.Error()) | |||||
| go svc.leaseActor.Serve() | |||||
| svc.cmdChan.Send(func() { svc.doResetState() }) | |||||
| cmdChan := svc.cmdChan.BeginChanReceive() | |||||
| defer svc.cmdChan.CloseChanReceive() | |||||
| for { | |||||
| select { | |||||
| case cmd := <-cmdChan: | |||||
| cmd() | |||||
| } | } | ||||
| }() | |||||
| } | |||||
| return nil | |||||
| } | |||||
| func (svc *Service) doResetState() { | |||||
| logger.Std.Infof("start reset state") | |||||
| // TODO context | // TODO context | ||||
| err := svc.resetState(context.Background()) | err := svc.resetState(context.Background()) | ||||
| if err != nil { | if err != nil { | ||||
| // TODO 关闭其他的Actor,或者更好的错误处理方式 | |||||
| return fmt.Errorf("init data failed, err: %w", err) | |||||
| logger.Std.Warnf("reseting state: %s, will try again after 3 seconds", err.Error()) | |||||
| <-time.After(time.Second * 3) | |||||
| svc.cmdChan.Send(func() { svc.doResetState() }) | |||||
| return | |||||
| } | } | ||||
| // TODO 防止退出的临时解决办法 | |||||
| ch := make(chan any) | |||||
| <-ch | |||||
| return nil | |||||
| logger.Std.Infof("reset state success") | |||||
| } | } | ||||
| // ResetState 重置内部状态。注:只要调用到了此函数,无论在哪一步出的错, | // ResetState 重置内部状态。注:只要调用到了此函数,无论在哪一步出的错, | ||||
| // 都要将内部状态视为已被破坏,直到成功调用了此函数才能继续后面的步骤 | |||||
| // 都要将内部状态视为已被破坏,直到成功调用了此函数才能继续后面的步骤。 | |||||
| // 如果调用失败,服务将进入维护模式,届时可以接受请求,但不会处理请求,直到调用成功为止。 | |||||
| func (svc *Service) resetState(ctx context.Context) error { | func (svc *Service) resetState(ctx context.Context) error { | ||||
| // 让服务都进入维护模式 | |||||
| svc.watchEtcdActor.Stop() | |||||
| svc.leaseActor.Stop() | |||||
| svc.acquireActor.EnterMaintenance() | |||||
| svc.releaseActor.EnterMaintenance() | |||||
| // 必须使用事务一次性获取所有数据 | // 必须使用事务一次性获取所有数据 | ||||
| txResp, err := svc.etcdCli.Txn(ctx). | txResp, err := svc.etcdCli.Txn(ctx). | ||||
| Then( | Then( | ||||
| @@ -240,29 +264,31 @@ func (svc *Service) resetState(ctx context.Context) error { | |||||
| svcInfo = append(svcInfo, info) | svcInfo = append(svcInfo, info) | ||||
| } | } | ||||
| // 先停止监听等定时事件 | |||||
| svc.watchEtcdActor.Stop() | |||||
| svc.leaseActor.Stop() | |||||
| // 然后将新获取到的状态装填到Actor中。注:执行顺序需要考虑Actor会被谁调用,不会被调用的优先Reset。 | |||||
| // 然后将新获取到的状态装填到Actor中 | |||||
| releasingIDs, err := svc.serviceInfoActor.ResetState(ctx, svcInfo, reqData) | releasingIDs, err := svc.serviceInfoActor.ResetState(ctx, svcInfo, reqData) | ||||
| if err != nil { | if err != nil { | ||||
| return fmt.Errorf("reseting service info actor: %w", err) | return fmt.Errorf("reseting service info actor: %w", err) | ||||
| } | } | ||||
| svc.acquireActor.ResetState(svc.serviceInfoActor.GetSelfInfo().ID) | |||||
| svc.leaseActor.ResetState() | |||||
| // 要在acquireActor之前,因为acquireActor会调用它的WaitLocalIndexTo | |||||
| err = svc.providersActor.ResetState(index, reqData) | err = svc.providersActor.ResetState(index, reqData) | ||||
| if err != nil { | if err != nil { | ||||
| return fmt.Errorf("reseting providers actor: %w", err) | return fmt.Errorf("reseting providers actor: %w", err) | ||||
| } | } | ||||
| svc.releaseActor.ResetState(releasingIDs) | |||||
| svc.acquireActor.ResetState(svc.serviceInfoActor.GetSelfInfo().ID) | |||||
| // 重置完了之后再启动监听 | |||||
| // ReleaseActor没有什么需要Reset的状态 | |||||
| svc.releaseActor.Release(releasingIDs) | |||||
| // 重置完了之后再退出维护模式 | |||||
| svc.watchEtcdActor.Start(txResp.Header.Revision) | svc.watchEtcdActor.Start(txResp.Header.Revision) | ||||
| svc.leaseActor.Start() | svc.leaseActor.Start() | ||||
| svc.acquireActor.LeaveMaintenance() | |||||
| svc.releaseActor.LeaveMaintenance() | |||||
| svc.acquireActor.TryAcquireNow() | |||||
| svc.releaseActor.TryReleaseNow() | |||||
| return nil | return nil | ||||
| } | } | ||||