You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

grpclb.go 15 kB

5 years ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484
  1. /*
  2. *
  3. * Copyright 2016 gRPC authors.
  4. *
  5. * Licensed under the Apache License, Version 2.0 (the "License");
  6. * you may not use this file except in compliance with the License.
  7. * You may obtain a copy of the License at
  8. *
  9. * http://www.apache.org/licenses/LICENSE-2.0
  10. *
  11. * Unless required by applicable law or agreed to in writing, software
  12. * distributed under the License is distributed on an "AS IS" BASIS,
  13. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14. * See the License for the specific language governing permissions and
  15. * limitations under the License.
  16. *
  17. */
  18. //go:generate ./regenerate.sh
  19. // Package grpclb defines a grpclb balancer.
  20. //
  21. // To install grpclb balancer, import this package as:
  22. // import _ "google.golang.org/grpc/balancer/grpclb"
  23. package grpclb
  24. import (
  25. "context"
  26. "errors"
  27. "strconv"
  28. "sync"
  29. "time"
  30. durationpb "github.com/golang/protobuf/ptypes/duration"
  31. "google.golang.org/grpc"
  32. "google.golang.org/grpc/balancer"
  33. lbpb "google.golang.org/grpc/balancer/grpclb/grpc_lb_v1"
  34. "google.golang.org/grpc/connectivity"
  35. "google.golang.org/grpc/credentials"
  36. "google.golang.org/grpc/grpclog"
  37. "google.golang.org/grpc/internal"
  38. "google.golang.org/grpc/internal/backoff"
  39. "google.golang.org/grpc/resolver"
  40. )
  41. const (
  42. lbTokeyKey = "lb-token"
  43. defaultFallbackTimeout = 10 * time.Second
  44. grpclbName = "grpclb"
  45. )
  46. var (
  47. // defaultBackoffConfig configures the backoff strategy that's used when the
  48. // init handshake in the RPC is unsuccessful. It's not for the clientconn
  49. // reconnect backoff.
  50. //
  51. // It has the same value as the default grpc.DefaultBackoffConfig.
  52. //
  53. // TODO: make backoff configurable.
  54. defaultBackoffConfig = backoff.Exponential{
  55. MaxDelay: 120 * time.Second,
  56. }
  57. errServerTerminatedConnection = errors.New("grpclb: failed to recv server list: server terminated connection")
  58. )
  59. func convertDuration(d *durationpb.Duration) time.Duration {
  60. if d == nil {
  61. return 0
  62. }
  63. return time.Duration(d.Seconds)*time.Second + time.Duration(d.Nanos)*time.Nanosecond
  64. }
  65. // Client API for LoadBalancer service.
  66. // Mostly copied from generated pb.go file.
  67. // To avoid circular dependency.
  68. type loadBalancerClient struct {
  69. cc *grpc.ClientConn
  70. }
  71. func (c *loadBalancerClient) BalanceLoad(ctx context.Context, opts ...grpc.CallOption) (*balanceLoadClientStream, error) {
  72. desc := &grpc.StreamDesc{
  73. StreamName: "BalanceLoad",
  74. ServerStreams: true,
  75. ClientStreams: true,
  76. }
  77. stream, err := c.cc.NewStream(ctx, desc, "/grpc.lb.v1.LoadBalancer/BalanceLoad", opts...)
  78. if err != nil {
  79. return nil, err
  80. }
  81. x := &balanceLoadClientStream{stream}
  82. return x, nil
  83. }
  84. type balanceLoadClientStream struct {
  85. grpc.ClientStream
  86. }
  87. func (x *balanceLoadClientStream) Send(m *lbpb.LoadBalanceRequest) error {
  88. return x.ClientStream.SendMsg(m)
  89. }
  90. func (x *balanceLoadClientStream) Recv() (*lbpb.LoadBalanceResponse, error) {
  91. m := new(lbpb.LoadBalanceResponse)
  92. if err := x.ClientStream.RecvMsg(m); err != nil {
  93. return nil, err
  94. }
  95. return m, nil
  96. }
  97. func init() {
  98. balancer.Register(newLBBuilder())
  99. }
  100. // newLBBuilder creates a builder for grpclb.
  101. func newLBBuilder() balancer.Builder {
  102. return newLBBuilderWithFallbackTimeout(defaultFallbackTimeout)
  103. }
  104. // newLBBuilderWithFallbackTimeout creates a grpclb builder with the given
  105. // fallbackTimeout. If no response is received from the remote balancer within
  106. // fallbackTimeout, the backend addresses from the resolved address list will be
  107. // used.
  108. //
  109. // Only call this function when a non-default fallback timeout is needed.
  110. func newLBBuilderWithFallbackTimeout(fallbackTimeout time.Duration) balancer.Builder {
  111. return &lbBuilder{
  112. fallbackTimeout: fallbackTimeout,
  113. }
  114. }
  115. type lbBuilder struct {
  116. fallbackTimeout time.Duration
  117. }
  118. func (b *lbBuilder) Name() string {
  119. return grpclbName
  120. }
  121. func (b *lbBuilder) Build(cc balancer.ClientConn, opt balancer.BuildOptions) balancer.Balancer {
  122. // This generates a manual resolver builder with a random scheme. This
  123. // scheme will be used to dial to remote LB, so we can send filtered address
  124. // updates to remote LB ClientConn using this manual resolver.
  125. scheme := "grpclb_internal_" + strconv.FormatInt(time.Now().UnixNano(), 36)
  126. r := &lbManualResolver{scheme: scheme, ccb: cc}
  127. lb := &lbBalancer{
  128. cc: newLBCacheClientConn(cc),
  129. target: opt.Target.Endpoint,
  130. opt: opt,
  131. fallbackTimeout: b.fallbackTimeout,
  132. doneCh: make(chan struct{}),
  133. manualResolver: r,
  134. subConns: make(map[resolver.Address]balancer.SubConn),
  135. scStates: make(map[balancer.SubConn]connectivity.State),
  136. picker: &errPicker{err: balancer.ErrNoSubConnAvailable},
  137. clientStats: newRPCStats(),
  138. backoff: defaultBackoffConfig, // TODO: make backoff configurable.
  139. }
  140. var err error
  141. if opt.CredsBundle != nil {
  142. lb.grpclbClientConnCreds, err = opt.CredsBundle.NewWithMode(internal.CredsBundleModeBalancer)
  143. if err != nil {
  144. grpclog.Warningf("lbBalancer: client connection creds NewWithMode failed: %v", err)
  145. }
  146. lb.grpclbBackendCreds, err = opt.CredsBundle.NewWithMode(internal.CredsBundleModeBackendFromBalancer)
  147. if err != nil {
  148. grpclog.Warningf("lbBalancer: backend creds NewWithMode failed: %v", err)
  149. }
  150. }
  151. return lb
  152. }
  153. type lbBalancer struct {
  154. cc *lbCacheClientConn
  155. target string
  156. opt balancer.BuildOptions
  157. usePickFirst bool
  158. // grpclbClientConnCreds is the creds bundle to be used to connect to grpclb
  159. // servers. If it's nil, use the TransportCredentials from BuildOptions
  160. // instead.
  161. grpclbClientConnCreds credentials.Bundle
  162. // grpclbBackendCreds is the creds bundle to be used for addresses that are
  163. // returned by grpclb server. If it's nil, don't set anything when creating
  164. // SubConns.
  165. grpclbBackendCreds credentials.Bundle
  166. fallbackTimeout time.Duration
  167. doneCh chan struct{}
  168. // manualResolver is used in the remote LB ClientConn inside grpclb. When
  169. // resolved address updates are received by grpclb, filtered updates will be
  170. // send to remote LB ClientConn through this resolver.
  171. manualResolver *lbManualResolver
  172. // The ClientConn to talk to the remote balancer.
  173. ccRemoteLB *grpc.ClientConn
  174. // backoff for calling remote balancer.
  175. backoff backoff.Strategy
  176. // Support client side load reporting. Each picker gets a reference to this,
  177. // and will update its content.
  178. clientStats *rpcStats
  179. mu sync.Mutex // guards everything following.
  180. // The full server list including drops, used to check if the newly received
  181. // serverList contains anything new. Each generate picker will also have
  182. // reference to this list to do the first layer pick.
  183. fullServerList []*lbpb.Server
  184. // Backend addresses. It's kept so the addresses are available when
  185. // switching between round_robin and pickfirst.
  186. backendAddrs []resolver.Address
  187. // All backends addresses, with metadata set to nil. This list contains all
  188. // backend addresses in the same order and with the same duplicates as in
  189. // serverlist. When generating picker, a SubConn slice with the same order
  190. // but with only READY SCs will be gerenated.
  191. backendAddrsWithoutMetadata []resolver.Address
  192. // Roundrobin functionalities.
  193. state connectivity.State
  194. subConns map[resolver.Address]balancer.SubConn // Used to new/remove SubConn.
  195. scStates map[balancer.SubConn]connectivity.State // Used to filter READY SubConns.
  196. picker balancer.Picker
  197. // Support fallback to resolved backend addresses if there's no response
  198. // from remote balancer within fallbackTimeout.
  199. remoteBalancerConnected bool
  200. serverListReceived bool
  201. inFallback bool
  202. // resolvedBackendAddrs is resolvedAddrs minus remote balancers. It's set
  203. // when resolved address updates are received, and read in the goroutine
  204. // handling fallback.
  205. resolvedBackendAddrs []resolver.Address
  206. }
  207. // regeneratePicker takes a snapshot of the balancer, and generates a picker from
  208. // it. The picker
  209. // - always returns ErrTransientFailure if the balancer is in TransientFailure,
  210. // - does two layer roundrobin pick otherwise.
  211. // Caller must hold lb.mu.
  212. func (lb *lbBalancer) regeneratePicker(resetDrop bool) {
  213. if lb.state == connectivity.TransientFailure {
  214. lb.picker = &errPicker{err: balancer.ErrTransientFailure}
  215. return
  216. }
  217. if lb.state == connectivity.Connecting {
  218. lb.picker = &errPicker{err: balancer.ErrNoSubConnAvailable}
  219. return
  220. }
  221. var readySCs []balancer.SubConn
  222. if lb.usePickFirst {
  223. for _, sc := range lb.subConns {
  224. readySCs = append(readySCs, sc)
  225. break
  226. }
  227. } else {
  228. for _, a := range lb.backendAddrsWithoutMetadata {
  229. if sc, ok := lb.subConns[a]; ok {
  230. if st, ok := lb.scStates[sc]; ok && st == connectivity.Ready {
  231. readySCs = append(readySCs, sc)
  232. }
  233. }
  234. }
  235. }
  236. if len(readySCs) <= 0 {
  237. // If there's no ready SubConns, always re-pick. This is to avoid drops
  238. // unless at least one SubConn is ready. Otherwise we may drop more
  239. // often than want because of drops + re-picks(which become re-drops).
  240. //
  241. // This doesn't seem to be necessary after the connecting check above.
  242. // Kept for safety.
  243. lb.picker = &errPicker{err: balancer.ErrNoSubConnAvailable}
  244. return
  245. }
  246. if lb.inFallback {
  247. lb.picker = newRRPicker(readySCs)
  248. return
  249. }
  250. if resetDrop {
  251. lb.picker = newLBPicker(lb.fullServerList, readySCs, lb.clientStats)
  252. return
  253. }
  254. prevLBPicker, ok := lb.picker.(*lbPicker)
  255. if !ok {
  256. lb.picker = newLBPicker(lb.fullServerList, readySCs, lb.clientStats)
  257. return
  258. }
  259. prevLBPicker.updateReadySCs(readySCs)
  260. }
  261. // aggregateSubConnStats calculate the aggregated state of SubConns in
  262. // lb.SubConns. These SubConns are subconns in use (when switching between
  263. // fallback and grpclb). lb.scState contains states for all SubConns, including
  264. // those in cache (SubConns are cached for 10 seconds after remove).
  265. //
  266. // The aggregated state is:
  267. // - If at least one SubConn in Ready, the aggregated state is Ready;
  268. // - Else if at least one SubConn in Connecting, the aggregated state is Connecting;
  269. // - Else the aggregated state is TransientFailure.
  270. func (lb *lbBalancer) aggregateSubConnStates() connectivity.State {
  271. var numConnecting uint64
  272. for _, sc := range lb.subConns {
  273. if state, ok := lb.scStates[sc]; ok {
  274. switch state {
  275. case connectivity.Ready:
  276. return connectivity.Ready
  277. case connectivity.Connecting:
  278. numConnecting++
  279. }
  280. }
  281. }
  282. if numConnecting > 0 {
  283. return connectivity.Connecting
  284. }
  285. return connectivity.TransientFailure
  286. }
  287. func (lb *lbBalancer) HandleSubConnStateChange(sc balancer.SubConn, s connectivity.State) {
  288. panic("not used")
  289. }
  290. func (lb *lbBalancer) UpdateSubConnState(sc balancer.SubConn, scs balancer.SubConnState) {
  291. s := scs.ConnectivityState
  292. if grpclog.V(2) {
  293. grpclog.Infof("lbBalancer: handle SubConn state change: %p, %v", sc, s)
  294. }
  295. lb.mu.Lock()
  296. defer lb.mu.Unlock()
  297. oldS, ok := lb.scStates[sc]
  298. if !ok {
  299. if grpclog.V(2) {
  300. grpclog.Infof("lbBalancer: got state changes for an unknown SubConn: %p, %v", sc, s)
  301. }
  302. return
  303. }
  304. lb.scStates[sc] = s
  305. switch s {
  306. case connectivity.Idle:
  307. sc.Connect()
  308. case connectivity.Shutdown:
  309. // When an address was removed by resolver, b called RemoveSubConn but
  310. // kept the sc's state in scStates. Remove state for this sc here.
  311. delete(lb.scStates, sc)
  312. }
  313. // Force regenerate picker if
  314. // - this sc became ready from not-ready
  315. // - this sc became not-ready from ready
  316. lb.updateStateAndPicker((oldS == connectivity.Ready) != (s == connectivity.Ready), false)
  317. // Enter fallback when the aggregated state is not Ready and the connection
  318. // to remote balancer is lost.
  319. if lb.state != connectivity.Ready {
  320. if !lb.inFallback && !lb.remoteBalancerConnected {
  321. // Enter fallback.
  322. lb.refreshSubConns(lb.resolvedBackendAddrs, true, lb.usePickFirst)
  323. }
  324. }
  325. }
  326. // updateStateAndPicker re-calculate the aggregated state, and regenerate picker
  327. // if overall state is changed.
  328. //
  329. // If forceRegeneratePicker is true, picker will be regenerated.
  330. func (lb *lbBalancer) updateStateAndPicker(forceRegeneratePicker bool, resetDrop bool) {
  331. oldAggrState := lb.state
  332. lb.state = lb.aggregateSubConnStates()
  333. // Regenerate picker when one of the following happens:
  334. // - caller wants to regenerate
  335. // - the aggregated state changed
  336. if forceRegeneratePicker || (lb.state != oldAggrState) {
  337. lb.regeneratePicker(resetDrop)
  338. }
  339. lb.cc.UpdateBalancerState(lb.state, lb.picker)
  340. }
  341. // fallbackToBackendsAfter blocks for fallbackTimeout and falls back to use
  342. // resolved backends (backends received from resolver, not from remote balancer)
  343. // if no connection to remote balancers was successful.
  344. func (lb *lbBalancer) fallbackToBackendsAfter(fallbackTimeout time.Duration) {
  345. timer := time.NewTimer(fallbackTimeout)
  346. defer timer.Stop()
  347. select {
  348. case <-timer.C:
  349. case <-lb.doneCh:
  350. return
  351. }
  352. lb.mu.Lock()
  353. if lb.inFallback || lb.serverListReceived {
  354. lb.mu.Unlock()
  355. return
  356. }
  357. // Enter fallback.
  358. lb.refreshSubConns(lb.resolvedBackendAddrs, true, lb.usePickFirst)
  359. lb.mu.Unlock()
  360. }
  361. // HandleResolvedAddrs sends the updated remoteLB addresses to remoteLB
  362. // clientConn. The remoteLB clientConn will handle creating/removing remoteLB
  363. // connections.
  364. func (lb *lbBalancer) HandleResolvedAddrs(addrs []resolver.Address, err error) {
  365. panic("not used")
  366. }
  367. func (lb *lbBalancer) handleServiceConfig(sc string) {
  368. lb.mu.Lock()
  369. defer lb.mu.Unlock()
  370. newUsePickFirst := childIsPickFirst(sc)
  371. if lb.usePickFirst == newUsePickFirst {
  372. return
  373. }
  374. if grpclog.V(2) {
  375. grpclog.Infof("lbBalancer: switching mode, new usePickFirst: %+v", newUsePickFirst)
  376. }
  377. lb.refreshSubConns(lb.backendAddrs, lb.inFallback, newUsePickFirst)
  378. }
  379. func (lb *lbBalancer) UpdateResolverState(rs resolver.State) {
  380. if grpclog.V(2) {
  381. grpclog.Infof("lbBalancer: UpdateResolverState: %+v", rs)
  382. }
  383. lb.handleServiceConfig(rs.ServiceConfig)
  384. addrs := rs.Addresses
  385. if len(addrs) <= 0 {
  386. return
  387. }
  388. var remoteBalancerAddrs, backendAddrs []resolver.Address
  389. for _, a := range addrs {
  390. if a.Type == resolver.GRPCLB {
  391. a.Type = resolver.Backend
  392. remoteBalancerAddrs = append(remoteBalancerAddrs, a)
  393. } else {
  394. backendAddrs = append(backendAddrs, a)
  395. }
  396. }
  397. if lb.ccRemoteLB == nil {
  398. if len(remoteBalancerAddrs) <= 0 {
  399. grpclog.Errorf("grpclb: no remote balancer address is available, should never happen")
  400. return
  401. }
  402. // First time receiving resolved addresses, create a cc to remote
  403. // balancers.
  404. lb.dialRemoteLB(remoteBalancerAddrs[0].ServerName)
  405. // Start the fallback goroutine.
  406. go lb.fallbackToBackendsAfter(lb.fallbackTimeout)
  407. }
  408. // cc to remote balancers uses lb.manualResolver. Send the updated remote
  409. // balancer addresses to it through manualResolver.
  410. lb.manualResolver.UpdateState(resolver.State{Addresses: remoteBalancerAddrs})
  411. lb.mu.Lock()
  412. lb.resolvedBackendAddrs = backendAddrs
  413. if lb.inFallback {
  414. // This means we received a new list of resolved backends, and we are
  415. // still in fallback mode. Need to update the list of backends we are
  416. // using to the new list of backends.
  417. lb.refreshSubConns(lb.resolvedBackendAddrs, true, lb.usePickFirst)
  418. }
  419. lb.mu.Unlock()
  420. }
  421. func (lb *lbBalancer) Close() {
  422. select {
  423. case <-lb.doneCh:
  424. return
  425. default:
  426. }
  427. close(lb.doneCh)
  428. if lb.ccRemoteLB != nil {
  429. lb.ccRemoteLB.Close()
  430. }
  431. lb.cc.close()
  432. }