transport.go 34 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363
  1. package kafka
  2. import (
  3. "context"
  4. "crypto/tls"
  5. "errors"
  6. "fmt"
  7. "io"
  8. "math/rand"
  9. "net"
  10. "runtime/pprof"
  11. "sort"
  12. "strconv"
  13. "strings"
  14. "sync"
  15. "sync/atomic"
  16. "time"
  17. "github.com/segmentio/kafka-go/protocol"
  18. "github.com/segmentio/kafka-go/protocol/apiversions"
  19. "github.com/segmentio/kafka-go/protocol/createtopics"
  20. "github.com/segmentio/kafka-go/protocol/findcoordinator"
  21. meta "github.com/segmentio/kafka-go/protocol/metadata"
  22. "github.com/segmentio/kafka-go/protocol/saslauthenticate"
  23. "github.com/segmentio/kafka-go/protocol/saslhandshake"
  24. "github.com/segmentio/kafka-go/sasl"
  25. )
  26. // Request is an interface implemented by types that represent messages sent
  27. // from kafka clients to brokers.
  28. type Request = protocol.Message
  29. // Response is an interface implemented by types that represent messages sent
  30. // from kafka brokers in response to client requests.
  31. type Response = protocol.Message
  32. // RoundTripper is an interface implemented by types which support interacting
  33. // with kafka brokers.
  34. type RoundTripper interface {
  35. // RoundTrip sends a request to a kafka broker and returns the response that
  36. // was received, or a non-nil error.
  37. //
  38. // The context passed as first argument can be used to asynchronnously abort
  39. // the call if needed.
  40. RoundTrip(context.Context, net.Addr, Request) (Response, error)
  41. }
  42. // Transport is an implementation of the RoundTripper interface.
  43. //
  44. // Transport values manage a pool of connections and automatically discovers the
  45. // clusters layout to route requests to the appropriate brokers.
  46. //
  47. // Transport values are safe to use concurrently from multiple goroutines.
  48. //
  49. // Note: The intent is for the Transport to become the underlying layer of the
  50. // kafka.Reader and kafka.Writer types.
  51. type Transport struct {
  52. // A function used to establish connections to the kafka cluster.
  53. Dial func(context.Context, string, string) (net.Conn, error)
  54. // Time limit set for establishing connections to the kafka cluster. This
  55. // limit includes all round trips done to establish the connections (TLS
  56. // handshake, SASL negotiation, etc...).
  57. //
  58. // Defaults to 5s.
  59. DialTimeout time.Duration
  60. // Maximum amount of time that connections will remain open and unused.
  61. // The transport will manage to automatically close connections that have
  62. // been idle for too long, and re-open them on demand when the transport is
  63. // used again.
  64. //
  65. // Defaults to 30s.
  66. IdleTimeout time.Duration
  67. // TTL for the metadata cached by this transport. Note that the value
  68. // configured here is an upper bound, the transport randomizes the TTLs to
  69. // avoid getting into states where multiple clients end up synchronized and
  70. // cause bursts of requests to the kafka broker.
  71. //
  72. // Default to 6s.
  73. MetadataTTL time.Duration
  74. // Topic names for the metadata cached by this transport. If this field is left blank,
  75. // metadata information of all topics in the cluster will be retrieved.
  76. MetadataTopics []string
  77. // Unique identifier that the transport communicates to the brokers when it
  78. // sends requests.
  79. ClientID string
  80. // An optional configuration for TLS connections established by this
  81. // transport.
  82. //
  83. // If the Server
  84. TLS *tls.Config
  85. // SASL configures the Transfer to use SASL authentication.
  86. SASL sasl.Mechanism
  87. // An optional resolver used to translate broker host names into network
  88. // addresses.
  89. //
  90. // The resolver will be called for every request (not every connection),
  91. // making it possible to implement ACL policies by validating that the
  92. // program is allowed to connect to the kafka broker. This also means that
  93. // the resolver should probably provide a caching layer to avoid storming
  94. // the service discovery backend with requests.
  95. //
  96. // When set, the Dial function is not responsible for performing name
  97. // resolution, and is always called with a pre-resolved address.
  98. Resolver BrokerResolver
  99. // The background context used to control goroutines started internally by
  100. // the transport.
  101. //
  102. // If nil, context.Background() is used instead.
  103. Context context.Context
  104. mutex sync.RWMutex
  105. pools map[networkAddress]*connPool
  106. }
  107. // DefaultTransport is the default transport used by kafka clients in this
  108. // package.
  109. var DefaultTransport RoundTripper = &Transport{
  110. Dial: (&net.Dialer{
  111. Timeout: 3 * time.Second,
  112. DualStack: true,
  113. }).DialContext,
  114. }
  115. // CloseIdleConnections closes all idle connections immediately, and marks all
  116. // connections that are in use to be closed when they become idle again.
  117. func (t *Transport) CloseIdleConnections() {
  118. t.mutex.Lock()
  119. defer t.mutex.Unlock()
  120. for _, pool := range t.pools {
  121. pool.unref()
  122. }
  123. for k := range t.pools {
  124. delete(t.pools, k)
  125. }
  126. }
  127. // RoundTrip sends a request to a kafka cluster and returns the response, or an
  128. // error if no responses were received.
  129. //
  130. // Message types are available in sub-packages of the protocol package. Each
  131. // kafka API is implemented in a different sub-package. For example, the request
  132. // and response types for the Fetch API are available in the protocol/fetch
  133. // package.
  134. //
  135. // The type of the response message will match the type of the request. For
  136. // example, if RoundTrip was called with a *fetch.Request as argument, the value
  137. // returned will be of type *fetch.Response. It is safe for the program to do a
  138. // type assertion after checking that no error was returned.
  139. //
  140. // This example illustrates the way this method is expected to be used:
  141. //
  142. // r, err := transport.RoundTrip(ctx, addr, &fetch.Request{ ... })
  143. // if err != nil {
  144. // ...
  145. // } else {
  146. // res := r.(*fetch.Response)
  147. // ...
  148. // }
  149. //
  150. // The transport automatically selects the highest version of the API that is
  151. // supported by both the kafka-go package and the kafka broker. The negotiation
  152. // happens transparently once when connections are established.
  153. //
  154. // This API was introduced in version 0.4 as a way to leverage the lower-level
  155. // features of the kafka protocol, but also provide a more efficient way of
  156. // managing connections to kafka brokers.
  157. func (t *Transport) RoundTrip(ctx context.Context, addr net.Addr, req Request) (Response, error) {
  158. p := t.grabPool(addr)
  159. defer p.unref()
  160. return p.roundTrip(ctx, req)
  161. }
  162. func (t *Transport) dial() func(context.Context, string, string) (net.Conn, error) {
  163. if t.Dial != nil {
  164. return t.Dial
  165. }
  166. return defaultDialer.DialContext
  167. }
  168. func (t *Transport) dialTimeout() time.Duration {
  169. if t.DialTimeout > 0 {
  170. return t.DialTimeout
  171. }
  172. return 5 * time.Second
  173. }
  174. func (t *Transport) idleTimeout() time.Duration {
  175. if t.IdleTimeout > 0 {
  176. return t.IdleTimeout
  177. }
  178. return 30 * time.Second
  179. }
  180. func (t *Transport) metadataTTL() time.Duration {
  181. if t.MetadataTTL > 0 {
  182. return t.MetadataTTL
  183. }
  184. return 6 * time.Second
  185. }
  186. func (t *Transport) grabPool(addr net.Addr) *connPool {
  187. k := networkAddress{
  188. network: addr.Network(),
  189. address: addr.String(),
  190. }
  191. t.mutex.RLock()
  192. p := t.pools[k]
  193. if p != nil {
  194. p.ref()
  195. }
  196. t.mutex.RUnlock()
  197. if p != nil {
  198. return p
  199. }
  200. t.mutex.Lock()
  201. defer t.mutex.Unlock()
  202. if p := t.pools[k]; p != nil {
  203. p.ref()
  204. return p
  205. }
  206. ctx, cancel := context.WithCancel(t.context())
  207. p = &connPool{
  208. refc: 2,
  209. dial: t.dial(),
  210. dialTimeout: t.dialTimeout(),
  211. idleTimeout: t.idleTimeout(),
  212. metadataTTL: t.metadataTTL(),
  213. metadataTopics: t.MetadataTopics,
  214. clientID: t.ClientID,
  215. tls: t.TLS,
  216. sasl: t.SASL,
  217. resolver: t.Resolver,
  218. ready: make(event),
  219. wake: make(chan event),
  220. conns: make(map[int32]*connGroup),
  221. cancel: cancel,
  222. }
  223. p.ctrl = p.newConnGroup(addr)
  224. go p.discover(ctx, p.wake)
  225. if t.pools == nil {
  226. t.pools = make(map[networkAddress]*connPool)
  227. }
  228. t.pools[k] = p
  229. return p
  230. }
  231. func (t *Transport) context() context.Context {
  232. if t.Context != nil {
  233. return t.Context
  234. }
  235. return context.Background()
  236. }
  237. type event chan struct{}
  238. func (e event) trigger() { close(e) }
  239. type connPool struct {
  240. refc uintptr
  241. // Immutable fields of the connection pool. Connections access these field
  242. // on their parent pool in a ready-only fashion, so no synchronization is
  243. // required.
  244. dial func(context.Context, string, string) (net.Conn, error)
  245. dialTimeout time.Duration
  246. idleTimeout time.Duration
  247. metadataTTL time.Duration
  248. metadataTopics []string
  249. clientID string
  250. tls *tls.Config
  251. sasl sasl.Mechanism
  252. resolver BrokerResolver
  253. // Signaling mechanisms to orchestrate communications between the pool and
  254. // the rest of the program.
  255. once sync.Once // ensure that `ready` is triggered only once
  256. ready event // triggered after the first metadata update
  257. wake chan event // used to force metadata updates
  258. cancel context.CancelFunc
  259. // Mutable fields of the connection pool, access must be synchronized.
  260. mutex sync.RWMutex
  261. conns map[int32]*connGroup // data connections used for produce/fetch/etc...
  262. ctrl *connGroup // control connections used for metadata requests
  263. state atomic.Value // cached cluster state
  264. }
  265. type connPoolState struct {
  266. metadata *meta.Response // last metadata response seen by the pool
  267. err error // last error from metadata requests
  268. layout protocol.Cluster // cluster layout built from metadata response
  269. }
  270. func (p *connPool) grabState() connPoolState {
  271. state, _ := p.state.Load().(connPoolState)
  272. return state
  273. }
  274. func (p *connPool) setState(state connPoolState) {
  275. p.state.Store(state)
  276. }
  277. func (p *connPool) ref() {
  278. atomic.AddUintptr(&p.refc, +1)
  279. }
  280. func (p *connPool) unref() {
  281. if atomic.AddUintptr(&p.refc, ^uintptr(0)) == 0 {
  282. p.mutex.Lock()
  283. defer p.mutex.Unlock()
  284. for _, conns := range p.conns {
  285. conns.closeIdleConns()
  286. }
  287. p.ctrl.closeIdleConns()
  288. p.cancel()
  289. }
  290. }
  291. func (p *connPool) roundTrip(ctx context.Context, req Request) (Response, error) {
  292. // This first select should never block after the first metadata response
  293. // that would mark the pool as `ready`.
  294. select {
  295. case <-p.ready:
  296. case <-ctx.Done():
  297. return nil, ctx.Err()
  298. }
  299. state := p.grabState()
  300. var response promise
  301. switch m := req.(type) {
  302. case *meta.Request:
  303. // We serve metadata requests directly from the transport cache unless
  304. // we would like to auto create a topic that isn't in our cache.
  305. //
  306. // This reduces the number of round trips to kafka brokers while keeping
  307. // the logic simple when applying partitioning strategies.
  308. if state.err != nil {
  309. return nil, state.err
  310. }
  311. cachedMeta := filterMetadataResponse(m, state.metadata)
  312. // requestNeeded indicates if we need to send this metadata request to the server.
  313. // It's true when we want to auto-create topics and we don't have the topic in our
  314. // cache.
  315. var requestNeeded bool
  316. if m.AllowAutoTopicCreation {
  317. for _, topic := range cachedMeta.Topics {
  318. if topic.ErrorCode == int16(UnknownTopicOrPartition) {
  319. requestNeeded = true
  320. break
  321. }
  322. }
  323. }
  324. if !requestNeeded {
  325. return cachedMeta, nil
  326. }
  327. case protocol.Splitter:
  328. // Messages that implement the Splitter interface trigger the creation of
  329. // multiple requests that are all merged back into a single results by
  330. // a merger.
  331. messages, merger, err := m.Split(state.layout)
  332. if err != nil {
  333. return nil, err
  334. }
  335. promises := make([]promise, len(messages))
  336. for i, m := range messages {
  337. promises[i] = p.sendRequest(ctx, m, state)
  338. }
  339. response = join(promises, messages, merger)
  340. }
  341. if response == nil {
  342. response = p.sendRequest(ctx, req, state)
  343. }
  344. r, err := response.await(ctx)
  345. if err != nil {
  346. return r, err
  347. }
  348. switch resp := r.(type) {
  349. case *createtopics.Response:
  350. // Force an update of the metadata when adding topics,
  351. // otherwise the cached state would get out of sync.
  352. topicsToRefresh := make([]string, 0, len(resp.Topics))
  353. for _, topic := range resp.Topics {
  354. // fixes issue 672: don't refresh topics that failed to create, it causes the library to hang indefinitely
  355. if topic.ErrorCode != 0 {
  356. continue
  357. }
  358. topicsToRefresh = append(topicsToRefresh, topic.Name)
  359. }
  360. p.refreshMetadata(ctx, topicsToRefresh)
  361. case *meta.Response:
  362. m := req.(*meta.Request)
  363. // If we get here with allow auto topic creation then
  364. // we didn't have that topic in our cache, so we should update
  365. // the cache.
  366. if m.AllowAutoTopicCreation {
  367. topicsToRefresh := make([]string, 0, len(resp.Topics))
  368. for _, topic := range resp.Topics {
  369. // Don't refresh topics that failed to create, since that may
  370. // mean that enable automatic topic creation is not enabled.
  371. // That causes the library to hang indefinitely, same as
  372. // don't refresh topics that failed to create,
  373. // createtopics process. Fixes issue 806.
  374. if topic.ErrorCode != 0 {
  375. continue
  376. }
  377. topicsToRefresh = append(topicsToRefresh, topic.Name)
  378. }
  379. p.refreshMetadata(ctx, topicsToRefresh)
  380. }
  381. }
  382. return r, nil
  383. }
  384. // refreshMetadata forces an update of the cached cluster metadata, and waits
  385. // for the given list of topics to appear. This waiting mechanism is necessary
  386. // to account for the fact that topic creation is asynchronous in kafka, and
  387. // causes subsequent requests to fail while the cluster state is propagated to
  388. // all the brokers.
  389. func (p *connPool) refreshMetadata(ctx context.Context, expectTopics []string) {
  390. minBackoff := 100 * time.Millisecond
  391. maxBackoff := 2 * time.Second
  392. cancel := ctx.Done()
  393. for ctx.Err() == nil {
  394. notify := make(event)
  395. select {
  396. case <-cancel:
  397. return
  398. case p.wake <- notify:
  399. select {
  400. case <-notify:
  401. case <-cancel:
  402. return
  403. }
  404. }
  405. state := p.grabState()
  406. found := 0
  407. for _, topic := range expectTopics {
  408. if _, ok := state.layout.Topics[topic]; ok {
  409. found++
  410. }
  411. }
  412. if found == len(expectTopics) {
  413. return
  414. }
  415. if delay := time.Duration(rand.Int63n(int64(minBackoff))); delay > 0 {
  416. timer := time.NewTimer(minBackoff)
  417. select {
  418. case <-cancel:
  419. case <-timer.C:
  420. }
  421. timer.Stop()
  422. if minBackoff *= 2; minBackoff > maxBackoff {
  423. minBackoff = maxBackoff
  424. }
  425. }
  426. }
  427. }
  428. func (p *connPool) setReady() {
  429. p.once.Do(p.ready.trigger)
  430. }
  431. // update is called periodically by the goroutine running the discover method
  432. // to refresh the cluster layout information used by the transport to route
  433. // requests to brokers.
  434. func (p *connPool) update(ctx context.Context, metadata *meta.Response, err error) {
  435. var layout protocol.Cluster
  436. if metadata != nil {
  437. metadata.ThrottleTimeMs = 0
  438. // Normalize the lists so we can apply binary search on them.
  439. sortMetadataBrokers(metadata.Brokers)
  440. sortMetadataTopics(metadata.Topics)
  441. for i := range metadata.Topics {
  442. t := &metadata.Topics[i]
  443. sortMetadataPartitions(t.Partitions)
  444. }
  445. layout = makeLayout(metadata)
  446. }
  447. state := p.grabState()
  448. addBrokers := make(map[int32]struct{})
  449. delBrokers := make(map[int32]struct{})
  450. if err != nil {
  451. // Only update the error on the transport if the cluster layout was
  452. // unknown. This ensures that we prioritize a previously known state
  453. // of the cluster to reduce the impact of transient failures.
  454. if state.metadata != nil {
  455. return
  456. }
  457. state.err = err
  458. } else {
  459. for id, b2 := range layout.Brokers {
  460. if b1, ok := state.layout.Brokers[id]; !ok {
  461. addBrokers[id] = struct{}{}
  462. } else if b1 != b2 {
  463. addBrokers[id] = struct{}{}
  464. delBrokers[id] = struct{}{}
  465. }
  466. }
  467. for id := range state.layout.Brokers {
  468. if _, ok := layout.Brokers[id]; !ok {
  469. delBrokers[id] = struct{}{}
  470. }
  471. }
  472. state.metadata, state.layout = metadata, layout
  473. state.err = nil
  474. }
  475. defer p.setReady()
  476. defer p.setState(state)
  477. if len(addBrokers) != 0 || len(delBrokers) != 0 {
  478. // Only acquire the lock when there is a change of layout. This is an
  479. // infrequent event so we don't risk introducing regular contention on
  480. // the mutex if we were to lock it on every update.
  481. p.mutex.Lock()
  482. defer p.mutex.Unlock()
  483. if ctx.Err() != nil {
  484. return // the pool has been closed, no need to update
  485. }
  486. for id := range delBrokers {
  487. if broker := p.conns[id]; broker != nil {
  488. broker.closeIdleConns()
  489. delete(p.conns, id)
  490. }
  491. }
  492. for id := range addBrokers {
  493. broker := layout.Brokers[id]
  494. p.conns[id] = p.newBrokerConnGroup(Broker{
  495. Rack: broker.Rack,
  496. Host: broker.Host,
  497. Port: int(broker.Port),
  498. ID: int(broker.ID),
  499. })
  500. }
  501. }
  502. }
  503. // discover is the entry point of an internal goroutine for the transport which
  504. // periodically requests updates of the cluster metadata and refreshes the
  505. // transport cached cluster layout.
  506. func (p *connPool) discover(ctx context.Context, wake <-chan event) {
  507. prng := rand.New(rand.NewSource(time.Now().UnixNano()))
  508. metadataTTL := func() time.Duration {
  509. return time.Duration(prng.Int63n(int64(p.metadataTTL)))
  510. }
  511. timer := time.NewTimer(metadataTTL())
  512. defer timer.Stop()
  513. var notify event
  514. done := ctx.Done()
  515. req := &meta.Request{
  516. TopicNames: p.metadataTopics,
  517. }
  518. for {
  519. c, err := p.grabClusterConn(ctx)
  520. if err != nil {
  521. p.update(ctx, nil, err)
  522. } else {
  523. res := make(async, 1)
  524. deadline, cancel := context.WithTimeout(ctx, p.metadataTTL)
  525. c.reqs <- connRequest{
  526. ctx: deadline,
  527. req: req,
  528. res: res,
  529. }
  530. r, err := res.await(deadline)
  531. cancel()
  532. if err != nil && errors.Is(err, ctx.Err()) {
  533. return
  534. }
  535. ret, _ := r.(*meta.Response)
  536. p.update(ctx, ret, err)
  537. }
  538. if notify != nil {
  539. notify.trigger()
  540. notify = nil
  541. }
  542. select {
  543. case <-timer.C:
  544. timer.Reset(metadataTTL())
  545. case <-done:
  546. return
  547. case notify = <-wake:
  548. }
  549. }
  550. }
  551. // grabBrokerConn returns a connection to a specific broker represented by the
  552. // broker id passed as argument. If the broker id was not known, an error is
  553. // returned.
  554. func (p *connPool) grabBrokerConn(ctx context.Context, brokerID int32) (*conn, error) {
  555. p.mutex.RLock()
  556. g := p.conns[brokerID]
  557. p.mutex.RUnlock()
  558. if g == nil {
  559. return nil, BrokerNotAvailable
  560. }
  561. return g.grabConnOrConnect(ctx)
  562. }
  563. // grabClusterConn returns the connection to the kafka cluster that the pool is
  564. // configured to connect to.
  565. //
  566. // The transport uses a shared `control` connection to the cluster for any
  567. // requests that aren't supposed to be sent to specific brokers (e.g. Fetch or
  568. // Produce requests). Requests intended to be routed to specific brokers are
  569. // dispatched on a separate pool of connections that the transport maintains.
  570. // This split help avoid head-of-line blocking situations where control requests
  571. // like Metadata would be queued behind large responses from Fetch requests for
  572. // example.
  573. //
  574. // In either cases, the requests are multiplexed so we can keep a minimal number
  575. // of connections open (N+1, where N is the number of brokers in the cluster).
  576. func (p *connPool) grabClusterConn(ctx context.Context) (*conn, error) {
  577. return p.ctrl.grabConnOrConnect(ctx)
  578. }
  579. func (p *connPool) sendRequest(ctx context.Context, req Request, state connPoolState) promise {
  580. brokerID := int32(-1)
  581. switch m := req.(type) {
  582. case protocol.BrokerMessage:
  583. // Some requests are supposed to be sent to specific brokers (e.g. the
  584. // partition leaders). They implement the BrokerMessage interface to
  585. // delegate the routing decision to each message type.
  586. broker, err := m.Broker(state.layout)
  587. if err != nil {
  588. return reject(err)
  589. }
  590. brokerID = broker.ID
  591. case protocol.GroupMessage:
  592. // Some requests are supposed to be sent to a group coordinator,
  593. // look up which broker is currently the coordinator for the group
  594. // so we can get a connection to that broker.
  595. //
  596. // TODO: should we cache the coordinator info?
  597. p := p.sendRequest(ctx, &findcoordinator.Request{Key: m.Group()}, state)
  598. r, err := p.await(ctx)
  599. if err != nil {
  600. return reject(err)
  601. }
  602. brokerID = r.(*findcoordinator.Response).NodeID
  603. case protocol.TransactionalMessage:
  604. p := p.sendRequest(ctx, &findcoordinator.Request{
  605. Key: m.Transaction(),
  606. KeyType: int8(CoordinatorKeyTypeTransaction),
  607. }, state)
  608. r, err := p.await(ctx)
  609. if err != nil {
  610. return reject(err)
  611. }
  612. brokerID = r.(*findcoordinator.Response).NodeID
  613. }
  614. var c *conn
  615. var err error
  616. if brokerID >= 0 {
  617. c, err = p.grabBrokerConn(ctx, brokerID)
  618. } else {
  619. c, err = p.grabClusterConn(ctx)
  620. }
  621. if err != nil {
  622. return reject(err)
  623. }
  624. res := make(async, 1)
  625. c.reqs <- connRequest{
  626. ctx: ctx,
  627. req: req,
  628. res: res,
  629. }
  630. return res
  631. }
  632. func filterMetadataResponse(req *meta.Request, res *meta.Response) *meta.Response {
  633. ret := *res
  634. if req.TopicNames != nil {
  635. ret.Topics = make([]meta.ResponseTopic, len(req.TopicNames))
  636. for i, topicName := range req.TopicNames {
  637. j, ok := findMetadataTopic(res.Topics, topicName)
  638. if ok {
  639. ret.Topics[i] = res.Topics[j]
  640. } else {
  641. ret.Topics[i] = meta.ResponseTopic{
  642. ErrorCode: int16(UnknownTopicOrPartition),
  643. Name: topicName,
  644. }
  645. }
  646. }
  647. }
  648. return &ret
  649. }
  650. func findMetadataTopic(topics []meta.ResponseTopic, topicName string) (int, bool) {
  651. i := sort.Search(len(topics), func(i int) bool {
  652. return topics[i].Name >= topicName
  653. })
  654. return i, i >= 0 && i < len(topics) && topics[i].Name == topicName
  655. }
  656. func sortMetadataBrokers(brokers []meta.ResponseBroker) {
  657. sort.Slice(brokers, func(i, j int) bool {
  658. return brokers[i].NodeID < brokers[j].NodeID
  659. })
  660. }
  661. func sortMetadataTopics(topics []meta.ResponseTopic) {
  662. sort.Slice(topics, func(i, j int) bool {
  663. return topics[i].Name < topics[j].Name
  664. })
  665. }
  666. func sortMetadataPartitions(partitions []meta.ResponsePartition) {
  667. sort.Slice(partitions, func(i, j int) bool {
  668. return partitions[i].PartitionIndex < partitions[j].PartitionIndex
  669. })
  670. }
  671. func makeLayout(metadataResponse *meta.Response) protocol.Cluster {
  672. layout := protocol.Cluster{
  673. Controller: metadataResponse.ControllerID,
  674. Brokers: make(map[int32]protocol.Broker),
  675. Topics: make(map[string]protocol.Topic),
  676. }
  677. for _, broker := range metadataResponse.Brokers {
  678. layout.Brokers[broker.NodeID] = protocol.Broker{
  679. Rack: broker.Rack,
  680. Host: broker.Host,
  681. Port: broker.Port,
  682. ID: broker.NodeID,
  683. }
  684. }
  685. for _, topic := range metadataResponse.Topics {
  686. if topic.IsInternal {
  687. continue // TODO: do we need to expose those?
  688. }
  689. layout.Topics[topic.Name] = protocol.Topic{
  690. Name: topic.Name,
  691. Error: topic.ErrorCode,
  692. Partitions: makePartitions(topic.Partitions),
  693. }
  694. }
  695. return layout
  696. }
  697. func makePartitions(metadataPartitions []meta.ResponsePartition) map[int32]protocol.Partition {
  698. protocolPartitions := make(map[int32]protocol.Partition, len(metadataPartitions))
  699. numBrokerIDs := 0
  700. for _, p := range metadataPartitions {
  701. numBrokerIDs += len(p.ReplicaNodes) + len(p.IsrNodes) + len(p.OfflineReplicas)
  702. }
  703. // Reduce the memory footprint a bit by allocating a single buffer to write
  704. // all broker ids.
  705. brokerIDs := make([]int32, 0, numBrokerIDs)
  706. for _, p := range metadataPartitions {
  707. var rep, isr, off []int32
  708. brokerIDs, rep = appendBrokerIDs(brokerIDs, p.ReplicaNodes)
  709. brokerIDs, isr = appendBrokerIDs(brokerIDs, p.IsrNodes)
  710. brokerIDs, off = appendBrokerIDs(brokerIDs, p.OfflineReplicas)
  711. protocolPartitions[p.PartitionIndex] = protocol.Partition{
  712. ID: p.PartitionIndex,
  713. Error: p.ErrorCode,
  714. Leader: p.LeaderID,
  715. Replicas: rep,
  716. ISR: isr,
  717. Offline: off,
  718. }
  719. }
  720. return protocolPartitions
  721. }
  722. func appendBrokerIDs(ids, brokers []int32) ([]int32, []int32) {
  723. i := len(ids)
  724. ids = append(ids, brokers...)
  725. return ids, ids[i:len(ids):len(ids)]
  726. }
  727. func (p *connPool) newConnGroup(a net.Addr) *connGroup {
  728. return &connGroup{
  729. addr: a,
  730. pool: p,
  731. broker: Broker{
  732. ID: -1,
  733. },
  734. }
  735. }
  736. func (p *connPool) newBrokerConnGroup(broker Broker) *connGroup {
  737. return &connGroup{
  738. addr: &networkAddress{
  739. network: "tcp",
  740. address: net.JoinHostPort(broker.Host, strconv.Itoa(broker.Port)),
  741. },
  742. pool: p,
  743. broker: broker,
  744. }
  745. }
  746. type connRequest struct {
  747. ctx context.Context
  748. req Request
  749. res async
  750. }
  751. // The promise interface is used as a message passing abstraction to coordinate
  752. // between goroutines that handle requests and responses.
  753. type promise interface {
  754. // Waits until the promise is resolved, rejected, or the context canceled.
  755. await(context.Context) (Response, error)
  756. }
  757. // async is an implementation of the promise interface which supports resolving
  758. // or rejecting the await call asynchronously.
  759. type async chan interface{}
  760. func (p async) await(ctx context.Context) (Response, error) {
  761. select {
  762. case x := <-p:
  763. switch v := x.(type) {
  764. case nil:
  765. return nil, nil // A nil response is ok (e.g. when RequiredAcks is None)
  766. case Response:
  767. return v, nil
  768. case error:
  769. return nil, v
  770. default:
  771. panic(fmt.Errorf("BUG: promise resolved with impossible value of type %T", v))
  772. }
  773. case <-ctx.Done():
  774. return nil, ctx.Err()
  775. }
  776. }
  777. func (p async) resolve(res Response) { p <- res }
  778. func (p async) reject(err error) { p <- err }
  779. // rejected is an implementation of the promise interface which is always
  780. // returns an error. Values of this type are constructed using the reject
  781. // function.
  782. type rejected struct{ err error }
  783. func reject(err error) promise { return &rejected{err: err} }
  784. func (p *rejected) await(ctx context.Context) (Response, error) {
  785. return nil, p.err
  786. }
  787. // joined is an implementation of the promise interface which merges results
  788. // from multiple promises into one await call using a merger.
  789. type joined struct {
  790. promises []promise
  791. requests []Request
  792. merger protocol.Merger
  793. }
  794. func join(promises []promise, requests []Request, merger protocol.Merger) promise {
  795. return &joined{
  796. promises: promises,
  797. requests: requests,
  798. merger: merger,
  799. }
  800. }
  801. func (p *joined) await(ctx context.Context) (Response, error) {
  802. results := make([]interface{}, len(p.promises))
  803. for i, sub := range p.promises {
  804. m, err := sub.await(ctx)
  805. if err != nil {
  806. results[i] = err
  807. } else {
  808. results[i] = m
  809. }
  810. }
  811. return p.merger.Merge(p.requests, results)
  812. }
  813. // Default dialer used by the transport connections when no Dial function
  814. // was configured by the program.
  815. var defaultDialer = net.Dialer{
  816. Timeout: 3 * time.Second,
  817. DualStack: true,
  818. }
  819. // connGroup represents a logical connection group to a kafka broker. The
  820. // actual network connections are lazily open before sending requests, and
  821. // closed if they are unused for longer than the idle timeout.
  822. type connGroup struct {
  823. addr net.Addr
  824. broker Broker
  825. // Immutable state of the connection.
  826. pool *connPool
  827. // Shared state of the connection, this is synchronized on the mutex through
  828. // calls to the synchronized method. Both goroutines of the connection share
  829. // the state maintained in these fields.
  830. mutex sync.Mutex
  831. closed bool
  832. idleConns []*conn // stack of idle connections
  833. }
  834. func (g *connGroup) closeIdleConns() {
  835. g.mutex.Lock()
  836. conns := g.idleConns
  837. g.idleConns = nil
  838. g.closed = true
  839. g.mutex.Unlock()
  840. for _, c := range conns {
  841. c.close()
  842. }
  843. }
  844. func (g *connGroup) grabConnOrConnect(ctx context.Context) (*conn, error) {
  845. rslv := g.pool.resolver
  846. addr := g.addr
  847. var c *conn
  848. if rslv == nil {
  849. c = g.grabConn()
  850. } else {
  851. var err error
  852. broker := g.broker
  853. if broker.ID < 0 {
  854. host, port, err := splitHostPortNumber(addr.String())
  855. if err != nil {
  856. return nil, err
  857. }
  858. broker.Host = host
  859. broker.Port = port
  860. }
  861. ipAddrs, err := rslv.LookupBrokerIPAddr(ctx, broker)
  862. if err != nil {
  863. return nil, err
  864. }
  865. for _, ipAddr := range ipAddrs {
  866. network := addr.Network()
  867. address := net.JoinHostPort(ipAddr.String(), strconv.Itoa(broker.Port))
  868. if c = g.grabConnTo(network, address); c != nil {
  869. break
  870. }
  871. }
  872. }
  873. if c == nil {
  874. connChan := make(chan *conn)
  875. errChan := make(chan error)
  876. go func() {
  877. c, err := g.connect(ctx, addr)
  878. if err != nil {
  879. select {
  880. case errChan <- err:
  881. case <-ctx.Done():
  882. }
  883. } else {
  884. select {
  885. case connChan <- c:
  886. case <-ctx.Done():
  887. if !g.releaseConn(c) {
  888. c.close()
  889. }
  890. }
  891. }
  892. }()
  893. select {
  894. case c = <-connChan:
  895. case err := <-errChan:
  896. return nil, err
  897. case <-ctx.Done():
  898. return nil, ctx.Err()
  899. }
  900. }
  901. return c, nil
  902. }
  903. func (g *connGroup) grabConnTo(network, address string) *conn {
  904. g.mutex.Lock()
  905. defer g.mutex.Unlock()
  906. for i := len(g.idleConns) - 1; i >= 0; i-- {
  907. c := g.idleConns[i]
  908. if c.network == network && c.address == address {
  909. copy(g.idleConns[i:], g.idleConns[i+1:])
  910. n := len(g.idleConns) - 1
  911. g.idleConns[n] = nil
  912. g.idleConns = g.idleConns[:n]
  913. if c.timer != nil {
  914. c.timer.Stop()
  915. }
  916. return c
  917. }
  918. }
  919. return nil
  920. }
  921. func (g *connGroup) grabConn() *conn {
  922. g.mutex.Lock()
  923. defer g.mutex.Unlock()
  924. if len(g.idleConns) == 0 {
  925. return nil
  926. }
  927. n := len(g.idleConns) - 1
  928. c := g.idleConns[n]
  929. g.idleConns[n] = nil
  930. g.idleConns = g.idleConns[:n]
  931. if c.timer != nil {
  932. c.timer.Stop()
  933. }
  934. return c
  935. }
  936. func (g *connGroup) removeConn(c *conn) bool {
  937. g.mutex.Lock()
  938. defer g.mutex.Unlock()
  939. if c.timer != nil {
  940. c.timer.Stop()
  941. }
  942. for i, x := range g.idleConns {
  943. if x == c {
  944. copy(g.idleConns[i:], g.idleConns[i+1:])
  945. n := len(g.idleConns) - 1
  946. g.idleConns[n] = nil
  947. g.idleConns = g.idleConns[:n]
  948. return true
  949. }
  950. }
  951. return false
  952. }
  953. func (g *connGroup) releaseConn(c *conn) bool {
  954. idleTimeout := g.pool.idleTimeout
  955. g.mutex.Lock()
  956. defer g.mutex.Unlock()
  957. if g.closed {
  958. return false
  959. }
  960. if c.timer != nil {
  961. c.timer.Reset(idleTimeout)
  962. } else {
  963. c.timer = time.AfterFunc(idleTimeout, func() {
  964. if g.removeConn(c) {
  965. c.close()
  966. }
  967. })
  968. }
  969. g.idleConns = append(g.idleConns, c)
  970. return true
  971. }
  972. func (g *connGroup) connect(ctx context.Context, addr net.Addr) (*conn, error) {
  973. deadline := time.Now().Add(g.pool.dialTimeout)
  974. ctx, cancel := context.WithDeadline(ctx, deadline)
  975. defer cancel()
  976. network := strings.Split(addr.Network(), ",")
  977. address := strings.Split(addr.String(), ",")
  978. var netConn net.Conn
  979. var netAddr net.Addr
  980. var err error
  981. if len(address) > 1 {
  982. // Shuffle the list of addresses to randomize the order in which
  983. // connections are attempted. This prevents routing all connections
  984. // to the first broker (which will usually succeed).
  985. rand.Shuffle(len(address), func(i, j int) {
  986. network[i], network[j] = network[j], network[i]
  987. address[i], address[j] = address[j], address[i]
  988. })
  989. }
  990. for i := range address {
  991. netConn, err = g.pool.dial(ctx, network[i], address[i])
  992. if err == nil {
  993. netAddr = &networkAddress{
  994. network: network[i],
  995. address: address[i],
  996. }
  997. break
  998. }
  999. }
  1000. if err != nil {
  1001. return nil, err
  1002. }
  1003. defer func() {
  1004. if netConn != nil {
  1005. netConn.Close()
  1006. }
  1007. }()
  1008. if tlsConfig := g.pool.tls; tlsConfig != nil {
  1009. if tlsConfig.ServerName == "" {
  1010. host, _ := splitHostPort(netAddr.String())
  1011. tlsConfig = tlsConfig.Clone()
  1012. tlsConfig.ServerName = host
  1013. }
  1014. netConn = tls.Client(netConn, tlsConfig)
  1015. }
  1016. pc := protocol.NewConn(netConn, g.pool.clientID)
  1017. pc.SetDeadline(deadline)
  1018. r, err := pc.RoundTrip(new(apiversions.Request))
  1019. if err != nil {
  1020. return nil, err
  1021. }
  1022. res := r.(*apiversions.Response)
  1023. ver := make(map[protocol.ApiKey]int16, len(res.ApiKeys))
  1024. if res.ErrorCode != 0 {
  1025. return nil, fmt.Errorf("negotating API versions with kafka broker at %s: %w", g.addr, Error(res.ErrorCode))
  1026. }
  1027. for _, r := range res.ApiKeys {
  1028. apiKey := protocol.ApiKey(r.ApiKey)
  1029. ver[apiKey] = apiKey.SelectVersion(r.MinVersion, r.MaxVersion)
  1030. }
  1031. pc.SetVersions(ver)
  1032. pc.SetDeadline(time.Time{})
  1033. if g.pool.sasl != nil {
  1034. host, port, err := splitHostPortNumber(netAddr.String())
  1035. if err != nil {
  1036. return nil, err
  1037. }
  1038. metadata := &sasl.Metadata{
  1039. Host: host,
  1040. Port: port,
  1041. }
  1042. if err := authenticateSASL(sasl.WithMetadata(ctx, metadata), pc, g.pool.sasl); err != nil {
  1043. return nil, err
  1044. }
  1045. }
  1046. reqs := make(chan connRequest)
  1047. c := &conn{
  1048. network: netAddr.Network(),
  1049. address: netAddr.String(),
  1050. reqs: reqs,
  1051. group: g,
  1052. }
  1053. go c.run(pc, reqs)
  1054. netConn = nil
  1055. return c, nil
  1056. }
  1057. type conn struct {
  1058. reqs chan<- connRequest
  1059. network string
  1060. address string
  1061. once sync.Once
  1062. group *connGroup
  1063. timer *time.Timer
  1064. }
  1065. func (c *conn) close() {
  1066. c.once.Do(func() { close(c.reqs) })
  1067. }
  1068. func (c *conn) run(pc *protocol.Conn, reqs <-chan connRequest) {
  1069. defer pc.Close()
  1070. for cr := range reqs {
  1071. r, err := c.roundTrip(cr.ctx, pc, cr.req)
  1072. if err != nil {
  1073. cr.res.reject(err)
  1074. if !errors.Is(err, protocol.ErrNoRecord) {
  1075. break
  1076. }
  1077. } else {
  1078. cr.res.resolve(r)
  1079. }
  1080. if !c.group.releaseConn(c) {
  1081. break
  1082. }
  1083. }
  1084. }
  1085. func (c *conn) roundTrip(ctx context.Context, pc *protocol.Conn, req Request) (Response, error) {
  1086. pprof.SetGoroutineLabels(ctx)
  1087. defer pprof.SetGoroutineLabels(context.Background())
  1088. if deadline, hasDeadline := ctx.Deadline(); hasDeadline {
  1089. pc.SetDeadline(deadline)
  1090. defer pc.SetDeadline(time.Time{})
  1091. }
  1092. return pc.RoundTrip(req)
  1093. }
  1094. // authenticateSASL performs all of the required requests to authenticate this
  1095. // connection. If any step fails, this function returns with an error. A nil
  1096. // error indicates successful authentication.
  1097. func authenticateSASL(ctx context.Context, pc *protocol.Conn, mechanism sasl.Mechanism) error {
  1098. if err := saslHandshakeRoundTrip(pc, mechanism.Name()); err != nil {
  1099. return err
  1100. }
  1101. sess, state, err := mechanism.Start(ctx)
  1102. if err != nil {
  1103. return err
  1104. }
  1105. for completed := false; !completed; {
  1106. challenge, err := saslAuthenticateRoundTrip(pc, state)
  1107. if err != nil {
  1108. if errors.Is(err, io.EOF) {
  1109. // the broker may communicate a failed exchange by closing the
  1110. // connection (esp. in the case where we're passing opaque sasl
  1111. // data over the wire since there's no protocol info).
  1112. return SASLAuthenticationFailed
  1113. }
  1114. return err
  1115. }
  1116. completed, state, err = sess.Next(ctx, challenge)
  1117. if err != nil {
  1118. return err
  1119. }
  1120. }
  1121. return nil
  1122. }
  1123. // saslHandshake sends the SASL handshake message. This will determine whether
  1124. // the Mechanism is supported by the cluster. If it's not, this function will
  1125. // error out with UnsupportedSASLMechanism.
  1126. //
  1127. // If the mechanism is unsupported, the handshake request will reply with the
  1128. // list of the cluster's configured mechanisms, which could potentially be used
  1129. // to facilitate negotiation. At the moment, we are not negotiating the
  1130. // mechanism as we believe that brokers are usually known to the client, and
  1131. // therefore the client should already know which mechanisms are supported.
  1132. //
  1133. // See http://kafka.apache.org/protocol.html#The_Messages_SaslHandshake
  1134. func saslHandshakeRoundTrip(pc *protocol.Conn, mechanism string) error {
  1135. msg, err := pc.RoundTrip(&saslhandshake.Request{
  1136. Mechanism: mechanism,
  1137. })
  1138. if err != nil {
  1139. return err
  1140. }
  1141. res := msg.(*saslhandshake.Response)
  1142. if res.ErrorCode != 0 {
  1143. err = Error(res.ErrorCode)
  1144. }
  1145. return err
  1146. }
  1147. // saslAuthenticate sends the SASL authenticate message. This function must
  1148. // be immediately preceded by a successful saslHandshake.
  1149. //
  1150. // See http://kafka.apache.org/protocol.html#The_Messages_SaslAuthenticate
  1151. func saslAuthenticateRoundTrip(pc *protocol.Conn, data []byte) ([]byte, error) {
  1152. msg, err := pc.RoundTrip(&saslauthenticate.Request{
  1153. AuthBytes: data,
  1154. })
  1155. if err != nil {
  1156. return nil, err
  1157. }
  1158. res := msg.(*saslauthenticate.Response)
  1159. if res.ErrorCode != 0 {
  1160. err = makeError(res.ErrorCode, res.ErrorMessage)
  1161. }
  1162. return res.AuthBytes, err
  1163. }
  1164. var _ RoundTripper = (*Transport)(nil)