transport.go 33 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304
  1. package kafka
  2. import (
  3. "context"
  4. "crypto/tls"
  5. "errors"
  6. "fmt"
  7. "io"
  8. "math/rand"
  9. "net"
  10. "runtime/pprof"
  11. "sort"
  12. "strconv"
  13. "strings"
  14. "sync"
  15. "sync/atomic"
  16. "time"
  17. "github.com/segmentio/kafka-go/protocol"
  18. "github.com/segmentio/kafka-go/protocol/apiversions"
  19. "github.com/segmentio/kafka-go/protocol/createtopics"
  20. "github.com/segmentio/kafka-go/protocol/findcoordinator"
  21. meta "github.com/segmentio/kafka-go/protocol/metadata"
  22. "github.com/segmentio/kafka-go/protocol/saslauthenticate"
  23. "github.com/segmentio/kafka-go/protocol/saslhandshake"
  24. "github.com/segmentio/kafka-go/sasl"
  25. )
  26. // Request is an interface implemented by types that represent messages sent
  27. // from kafka clients to brokers.
  28. type Request = protocol.Message
  29. // Response is an interface implemented by types that represent messages sent
  30. // from kafka brokers in response to client requests.
  31. type Response = protocol.Message
  32. // RoundTripper is an interface implemented by types which support interacting
  33. // with kafka brokers.
  34. type RoundTripper interface {
  35. // RoundTrip sends a request to a kafka broker and returns the response that
  36. // was received, or a non-nil error.
  37. //
  38. // The context passed as first argument can be used to asynchronnously abort
  39. // the call if needed.
  40. RoundTrip(context.Context, net.Addr, Request) (Response, error)
  41. }
  42. // Transport is an implementation of the RoundTripper interface.
  43. //
  44. // Transport values manage a pool of connections and automatically discovers the
  45. // clusters layout to route requests to the appropriate brokers.
  46. //
  47. // Transport values are safe to use concurrently from multiple goroutines.
  48. //
  49. // Note: The intent is for the Transport to become the underlying layer of the
  50. // kafka.Reader and kafka.Writer types.
  51. type Transport struct {
  52. // A function used to establish connections to the kafka cluster.
  53. Dial func(context.Context, string, string) (net.Conn, error)
  54. // Time limit set for establishing connections to the kafka cluster. This
  55. // limit includes all round trips done to establish the connections (TLS
  56. // hadbhaske, SASL negotiation, etc...).
  57. //
  58. // Defaults to 5s.
  59. DialTimeout time.Duration
  60. // Maximum amount of time that connections will remain open and unused.
  61. // The transport will manage to automatically close connections that have
  62. // been idle for too long, and re-open them on demand when the transport is
  63. // used again.
  64. //
  65. // Defaults to 30s.
  66. IdleTimeout time.Duration
  67. // TTL for the metadata cached by this transport. Note that the value
  68. // configured here is an upper bound, the transport randomizes the TTLs to
  69. // avoid getting into states where multiple clients end up synchronized and
  70. // cause bursts of requests to the kafka broker.
  71. //
  72. // Default to 6s.
  73. MetadataTTL time.Duration
  74. // Unique identifier that the transport communicates to the brokers when it
  75. // sends requests.
  76. ClientID string
  77. // An optional configuration for TLS connections established by this
  78. // transport.
  79. //
  80. // If the Server
  81. TLS *tls.Config
  82. // SASL configures the Transfer to use SASL authentication.
  83. SASL sasl.Mechanism
  84. // An optional resolver used to translate broker host names into network
  85. // addresses.
  86. //
  87. // The resolver will be called for every request (not every connection),
  88. // making it possible to implement ACL policies by validating that the
  89. // program is allowed to connect to the kafka broker. This also means that
  90. // the resolver should probably provide a caching layer to avoid storming
  91. // the service discovery backend with requests.
  92. //
  93. // When set, the Dial function is not responsible for performing name
  94. // resolution, and is always called with a pre-resolved address.
  95. Resolver BrokerResolver
  96. // The background context used to control goroutines started internally by
  97. // the transport.
  98. //
  99. // If nil, context.Background() is used instead.
  100. Context context.Context
  101. mutex sync.RWMutex
  102. pools map[networkAddress]*connPool
  103. }
  104. // DefaultTransport is the default transport used by kafka clients in this
  105. // package.
  106. var DefaultTransport RoundTripper = &Transport{
  107. Dial: (&net.Dialer{
  108. Timeout: 3 * time.Second,
  109. DualStack: true,
  110. }).DialContext,
  111. }
  112. // CloseIdleConnections closes all idle connections immediately, and marks all
  113. // connections that are in use to be closed when they become idle again.
  114. func (t *Transport) CloseIdleConnections() {
  115. t.mutex.Lock()
  116. defer t.mutex.Unlock()
  117. for _, pool := range t.pools {
  118. pool.unref()
  119. }
  120. for k := range t.pools {
  121. delete(t.pools, k)
  122. }
  123. }
  124. // RoundTrip sends a request to a kafka cluster and returns the response, or an
  125. // error if no responses were received.
  126. //
  127. // Message types are available in sub-packages of the protocol package. Each
  128. // kafka API is implemented in a different sub-package. For example, the request
  129. // and response types for the Fetch API are available in the protocol/fetch
  130. // package.
  131. //
  132. // The type of the response message will match the type of the request. For
  133. // exmple, if RoundTrip was called with a *fetch.Request as argument, the value
  134. // returned will be of type *fetch.Response. It is safe for the program to do a
  135. // type assertion after checking that no error was returned.
  136. //
  137. // This example illustrates the way this method is expected to be used:
  138. //
  139. // r, err := transport.RoundTrip(ctx, addr, &fetch.Request{ ... })
  140. // if err != nil {
  141. // ...
  142. // } else {
  143. // res := r.(*fetch.Response)
  144. // ...
  145. // }
  146. //
  147. // The transport automatically selects the highest version of the API that is
  148. // supported by both the kafka-go package and the kafka broker. The negotiation
  149. // happens transparently once when connections are established.
  150. //
  151. // This API was introduced in version 0.4 as a way to leverage the lower-level
  152. // features of the kafka protocol, but also provide a more efficient way of
  153. // managing connections to kafka brokers.
  154. func (t *Transport) RoundTrip(ctx context.Context, addr net.Addr, req Request) (Response, error) {
  155. p := t.grabPool(addr)
  156. defer p.unref()
  157. return p.roundTrip(ctx, req)
  158. }
  159. func (t *Transport) dial() func(context.Context, string, string) (net.Conn, error) {
  160. if t.Dial != nil {
  161. return t.Dial
  162. }
  163. return defaultDialer.DialContext
  164. }
  165. func (t *Transport) dialTimeout() time.Duration {
  166. if t.DialTimeout > 0 {
  167. return t.DialTimeout
  168. }
  169. return 5 * time.Second
  170. }
  171. func (t *Transport) idleTimeout() time.Duration {
  172. if t.IdleTimeout > 0 {
  173. return t.IdleTimeout
  174. }
  175. return 30 * time.Second
  176. }
  177. func (t *Transport) metadataTTL() time.Duration {
  178. if t.MetadataTTL > 0 {
  179. return t.MetadataTTL
  180. }
  181. return 6 * time.Second
  182. }
  183. func (t *Transport) grabPool(addr net.Addr) *connPool {
  184. k := networkAddress{
  185. network: addr.Network(),
  186. address: addr.String(),
  187. }
  188. t.mutex.RLock()
  189. p := t.pools[k]
  190. if p != nil {
  191. p.ref()
  192. }
  193. t.mutex.RUnlock()
  194. if p != nil {
  195. return p
  196. }
  197. t.mutex.Lock()
  198. defer t.mutex.Unlock()
  199. if p := t.pools[k]; p != nil {
  200. p.ref()
  201. return p
  202. }
  203. ctx, cancel := context.WithCancel(t.context())
  204. p = &connPool{
  205. refc: 2,
  206. dial: t.dial(),
  207. dialTimeout: t.dialTimeout(),
  208. idleTimeout: t.idleTimeout(),
  209. metadataTTL: t.metadataTTL(),
  210. clientID: t.ClientID,
  211. tls: t.TLS,
  212. sasl: t.SASL,
  213. resolver: t.Resolver,
  214. ready: make(event),
  215. wake: make(chan event),
  216. conns: make(map[int32]*connGroup),
  217. cancel: cancel,
  218. }
  219. p.ctrl = p.newConnGroup(addr)
  220. go p.discover(ctx, p.wake)
  221. if t.pools == nil {
  222. t.pools = make(map[networkAddress]*connPool)
  223. }
  224. t.pools[k] = p
  225. return p
  226. }
  227. func (t *Transport) context() context.Context {
  228. if t.Context != nil {
  229. return t.Context
  230. }
  231. return context.Background()
  232. }
  233. type event chan struct{}
  234. func (e event) trigger() { close(e) }
  235. type connPool struct {
  236. refc uintptr
  237. // Immutable fields of the connection pool. Connections access these field
  238. // on their parent pool in a ready-only fashion, so no synchronization is
  239. // required.
  240. dial func(context.Context, string, string) (net.Conn, error)
  241. dialTimeout time.Duration
  242. idleTimeout time.Duration
  243. metadataTTL time.Duration
  244. clientID string
  245. tls *tls.Config
  246. sasl sasl.Mechanism
  247. resolver BrokerResolver
  248. // Signaling mechanisms to orchestrate communications between the pool and
  249. // the rest of the program.
  250. once sync.Once // ensure that `ready` is triggered only once
  251. ready event // triggered after the first metadata update
  252. wake chan event // used to force metadata updates
  253. cancel context.CancelFunc
  254. // Mutable fields of the connection pool, access must be synchronized.
  255. mutex sync.RWMutex
  256. conns map[int32]*connGroup // data connections used for produce/fetch/etc...
  257. ctrl *connGroup // control connections used for metadata requests
  258. state atomic.Value // cached cluster state
  259. }
  260. type connPoolState struct {
  261. metadata *meta.Response // last metadata response seen by the pool
  262. err error // last error from metadata requests
  263. layout protocol.Cluster // cluster layout built from metadata response
  264. }
  265. func (p *connPool) grabState() connPoolState {
  266. state, _ := p.state.Load().(connPoolState)
  267. return state
  268. }
  269. func (p *connPool) setState(state connPoolState) {
  270. p.state.Store(state)
  271. }
  272. func (p *connPool) ref() {
  273. atomic.AddUintptr(&p.refc, +1)
  274. }
  275. func (p *connPool) unref() {
  276. if atomic.AddUintptr(&p.refc, ^uintptr(0)) == 0 {
  277. p.mutex.Lock()
  278. defer p.mutex.Unlock()
  279. for _, conns := range p.conns {
  280. conns.closeIdleConns()
  281. }
  282. p.ctrl.closeIdleConns()
  283. p.cancel()
  284. }
  285. }
  286. func (p *connPool) roundTrip(ctx context.Context, req Request) (Response, error) {
  287. // This first select should never block after the first metadata response
  288. // that would mark the pool as `ready`.
  289. select {
  290. case <-p.ready:
  291. case <-ctx.Done():
  292. return nil, ctx.Err()
  293. }
  294. state := p.grabState()
  295. var response promise
  296. switch m := req.(type) {
  297. case *meta.Request:
  298. // We serve metadata requests directly from the transport cache.
  299. //
  300. // This reduces the number of round trips to kafka brokers while keeping
  301. // the logic simple when applying partitioning strategies.
  302. if state.err != nil {
  303. return nil, state.err
  304. }
  305. return filterMetadataResponse(m, state.metadata), nil
  306. case protocol.Splitter:
  307. // Messages that implement the Splitter interface trigger the creation of
  308. // multiple requests that are all merged back into a single results by
  309. // a merger.
  310. messages, merger, err := m.Split(state.layout)
  311. if err != nil {
  312. return nil, err
  313. }
  314. promises := make([]promise, len(messages))
  315. for i, m := range messages {
  316. promises[i] = p.sendRequest(ctx, m, state)
  317. }
  318. response = join(promises, messages, merger)
  319. }
  320. if response == nil {
  321. response = p.sendRequest(ctx, req, state)
  322. }
  323. r, err := response.await(ctx)
  324. if err != nil {
  325. return r, err
  326. }
  327. switch resp := r.(type) {
  328. case *createtopics.Response:
  329. // Force an update of the metadata when adding topics,
  330. // otherwise the cached state would get out of sync.
  331. topicsToRefresh := make([]string, 0, len(resp.Topics))
  332. for _, topic := range resp.Topics {
  333. // fixes issue 672: don't refresh topics that failed to create, it causes the library to hang indefinitely
  334. if topic.ErrorCode != 0 {
  335. continue
  336. }
  337. topicsToRefresh = append(topicsToRefresh, topic.Name)
  338. }
  339. p.refreshMetadata(ctx, topicsToRefresh)
  340. }
  341. return r, nil
  342. }
  343. // refreshMetadata forces an update of the cached cluster metadata, and waits
  344. // for the given list of topics to appear. This waiting mechanism is necessary
  345. // to account for the fact that topic creation is asynchronous in kafka, and
  346. // causes subsequent requests to fail while the cluster state is propagated to
  347. // all the brokers.
  348. func (p *connPool) refreshMetadata(ctx context.Context, expectTopics []string) {
  349. minBackoff := 100 * time.Millisecond
  350. maxBackoff := 2 * time.Second
  351. cancel := ctx.Done()
  352. for ctx.Err() == nil {
  353. notify := make(event)
  354. select {
  355. case <-cancel:
  356. return
  357. case p.wake <- notify:
  358. select {
  359. case <-notify:
  360. case <-cancel:
  361. return
  362. }
  363. }
  364. state := p.grabState()
  365. found := 0
  366. for _, topic := range expectTopics {
  367. if _, ok := state.layout.Topics[topic]; ok {
  368. found++
  369. }
  370. }
  371. if found == len(expectTopics) {
  372. return
  373. }
  374. if delay := time.Duration(rand.Int63n(int64(minBackoff))); delay > 0 {
  375. timer := time.NewTimer(minBackoff)
  376. select {
  377. case <-cancel:
  378. case <-timer.C:
  379. }
  380. timer.Stop()
  381. if minBackoff *= 2; minBackoff > maxBackoff {
  382. minBackoff = maxBackoff
  383. }
  384. }
  385. }
  386. }
  387. func (p *connPool) setReady() {
  388. p.once.Do(p.ready.trigger)
  389. }
  390. // update is called periodically by the goroutine running the discover method
  391. // to refresh the cluster layout information used by the transport to route
  392. // requests to brokers.
  393. func (p *connPool) update(ctx context.Context, metadata *meta.Response, err error) {
  394. var layout protocol.Cluster
  395. if metadata != nil {
  396. metadata.ThrottleTimeMs = 0
  397. // Normalize the lists so we can apply binary search on them.
  398. sortMetadataBrokers(metadata.Brokers)
  399. sortMetadataTopics(metadata.Topics)
  400. for i := range metadata.Topics {
  401. t := &metadata.Topics[i]
  402. sortMetadataPartitions(t.Partitions)
  403. }
  404. layout = makeLayout(metadata)
  405. }
  406. state := p.grabState()
  407. addBrokers := make(map[int32]struct{})
  408. delBrokers := make(map[int32]struct{})
  409. if err != nil {
  410. // Only update the error on the transport if the cluster layout was
  411. // unknown. This ensures that we prioritize a previously known state
  412. // of the cluster to reduce the impact of transient failures.
  413. if state.metadata != nil {
  414. return
  415. }
  416. state.err = err
  417. } else {
  418. for id, b2 := range layout.Brokers {
  419. if b1, ok := state.layout.Brokers[id]; !ok {
  420. addBrokers[id] = struct{}{}
  421. } else if b1 != b2 {
  422. addBrokers[id] = struct{}{}
  423. delBrokers[id] = struct{}{}
  424. }
  425. }
  426. for id := range state.layout.Brokers {
  427. if _, ok := layout.Brokers[id]; !ok {
  428. delBrokers[id] = struct{}{}
  429. }
  430. }
  431. state.metadata, state.layout = metadata, layout
  432. state.err = nil
  433. }
  434. defer p.setReady()
  435. defer p.setState(state)
  436. if len(addBrokers) != 0 || len(delBrokers) != 0 {
  437. // Only acquire the lock when there is a change of layout. This is an
  438. // infrequent event so we don't risk introducing regular contention on
  439. // the mutex if we were to lock it on every update.
  440. p.mutex.Lock()
  441. defer p.mutex.Unlock()
  442. if ctx.Err() != nil {
  443. return // the pool has been closed, no need to update
  444. }
  445. for id := range delBrokers {
  446. if broker := p.conns[id]; broker != nil {
  447. broker.closeIdleConns()
  448. delete(p.conns, id)
  449. }
  450. }
  451. for id := range addBrokers {
  452. broker := layout.Brokers[id]
  453. p.conns[id] = p.newBrokerConnGroup(Broker{
  454. Rack: broker.Rack,
  455. Host: broker.Host,
  456. Port: int(broker.Port),
  457. ID: int(broker.ID),
  458. })
  459. }
  460. }
  461. }
  462. // discover is the entry point of an internal goroutine for the transport which
  463. // periodically requests updates of the cluster metadata and refreshes the
  464. // transport cached cluster layout.
  465. func (p *connPool) discover(ctx context.Context, wake <-chan event) {
  466. prng := rand.New(rand.NewSource(time.Now().UnixNano()))
  467. metadataTTL := func() time.Duration {
  468. return time.Duration(prng.Int63n(int64(p.metadataTTL)))
  469. }
  470. timer := time.NewTimer(metadataTTL())
  471. defer timer.Stop()
  472. var notify event
  473. done := ctx.Done()
  474. for {
  475. c, err := p.grabClusterConn(ctx)
  476. if err != nil {
  477. p.update(ctx, nil, err)
  478. } else {
  479. res := make(async, 1)
  480. req := &meta.Request{
  481. IncludeClusterAuthorizedOperations: true,
  482. IncludeTopicAuthorizedOperations: true,
  483. }
  484. deadline, cancel := context.WithTimeout(ctx, p.metadataTTL)
  485. c.reqs <- connRequest{
  486. ctx: deadline,
  487. req: req,
  488. res: res,
  489. }
  490. r, err := res.await(deadline)
  491. cancel()
  492. if err != nil && err == ctx.Err() {
  493. return
  494. }
  495. ret, _ := r.(*meta.Response)
  496. p.update(ctx, ret, err)
  497. }
  498. if notify != nil {
  499. notify.trigger()
  500. notify = nil
  501. }
  502. select {
  503. case <-timer.C:
  504. timer.Reset(metadataTTL())
  505. case <-done:
  506. return
  507. case notify = <-wake:
  508. }
  509. }
  510. }
  511. // grabBrokerConn returns a connection to a specific broker represented by the
  512. // broker id passed as argument. If the broker id was not known, an error is
  513. // returned.
  514. func (p *connPool) grabBrokerConn(ctx context.Context, brokerID int32) (*conn, error) {
  515. p.mutex.RLock()
  516. g := p.conns[brokerID]
  517. p.mutex.RUnlock()
  518. if g == nil {
  519. return nil, BrokerNotAvailable
  520. }
  521. return g.grabConnOrConnect(ctx)
  522. }
  523. // grabClusterConn returns the connection to the kafka cluster that the pool is
  524. // configured to connect to.
  525. //
  526. // The transport uses a shared `control` connection to the cluster for any
  527. // requests that aren't supposed to be sent to specific brokers (e.g. Fetch or
  528. // Produce requests). Requests intended to be routed to specific brokers are
  529. // dispatched on a separate pool of connections that the transport maintains.
  530. // This split help avoid head-of-line blocking situations where control requests
  531. // like Metadata would be queued behind large responses from Fetch requests for
  532. // example.
  533. //
  534. // In either cases, the requests are multiplexed so we can keep a minimal number
  535. // of connections open (N+1, where N is the number of brokers in the cluster).
  536. func (p *connPool) grabClusterConn(ctx context.Context) (*conn, error) {
  537. return p.ctrl.grabConnOrConnect(ctx)
  538. }
  539. func (p *connPool) sendRequest(ctx context.Context, req Request, state connPoolState) promise {
  540. brokerID := int32(-1)
  541. switch m := req.(type) {
  542. case protocol.BrokerMessage:
  543. // Some requests are supposed to be sent to specific brokers (e.g. the
  544. // partition leaders). They implement the BrokerMessage interface to
  545. // delegate the routing decision to each message type.
  546. broker, err := m.Broker(state.layout)
  547. if err != nil {
  548. return reject(err)
  549. }
  550. brokerID = broker.ID
  551. case protocol.GroupMessage:
  552. // Some requests are supposed to be sent to a group coordinator,
  553. // look up which broker is currently the coordinator for the group
  554. // so we can get a connection to that broker.
  555. //
  556. // TODO: should we cache the coordinator info?
  557. p := p.sendRequest(ctx, &findcoordinator.Request{Key: m.Group()}, state)
  558. r, err := p.await(ctx)
  559. if err != nil {
  560. return reject(err)
  561. }
  562. brokerID = r.(*findcoordinator.Response).NodeID
  563. }
  564. var c *conn
  565. var err error
  566. if brokerID >= 0 {
  567. c, err = p.grabBrokerConn(ctx, brokerID)
  568. } else {
  569. c, err = p.grabClusterConn(ctx)
  570. }
  571. if err != nil {
  572. return reject(err)
  573. }
  574. res := make(async, 1)
  575. c.reqs <- connRequest{
  576. ctx: ctx,
  577. req: req,
  578. res: res,
  579. }
  580. return res
  581. }
  582. func filterMetadataResponse(req *meta.Request, res *meta.Response) *meta.Response {
  583. ret := *res
  584. if req.TopicNames != nil {
  585. ret.Topics = make([]meta.ResponseTopic, len(req.TopicNames))
  586. for i, topicName := range req.TopicNames {
  587. j, ok := findMetadataTopic(res.Topics, topicName)
  588. if ok {
  589. ret.Topics[i] = res.Topics[j]
  590. } else {
  591. ret.Topics[i] = meta.ResponseTopic{
  592. ErrorCode: int16(UnknownTopicOrPartition),
  593. Name: topicName,
  594. }
  595. }
  596. }
  597. }
  598. return &ret
  599. }
  600. func findMetadataTopic(topics []meta.ResponseTopic, topicName string) (int, bool) {
  601. i := sort.Search(len(topics), func(i int) bool {
  602. return topics[i].Name >= topicName
  603. })
  604. return i, i >= 0 && i < len(topics) && topics[i].Name == topicName
  605. }
  606. func sortMetadataBrokers(brokers []meta.ResponseBroker) {
  607. sort.Slice(brokers, func(i, j int) bool {
  608. return brokers[i].NodeID < brokers[j].NodeID
  609. })
  610. }
  611. func sortMetadataTopics(topics []meta.ResponseTopic) {
  612. sort.Slice(topics, func(i, j int) bool {
  613. return topics[i].Name < topics[j].Name
  614. })
  615. }
  616. func sortMetadataPartitions(partitions []meta.ResponsePartition) {
  617. sort.Slice(partitions, func(i, j int) bool {
  618. return partitions[i].PartitionIndex < partitions[j].PartitionIndex
  619. })
  620. }
  621. func makeLayout(metadataResponse *meta.Response) protocol.Cluster {
  622. layout := protocol.Cluster{
  623. Controller: metadataResponse.ControllerID,
  624. Brokers: make(map[int32]protocol.Broker),
  625. Topics: make(map[string]protocol.Topic),
  626. }
  627. for _, broker := range metadataResponse.Brokers {
  628. layout.Brokers[broker.NodeID] = protocol.Broker{
  629. Rack: broker.Rack,
  630. Host: broker.Host,
  631. Port: broker.Port,
  632. ID: broker.NodeID,
  633. }
  634. }
  635. for _, topic := range metadataResponse.Topics {
  636. if topic.IsInternal {
  637. continue // TODO: do we need to expose those?
  638. }
  639. layout.Topics[topic.Name] = protocol.Topic{
  640. Name: topic.Name,
  641. Error: topic.ErrorCode,
  642. Partitions: makePartitions(topic.Partitions),
  643. }
  644. }
  645. return layout
  646. }
  647. func makePartitions(metadataPartitions []meta.ResponsePartition) map[int32]protocol.Partition {
  648. protocolPartitions := make(map[int32]protocol.Partition, len(metadataPartitions))
  649. numBrokerIDs := 0
  650. for _, p := range metadataPartitions {
  651. numBrokerIDs += len(p.ReplicaNodes) + len(p.IsrNodes) + len(p.OfflineReplicas)
  652. }
  653. // Reduce the memory footprint a bit by allocating a single buffer to write
  654. // all broker ids.
  655. brokerIDs := make([]int32, 0, numBrokerIDs)
  656. for _, p := range metadataPartitions {
  657. var rep, isr, off []int32
  658. brokerIDs, rep = appendBrokerIDs(brokerIDs, p.ReplicaNodes)
  659. brokerIDs, isr = appendBrokerIDs(brokerIDs, p.IsrNodes)
  660. brokerIDs, off = appendBrokerIDs(brokerIDs, p.OfflineReplicas)
  661. protocolPartitions[p.PartitionIndex] = protocol.Partition{
  662. ID: p.PartitionIndex,
  663. Error: p.ErrorCode,
  664. Leader: p.LeaderID,
  665. Replicas: rep,
  666. ISR: isr,
  667. Offline: off,
  668. }
  669. }
  670. return protocolPartitions
  671. }
  672. func appendBrokerIDs(ids, brokers []int32) ([]int32, []int32) {
  673. i := len(ids)
  674. ids = append(ids, brokers...)
  675. return ids, ids[i:len(ids):len(ids)]
  676. }
  677. func (p *connPool) newConnGroup(a net.Addr) *connGroup {
  678. return &connGroup{
  679. addr: a,
  680. pool: p,
  681. broker: Broker{
  682. ID: -1,
  683. },
  684. }
  685. }
  686. func (p *connPool) newBrokerConnGroup(broker Broker) *connGroup {
  687. return &connGroup{
  688. addr: &networkAddress{
  689. network: "tcp",
  690. address: net.JoinHostPort(broker.Host, strconv.Itoa(broker.Port)),
  691. },
  692. pool: p,
  693. broker: broker,
  694. }
  695. }
  696. type connRequest struct {
  697. ctx context.Context
  698. req Request
  699. res async
  700. }
  701. // The promise interface is used as a message passing abstraction to coordinate
  702. // between goroutines that handle requests and responses.
  703. type promise interface {
  704. // Waits until the promise is resolved, rejected, or the context canceled.
  705. await(context.Context) (Response, error)
  706. }
  707. // async is an implementation of the promise interface which supports resolving
  708. // or rejecting the await call asynchronously.
  709. type async chan interface{}
  710. func (p async) await(ctx context.Context) (Response, error) {
  711. select {
  712. case x := <-p:
  713. switch v := x.(type) {
  714. case nil:
  715. return nil, nil // A nil response is ok (e.g. when RequiredAcks is None)
  716. case Response:
  717. return v, nil
  718. case error:
  719. return nil, v
  720. default:
  721. panic(fmt.Errorf("BUG: promise resolved with impossible value of type %T", v))
  722. }
  723. case <-ctx.Done():
  724. return nil, ctx.Err()
  725. }
  726. }
  727. func (p async) resolve(res Response) { p <- res }
  728. func (p async) reject(err error) { p <- err }
  729. // rejected is an implementation of the promise interface which is always
  730. // returns an error. Values of this type are constructed using the reject
  731. // function.
  732. type rejected struct{ err error }
  733. func reject(err error) promise { return &rejected{err: err} }
  734. func (p *rejected) await(ctx context.Context) (Response, error) {
  735. return nil, p.err
  736. }
  737. // joined is an implementation of the promise interface which merges results
  738. // from multiple promises into one await call using a merger.
  739. type joined struct {
  740. promises []promise
  741. requests []Request
  742. merger protocol.Merger
  743. }
  744. func join(promises []promise, requests []Request, merger protocol.Merger) promise {
  745. return &joined{
  746. promises: promises,
  747. requests: requests,
  748. merger: merger,
  749. }
  750. }
  751. func (p *joined) await(ctx context.Context) (Response, error) {
  752. results := make([]interface{}, len(p.promises))
  753. for i, sub := range p.promises {
  754. m, err := sub.await(ctx)
  755. if err != nil {
  756. results[i] = err
  757. } else {
  758. results[i] = m
  759. }
  760. }
  761. return p.merger.Merge(p.requests, results)
  762. }
  763. // Default dialer used by the transport connections when no Dial function
  764. // was configured by the program.
  765. var defaultDialer = net.Dialer{
  766. Timeout: 3 * time.Second,
  767. DualStack: true,
  768. }
  769. // connGroup represents a logical connection group to a kafka broker. The
  770. // actual network connections are lazily open before sending requests, and
  771. // closed if they are unused for longer than the idle timeout.
  772. type connGroup struct {
  773. addr net.Addr
  774. broker Broker
  775. // Immutable state of the connection.
  776. pool *connPool
  777. // Shared state of the connection, this is synchronized on the mutex through
  778. // calls to the synchronized method. Both goroutines of the connection share
  779. // the state maintained in these fields.
  780. mutex sync.Mutex
  781. closed bool
  782. idleConns []*conn // stack of idle connections
  783. }
  784. func (g *connGroup) closeIdleConns() {
  785. g.mutex.Lock()
  786. conns := g.idleConns
  787. g.idleConns = nil
  788. g.closed = true
  789. g.mutex.Unlock()
  790. for _, c := range conns {
  791. c.close()
  792. }
  793. }
  794. func (g *connGroup) grabConnOrConnect(ctx context.Context) (*conn, error) {
  795. rslv := g.pool.resolver
  796. addr := g.addr
  797. var c *conn
  798. if rslv == nil {
  799. c = g.grabConn()
  800. } else {
  801. var err error
  802. broker := g.broker
  803. if broker.ID < 0 {
  804. host, port, err := net.SplitHostPort(addr.String())
  805. if err != nil {
  806. return nil, fmt.Errorf("%s: %w", addr, err)
  807. }
  808. portNumber, err := strconv.Atoi(port)
  809. if err != nil {
  810. return nil, fmt.Errorf("%s: %w", addr, err)
  811. }
  812. broker.Host = host
  813. broker.Port = portNumber
  814. }
  815. ipAddrs, err := rslv.LookupBrokerIPAddr(ctx, broker)
  816. if err != nil {
  817. return nil, err
  818. }
  819. for _, ipAddr := range ipAddrs {
  820. network := addr.Network()
  821. address := net.JoinHostPort(ipAddr.String(), strconv.Itoa(broker.Port))
  822. if c = g.grabConnTo(network, address); c != nil {
  823. break
  824. }
  825. }
  826. }
  827. if c == nil {
  828. connChan := make(chan *conn)
  829. errChan := make(chan error)
  830. go func() {
  831. c, err := g.connect(ctx, addr)
  832. if err != nil {
  833. select {
  834. case errChan <- err:
  835. case <-ctx.Done():
  836. }
  837. } else {
  838. select {
  839. case connChan <- c:
  840. case <-ctx.Done():
  841. if !g.releaseConn(c) {
  842. c.close()
  843. }
  844. }
  845. }
  846. }()
  847. select {
  848. case c = <-connChan:
  849. case err := <-errChan:
  850. return nil, err
  851. case <-ctx.Done():
  852. return nil, ctx.Err()
  853. }
  854. }
  855. return c, nil
  856. }
  857. func (g *connGroup) grabConnTo(network, address string) *conn {
  858. g.mutex.Lock()
  859. defer g.mutex.Unlock()
  860. for i := len(g.idleConns) - 1; i >= 0; i-- {
  861. c := g.idleConns[i]
  862. if c.network == network && c.address == address {
  863. copy(g.idleConns[i:], g.idleConns[i+1:])
  864. n := len(g.idleConns) - 1
  865. g.idleConns[n] = nil
  866. g.idleConns = g.idleConns[:n]
  867. if c.timer != nil {
  868. c.timer.Stop()
  869. }
  870. return c
  871. }
  872. }
  873. return nil
  874. }
  875. func (g *connGroup) grabConn() *conn {
  876. g.mutex.Lock()
  877. defer g.mutex.Unlock()
  878. if len(g.idleConns) == 0 {
  879. return nil
  880. }
  881. n := len(g.idleConns) - 1
  882. c := g.idleConns[n]
  883. g.idleConns[n] = nil
  884. g.idleConns = g.idleConns[:n]
  885. if c.timer != nil {
  886. c.timer.Stop()
  887. }
  888. return c
  889. }
  890. func (g *connGroup) removeConn(c *conn) bool {
  891. g.mutex.Lock()
  892. defer g.mutex.Unlock()
  893. if c.timer != nil {
  894. c.timer.Stop()
  895. }
  896. for i, x := range g.idleConns {
  897. if x == c {
  898. copy(g.idleConns[i:], g.idleConns[i+1:])
  899. n := len(g.idleConns) - 1
  900. g.idleConns[n] = nil
  901. g.idleConns = g.idleConns[:n]
  902. return true
  903. }
  904. }
  905. return false
  906. }
  907. func (g *connGroup) releaseConn(c *conn) bool {
  908. idleTimeout := g.pool.idleTimeout
  909. g.mutex.Lock()
  910. defer g.mutex.Unlock()
  911. if g.closed {
  912. return false
  913. }
  914. if c.timer != nil {
  915. c.timer.Reset(idleTimeout)
  916. } else {
  917. c.timer = time.AfterFunc(idleTimeout, func() {
  918. if g.removeConn(c) {
  919. c.close()
  920. }
  921. })
  922. }
  923. g.idleConns = append(g.idleConns, c)
  924. return true
  925. }
  926. func (g *connGroup) connect(ctx context.Context, addr net.Addr) (*conn, error) {
  927. deadline := time.Now().Add(g.pool.dialTimeout)
  928. ctx, cancel := context.WithDeadline(ctx, deadline)
  929. defer cancel()
  930. network := strings.Split(addr.Network(), ",")
  931. address := strings.Split(addr.String(), ",")
  932. var netConn net.Conn
  933. var netAddr net.Addr
  934. var err error
  935. if len(address) > 1 {
  936. // Shuffle the list of addresses to randomize the order in which
  937. // connections are attempted. This prevents routing all connections
  938. // to the first broker (which will usually succeed).
  939. rand.Shuffle(len(address), func(i, j int) {
  940. network[i], network[j] = network[j], network[i]
  941. address[i], address[j] = address[j], address[i]
  942. })
  943. }
  944. for i := range address {
  945. netConn, err = g.pool.dial(ctx, network[i], address[i])
  946. if err == nil {
  947. netAddr = &networkAddress{
  948. network: network[i],
  949. address: address[i],
  950. }
  951. break
  952. }
  953. }
  954. if err != nil {
  955. return nil, err
  956. }
  957. defer func() {
  958. if netConn != nil {
  959. netConn.Close()
  960. }
  961. }()
  962. if tlsConfig := g.pool.tls; tlsConfig != nil {
  963. if tlsConfig.ServerName == "" && !tlsConfig.InsecureSkipVerify {
  964. host, _, _ := net.SplitHostPort(netAddr.String())
  965. tlsConfig = tlsConfig.Clone()
  966. tlsConfig.ServerName = host
  967. }
  968. netConn = tls.Client(netConn, tlsConfig)
  969. }
  970. pc := protocol.NewConn(netConn, g.pool.clientID)
  971. pc.SetDeadline(deadline)
  972. r, err := pc.RoundTrip(new(apiversions.Request))
  973. if err != nil {
  974. return nil, err
  975. }
  976. res := r.(*apiversions.Response)
  977. ver := make(map[protocol.ApiKey]int16, len(res.ApiKeys))
  978. if res.ErrorCode != 0 {
  979. return nil, fmt.Errorf("negotating API versions with kafka broker at %s: %w", g.addr, Error(res.ErrorCode))
  980. }
  981. for _, r := range res.ApiKeys {
  982. apiKey := protocol.ApiKey(r.ApiKey)
  983. ver[apiKey] = apiKey.SelectVersion(r.MinVersion, r.MaxVersion)
  984. }
  985. pc.SetVersions(ver)
  986. pc.SetDeadline(time.Time{})
  987. if g.pool.sasl != nil {
  988. if err := authenticateSASL(ctx, pc, g.pool.sasl); err != nil {
  989. return nil, err
  990. }
  991. }
  992. reqs := make(chan connRequest)
  993. c := &conn{
  994. network: netAddr.Network(),
  995. address: netAddr.String(),
  996. reqs: reqs,
  997. group: g,
  998. }
  999. go c.run(pc, reqs)
  1000. netConn = nil
  1001. return c, nil
  1002. }
  1003. type conn struct {
  1004. reqs chan<- connRequest
  1005. network string
  1006. address string
  1007. once sync.Once
  1008. group *connGroup
  1009. timer *time.Timer
  1010. }
  1011. func (c *conn) close() {
  1012. c.once.Do(func() { close(c.reqs) })
  1013. }
  1014. func (c *conn) run(pc *protocol.Conn, reqs <-chan connRequest) {
  1015. defer pc.Close()
  1016. for cr := range reqs {
  1017. r, err := c.roundTrip(cr.ctx, pc, cr.req)
  1018. if err != nil {
  1019. cr.res.reject(err)
  1020. if !errors.Is(err, protocol.ErrNoRecord) {
  1021. break
  1022. }
  1023. } else {
  1024. cr.res.resolve(r)
  1025. }
  1026. if !c.group.releaseConn(c) {
  1027. break
  1028. }
  1029. }
  1030. }
  1031. func (c *conn) roundTrip(ctx context.Context, pc *protocol.Conn, req Request) (Response, error) {
  1032. pprof.SetGoroutineLabels(ctx)
  1033. defer pprof.SetGoroutineLabels(context.Background())
  1034. if deadline, hasDeadline := ctx.Deadline(); hasDeadline {
  1035. pc.SetDeadline(deadline)
  1036. defer pc.SetDeadline(time.Time{})
  1037. }
  1038. return pc.RoundTrip(req)
  1039. }
  1040. // authenticateSASL performs all of the required requests to authenticate this
  1041. // connection. If any step fails, this function returns with an error. A nil
  1042. // error indicates successful authentication.
  1043. func authenticateSASL(ctx context.Context, pc *protocol.Conn, mechanism sasl.Mechanism) error {
  1044. if err := saslHandshakeRoundTrip(pc, mechanism.Name()); err != nil {
  1045. return err
  1046. }
  1047. sess, state, err := mechanism.Start(ctx)
  1048. if err != nil {
  1049. return err
  1050. }
  1051. for completed := false; !completed; {
  1052. challenge, err := saslAuthenticateRoundTrip(pc, state)
  1053. switch err {
  1054. case nil:
  1055. case io.EOF:
  1056. // the broker may communicate a failed exchange by closing the
  1057. // connection (esp. in the case where we're passing opaque sasl
  1058. // data over the wire since there's no protocol info).
  1059. return SASLAuthenticationFailed
  1060. default:
  1061. return err
  1062. }
  1063. completed, state, err = sess.Next(ctx, challenge)
  1064. if err != nil {
  1065. return err
  1066. }
  1067. }
  1068. return nil
  1069. }
  1070. // saslHandshake sends the SASL handshake message. This will determine whether
  1071. // the Mechanism is supported by the cluster. If it's not, this function will
  1072. // error out with UnsupportedSASLMechanism.
  1073. //
  1074. // If the mechanism is unsupported, the handshake request will reply with the
  1075. // list of the cluster's configured mechanisms, which could potentially be used
  1076. // to facilitate negotiation. At the moment, we are not negotiating the
  1077. // mechanism as we believe that brokers are usually known to the client, and
  1078. // therefore the client should already know which mechanisms are supported.
  1079. //
  1080. // See http://kafka.apache.org/protocol.html#The_Messages_SaslHandshake
  1081. func saslHandshakeRoundTrip(pc *protocol.Conn, mechanism string) error {
  1082. msg, err := pc.RoundTrip(&saslhandshake.Request{
  1083. Mechanism: mechanism,
  1084. })
  1085. if err != nil {
  1086. return err
  1087. }
  1088. res := msg.(*saslhandshake.Response)
  1089. if res.ErrorCode != 0 {
  1090. err = Error(res.ErrorCode)
  1091. }
  1092. return err
  1093. }
  1094. // saslAuthenticate sends the SASL authenticate message. This function must
  1095. // be immediately preceded by a successful saslHandshake.
  1096. //
  1097. // See http://kafka.apache.org/protocol.html#The_Messages_SaslAuthenticate
  1098. func saslAuthenticateRoundTrip(pc *protocol.Conn, data []byte) ([]byte, error) {
  1099. msg, err := pc.RoundTrip(&saslauthenticate.Request{
  1100. AuthBytes: data,
  1101. })
  1102. if err != nil {
  1103. return nil, err
  1104. }
  1105. res := msg.(*saslauthenticate.Response)
  1106. if res.ErrorCode != 0 {
  1107. err = makeError(res.ErrorCode, res.ErrorMessage)
  1108. }
  1109. return res.AuthBytes, err
  1110. }
  1111. var _ RoundTripper = (*Transport)(nil)